webgui/lib/WebGUI/HTML.pm
Colin Kuskie 4376a6770e Update POD in HTML.pm, 'html' is not an option for format content.
Fix Comment formatting in the AssetAspect.
2009-07-10 23:40:57 +00:00

500 lines
13 KiB
Perl

package WebGUI::HTML;
=head1 LEGAL
-------------------------------------------------------------------
WebGUI is Copyright 2001-2009 Plain Black Corporation.
-------------------------------------------------------------------
Please read the legal notices (docs/legal.txt) and the license
(docs/license.txt) that came with this distribution before using
this software.
-------------------------------------------------------------------
http://www.plainblack.com info@plainblack.com
-------------------------------------------------------------------
=cut
use HTML::TokeParser;
use HTML::TagFilter;
use strict;
use WebGUI::Macro;
use HTML::Parser;
=head1 NAME
Package WebGUI::HTML
=head1 DESCRIPTION
A package for manipulating and massaging HTML.
=head1 SYNOPSIS
use WebGUI::HTML;
$html = WebGUI::HTML::cleanSegment($html);
$html = WebGUI::HTML::filter($html);
$html = WebGUI::HTML::format($content, $contentType);
$html = WebGUI::HTML::html2text($html);
$html = WebGUI::HTML::makeAbsolute($session, $html);
$html = WebGUI::HTML::processReplacements($session, $html);
$html = WebGUI::HTML::splitTag([$tag,]$html[,$count]); # defaults to ( 'p', $html, 1 )
=head1 METHODS
These methods are available from this package:
=cut
#-------------------------------------------------------------------
=head2 cleanSegment ( html , preserveStyleScript )
Returns an HTML segment that has been stripped of the <BODY> tag and anything before it, as well as the </BODY> tag and anything after it. It's main purpose is to get rid of META tags and other garbage from an HTML page that will be used as a segment inside of another page.
=head3 html
The HTML segment you want cleaned.
=head3 preserveStyleScript
With this option set, <style> and <script> tags will be preserved in the output.
=cut
sub cleanSegment {
my $html = shift;
my $preserveStyleScript = shift;
my $headers = "";
if ($html =~ s{^(.*)<body\b[^>]*>}{}is && $preserveStyleScript) {
my $head = $1;
# extract every link tag
while ( $head =~ m{(<link\b[^>]+>)}isg ) {
$headers .= $1;
}
# extract every script or style tag
while ($head =~ m{(<(script|style)\b.*?</\2>)}isg) {
$headers .= $1;
}
}
$html =~ s{</body>.*}{}is;
# remove windows carriage returns
$html =~ s/\r\n/\n/g;
$html =~ s/\r/\n/g;
return $headers . $html;
}
#-------------------------------------------------------------------
=head2 filter ( html [, filter ] )
Returns HTML with unwanted tags filtered out.
=head3 html
The HTML content you want filtered.
=head3 filter
Choose from "all", "none", "macros", "javascript", or "most". Defaults to "most". "all" removes all HTML tags and macros; "none" removes no HTML tags; "javascript" removes all references to javacript and macros; "macros" removes all macros, but nothing else; and "most" removes all but simple formatting tags like bold and italics.
=cut
sub filter {
my $html = shift;
my $type = shift;
if ($type eq "all") {
#Hash used to keep track of depth within tags
my %html_parser_inside_tag;
#String containing text output from HTML::Parser
my $html_parser_text = "" ;
#Hash containing HTML tags (as keys) that create whitespace when rendered by the browser
my %html_parser_whitespace_tags = ('p'=>1, 'br'=>1, 'hr'=>1, 'td'=>1, 'th'=>1,
'tr'=>1, 'table'=>1, 'ul'=>1, 'li'=>1, 'div'=>1) ;
#HTML::Parser event handler called at the start and end of each HTML tag, adds whitespace (if necessary)
#to output if the tag creates whitespace. This was done to keep text from running together inappropriately.
my $html_parser_tag_sub = sub {
my($tag, $num) = @_;
$html_parser_inside_tag{$tag} += $num;
if ($html_parser_whitespace_tags{$tag} &&
($html_parser_text =~ /\S$/)) { #add space only if no preceeding space
$html_parser_text .= " " ;
}
} ;
#HTML::Parser event handler called with non-tag text (no tags)
my $html_parser_text_sub = sub {
return undef if $html_parser_inside_tag{script} || $html_parser_inside_tag{style}; # do not output text
$html_parser_text .= $_[0] ;
} ;
my $parser = HTML::Parser->new(api_version => 3,
handlers => [start => [$html_parser_tag_sub,
"tagname, '+1'"],
end => [$html_parser_tag_sub,
"tagname, '-1'"],
text => [$html_parser_text_sub,
"text"]
],
marked_sections => 1,
) ;
$parser->parse($html) ;
$parser->eof() ;
$html = $html_parser_text ;
$html =~ s/&nbsp;/ /ixsg ;
WebGUI::Macro::negate(\$html);
} elsif ($type eq "javascript") {
$html =~ s/\<\s*script.*?\/script\s*\>//ixsg;
$html =~ s/(href="??)javascript\:.*?\)/$1removed/ixsg;
$html =~ s/onClick/removed/ixsg;
$html =~ s/onDblClick/removed/ixsg;
$html =~ s/onLoad/removed/ixsg;
$html =~ s/onMouseOver/removed/ixsg;
$html =~ s/onMouseOut/removed/ixsg;
$html =~ s/onMouseMove/removed/ixsg;
$html =~ s/onMouseUp/removed/ixsg;
$html =~ s/onMouseDown/removed/ixsg;
$html =~ s/onKeyPress/removed/ixsg;
$html =~ s/onKeyUp/removed/ixsg;
$html =~ s/onKeyDown/removed/ixsg;
$html =~ s/onSubmit/removed/ixsg;
$html =~ s/onReset/removed/ixsg;
WebGUI::Macro::negate(\$html);
} elsif ($type eq "macros") {
WebGUI::Macro::negate(\$html);
} elsif ($type eq "none") {
# do nothing
} else {
my $filter = HTML::TagFilter->new; # defaultly strips almost everything
$html = $filter->filter($html);
WebGUI::Macro::filter(\$html);
}
return $html;
}
#-------------------------------------------------------------------
=head2 format ( content [ , contentType ] )
Formats various text types into HTML.
=head3 content
The text content to be formatted.
=head3 contentType
The content type to use as formatting. Valid types are 'text', 'code', and 'mixed'. The default contentType is 'mixed'.
See also the contentType method in WebGUI::Form, WebGUI::HTMLForm, and WebGUI::FormProcessor.
=cut
sub format {
my ($content, $contentType) = @_;
$contentType = 'mixed' unless ($contentType);
if ($contentType eq "text" || $contentType eq "code") {
$content =~ s/&/&amp;/g;
$content =~ s/\</&lt;/g;
$content =~ s/\>/&gt;/g;
$content =~ s/\n/\<br \/\>\n/g;
$content =~ s/\t/&nbsp;&nbsp;&nbsp;&nbsp;/g;
}
if ($contentType eq "mixed") {
unless ($content =~ /\<div/ig || $content =~ /\<br/ig || $content =~ /\<p/ig) {
$content =~ s/\n/\<br \/\>\n/g;
}
} elsif ($contentType eq "text") {
$content =~ s/ / &nbsp;/g;
} elsif ($contentType eq "code") {
$content =~ s/ /&nbsp;/g;
$content = '<div style="font-family: monospace;">'.$content.'</div>';
}
return $content;
}
#-------------------------------------------------------------------
=head2 html2text ( html )
Converts html to text. It currently handles only text, so tables
or forms are not converted.
=head3 html
The html segment you want to convert to text.
=cut
# for recursive function
my $text = "";
my $inside = {};
sub html2text {
my $html = shift() . " ";
$text = "";
$inside = {};
my $tagHandler = sub {
my($tag, $num) = @_;
$inside->{$tag} += $num;
if($tag eq "br" || $tag eq "p") {
$text .= "\n";
}
};
my $textHandler = sub {
return undef if $inside->{script} || $inside->{style};
if ($_[0] =~ /\S+/) {
$text .= $_[0];
}
};
HTML::Parser->new(api_version => 3,
handlers => [start => [$tagHandler, "tagname, '+1'"],
end => [$tagHandler, "tagname, '-1'"],
text => [$textHandler, "dtext"],
],
marked_sections => 1,
)->parse($html);
return $text;
}
#-------------------------------------------------------------------
=head2 makeAbsolute ( session, html , [ baseURL ] )
Returns html with all relative links converted to absolute.
=head3 session
A reference to the current session.
=head3 html
The html to be made absolute.
=head3 baseURL
The base URL to use. Defaults to current page's url.
=cut
my $absolute = "";
sub makeAbsolute {
my $session = shift;
my $html = shift;
my $baseURL = shift;
$absolute = "";
my $linkParser = sub {
my ($tagname, $attr, $text) = @_;
my %linkElements = # from HTML::Element
(
body => 'background',
base => 'href',
a => 'href',
img => [qw(src lowsrc usemap)], # lowsrc is a Netscape invention
form => 'action',
input => 'src',
'link' => 'href', # need quoting since link is a perl builtin
frame => 'src',
iframe => 'src',
applet => 'codebase',
area => 'href',
script => 'src',
iframe => 'src',
);
if(not exists $linkElements{$tagname}) { # no need to touch this tag
$absolute .= $text;
return undef;
}
# Build a hash with tag attributes
my %tag_attr;
for my $tag (keys %linkElements) {
my $tagval = $linkElements{$tag};
for my $attr (ref $tagval ? @$tagval : $tagval) {
$tag_attr{"$tag $attr"}++;
}
}
$absolute .= "<".$tagname;
my $foundClosingSlash;
foreach (keys %$attr) {
if($_ eq '/') {
$foundClosingSlash = '1';
next;
}
if ($tag_attr{"$tagname $_"}) { # make this absolute
$attr->{$_} = $session->url->makeAbsolute($attr->{$_}, $baseURL);
}
$absolute .= qq' $_="$attr->{$_}"';
}
$absolute .= '/' if ($foundClosingSlash);
$absolute .= '>';
};
HTML::Parser->new(
default_h => [ sub { $absolute .= shift }, 'text' ],
start_h => [ $linkParser , 'tagname, attr, text' ],
)->parse($html);
return $absolute;
}
#-------------------------------------------------------------------
=head2 makeParameterSafe ( text )
Encodes text to make it safe to embed in a macro by HTML encoding commas and quotes.
=head3 html
A reference to the text to be encoded.
=cut
sub makeParameterSafe {
my $text = shift;
${ $text } =~ s/,/&#44;/g;
${ $text } =~ s/'/&#39;/g;
return undef;
}
#-------------------------------------------------------------------
=head2 processReplacements ( session, content )
Processes text using the WebGUI replacements system.
=head3 session
A reference to the current session.
=head3 content
The content to be processed through the replacements filter.
=cut
sub processReplacements {
my $session = shift;
my ($content) = @_;
my $replacements = $session->stow->get("replacements");
if (defined $replacements) {
foreach my $searchFor (keys %{$replacements}) {
my $replaceWith = $replacements->{$searchFor};
$content =~ s/\Q$searchFor/$replaceWith/gs;
}
} else {
my $sth = $session->dbSlave->read("select searchFor,replaceWith from replacements");
while (my ($searchFor,$replaceWith) = $sth->array) {
$replacements->{$searchFor} = $replaceWith;
$content =~ s/\Q$searchFor/$replaceWith/gs;
}
$sth->finish;
$session->stow->set("replacements",$replacements);
}
return $content;
}
#-------------------------------------------------------------------
=head2 WebGUI::HTML::splitTag([$tag,]$html[,$count]);
splits an block of HTML into an array based on the contents of a single tag
=head3 tag
The HTML tag top extract from the text. this defaults to 'p' giving a list of paragraphs
=head3 html
The block of HTML text that will be disected
=head3 count
How many items do we want? defaults to 1; returns 1 non-blank item; -1 returns all items
=cut
sub splitTag {
my $tag = shift;
my $html = shift;
my $count = shift || 1;
if( not defined $html or $html =~ /^(-?\d+)$/ ) {
$count = $html if $1;
$html = $tag;
$tag = 'p'; # the default tag is 'p' -- grabs a paragraph
}
my @result;
my $p = HTML::TokeParser->new(\$html);
while (my $token = $p->get_tag($tag)) {
my $text = $p->get_trimmed_text("/$tag");
utf8::upgrade($text); ##PATCH to work around HTML::Entities and DBD::mysql
next if $text =~ /^([[:space:]]|[[:^print:]])*$/; # skip whitespace
push @result, $text; # add the text between the tags to the result array
last if @result == $count; # if we have a full count then quit
}
return @result if wantarray;
return $result[0];
}
#-------------------------------------------------------------------
=head2 WebGUI::HTML::splitHeadBody($html);
splits an block of HTML into a HEAD and a BODY section
=head3 html
The block of HTML text that will be disected
=cut
sub splitHeadBody {
my $html = shift;
my $parser = HTML::Parser->new(api_version => 3);
my $head = '';
my $body = '';
my $accum;
$parser->handler(start => sub {
my ($tag, $text) = @_;
if ($tag eq 'head') {
$accum = \$head;
}
elsif ($tag eq 'body') {
$accum = \$body;
}
elsif ($accum) {
$$accum .= $text;
}
}, 'tagname, text');
$parser->handler(end => sub {
my ($tag, $text) = @_;
if ($tag eq 'head' || $tag eq 'body') {
$accum = undef;
}
elsif ($accum) {
$$accum .= $text;
}
}, 'tagname, text');
$parser->handler(default => sub {
my ($tag, $text) = @_;
if ($accum) {
$$accum .= $text;
}
}, 'tagname, text');
$parser->parse($html);
return ($head, $body);
}
1;