diff --git a/lib/WebGUI/HTML.pm b/lib/WebGUI/HTML.pm index a9e51aded..a4828de33 100644 --- a/lib/WebGUI/HTML.pm +++ b/lib/WebGUI/HTML.pm @@ -19,6 +19,8 @@ use strict; use WebGUI::Macro; use WebGUI::Session; use WebGUI::SQL; +use HTML::Parser; +use WebGUI::URL; =head1 NAME @@ -34,6 +36,8 @@ A package for manipulating and massaging HTML. $html = WebGUI::HTML::cleanSegment($html); $html = WebGUI::HTML::filter($html); $html = WebGUI::HTML::format($content, $contentType); + $html = WebGUI::HTML::html2text($html); + $html = WebGUI::HTML::makeAbsolute($html); $html = WebGUI::HTML::processReplacements($html); =head1 METHODS @@ -175,6 +179,134 @@ sub format { #------------------------------------------------------------------- +=head2 html2text ( html ) + +Converts html to text. It currently handles only text, so tables +or forms are not converted. + +=over + +=item html + +The html segment you want to convert to text. + +=back + +=cut + +sub html2text { + my $html = shift; + $session{temp}{html2text}{text} = ""; + delete($session{temp}{html2text}{inside}); + + my $tagHandler = sub { + my($tag, $num) = @_; + $session{temp}{html2text}{inside}{$tag} += $num; + if($tag eq "br" || $tag eq "p") { + $session{temp}{html2text}{text} .= "\n"; + } + }; + my $textHandler = sub { + return if $session{temp}{html2text}{inside}{script} || $session{temp}{html2text}{inside}{style}; + if ($_[0] =~ /\S+/) { + $session{temp}{html2text}{text} .= $_[0]; + } + }; + + HTML::Parser->new(api_version => 3, + handlers => [start => [$tagHandler, "tagname, '+1'"], + end => [$tagHandler, "tagname, '-1'"], + text => [$textHandler, "dtext"], + ], + marked_sections => 1, + )->parse($html); + + return $session{temp}{html2text}{text}; +} + +#------------------------------------------------------------------- + +=head2 makeAbsolute ( html , [ baseURL ] ) + +Returns html with all relative links converted to absolute. + +=over + +=item html + +The html to be made absolute. + +=item baseURL + +The base URL to use. Defaults to current page's url. +=back + +=cut + +sub makeAbsolute { + my $html = shift; + my $baseURL = shift; + + $session{temp}{makeAbsolute}{html} = ""; + + my $linkParser = sub { + my ($tagname, $attr, $text) = @_; + my %linkElements = # from HTML::Element.pm + ( + body => 'background', + base => 'href', + a => 'href', + img => [qw(src lowsrc usemap)], # lowsrc is a Netscape invention + form => 'action', + input => 'src', + 'link' => 'href', # need quoting since link is a perl builtin + frame => 'src', + iframe => 'src', + applet => 'codebase', + area => 'href', + script => 'src', + iframe => 'src', + ); + + if(not exists $linkElements{$tagname}) { # no need to touch this tag + $session{temp}{makeAbsolute}{html} .= $text; + return; + } + + # Build a hash with tag attributes + my %tag_attr; + for my $tag (keys %linkElements) { + my $tagval = $linkElements{$tag}; + for my $attr (ref $tagval ? @$tagval : $tagval) { + $tag_attr{"$tag $attr"}++; + } + } + + $session{temp}{makeAbsolute}{html} .= "<".$tagname; + + foreach (keys %$attr) { + if($_ eq '/') { + $session{temp}{makeAbsolute}{html} .= '/'; + next; + } + if ($tag_attr{"$tagname $_"}) { # make this absolute + $attr->{$_} = WebGUI::URL::makeAbsolute($attr->{$_}, $baseURL); + } + $session{temp}{makeAbsolute}{html} .= qq' $_="$attr->{$_}"'; + } + + $session{temp}{makeAbsolute}{html} .= '>'; + }; + HTML::Parser->new( + default_h => [ sub { $session{temp}{makeAbsolute}{html} .= shift }, 'text' ], + start_h => [ $linkParser , 'tagname, attr, text' ], + )->parse($html); + + return $session{temp}{makeAbsolute}{html}; +} + +#------------------------------------------------------------------- + =head2 processReplacements ( content ) Processes text using the WebGUI replacements system. @@ -208,7 +340,5 @@ sub processReplacements { return $content; } - - 1;