webgui/lib/WebGUI/HTML.pm
2004-07-17 20:25:59 +00:00

344 lines
9.8 KiB
Perl

package WebGUI::HTML;
=head1 LEGAL
-------------------------------------------------------------------
WebGUI is Copyright 2001-2004 Plain Black LLC.
-------------------------------------------------------------------
Please read the legal notices (docs/legal.txt) and the license
(docs/license.txt) that came with this distribution before using
this software.
-------------------------------------------------------------------
http://www.plainblack.com info@plainblack.com
-------------------------------------------------------------------
=cut
use HTML::TagFilter;
use strict;
use WebGUI::Macro;
use WebGUI::Session;
use WebGUI::SQL;
use HTML::Parser;
use WebGUI::URL;
=head1 NAME
Package WebGUI::HTML
=head1 DESCRIPTION
A package for manipulating and massaging HTML.
=head1 SYNOPSIS
use WebGUI::HTML;
$html = WebGUI::HTML::cleanSegment($html);
$html = WebGUI::HTML::filter($html);
$html = WebGUI::HTML::format($content, $contentType);
$html = WebGUI::HTML::html2text($html);
$html = WebGUI::HTML::makeAbsolute($html);
$html = WebGUI::HTML::processReplacements($html);
=head1 METHODS
These methods are available from this package:
=cut
#-------------------------------------------------------------------
=head2 cleanSegment ( html )
Returns an HTML segment that has been stripped of the <BODY> tag and anything before it, as well as the </BODY> tag and anything after it. It's main purpose is to get rid of META tags and other garbage from an HTML page that will be used as a segment inside of another page.
NOTE: This filter does have one exception, it leaves anything before the <BODY> tag that is enclosed in <STYLE></STYLE> or <SCRIPT></SCRIPT> tags.
=over
=item html
The HTML segment you want cleaned.
=back
=cut
sub cleanSegment {
my $value = $_[0];
if ($value =~ s/\r/\n/g) {
$value =~ s/\n\n/\n/g
}
$value =~ m/(\<style.*?\/style\>)/ixsg;
my $style = $1;
$value =~ m/(\<script.*?\/script\>)/ixsg;
my $script = $1;
$value =~ s/\A.*?\<body.*?\>(.*?)/$1/ixsg;
$value =~ s/(.*?)\<\/body\>.*?\z/$1/ixsg;
return $script.$style.$value;
}
#-------------------------------------------------------------------
=head2 filter ( html [, filter ] )
Returns HTML with unwanted tags filtered out.
=over
=item html
The HTML content you want filtered.
=item filter
Choose from "all", "none", "macros", "javascript", or "most". Defaults to "most". "all" removes all HTML tags and macros; "none" removes no HTML tags; "javascript" removes all references to javacript and macros; "macros" removes all macros, but nothing else; and "most" removes all but simple formatting tags like bold and italics.
=back
=cut
sub filter {
my ($filter, $html, $type);
$type = $_[1];
if ($type eq "all") {
$filter = HTML::TagFilter->new(allow=>{'none'},strip_comments=>1);
$html = $filter->filter($_[0]);
return WebGUI::Macro::negate($html);
} elsif ($type eq "javascript") {
$html = $_[0];
$html =~ s/\<script.*?\/script\>//ixsg;
$html =~ s/(href="??)javascript\:.*?\)/$1removed/ixsg;
$html =~ s/onClick/removed/ixsg;
$html =~ s/onDblClick/removed/ixsg;
$html =~ s/onLoad/removed/ixsg;
$html =~ s/onMouseOver/removed/ixsg;
$html =~ s/onMouseOut/removed/ixsg;
$html =~ s/onMouseMove/removed/ixsg;
$html =~ s/onMouseUp/removed/ixsg;
$html =~ s/onMouseDown/removed/ixsg;
$html =~ s/onKeyPress/removed/ixsg;
$html =~ s/onKeyUp/removed/ixsg;
$html =~ s/onKeyDown/removed/ixsg;
$html =~ s/onSubmit/removed/ixsg;
$html =~ s/onReset/removed/ixsg;
$html = WebGUI::Macro::negate($html);
} elsif ($type eq "macros") {
return WebGUI::Macro::negate($_[0]);
} elsif ($type eq "none") {
return $_[0];
} else {
$filter = HTML::TagFilter->new; # defaultly strips almost everything
$html = $filter->filter($_[0]);
return WebGUI::Macro::filter($html);
}
}
#-------------------------------------------------------------------
=head2 format ( content [ , contentType ] )
Formats various text types into HTML.
=over
=item content
The text content to be formatted.
=item contentType
The content type to use as formatting. Valid types are 'html', 'text', 'code', and 'mixed'. Defaults to mixed. See also the contentType method in WebGUI::Form, WebGUI::HTMLForm, and WebGUI::FormProcessor.
=back
=cut
sub format {
my ($content, $contentType) = @_;
$contentType = 'mixed' unless ($contentType);
if ($contentType eq "mixed") {
unless ($content =~ /\<div/ig || $content =~ /\<br/ig || $content =~ /\<p/ig) {
$content =~ s/\n/\<br \/\>/g;
}
} elsif ($contentType eq "text") {
$content =~ s/\t/&nbsp;&nbsp;&nbsp;&nbsp;/g;
$content =~ s/\n/\<br \/\>/g;
} elsif ($contentType eq "code") {
$content =~ s/&/&amp;/g;
$content =~ s/\</&lt;/g;
$content =~ s/\>/&gt;/g;
$content =~ s/\n/\<br \/\>/g;
$content =~ s/\t/&nbsp;&nbsp;&nbsp;&nbsp;/g;
$content =~ s/ /&nbsp;/g;
$content = '<div style="font-family: monospace;">'.$content.'</div>';
}
return $content;
}
#-------------------------------------------------------------------
=head2 html2text ( html )
Converts html to text. It currently handles only text, so tables
or forms are not converted.
=over
=item html
The html segment you want to convert to text.
=back
=cut
sub html2text {
my $html = shift;
$session{temp}{html2text}{text} = "";
delete($session{temp}{html2text}{inside});
my $tagHandler = sub {
my($tag, $num) = @_;
$session{temp}{html2text}{inside}{$tag} += $num;
if($tag eq "br" || $tag eq "p") {
$session{temp}{html2text}{text} .= "\n";
}
};
my $textHandler = sub {
return if $session{temp}{html2text}{inside}{script} || $session{temp}{html2text}{inside}{style};
if ($_[0] =~ /\S+/) {
$session{temp}{html2text}{text} .= $_[0];
}
};
HTML::Parser->new(api_version => 3,
handlers => [start => [$tagHandler, "tagname, '+1'"],
end => [$tagHandler, "tagname, '-1'"],
text => [$textHandler, "dtext"],
],
marked_sections => 1,
)->parse($html);
return $session{temp}{html2text}{text};
}
#-------------------------------------------------------------------
=head2 makeAbsolute ( html , [ baseURL ] )
Returns html with all relative links converted to absolute.
=over
=item html
The html to be made absolute.
=item baseURL
The base URL to use. Defaults to current page's url.
=back
=cut
sub makeAbsolute {
my $html = shift;
my $baseURL = shift;
$session{temp}{makeAbsolute}{html} = "";
my $linkParser = sub {
my ($tagname, $attr, $text) = @_;
my %linkElements = # from HTML::Element.pm
(
body => 'background',
base => 'href',
a => 'href',
img => [qw(src lowsrc usemap)], # lowsrc is a Netscape invention
form => 'action',
input => 'src',
'link' => 'href', # need quoting since link is a perl builtin
frame => 'src',
iframe => 'src',
applet => 'codebase',
area => 'href',
script => 'src',
iframe => 'src',
);
if(not exists $linkElements{$tagname}) { # no need to touch this tag
$session{temp}{makeAbsolute}{html} .= $text;
return;
}
# Build a hash with tag attributes
my %tag_attr;
for my $tag (keys %linkElements) {
my $tagval = $linkElements{$tag};
for my $attr (ref $tagval ? @$tagval : $tagval) {
$tag_attr{"$tag $attr"}++;
}
}
$session{temp}{makeAbsolute}{html} .= "<".$tagname;
foreach (keys %$attr) {
if($_ eq '/') {
$session{temp}{makeAbsolute}{html} .= '/';
next;
}
if ($tag_attr{"$tagname $_"}) { # make this absolute
$attr->{$_} = WebGUI::URL::makeAbsolute($attr->{$_}, $baseURL);
}
$session{temp}{makeAbsolute}{html} .= qq' $_="$attr->{$_}"';
}
$session{temp}{makeAbsolute}{html} .= '>';
};
HTML::Parser->new(
default_h => [ sub { $session{temp}{makeAbsolute}{html} .= shift }, 'text' ],
start_h => [ $linkParser , 'tagname, attr, text' ],
)->parse($html);
return $session{temp}{makeAbsolute}{html};
}
#-------------------------------------------------------------------
=head2 processReplacements ( content )
Processes text using the WebGUI replacements system.
=over
=item content
The content to be processed through the replacements filter.
=back
=cut
sub processReplacements {
my ($content) = @_;
if (exists $session{replacements}) {
my $replacements = $session{replacements};
foreach my $searchFor (keys %{$replacements}) {
my $replaceWith = $replacements->{$searchFor};
$content =~ s/\Q$searchFor/$replaceWith/gs;
}
} else {
my $sth = WebGUI::SQL->read("select searchFor,replaceWith from replacements",WebGUI::SQL->getSlave);
while (my ($searchFor,$replaceWith) = $sth->array) {
$session{replacements}{$searchFor} = $replaceWith;
$content =~ s/\Q$searchFor/$replaceWith/gs;
}
$sth->finish;
}
return $content;
}
1;