added a new function to WebGUI::HTML to parse html text and extract a list of elements based on a tag

spec. used to extract paragraphs for the synopsis; convert FeedPP properties to scalar context so that
blank fields are rendered properly.
This commit is contained in:
David Delikat 2009-01-23 03:41:49 +00:00
parent 2893944837
commit 16ce54bf4f
4 changed files with 61 additions and 8 deletions

View file

@ -14,6 +14,7 @@ package WebGUI::HTML;
=cut
use HTML::TokeParser;
use HTML::TagFilter;
use strict;
use WebGUI::Macro;
@ -36,6 +37,7 @@ A package for manipulating and massaging HTML.
$html = WebGUI::HTML::html2text($html);
$html = WebGUI::HTML::makeAbsolute($session, $html);
$html = WebGUI::HTML::processReplacements($session, $html);
$html = WebGUI::HTML::splitTag([$tag,]$html[,$count]); # defaults to ( 'p', $html, 0 )
=head1 METHODS
@ -396,5 +398,51 @@ sub processReplacements {
return $content;
}
#-------------------------------------------------------------------
=head2 WebGUI::HTML::splitTag([$tag,]$html[,$count]);
splits an block of HTML into an array based on the contents of a single tag
=head3 tag
The HTML tag top extract from the text. this defaults to 'p' giving a list of paragraphs
=head3 html
The block of HTML text that will be disected
=head3 count
How many items do we want? defaults to 1; returns 1 non-blank item
=cut
sub splitTag {
my $tag = shift;
my $html = shift;
my $count = shift || 1;
if( not defined $html or $html =~ /^(\d+)$/ ) {
$count = $html if $1;
$html = $tag;
$tag = 'p'; # the default tag is 'p' -- grabs a paragraph
}
my @result;
$html =~ s/\ //g; # get rid of all non-breaking spaces
my $p = HTML::TokeParser->new(\$html);
while (my $token = $p->get_tag($tag)) {
my $text = $p->get_trimmed_text("/$tag");
next if $text =~ /^([:space:]|[:^print:])*$/; # skip whitespace
push @result, $text; # add the text between the tags to the result array
last if @result == $count; # if we have a full count then quit
}
return @result;
}
1;