From 16ce54bf4f4a5ee70845bd31ab961b1f821c0e8b Mon Sep 17 00:00:00 2001 From: David Delikat Date: Fri, 23 Jan 2009 03:41:49 +0000 Subject: [PATCH] added a new function to WebGUI::HTML to parse html text and extract a list of elements based on a tag spec. used to extract paragraphs for the synopsis; convert FeedPP properties to scalar context so that blank fields are rendered properly. --- docs/changelog/7.x.x.txt | 1 + lib/WebGUI/Asset/Post.pm | 12 +++-- lib/WebGUI/Asset/Wobject/SyndicatedContent.pm | 8 ++-- lib/WebGUI/HTML.pm | 48 +++++++++++++++++++ 4 files changed, 61 insertions(+), 8 deletions(-) diff --git a/docs/changelog/7.x.x.txt b/docs/changelog/7.x.x.txt index ae18a4283..5a490c178 100644 --- a/docs/changelog/7.x.x.txt +++ b/docs/changelog/7.x.x.txt @@ -1,4 +1,5 @@ 7.6.10 + - fixed #9455: fixed synopsis to pick out html paragraphs, fixed FeedPP fields to scalar when blank - fixed: With autocommit and no comments on, making a shortcut of an asset takes you to that asset's view. 7.6.9 diff --git a/lib/WebGUI/Asset/Post.pm b/lib/WebGUI/Asset/Post.pm index 79d3770a7..3332d4e71 100644 --- a/lib/WebGUI/Asset/Post.pm +++ b/lib/WebGUI/Asset/Post.pm @@ -548,11 +548,15 @@ sub getSynopsisAndContent { my $synopsis = shift; my $body = shift; unless ($synopsis) { - $body =~ s/\n/\^\-\;/ unless ($body =~ m/\^\-\;/); - my @content = split(/\^\-\;/,$body); - $synopsis = WebGUI::HTML::filter($content[0],"all"); + my @content; + if( $body =~ /

/ ) { + @content = WebGUI::HTML::splitTag($body); + } else { + @content = split("\n",$body); + } + shift @content if $content[0] =~ /^\s*$/; + $synopsis = WebGUI::HTML::filter($content[0],"all"); } - $body =~ s/\^\-\;/\n/; return ($synopsis,$body); } diff --git a/lib/WebGUI/Asset/Wobject/SyndicatedContent.pm b/lib/WebGUI/Asset/Wobject/SyndicatedContent.pm index cf0a8fc20..0de8d7730 100644 --- a/lib/WebGUI/Asset/Wobject/SyndicatedContent.pm +++ b/lib/WebGUI/Asset/Wobject/SyndicatedContent.pm @@ -196,9 +196,9 @@ sub getTemplateVariables { my @items = $feed->get_item; my %var; $var{channel_title} = WebGUI::HTML::filter($feed->title, 'javascript'); - $var{channel_description} = WebGUI::HTML::filter($feed->description, 'javascript'); - $var{channel_date} = WebGUI::HTML::filter($feed->get_pubDate_epoch, 'javascript'); - $var{channel_copyright} = WebGUI::HTML::filter($feed->copyright, 'javascript'); + $var{channel_description} = WebGUI::HTML::filter(scalar($feed->description), 'javascript'); + $var{channel_date} = WebGUI::HTML::filter(scalar($feed->get_pubDate_epoch), 'javascript'); + $var{channel_copyright} = WebGUI::HTML::filter(scalar($feed->copyright), 'javascript'); $var{channel_link} = WebGUI::HTML::filter($feed->link, 'javascript'); my @image = $feed->image; $var{channel_image_url} = WebGUI::HTML::filter($image[0], 'javascript'); @@ -215,7 +215,7 @@ sub getTemplateVariables { $item{author} = WebGUI::HTML::filter($object->author, 'javascript'); $item{guid} = WebGUI::HTML::filter($object->guid, 'javascript'); $item{link} = WebGUI::HTML::filter($object->link, 'javascript'); - $item{description} = WebGUI::HTML::filter($object->description, 'javascript'); + $item{description} = WebGUI::HTML::filter(scalar($object->description), 'javascript'); $item{descriptionFirst100words} = $item{description}; $item{descriptionFirst100words} =~ s/(((\S+)\s+){100}).*/$1/s; $item{descriptionFirst75words} = $item{descriptionFirst100words}; diff --git a/lib/WebGUI/HTML.pm b/lib/WebGUI/HTML.pm index 67e4f8068..32d94ea7d 100644 --- a/lib/WebGUI/HTML.pm +++ b/lib/WebGUI/HTML.pm @@ -14,6 +14,7 @@ package WebGUI::HTML; =cut +use HTML::TokeParser; use HTML::TagFilter; use strict; use WebGUI::Macro; @@ -36,6 +37,7 @@ A package for manipulating and massaging HTML. $html = WebGUI::HTML::html2text($html); $html = WebGUI::HTML::makeAbsolute($session, $html); $html = WebGUI::HTML::processReplacements($session, $html); + $html = WebGUI::HTML::splitTag([$tag,]$html[,$count]); # defaults to ( 'p', $html, 0 ) =head1 METHODS @@ -396,5 +398,51 @@ sub processReplacements { return $content; } +#------------------------------------------------------------------- + +=head2 WebGUI::HTML::splitTag([$tag,]$html[,$count]); + +splits an block of HTML into an array based on the contents of a single tag + +=head3 tag + +The HTML tag top extract from the text. this defaults to 'p' giving a list of paragraphs + +=head3 html + +The block of HTML text that will be disected + +=head3 count + +How many items do we want? defaults to 1; returns 1 non-blank item + +=cut + +sub splitTag { + + my $tag = shift; + my $html = shift; + my $count = shift || 1; + if( not defined $html or $html =~ /^(\d+)$/ ) { + $count = $html if $1; + $html = $tag; + $tag = 'p'; # the default tag is 'p' -- grabs a paragraph + } + my @result; + + $html =~ s/\ //g; # get rid of all non-breaking spaces + + my $p = HTML::TokeParser->new(\$html); + + while (my $token = $p->get_tag($tag)) { + my $text = $p->get_trimmed_text("/$tag"); + next if $text =~ /^([:space:]|[:^print:])*$/; # skip whitespace + push @result, $text; # add the text between the tags to the result array + last if @result == $count; # if we have a full count then quit + } + + return @result; +} + 1;