added a new function to WebGUI::HTML to parse html text and extract a list of elements based on a tag

spec. used to extract paragraphs for the synopsis; convert FeedPP properties to scalar context so that blank fields are rendered properly.
2009-01-23 03:41:49 +00:00 · 2009-01-23 03:41:49 +00:00 · 16ce54bf4f
commit 16ce54bf4f
parent 2893944837
4 changed files with 61 additions and 8 deletions
--- a/docs/changelog/7.x.x.txt
+++ b/docs/changelog/7.x.x.txt
@ -1,4 +1,5 @@
 7.6.10
+ - fixed #9455: fixed synopsis to pick out html paragraphs, fixed FeedPP fields to scalar when blank
 - fixed: With autocommit and no comments on, making a shortcut of an asset takes you to that asset's view.

 7.6.9
--- a/lib/WebGUI/Asset/Post.pm
+++ b/lib/WebGUI/Asset/Post.pm
@ -548,11 +548,15 @@ sub getSynopsisAndContent {
 	my $synopsis = shift;
 	my $body = shift;
 	unless ($synopsis) {
-        	$body =~ s/\n/\^\-\;/ unless ($body =~ m/\^\-\;/);
-       	 	my @content = split(/\^\-\;/,$body);
-		$synopsis = WebGUI::HTML::filter($content[0],"all");
+           my @content;
+           if( $body =~ /<p>/ ) {
+               @content = WebGUI::HTML::splitTag($body);
+           } else {
+       	       @content = split("\n",$body);
+           }
+           shift @content if $content[0] =~ /^\s*$/;
+           $synopsis = WebGUI::HTML::filter($content[0],"all");
 	}
-	$body =~ s/\^\-\;/\n/;
 	return ($synopsis,$body);
 }

--- a/lib/WebGUI/Asset/Wobject/SyndicatedContent.pm
+++ b/lib/WebGUI/Asset/Wobject/SyndicatedContent.pm
@ -196,9 +196,9 @@ sub getTemplateVariables {
 	my @items = $feed->get_item;
 	my %var;
 	$var{channel_title} = WebGUI::HTML::filter($feed->title, 'javascript');
-	$var{channel_description} = WebGUI::HTML::filter($feed->description, 'javascript');
-	$var{channel_date} = WebGUI::HTML::filter($feed->get_pubDate_epoch, 'javascript');
-	$var{channel_copyright} = WebGUI::HTML::filter($feed->copyright, 'javascript');
+	$var{channel_description} = WebGUI::HTML::filter(scalar($feed->description), 'javascript');
+	$var{channel_date} = WebGUI::HTML::filter(scalar($feed->get_pubDate_epoch), 'javascript');
+	$var{channel_copyright} = WebGUI::HTML::filter(scalar($feed->copyright), 'javascript');
 	$var{channel_link} = WebGUI::HTML::filter($feed->link, 'javascript');
 	my @image = $feed->image;
 	$var{channel_image_url} = WebGUI::HTML::filter($image[0], 'javascript');
@ -215,7 +215,7 @@ sub getTemplateVariables {
        $item{author} = WebGUI::HTML::filter($object->author, 'javascript');
        $item{guid} = WebGUI::HTML::filter($object->guid, 'javascript');
        $item{link} = WebGUI::HTML::filter($object->link, 'javascript');
-        $item{description} = WebGUI::HTML::filter($object->description, 'javascript');
+        $item{description} = WebGUI::HTML::filter(scalar($object->description), 'javascript');
        $item{descriptionFirst100words} = $item{description};
        $item{descriptionFirst100words} =~ s/(((\S+)\s+){100}).*/$1/s;
        $item{descriptionFirst75words} = $item{descriptionFirst100words};
--- a/lib/WebGUI/HTML.pm
+++ b/lib/WebGUI/HTML.pm
@ -14,6 +14,7 @@ package WebGUI::HTML;

 =cut

+use HTML::TokeParser;
 use HTML::TagFilter;
 use strict;
 use WebGUI::Macro;
@ -36,6 +37,7 @@ A package for manipulating and massaging HTML.
 $html = WebGUI::HTML::html2text($html);
 $html = WebGUI::HTML::makeAbsolute($session, $html);
 $html = WebGUI::HTML::processReplacements($session, $html);
+ $html = WebGUI::HTML::splitTag([$tag,]$html[,$count]);    # defaults to ( 'p', $html, 0 )

 =head1 METHODS

@ -396,5 +398,51 @@ sub processReplacements {
 	return $content;
 }

+#-------------------------------------------------------------------
+
+=head2 WebGUI::HTML::splitTag([$tag,]$html[,$count]);
+
+splits an block of HTML into an array based on the contents of a single tag
+
+=head3 tag
+
+The HTML tag top extract from the text.  this defaults to 'p' giving a list of paragraphs
+
+=head3 html
+
+The block of HTML text that will be disected
+
+=head3 count
+
+How many items do we want?  defaults to 1; returns 1 non-blank item
+
+=cut
+
+sub splitTag {
+
+    my $tag = shift;
+    my $html = shift;
+    my $count = shift || 1;
+    if( not defined $html or $html =~ /^(\d+)$/ ) {
+        $count = $html if $1;
+        $html = $tag;
+        $tag = 'p';                 # the default tag is 'p' -- grabs a paragraph
+    }
+    my @result;
+
+    $html =~ s/\&nbsp;//g;   # get rid of all non-breaking spaces
+
+    my $p = HTML::TokeParser->new(\$html);
+
+    while (my $token = $p->get_tag($tag)) {
+        my $text = $p->get_trimmed_text("/$tag");
+        next if $text =~ /^([:space:]|[:^print:])*$/;    # skip whitespace
+        push @result, $text;          # add the text between the tags to the result array
+        last if @result == $count;    # if we have a full count then quit
+    }
+
+    return @result;
+}
+
 1;