Significant update to the SyndicatedContent wobject to allow for better aggregation and scheduled content downloads.

2005-06-15 02:29:12 +00:00 · 2005-06-15 02:29:12 +00:00 · 2fd3cb57f3
commit 2fd3cb57f3
parent c7c968eaf2
6 changed files with 519 additions and 150 deletions
--- a/lib/WebGUI/Asset/Wobject/SyndicatedContent.pm
+++ b/lib/WebGUI/Asset/Wobject/SyndicatedContent.pm
@ -25,11 +25,43 @@ use WebGUI::Asset::Wobject;
 use XML::RSSLite;
 use LWP::UserAgent;
 use WebGUI::ErrorHandler;
+use POSIX qw/floor/;
 my $hasEncode=1;
 eval " use Encode qw(from_to); "; $hasEncode=0 if $@;

 our @ISA = qw(WebGUI::Asset::Wobject);

+=head1 NAME
+
+Package WebGUI::Asset::Wobject::SyndicatedContent
+
+=head1 DESCRIPTION
+
+Displays items and channels from RSS feeds.
+
+=head1 SYNOPSIS
+
+use WebGUI::Asset::Wobject::SyndicatedWobject;
+
+
+=head1 METHODS
+
+These methods are available from this class:
+
+=cut
+
+
+#-------------------------------------------------------------------
+
+=head2 definition ( definition )
+
+Defines the properties of this asset.
+
+=head3 definition
+
+A hash reference passed in from a subclass definition.
+
+=cut


 #-------------------------------------------------------------------
@ -46,19 +78,31 @@ sub definition {
 				},
 			rssUrl=>{
 				defaultValue=>undef,
-				fieldType=>"url"
+				fieldType=>"textarea"
 				},
                        maxHeadlines=>{
 				fieldType=>"integer",
 				defaultValue=>10
 				},
+			displayMode=>{
+				      fieldType=>"text",
+				      defaultValue=>"interleaved"
+				     },
+			hasTerms=>{
+				   fieldType=>"text",
+				   defaultValue=>""
+				  }
 			}
 		});
        return $class->SUPER::definition($definition);
 }

+=head2 getName ()
+
+Returns the icons associated with this asset.
+
+=cut

-#-------------------------------------------------------------------
 sub getIcon {
 	my $self = shift;
 	my $small = shift;
@ -66,43 +110,81 @@ sub getIcon {
 	return $session{config}{extrasURL}.'/assets/syndicatedContent.gif';
 }

-#-------------------------------------------------------------------
+
+=head2 getName ()
+
+Returns the displayable name of this asset.
+
+=cut
+
 sub getName {
        return WebGUI::International::get(2,"Asset_SyndicatedContent");
 }

-#-------------------------------------------------------------------
+=head2 getUiLevel ()
+
+Returns the displayable name of this asset.
+
+=cut
+
 sub getUiLevel {
        return 6;
 }

-
 #-------------------------------------------------------------------
+
+=head2 getEditForm ()
+
+Returns the TabForm object that will be used in generating the edit page for this asset.
+
+=cut
+
 sub getEditForm {
 	my $self = shift;
 	my $tabform = $self->SUPER::getEditForm();
   	$tabform->getTab("display")->template(
      		-value=>$self->getValue('templateId'),
-      		-namespace=>"SyndicatedContent"
+      		-namespace=>"SyndicatedContent",
 		-label=>WebGUI::International::get(72,"Asset_SyndicatedContent"),
   		);
-	$tabform->getTab("properties")->url(
+	$tabform->getTab("display")->selectList(
+		-name=>"displayMode",
+		-options=>{
+			'interleaved'=>WebGUI::International::get("interleaved","Asset_SyndicatedContent"),
+			'grouped'=>WebGUI::International::get("grouped","Asset_SyndicatedContent"),
+			 },
+		-sortByValue=>1,
+		-label=>WebGUI::International::get("displayModeLabel","Asset_SyndicatedContent"),
+		-value=>[$self->getValue('displayMode')],
+		-subtext=>WebGUI::International::get("displayModeSubtext","Asset_SyndicatedContent"),
+		);
+	$tabform->getTab("display")->text(
+		-name=>"hasTerms",
+		-label=>WebGUI::International::get("hasTermsLabel","Asset_SyndicatedContent"),
+		-maxlength=>255,
+		-value=>$self->getValue("hasTerms"),
+		);
+	$tabform->getTab("properties")->textarea(
 		-name=>"rssUrl",
 		-label=>WebGUI::International::get(1,"Asset_SyndicatedContent"),
 		-value=>$self->getValue("rssUrl")
 		);
+
 	$tabform->getTab("display")->integer(
 		-name=>"maxHeadlines",
 		-label=>WebGUI::International::get(3,"Asset_SyndicatedContent"),
 		-value=>$self->getValue("maxHeadlines")
 		);
+	#$tabform->addTab("rss",WebGUI::International::get("rssTabName","Asset_SyndicatedContent"));
+	
 	return $tabform;
 }

 #-------------------------------------------------------------------
 # strip all html tags from the given data structure.  This is important to
 # prevent cross site scripting attacks
-my $_stripped_html = {};
+#my $_stripped_html = {};
+
 sub _strip_html {
        #my ($data) = @_;
        
@ -129,6 +211,7 @@ sub _strip_html {
 #-------------------------------------------------------------------
 # horrible kludge to find the channel or item record
 # in the varying kinds of rss structures returned by RSSLite
+
 sub _find_record {
        my ($data, $regex) = @_;
        
@ -166,6 +249,7 @@ sub _find_record {
 # the guid, if it is a link, always means the former.
 # Also copy the first few words of the description into the title if 
 # there is no title
+
 sub _normalize_items {
        #my ($items) = @_;
        
@ -195,7 +279,7 @@ sub _normalize_items {

 #-------------------------------------------------------------------
 sub _get_rss_data {
-        my ($url) = @_;
+        my $url = shift;
        
        my $cache = WebGUI::Cache->new("url:" . $url, "RSS");
        my $rss_serial = $cache->get;
@ -223,19 +307,16 @@ sub _get_rss_data {
 			}
 				
 		}
-
-                
-                # there is no encode_entities_numeric that I can find, so I am 
-                # commenting this out. -hal
-                #    $xml =~ s#(<title>)(.*?)(</title>)#$1.encode_entities_numeric(decode_entities($2)).$3#ges;
-                #    $xml =~ s#(<description>)(.*?)(</description>)#$1.encode_entities_numeric(decode_entities($2)).$3#ges; 
                
                my $rss_lite = {};
                eval {
                        XML::RSSLite::parseXML($rss_lite, \$xml);
                };
                if ($@) {
-                        WebGUI::ErrorHandler::warn("error parsing rss for url $url");
+                        WebGUI::ErrorHandler::warn("error parsing rss for url $url :".$@);
+			#Returning undef on a parse failure is a change from previous behaviour,
+			#but it SHOULDN'T have a major effect.
+			return undef;
                }
                
                # make sure that the {channel} points to the channel 
@ -243,6 +324,7 @@ sub _get_rss_data {
                # of items.  without this voodoo, different versions of 
                # rss return the data in different places in the data 
                # structure.
+
                $rss_lite = {channel => $rss_lite};
                if (!($rss->{channel} = 
                      _find_record($rss_lite, qr/^channel$/))) {
@ -251,13 +333,19 @@ sub _get_rss_data {
                if (!($rss->{items} = _find_record($rss_lite, qr/^items?$/))) {
                        WebGUI::ErrorHandler::warn("unable to find item info for url $url");
                        $rss->{items} = [];
-                }
+		}
                
                _strip_html($rss);
                 $rss->{items} = [ $rss->{items} ] unless (ref $rss->{items} eq 'ARRAY');

                _normalize_items($rss->{items});
-                
+
+		#Assign dates "globally" rather than when seen in a viewed feed.
+		#This is important because we can "filter" now and want to ensure we keep order
+		#correctly as new items appear.
+		_assign_rss_dates($rss->{items});
+
+                #Default to an hour timeout
                $cache->set(Storable::freeze($rss), 3600);
        }
        
@ -268,6 +356,7 @@ sub _get_rss_data {
 # rss items don't have a standard date, so timestamp them the first time
 # we see them and use that timestamp as the date.  Periodically nuke the
 # whole database to keep the thing from growing too large
+
 sub _assign_rss_dates {
        my ($items) = @_;
        
@ -285,113 +374,189 @@ sub _assign_rss_dates {
  }

 #-------------------------------------------------------------------
-sub _get_aggregate_items {
+# $items is the hashref to put items into.
+# $rss_feeds is an arrayref of all the feeds in this wobject
+# The only difference between an "interleaved" feed and a grouped feed
+# is the order the items are output.
+
+sub _create_grouped_items{
+    my($items,$rss_feeds,$maxHeadlines,$hasTermsRegex)=@_;
+    
+    _create_interleaved_items($items,$rss_feeds,$maxHeadlines,$hasTermsRegex);
+    
+    @$items=sort{$a->{'site_title'} cmp $b->{'site_title'}} @$items;
+    
+    #Loop through the items and output the "site_
+    my $siteTitleTracker;
+    foreach(@$items){
+	if($siteTitleTracker ne $_->{site_title}){
+	    $_->{new_rss_site}=1;
+	}
+	$siteTitleTracker=$_->{site_title};
+    }
+}
+
+
+#-------------------------------------------------------------------
+# Loop through the feeds for this wobject 
+# and push in the items in "interleaved mode"
+# No need to return because we're doing everything by reference.
+
+sub _create_interleaved_items{
+    my($items,$rss_feeds,$maxHeadlines,$hasTermsRegex)=@_;
+    my $items_remain = 1;
+    while((@$items < $maxHeadlines) && $items_remain){
+	foreach my $rss(@$rss_feeds){
+	    $items_remain=0;
+	    if(defined $rss->{items}
+	       && @$items < $maxHeadlines
+	       && (my $item = shift @{$rss->{items}})
+	      ){
+		$item->{site_title}=$rss->{channel}->{title};
+		$item->{site_link}=$rss->{channel}->{link};
+		if(! $hasTermsRegex || _check_hasTerms($item,$hasTermsRegex)){
+		    push(@{$items},$item);
+		} 
+		if (@{$rss->{items}}) {
+		    $items_remain = 1;
+		}
+	    }
+	}
+    }
+}
+
+#-------------------------------------------------------------------
+# Uses the regex constructed in _get_items (with the terms defaulting to OR)
+# to see if the title or description associated with this item match the kinds
+# of items we're looking for.
+#
+
+sub _check_hasTerms{
+    my($item,$hasTermsRegex)=@_;
+    my $to_check=$item->{title}.$item->{description};
+    if( $to_check =~ /$hasTermsRegex/gism){
+	return 1;
+    } else {
+	return 0;
+    }
+}
+
+
+################################################################################
+sub _make_regex{
+    my $terms=shift;
+    my @terms=split(/,/,$terms);
+    return join("|",@terms);
+}
+#############################
+
+
+#-------------------------------------------------------------------
+# So- We're going to manage an "aggregate cache" that represents
+# the rendering of the cumulative feeds in a Syndicated Wobject,
+# but let each feed "fend for itself" based on URL in the cache.
+#
+# This means we can set up the hourly task to get and cache each
+# individual feed WITHOUT having to re-request (undoubtedly the slowest
+# part of every RSS parsing action is the network traffic) each feed 
+# when we re-render each aggregrate representation.
+#
+# If, however, a feed expires between hourly tasks, it will be re-requested and
+# parsed per the usual. BUT, if a feed ever goes un-requested for more than an hour,
+# then it's retrieval schedule will be taken over by the hourly task, and we'll
+# be pre-seeding the RSS object cache automatically.
+#
+# Having the caching set up this way means we can re-use the same raw feed all over the site without
+# having each wobject request it separately, ASSUMING the URL is the same.
+#
+# All the values that may have an effect on the composition of items
+# are included in the cache key for the aggregate representation.
+
+sub _get_items {
 	my $self = shift;
 	my $urls = shift;
 	my $maxHeadlines = shift;
        
-        my $cache = WebGUI::Cache->new("aggregate:" . 
-                                       $self->get("rssUrl"), "RSS");
+	my $displayMode=$self->getValue('displayMode');
+
+	my $hasTermsRegex=_make_regex($self->getValue('hasTerms'));
+	my $maxHeadlines=$self->getValue('maxHeadlines');
+	
+	my $key=join(":",("aggregate", $displayMode,$hasTermsRegex,$maxHeadlines,$self->get("rssUrl")));
+
+        my $cache = WebGUI::Cache->new($key, "RSS");
        my $items = Storable::thaw($cache->get());
+	my @rss_feeds;
        if (!$items) {
                $items = [];
-                my $items_remain = 1;
                
-                my @rsss;
                for my $url (@{$urls}) {
-                        push(@rsss, _get_rss_data($url));
+		    my $rss_info=_get_rss_data($url);
+		    push(@rss_feeds, $rss_info) if($rss_info);
                }
-                
-                while ((@{$items} < $maxHeadlines) && $items_remain) {
-                        $items_remain = 0;
-                        for my $rss (@rsss) {
-                                if ($rss->{items} && 
-                                    (my $item = shift(@{$rss->{items}}))) {
-                                        push(@{$items}, 
-                                             {site_title => $rss->{channel}->{title},
-                                              site_link => $rss->{channel}->{link},
-                                              link => $item->{link},
-                                              title => $item->{title},
-                                              description => $item->{description},
-                                             });
-                                        if (@{$rss->{items}}) {
-                                                $items_remain = 1;
-                                        }
-                                }
-                        }
-                }
-                
-                _assign_rss_dates($items);
+
+		#Sort feeds in order by channel title.
+		#@rss_feeds=sort{$a->{channel}->{title} cmp $b->{channel}->{title}} @rss_feeds;
+		
+                if ($displayMode eq 'grouped') {
+		    _create_grouped_items($items,\@rss_feeds,$maxHeadlines,$hasTermsRegex);
+		} else {
+		    _create_interleaved_items($items,\@rss_feeds,$maxHeadlines,$hasTermsRegex);
+		}
                
                @{$items} = sort { $b->{date} <=> $a->{date} } @{$items};
                
-                #if (@{$items} > $_aggregate_size) {
-                #  @{$items} = @{$items}[0..($_aggregate_size-1)];
-                #}
-                
                $cache->set(Storable::freeze($items), 3600);
-        }
+	    }
        
-        return $items;
-}  
-
-#-------------------------------------------------------------------
-# interleave stories from each feed, up to a total of $_aggregate_size
-sub _view_aggregate_feed {
-	my $self = shift;
-	my $urls = shift;
-	my $maxHeadlines = shift;
-        my %var;
-        $var{'channel.title'} = $self->get("title");
-        $var{'channel.description'} = $self->get("description");
-        $var{item_loop} = $self->_get_aggregate_items($urls, $maxHeadlines);
-        
-        return $self->processTemplate(\%var,$self->get("templateId"));
+	#So return the item loop and the first RSS feed, because 
+	#when we're parsing a single feed we can use that feed's title and 
+	#description for channel.title, channel.link, and channel.description
+        return ($items,\@rss_feeds);
 }

+=head2 view()

-#-------------------------------------------------------------------
-sub _view_single_feed {
-	my $self = shift;
-        my $maxHeadlines = shift;
-        my $rss = _get_rss_data($self->get("rssUrl"));
-        my %var;
-        $var{"channel.title"} = $rss->{channel}->{title};
-        $var{"channel.link"} = $rss->{channel}->{link};
-        $var{"channel.description"} = $rss->{channel}->{description};
-        my @items;
-        $rss->{items} ||= [];
-        for (my $i = 0; ($i < @{$rss->{items}}) && ($i < $maxHeadlines);$i++) {
-                my $item = $rss->{items}->[$i];
-                push (@items,{
-                              link=>$item->{link},
-                              title=>$item->{title},
-                              description=>$item->{description}
-                             });
-        }
-        $var{item_loop} = \@items;
-        return $self->processTemplate(\%var,$self->get("templateId"));
-}
+Returns the rendered output of the wobject.
+
+=cut

-#-------------------------------------------------------------------
 sub view {
 	my $self = shift;
 	$self->logView() if ($session{setting}{passiveProfilingEnabled});
+
        my $maxHeadlines = $self->get("maxHeadlines") || 1000000;
-        my @urls = split(/\s+/,$self->get("rssUrl"));        
-        if (@urls == 1) {
-                return $self->_view_single_feed($maxHeadlines);
-        } else {
-                return $self->_view_aggregate_feed(\@urls, $maxHeadlines);
-        }
+        my @urls = split(/\s+/,$self->get("rssUrl"));
+
+        my %var;
+	
+	my($item_loop,$rss_feeds)=$self->_get_items(\@urls, $maxHeadlines);
+	if(@$rss_feeds > 1){
+	    #If there is more than one (valid) feed in this wobject, put in the wobject description info.
+	    $var{'channel.title'} = $self->get("title");
+	    $var{'channel.description'} = $self->get("description");
+	} else {
+	    #One feed. Put in the info from the feed.
+	    $var{"channel.title"} = $rss_feeds->[0]->{channel}->{title};
+	    $var{"channel.link"} = $rss_feeds->[0]->{channel}->{link};
+	    $var{"channel.description"} = $rss_feeds->[0]->{channel}->{description};
+	}
+        $var{item_loop} = $item_loop;
+
+        return $self->processTemplate(\%var,$self->get("templateId"));
 }


-#-------------------------------------------------------------------
+=head2 www_edit()
+
+Sets parameters and returns a form to edit this wobject.
+
+=cut
+
 sub www_edit {
        my $self = shift;
 	return WebGUI::Privilege::insufficient() unless $self->canEdit;
-        $self->getAdminConsole->setHelp("syndicated content add/edit","SyndicatedContent");
+        $self->getAdminConsole->setHelp("syndicated content add/edit","Asset_SyndicatedContent");
        return $self->getAdminConsole->render($self->getEditForm->print,WebGUI::International::get("4","Asset_SyndicatedContent"));
 }