From ba4a5c7e4b1cc54b3b8e7548a12303bf8985a5eb Mon Sep 17 00:00:00 2001
From: Hal Roberts <hroberts@webgui-cvs>
Date: Fri, 25 Jul 2003 15:00:39 +0000
Subject: [PATCH] revamped SyndicatedContent to support aggregate feeds.

---
 docs/changelog/5.x.x.txt                |   1 +
 docs/upgrades/upgrade_5.4.2-5.5.0.sql   |   6 +
 lib/WebGUI/Wobject/SyndicatedContent.pm | 297 +++++++++++++++++++++---
 3 files changed, 274 insertions(+), 30 deletions(-)
diff --git a/docs/changelog/5.x.x.txt b/docs/changelog/5.x.x.txt
index 254983b8c..3385619d2 100644
--- a/docs/changelog/5.x.x.txt
+++ b/docs/changelog/5.x.x.txt
@@ -6,6 +6,7 @@
  - Added anonymous response option to Survey. (Thanks to Andy Grundman.)
  - Added the alphabetic? option to the SiteMap wobject. (Thanks to Hal Roberts.)
  - Changed the page hideFromNavigation option to hide the given page from site maps as well as from navigation menus. (Thanks to Hal Roberts.)
+ - Revamped SyndicatedContent to support aggregate feeds. (Thanks to Hal Roberts.)
  - Fixed a relative hyperlink bug in htmlArea (Thanks to Andreas Graf.)
  - Ordered the username list in the manageUsersInGroup operation (Thanks to
    Andreas Graf.)
diff --git a/docs/upgrades/upgrade_5.4.2-5.5.0.sql b/docs/upgrades/upgrade_5.4.2-5.5.0.sql
index fb1c8803a..d84319c48 100644
--- a/docs/upgrades/upgrade_5.4.2-5.5.0.sql
+++ b/docs/upgrades/upgrade_5.4.2-5.5.0.sql
@@ -104,3 +104,9 @@ delete from international where languageId=1 and namespace='SiteMap' and interna
 insert into international values (71,'SiteMap',1,'Site maps are used to provide additional navigation in WebGUI. You could set up a traditional site map that would display a hierarchical view of all the pages in the site. On the other hand, you could use site maps to provide extra navigation at certain levels in your site.\r\n<br><br>\r\n\r\n<b>Template</b><br/>\r\nChoose a layout for this site map.\r\n<p/>\r\n\r\n<b>Start With</b><br>\r\nSelect the page that this site map should start from.\r\n<br><br>\r\n\r\n<b>Depth To Traverse</b><br>\r\nHow many levels deep of navigation should the Site Map show? If 0 (zero) is specified, it will show as many levels as there are.\r\n<p>\r\n\r\n<b>Indent</b><br>\r\nHow many characters should indent each level?\r\n<p>\r\n\r\n<b>Alphabetic?</b><br>\r\nIf this setting is true, site map entries are sorted alphabetically.  If this setting is false, site map entries are sorted by the page sequence order (editable via the up and down arrows in the page toolbar).\r\n<p>\r\n\r\n',1039908464,NULL);
 delete from international where languageId=1 and namespace='WebGUI' and internationalId=606;
 insert into international values (606,'WebGUI',1,'Think of pages as containers for content. For instance, if you want to write a letter to the editor of your favorite magazine you\'d get out a notepad (or open a word processor) and start filling it with your thoughts. The same is true with WebGUI. Create a page, then add your content to the page.\r\n<p>\r\n\r\n<b>Title</b><br>\r\nThe title of the page is what your users will use to navigate through the site. Titles should be descriptive, but not very long.\r\n<p>\r\n\r\n\r\n<b>Menu Title</b><br>\r\nA shorter or altered title to appear in navigation. If left blank this will default to <i>Title</i>.\r\n<p>\r\n\r\n<b>Page URL</b><br>\r\nWhen you create a page a URL for the page is generated based on the page title. If you are unhappy with the URL that was chosen, you can change it here.\r\n<p>\r\n\r\n<b>Redirect URL</b><br>\r\nWhen this page is visited, the user will be redirected to the URL specified here. \r\n<p>\r\n<b>NOTE:</b> The redirects will be disabled while in admin mode in order to make it easier to edit the properties of the page.\r\n<p>\r\n\r\n\r\n<b>Hide from navigation?</b><br>\r\nSelect yes to hide this page from the navigation menus and site maps.\r\n<p>\r\n<B>NOTE:</b> This will not hide the page from the page tree (Administrative functions... &gt; Manage page tree.), only from navigation macros and from site maps.\r\n<p>\r\n\r\n<b>Open in new window?</b><br>\r\nSelect yes to open this page in a new window. This is often used in conjunction with the <b>Redirect URL</b> parameter.\r\n<p>\r\n\r\n\r\n\r\n<b>Language</b><br/>\r\nChoose the default language for this page. All WebGUI generated messages will appear in that language and the character set will be changed to the character set for that language.\r\n<p/>\r\n\r\n<P><B>Cache Timeout</B><BR>The amount of time this page should remain cached for registered users. \r\n\r\n<P><B>Cache Timeout (Visitors)</B><BR>The amount of time this page should remain cached for visitors. \r\n\r\n<P><B>NOTE:</B> Page caching is only available if your administrator has installed the Cache::FileCache Perl module. Using page caching can improve site performance by as much as 1000%.&nbsp;\r\n\r\n\r\n<b>Template</b><br>\r\nBy default, WebGUI has one big content area to place wobjects. However, by specifying a template other than the default you can sub-divide the content area into several sections.\r\n<p>\r\n\r\n<b>Synopsis</b><br>\r\nA short description of a page. It is used to populate default descriptive meta tags as well as to provide descriptions on Site Maps.\r\n<p>\r\n\r\n<b>Meta Tags</b><br>\r\nMeta tags are used by some search engines to associate key words to a particular page. There is a great site called <a href=\"http://www.metatagbuilder.com/\">Meta Tag Builder</a> that will help you build meta tags if you\'ve never done it before.\r\n<p>\r\n\r\n<i>Advanced Users:</i> If you have other things (like JavaScript) you usually put in the  area of your pages, you may put them here as well.\r\n<p>\r\n\r\n<b>Use default meta tags?</b><br>\r\nIf you don\'t wish to specify meta tags yourself, WebGUI can generate meta tags based on the page title and your company\'s name. Check this box to enable the WebGUI-generated meta tags.\r\n<p>\r\n\r\n\r\n<b>Style</b><br>\r\nBy default, when you create a page, it inherits a few traits from its parent. One of those traits is style. Choose from the list of styles if you would like to change the appearance of this page. See <i>Add Style</i> for more details.\r\n<p>\r\n\r\nIf you select \"Yes\" below the style pull-down menu, all of the pages below this page will take on the style you\'ve chosen for this page.\r\n<p>\r\n\r\n<b>Start Date</b><br>\r\nThe date when users may begin viewing this page. Note that before this date only content managers with the rights to edit this page will see it.\r\n<p>\r\n\r\n<b>End Date</b><br>\r\nThe date when users will stop viewing this page. Note that after this date only content managers with the rights to edit this page will see it.\r\n<p>\r\n\r\n\r\n<b>Owner</b><br>\r\nThe owner of a page is usually the person who created the page. This user always has full edit and viewing rights on the page.\r\n<p>\r\n<b>NOTE:</b> The owner can only be changed by an administrator.\r\n<p>\r\n\r\n\r\n<b>Who can view?</b><br>\r\nChoose which group can view this page. If you want both visitors and registered users to be able to view the page then you should choose the \"Everybody\" group.\r\n<p>\r\n\r\n<b>Who can edit?</b><br>\r\nChoose the group that can edit this page. The group assigned editing rights can also always view the page.\r\n<p>\r\n\r\nYou can optionally recursively give these privileges to all pages under this page.\r\n<p>\r\n\r\n<b>What next?</b><br/>\r\nIf you leave this on the default setting you\'ll be redirected to the new page after creating it.\r\n<p/>',1056293101,NULL);
+
+alter table SyndicatedContent add column maxHeadlines int(11) not null default 0;
+insert into international (internationalId,languageId,namespace,message,lastUpdated) values (3,1,'SyndicatedContent','Maximum Number of Headlines',1057208065); 
+delete from international where languageId=1 and namespace='SyndicatedContent' and internationalId=71;
+INSERT INTO international VALUES (71,'SyndicatedContent',1,'Syndicated content is content that is pulled from another site using the RDF/RSS specification. This technology is often used to pull headlines from various news sites like <a href=\"http://www.cnn.com/\">CNN</a> and  <a href=\"http://slashdot.org/\">Slashdot</a>. It can, of course, be used for other things like sports scores, stock market info, etc.\r\n<br><br>\r\n\r\n<b>URL to RSS file</b><br>\r\nProvide the exact URL (starting with http://) to the syndicated content\'s RDF or RSS file. The syndicated content will be downloaded from this URL hourly.\r\n<br><br>\r\nYou can find syndicated content at the following locations:\r\n</p><ul>\r\n<li><a href=\"http://www.newsisfree.com/\">http://www.newsisfree.com</a>\r\n</li><li><a href=\"http://www.syndic8.com/\">http://www.syndic8.com</a>\r\n</li><li><a href=\"http://www.voidstar.com/node.php?id=144\">http://www.voidstar.com/node.php?id=144</a>\r\n</li><li><a href=\"http://my.userland.com/\">http://my.userland.com</a>\r\n</li><li><a href=\"http://www.webreference.com/services/news/\">http://www.webreference.com/services/news/</a>\r\n</li><li><a href=\"http://www.xmltree.com/\">http://www.xmltree.com</a>\r\n</li><li><a href=\"http://w.moreover.com/\">http://w.moreover.com/</a>\r\n</li></ul>\r\n\r\n<p>\r\n\r\nTo create an aggregate RSS feed, include a list of space separated urls instead of a single url.  For an aggregate feed, the system will display an equal number of headlines from each source, sorted by the date the system first received the story.<p>\r\n\r\n<b>Template</b><br>\r\nSelect a template for this content.\r\n<p><b>Maximum Headlines</b><br>\r\nEnter the maximum number of headlines that should be displayed.  For an aggregate feed, the system will display an equal number of headlines from each source, even if doing so requires displaying more than the requested maximum number of headlines.  Set to zero to allow any number of headlines.\r\n<p>',1047855741,NULL);
+
diff --git a/lib/WebGUI/Wobject/SyndicatedContent.pm b/lib/WebGUI/Wobject/SyndicatedContent.pm
index 26d0afc9b..333982eff 100644
--- a/lib/WebGUI/Wobject/SyndicatedContent.pm
+++ b/lib/WebGUI/Wobject/SyndicatedContent.pm
@@ -12,10 +12,12 @@ package WebGUI::Wobject::SyndicatedContent;
 
 use HTML::Entities;
 use strict;
+use Storable;
 use Tie::CPHash;
 use WebGUI::Cache;
 use WebGUI::DateTime;
 use WebGUI::HTMLForm;
+use WebGUI::HTML;
 use WebGUI::Icon;
 use WebGUI::International;
 use WebGUI::Privilege;
@@ -23,6 +25,7 @@ use WebGUI::Session;
 use WebGUI::SQL;
 use WebGUI::Wobject;
 use XML::RSSLite;
+use LWP::UserAgent;
 
 our @ISA = qw(WebGUI::Wobject);
 
@@ -39,7 +42,8 @@ sub new {
         my $self = WebGUI::Wobject->new(
                 -properties=>$property,
                 -extendedProperties=>{
-			rssUrl=>{}
+			rssUrl=>{},
+                        maxHeadlines=>{},
 			},
 		-useTemplate=>1
                 );
@@ -60,45 +64,278 @@ sub www_edit {
 		-label=>WebGUI::International::get(1,$_[0]->get("namespace")),
 		-value=>$_[0]->getValue("rssUrl")
 		);
+        my $layout = WebGUI::HTMLForm->new;
+	$layout->integer(
+		-name=>"maxHeadlines",
+		-label=>WebGUI::International::get(3,$_[0]->get("namespace")),
+		-value=>$_[0]->getValue("maxHeadlines")
+		);
 	return $_[0]->SUPER::www_edit(
 		-properties=>$properties->printRowsOnly,
+                -layout=>$layout->printRowsOnly,
 		-headingId=>4,
 		-helpId=>1
 		);
 }
 
 
+# strip all html tags from the given data structure.  This is important to
+# prevent cross site scripting attacks
+my $_stripped_html = {};
+sub _strip_html {
+        #my ($data) = @_;
+        
+        if (ref($_[0]) eq 'HASH') {
+                keys(%{$_[0]});
+                while (my ($name, $val) = each (%{$_[0]})) {
+                        $_[0]->{$name} = _strip_html($val);
+                }
+        } elsif (ref($_[0]) eq 'ARRAY') {
+                for (my $i = 0; $i < @{$_[0]}; $i++) {
+                        $_[0]->[$i] = _strip_html($_[0]->[$i]);
+                }
+        } else {
+                if ($_[0]) {
+                        $_[0] =~ s/\&lt;/</g;
+                        $_[0] =~ s/\&gt;/>/g;
+                        $_[0] = WebGUI::HTML::filter($_[0], 'all');
+                }
+        }
+        
+        return $_[0];
+}
+
+# horrible kludge to find the channel or item record
+# in the varying kinds of rss structures returned by RSSLite
+sub _find_record {
+        my ($data, $regex) = @_;
+        
+        if (ref($data) eq 'HASH') {
+                # reset the hash before calling each()
+                keys(%{$data});
+                while (my ($name, $val) = each(%{$data})) {
+                        if ($name =~ $_[1]) {
+                                if ((((ref($val) eq 'HASH') && 
+                                      ($val->{link} || $val->{title} || 
+                                       $val->{description})) ||
+                                     ((ref($val) eq 'ARRAY') && @{$val} && 
+                                      (ref($val->[0]) eq 'HASH') &&
+                                      ($val->[0]->{link} || 
+                                       $val->[0]->{title} ||
+                                       $val->[0]->{description})))) {
+                                        return $val;
+                                }
+                        }
+                        if (my $record = _find_record($val, $regex)) {
+                                return $record;
+                        }
+                }
+        }
+        
+        return undef;
+}
+
+# Copy the guid field to the link field if the guid looks like a link.
+# This is a kludge that gets around the fact that some folks use the link
+# field as the link to the story while others use it as the link
+# to the story about which the story is written.  The webuig templates seem
+# to assume the former, so we should use the guid instead of the link, b/c
+# the guid, if it is a link, always means the former.
+# Also copy the first few words of the description into the title if 
+# there is no title
+sub _normalize_items {
+        #my ($items) = @_;
+        
+        # max number of words to take from description to fill in an empty 
+        # title
+        my $max_words = 10;
+        
+        for my $item (@{$_[0]}) {
+                if ($item->{guid} && ($item->{guid} =~ /^http:\/\//i)) {
+                        $item->{link} = $item->{guid};
+                }
+                if (!$item->{title}) {
+                        my @description_words = split(/\s/, $item->{description});
+                        if (@description_words <= $max_words) {
+                                $item->{title} = $item->{description};
+                        } else {
+                                $item->{title} = join(" ", @description_words[0..$max_words-1]) . 
+                                  " ...";
+                        }
+                }
+                
+                # IE doesn't recognize &apos;
+                $item->{title} =~ s/&apos;/\'/;
+                $item->{description} =~ s/&apos;/\'/;
+        }
+}
+
+sub _get_rss_data {
+        my ($url) = @_;
+        
+        my $cache = WebGUI::Cache->new("url:" . $url, "RSS");
+        my $rss_serial = $cache->get;
+        my $rss = {};
+        if ($rss_serial) {
+                $rss = Storable::thaw($rss_serial);
+        } else {
+                my $ua = LWP::UserAgent->new(timeout => 5);
+                my $response = $ua->get($url);
+                if (!$response->is_success()) {
+                        warn("Error retrieving url '$url': " . 
+                             $response->status_line());
+                        return undef;
+                }
+                my $xml = $response->content();
+                
+                # there is no encode_entities_numeric that I can find, so I am 
+                # commenting this out. -hal
+                #    $xml =~ s#(<title>)(.*?)(</title>)#$1.encode_entities_numeric(decode_entities($2)).$3#ges;
+                #    $xml =~ s#(<description>)(.*?)(</description>)#$1.encode_entities_numeric(decode_entities($2)).$3#ges; 
+                
+                my $rss_lite = {};
+                eval {
+                        XML::RSSLite::parseXML($rss_lite, \$xml);
+                };
+                if ($@) {
+                        warn("error parsing rss for url $url");
+                }
+                
+                # make sure that the {channel} points to the channel 
+                # description record and that {items} points to the list 
+                # of items.  without this voodoo, different versions of 
+                # rss return the data in different places in the data 
+                # structure.
+                $rss_lite = {channel => $rss_lite};
+                if (!($rss->{channel} = 
+                      _find_record($rss_lite, qr/^channel$/))) {
+                        warn("unable to find channel info for url $url");
+                }
+                if (!($rss->{items} = _find_record($rss_lite, qr/^items?$/))) {
+                        warn("unable to find item info for url $url");
+                }
+                
+                _strip_html($rss);
+                
+                _normalize_items($rss->{items});
+                
+                $cache->set(Storable::freeze($rss), 3600);
+        }
+        
+        return $rss;
+}
+
+# rss items don't have a standard date, so timestamp them the first time
+# we see them and use that timestamp as the date.  Periodically nuke the
+# whole database to keep the thing from growing too large
+sub _assign_rss_dates {
+        my ($items) = @_;
+        
+        for my $item (@{$items}) {
+                my $key = 'dates:' . ($item->{guid} || $item->{title} || 
+                                      $item->{description} || $item->{link});
+                my $cache = WebGUI::Cache->new($key, "RSS");
+                if (my $date = $cache->get()) {
+                        $item->{date} = $date;
+                } else {
+                        $item->{date} = time();
+                        $cache->set($item->{date}, '1 year');
+                }
+        }
+  }
+
+sub _get_aggregate_items {
+        my ($urls, $obj, $maxHeadlines) = @_;
+        
+        my $cache = WebGUI::Cache->new("aggregate:" . 
+                                       $obj->get("rssUrl"), "RSS");
+        my $items = Storable::thaw($cache->get());
+        if (!$items) {
+                $items = [];
+                my $items_remain = 1;
+                
+                my @rsss;
+                for my $url (@{$urls}) {
+                        push(@rsss, _get_rss_data($url));
+                }
+                
+                while ((@{$items} < $maxHeadlines) && $items_remain) {
+                        $items_remain = 0;
+                        for my $rss (@rsss) {
+                                if ($rss->{items} && 
+                                    (my $item = shift(@{$rss->{items}}))) {
+                                        push(@{$items}, 
+                                             {site_title => $rss->{channel}->{title},
+                                              site_link => $rss->{channel}->{link},
+                                              link => $item->{link},
+                                              title => $item->{title},
+                                              description => $item->{description},
+                                             });
+                                        if (@{$rss->{items}}) {
+                                                $items_remain = 1;
+                                        }
+                                }
+                        }
+                }
+                
+                _assign_rss_dates($items);
+                
+                @{$items} = sort { $b->{date} <=> $a->{date} } @{$items};
+                
+                #if (@{$items} > $_aggregate_size) {
+                #  @{$items} = @{$items}[0..($_aggregate_size-1)];
+                #}
+                
+                $cache->set(Storable::freeze($items), 3600);
+        }
+        
+        return $items;
+}  
+
+# interleave stories from each feed, up to a total of $_aggregate_size
+sub _view_aggregate_feed {
+        my ($urls, $obj, $maxHeadlines) = @_;
+        
+        my %var;
+        $var{'channel.title'} = $obj->get("title");
+        $var{'channel.description'} = $obj->get("description");
+        $var{item_loop} = _get_aggregate_items($urls, $obj, $maxHeadlines);
+        
+        return $obj->processTemplate($obj->get("templateId"),\%var);
+}
+
+
 #-------------------------------------------------------------------
+sub _view_single_feed {
+        my $maxHeadlines = $_[1];
+        my $rss = _get_rss_data($_[0]->get("rssUrl"));
+        my %var;
+        $var{"channel.title"} = $rss->{channel}->{title};
+        $var{"channel.link"} = $rss->{channel}->{link};
+        $var{"channel.description"} = $rss->{description};
+        my @items;
+        $rss->{items} ||= [];
+        for (my $i = 0; ($i < @{$rss->{items}}) && ($i < $maxHeadlines);$i++) {
+                my $item = $rss->{items}->[$i];
+                push (@items,{
+                              link=>$item->{link},
+                              title=>$item->{title},
+                              description=>$item->{description}
+                             });
+        }
+        $var{item_loop} = \@items;
+        return $_[0]->processTemplate($_[0]->get("templateId"),\%var);
+}
+
 sub www_view {
-	my %rss;
-	my $cache = WebGUI::Cache->new($_[0]->get("rssUrl"),"URL");
-	my $rssFile = $cache->get;
-	unless (defined $rssFile) {
-		$rssFile = $cache->setByHTTP($_[0]->get("rssUrl"),3600);
-	}
-	$rssFile =~ s#(<title>)(.*?)(</title>)#$1.HTML::Entities::encode_entities(decode_entities($2)).$3#ges; 
-	$rssFile =~ s#(<description>)(.*?)(</description>)#$1.HTML::Entities::encode_entities(decode_entities($2)).$3#ges; 
-	eval{parseRSS(\%rss, \$rssFile)};
-	if ($@) {
-		WebGUI::ErrorHandler::warn($_[0]->get("rssUrl")." ".$@);
-	}
-	my %var;
-	$var{"channel.title"} = $rss{title} || $rss{channel}{title} || $rss{RDF}{channel}{title};
-        $var{"channel.link"} = $rss{link} || $rss{channel}{link} || $rss{RDF}{channel}{link};
-        $var{"channel.description"} = $rss{description} || $rss{channel}{description} || $rss{RDF}{channel}{description};
-	my @items;
-	my $rssItem = \$rss{item};
-	$rssItem = \$rss{RDF}{item} unless ($rss{item});
-	$rssItem = \[ $$rssItem ] unless (ref $$rssItem eq 'ARRAY'); 
-        foreach my $item (@{$$rssItem}) {
-		push (@items,{
-			link=>$item->{link},
-			title=>$item->{title},
-			description=>$item->{description}
-			});
-	}
-	$var{item_loop} = \@items;
-	return $_[0]->processTemplate($_[0]->get("templateId"),\%var);
+        my $maxHeadlines = $_[0]->get("maxHeadlines") || 1000000;
+
+        my @urls = split(/\s+/,$_[0]->get("rssUrl"));        
+        if (@urls == 1) {
+                return _view_single_feed($_[0], $maxHeadlines);
+        } else {
+                return _view_aggregate_feed(\@urls, $_[0], $maxHeadlines);
+        }
 }