Significant update to the SyndicatedContent wobject to allow for better aggregation and scheduled content downloads.

This commit is contained in:
Daniel Collis-Puro 2005-06-15 02:29:12 +00:00
parent c7c968eaf2
commit 2fd3cb57f3
6 changed files with 519 additions and 150 deletions

View file

@ -25,11 +25,43 @@ use WebGUI::Asset::Wobject;
use XML::RSSLite;
use LWP::UserAgent;
use WebGUI::ErrorHandler;
use POSIX qw/floor/;
my $hasEncode=1;
eval " use Encode qw(from_to); "; $hasEncode=0 if $@;
our @ISA = qw(WebGUI::Asset::Wobject);
=head1 NAME
Package WebGUI::Asset::Wobject::SyndicatedContent
=head1 DESCRIPTION
Displays items and channels from RSS feeds.
=head1 SYNOPSIS
use WebGUI::Asset::Wobject::SyndicatedWobject;
=head1 METHODS
These methods are available from this class:
=cut
#-------------------------------------------------------------------
=head2 definition ( definition )
Defines the properties of this asset.
=head3 definition
A hash reference passed in from a subclass definition.
=cut
#-------------------------------------------------------------------
@ -46,19 +78,31 @@ sub definition {
},
rssUrl=>{
defaultValue=>undef,
fieldType=>"url"
fieldType=>"textarea"
},
maxHeadlines=>{
fieldType=>"integer",
defaultValue=>10
},
displayMode=>{
fieldType=>"text",
defaultValue=>"interleaved"
},
hasTerms=>{
fieldType=>"text",
defaultValue=>""
}
}
});
return $class->SUPER::definition($definition);
}
=head2 getName ()
Returns the icons associated with this asset.
=cut
#-------------------------------------------------------------------
sub getIcon {
my $self = shift;
my $small = shift;
@ -66,43 +110,81 @@ sub getIcon {
return $session{config}{extrasURL}.'/assets/syndicatedContent.gif';
}
#-------------------------------------------------------------------
=head2 getName ()
Returns the displayable name of this asset.
=cut
sub getName {
return WebGUI::International::get(2,"Asset_SyndicatedContent");
}
#-------------------------------------------------------------------
=head2 getUiLevel ()
Returns the displayable name of this asset.
=cut
sub getUiLevel {
return 6;
}
#-------------------------------------------------------------------
=head2 getEditForm ()
Returns the TabForm object that will be used in generating the edit page for this asset.
=cut
sub getEditForm {
my $self = shift;
my $tabform = $self->SUPER::getEditForm();
$tabform->getTab("display")->template(
-value=>$self->getValue('templateId'),
-namespace=>"SyndicatedContent"
-namespace=>"SyndicatedContent",
-label=>WebGUI::International::get(72,"Asset_SyndicatedContent"),
);
$tabform->getTab("properties")->url(
$tabform->getTab("display")->selectList(
-name=>"displayMode",
-options=>{
'interleaved'=>WebGUI::International::get("interleaved","Asset_SyndicatedContent"),
'grouped'=>WebGUI::International::get("grouped","Asset_SyndicatedContent"),
},
-sortByValue=>1,
-label=>WebGUI::International::get("displayModeLabel","Asset_SyndicatedContent"),
-value=>[$self->getValue('displayMode')],
-subtext=>WebGUI::International::get("displayModeSubtext","Asset_SyndicatedContent"),
);
$tabform->getTab("display")->text(
-name=>"hasTerms",
-label=>WebGUI::International::get("hasTermsLabel","Asset_SyndicatedContent"),
-maxlength=>255,
-value=>$self->getValue("hasTerms"),
);
$tabform->getTab("properties")->textarea(
-name=>"rssUrl",
-label=>WebGUI::International::get(1,"Asset_SyndicatedContent"),
-value=>$self->getValue("rssUrl")
);
$tabform->getTab("display")->integer(
-name=>"maxHeadlines",
-label=>WebGUI::International::get(3,"Asset_SyndicatedContent"),
-value=>$self->getValue("maxHeadlines")
);
#$tabform->addTab("rss",WebGUI::International::get("rssTabName","Asset_SyndicatedContent"));
return $tabform;
}
#-------------------------------------------------------------------
# strip all html tags from the given data structure. This is important to
# prevent cross site scripting attacks
my $_stripped_html = {};
#my $_stripped_html = {};
sub _strip_html {
#my ($data) = @_;
@ -129,6 +211,7 @@ sub _strip_html {
#-------------------------------------------------------------------
# horrible kludge to find the channel or item record
# in the varying kinds of rss structures returned by RSSLite
sub _find_record {
my ($data, $regex) = @_;
@ -166,6 +249,7 @@ sub _find_record {
# the guid, if it is a link, always means the former.
# Also copy the first few words of the description into the title if
# there is no title
sub _normalize_items {
#my ($items) = @_;
@ -195,7 +279,7 @@ sub _normalize_items {
#-------------------------------------------------------------------
sub _get_rss_data {
my ($url) = @_;
my $url = shift;
my $cache = WebGUI::Cache->new("url:" . $url, "RSS");
my $rss_serial = $cache->get;
@ -223,19 +307,16 @@ sub _get_rss_data {
}
}
# there is no encode_entities_numeric that I can find, so I am
# commenting this out. -hal
# $xml =~ s#(<title>)(.*?)(</title>)#$1.encode_entities_numeric(decode_entities($2)).$3#ges;
# $xml =~ s#(<description>)(.*?)(</description>)#$1.encode_entities_numeric(decode_entities($2)).$3#ges;
my $rss_lite = {};
eval {
XML::RSSLite::parseXML($rss_lite, \$xml);
};
if ($@) {
WebGUI::ErrorHandler::warn("error parsing rss for url $url");
WebGUI::ErrorHandler::warn("error parsing rss for url $url :".$@);
#Returning undef on a parse failure is a change from previous behaviour,
#but it SHOULDN'T have a major effect.
return undef;
}
# make sure that the {channel} points to the channel
@ -243,6 +324,7 @@ sub _get_rss_data {
# of items. without this voodoo, different versions of
# rss return the data in different places in the data
# structure.
$rss_lite = {channel => $rss_lite};
if (!($rss->{channel} =
_find_record($rss_lite, qr/^channel$/))) {
@ -251,13 +333,19 @@ sub _get_rss_data {
if (!($rss->{items} = _find_record($rss_lite, qr/^items?$/))) {
WebGUI::ErrorHandler::warn("unable to find item info for url $url");
$rss->{items} = [];
}
}
_strip_html($rss);
$rss->{items} = [ $rss->{items} ] unless (ref $rss->{items} eq 'ARRAY');
_normalize_items($rss->{items});
#Assign dates "globally" rather than when seen in a viewed feed.
#This is important because we can "filter" now and want to ensure we keep order
#correctly as new items appear.
_assign_rss_dates($rss->{items});
#Default to an hour timeout
$cache->set(Storable::freeze($rss), 3600);
}
@ -268,6 +356,7 @@ sub _get_rss_data {
# rss items don't have a standard date, so timestamp them the first time
# we see them and use that timestamp as the date. Periodically nuke the
# whole database to keep the thing from growing too large
sub _assign_rss_dates {
my ($items) = @_;
@ -285,113 +374,189 @@ sub _assign_rss_dates {
}
#-------------------------------------------------------------------
sub _get_aggregate_items {
# $items is the hashref to put items into.
# $rss_feeds is an arrayref of all the feeds in this wobject
# The only difference between an "interleaved" feed and a grouped feed
# is the order the items are output.
sub _create_grouped_items{
my($items,$rss_feeds,$maxHeadlines,$hasTermsRegex)=@_;
_create_interleaved_items($items,$rss_feeds,$maxHeadlines,$hasTermsRegex);
@$items=sort{$a->{'site_title'} cmp $b->{'site_title'}} @$items;
#Loop through the items and output the "site_
my $siteTitleTracker;
foreach(@$items){
if($siteTitleTracker ne $_->{site_title}){
$_->{new_rss_site}=1;
}
$siteTitleTracker=$_->{site_title};
}
}
#-------------------------------------------------------------------
# Loop through the feeds for this wobject
# and push in the items in "interleaved mode"
# No need to return because we're doing everything by reference.
sub _create_interleaved_items{
my($items,$rss_feeds,$maxHeadlines,$hasTermsRegex)=@_;
my $items_remain = 1;
while((@$items < $maxHeadlines) && $items_remain){
foreach my $rss(@$rss_feeds){
$items_remain=0;
if(defined $rss->{items}
&& @$items < $maxHeadlines
&& (my $item = shift @{$rss->{items}})
){
$item->{site_title}=$rss->{channel}->{title};
$item->{site_link}=$rss->{channel}->{link};
if(! $hasTermsRegex || _check_hasTerms($item,$hasTermsRegex)){
push(@{$items},$item);
}
if (@{$rss->{items}}) {
$items_remain = 1;
}
}
}
}
}
#-------------------------------------------------------------------
# Uses the regex constructed in _get_items (with the terms defaulting to OR)
# to see if the title or description associated with this item match the kinds
# of items we're looking for.
#
sub _check_hasTerms{
my($item,$hasTermsRegex)=@_;
my $to_check=$item->{title}.$item->{description};
if( $to_check =~ /$hasTermsRegex/gism){
return 1;
} else {
return 0;
}
}
################################################################################
sub _make_regex{
my $terms=shift;
my @terms=split(/,/,$terms);
return join("|",@terms);
}
#############################
#-------------------------------------------------------------------
# So- We're going to manage an "aggregate cache" that represents
# the rendering of the cumulative feeds in a Syndicated Wobject,
# but let each feed "fend for itself" based on URL in the cache.
#
# This means we can set up the hourly task to get and cache each
# individual feed WITHOUT having to re-request (undoubtedly the slowest
# part of every RSS parsing action is the network traffic) each feed
# when we re-render each aggregrate representation.
#
# If, however, a feed expires between hourly tasks, it will be re-requested and
# parsed per the usual. BUT, if a feed ever goes un-requested for more than an hour,
# then it's retrieval schedule will be taken over by the hourly task, and we'll
# be pre-seeding the RSS object cache automatically.
#
# Having the caching set up this way means we can re-use the same raw feed all over the site without
# having each wobject request it separately, ASSUMING the URL is the same.
#
# All the values that may have an effect on the composition of items
# are included in the cache key for the aggregate representation.
sub _get_items {
my $self = shift;
my $urls = shift;
my $maxHeadlines = shift;
my $cache = WebGUI::Cache->new("aggregate:" .
$self->get("rssUrl"), "RSS");
my $displayMode=$self->getValue('displayMode');
my $hasTermsRegex=_make_regex($self->getValue('hasTerms'));
my $maxHeadlines=$self->getValue('maxHeadlines');
my $key=join(":",("aggregate", $displayMode,$hasTermsRegex,$maxHeadlines,$self->get("rssUrl")));
my $cache = WebGUI::Cache->new($key, "RSS");
my $items = Storable::thaw($cache->get());
my @rss_feeds;
if (!$items) {
$items = [];
my $items_remain = 1;
my @rsss;
for my $url (@{$urls}) {
push(@rsss, _get_rss_data($url));
my $rss_info=_get_rss_data($url);
push(@rss_feeds, $rss_info) if($rss_info);
}
while ((@{$items} < $maxHeadlines) && $items_remain) {
$items_remain = 0;
for my $rss (@rsss) {
if ($rss->{items} &&
(my $item = shift(@{$rss->{items}}))) {
push(@{$items},
{site_title => $rss->{channel}->{title},
site_link => $rss->{channel}->{link},
link => $item->{link},
title => $item->{title},
description => $item->{description},
});
if (@{$rss->{items}}) {
$items_remain = 1;
}
}
}
}
_assign_rss_dates($items);
#Sort feeds in order by channel title.
#@rss_feeds=sort{$a->{channel}->{title} cmp $b->{channel}->{title}} @rss_feeds;
if ($displayMode eq 'grouped') {
_create_grouped_items($items,\@rss_feeds,$maxHeadlines,$hasTermsRegex);
} else {
_create_interleaved_items($items,\@rss_feeds,$maxHeadlines,$hasTermsRegex);
}
@{$items} = sort { $b->{date} <=> $a->{date} } @{$items};
#if (@{$items} > $_aggregate_size) {
# @{$items} = @{$items}[0..($_aggregate_size-1)];
#}
$cache->set(Storable::freeze($items), 3600);
}
}
return $items;
}
#-------------------------------------------------------------------
# interleave stories from each feed, up to a total of $_aggregate_size
sub _view_aggregate_feed {
my $self = shift;
my $urls = shift;
my $maxHeadlines = shift;
my %var;
$var{'channel.title'} = $self->get("title");
$var{'channel.description'} = $self->get("description");
$var{item_loop} = $self->_get_aggregate_items($urls, $maxHeadlines);
return $self->processTemplate(\%var,$self->get("templateId"));
#So return the item loop and the first RSS feed, because
#when we're parsing a single feed we can use that feed's title and
#description for channel.title, channel.link, and channel.description
return ($items,\@rss_feeds);
}
=head2 view()
#-------------------------------------------------------------------
sub _view_single_feed {
my $self = shift;
my $maxHeadlines = shift;
my $rss = _get_rss_data($self->get("rssUrl"));
my %var;
$var{"channel.title"} = $rss->{channel}->{title};
$var{"channel.link"} = $rss->{channel}->{link};
$var{"channel.description"} = $rss->{channel}->{description};
my @items;
$rss->{items} ||= [];
for (my $i = 0; ($i < @{$rss->{items}}) && ($i < $maxHeadlines);$i++) {
my $item = $rss->{items}->[$i];
push (@items,{
link=>$item->{link},
title=>$item->{title},
description=>$item->{description}
});
}
$var{item_loop} = \@items;
return $self->processTemplate(\%var,$self->get("templateId"));
}
Returns the rendered output of the wobject.
=cut
#-------------------------------------------------------------------
sub view {
my $self = shift;
$self->logView() if ($session{setting}{passiveProfilingEnabled});
my $maxHeadlines = $self->get("maxHeadlines") || 1000000;
my @urls = split(/\s+/,$self->get("rssUrl"));
if (@urls == 1) {
return $self->_view_single_feed($maxHeadlines);
} else {
return $self->_view_aggregate_feed(\@urls, $maxHeadlines);
}
my @urls = split(/\s+/,$self->get("rssUrl"));
my %var;
my($item_loop,$rss_feeds)=$self->_get_items(\@urls, $maxHeadlines);
if(@$rss_feeds > 1){
#If there is more than one (valid) feed in this wobject, put in the wobject description info.
$var{'channel.title'} = $self->get("title");
$var{'channel.description'} = $self->get("description");
} else {
#One feed. Put in the info from the feed.
$var{"channel.title"} = $rss_feeds->[0]->{channel}->{title};
$var{"channel.link"} = $rss_feeds->[0]->{channel}->{link};
$var{"channel.description"} = $rss_feeds->[0]->{channel}->{description};
}
$var{item_loop} = $item_loop;
return $self->processTemplate(\%var,$self->get("templateId"));
}
#-------------------------------------------------------------------
=head2 www_edit()
Sets parameters and returns a form to edit this wobject.
=cut
sub www_edit {
my $self = shift;
return WebGUI::Privilege::insufficient() unless $self->canEdit;
$self->getAdminConsole->setHelp("syndicated content add/edit","SyndicatedContent");
$self->getAdminConsole->setHelp("syndicated content add/edit","Asset_SyndicatedContent");
return $self->getAdminConsole->render($self->getEditForm->print,WebGUI::International::get("4","Asset_SyndicatedContent"));
}