- The Syndicated Content asset was rewritten, and now uses 35% less memory and is 400% faster.

This commit is contained in:
JT Smith 2008-11-06 02:32:10 +00:00
parent d34eadc40e
commit ef6aedc862
12 changed files with 393 additions and 2180 deletions

View file

@ -14,16 +14,13 @@ use strict;
use HTML::Entities;
use Tie::IxHash;
use WebGUI::Cache;
use WebGUI::Exception;
use WebGUI::HTML;
use WebGUI::International;
use WebGUI::Asset::Wobject;
use base 'WebGUI::Asset::Wobject';
use WebGUI::Macro;
use XML::RSSLite;
use XML::RSS::Creator;
use LWP::UserAgent;
use Encode;
use XML::FeedPP;
our @ISA = qw(WebGUI::Asset::Wobject);
=head1 NAME
@ -31,7 +28,7 @@ Package WebGUI::Asset::Wobject::SyndicatedContent
=head1 DESCRIPTION
Displays items and channels from RSS feeds.
Displays items and channels from RSS/Atom/RDF feeds.
=head1 SYNOPSIS
@ -43,55 +40,6 @@ These methods are available from this class:
=cut
#-------------------------------------------------------------------
sub _constructRSS {
my($self,$rssObject,$var)=@_;
#They've chosen to emit this as an RSS feed, in one of the four flavors we support.
$rssObject->channel(
title=>$var->{'channel.title'} || $self->get('title'),
link=>$self->session->url->page('',1),
description=>$var->{'channel.description'} || ''
);
foreach my $item (@{$var->{item_loop}}) {
# I know this seems kludgy, but because XML::RSSLite parses
# feeds loosely, sometimes it returns a data structure when it shouldn't.
# So we're only pushing in attributes when they AREN'T a reference to
# a data structure.
my %attributes;
foreach my $attribute(keys %$item){
$attributes{$attribute}=$item->{$attribute} if (! ref($item->{$attribute}));
}
$rssObject->add_item(%attributes);
}
}
#-------------------------------------------------------------------
sub _createRSSURLs {
my $self=shift;
my $var=shift;
foreach({ver=>'1.0',param=>'10'},{ver=>'0.9',param=>'090'},{ver=>'0.91',param=>'091'},{ver=>'2.0',param=>'20'}){
$var->{'rss.url.'.$_->{ver}}=$self->getUrl('func=viewRSS'.$_->{param});
}
$var->{'rss.url'}=$self->getUrl('func=viewRSS20');
}
#-------------------------------------------------------------------
sub _getMaxHeadlines {
my $self = shift;
return $self->get('maxHeadlines') || 1000000;
}
#-------------------------------------------------------------------
sub _getValidatedUrls {
my $self = shift;
my @urls = split(/\s+/,$self->getRssUrl);
my @validatedUrls = ();
foreach my $url (@urls) {
push(@validatedUrls, $url) if ($url =~ m/^http/);
}
return @validatedUrls
}
#-------------------------------------------------------------------
@ -129,6 +77,7 @@ sub appendChoppedDescriptionTemplateVars {
$item->{"descriptionFirstSentence"} =~ s/^(.*?\.).*/$1/s;
}
#-------------------------------------------------------------------
=head2 definition ( definition )
@ -179,26 +128,13 @@ sub definition {
label=>$i18n->get('process macros in rss url'),
hoverHelp=>$i18n->get('process macros in rss url description'),
},
maxHeadlines=>{
tab=>"properties",
maxHeadlines=>{
tab=>"display",
fieldType=>'integer',
defaultValue=>10,
label=>$i18n->get(3),
hoverHelp=>$i18n->get('3 description')
},
displayMode=>{
tab=>"display",
fieldType=>'selectBox',
defaultValue=>'interleaved',
options=>{
'interleaved'=>$i18n->get('interleaved'),
'grouped'=>$i18n->get('grouped'),
},
sortByValue=>1,
label=>$i18n->get('displayModeLabel'),
hoverHelp=>$i18n->get('displayModeLabel description'),
subtext=>$i18n->get('displayModeSubtext')
},
hasTerms=>{
tab=>"properties",
fieldType=>'text',
@ -221,363 +157,115 @@ sub definition {
}
#-------------------------------------------------------------------
# strip all html tags from the given data structure. This is important to
# prevent cross site scripting attacks
sub _strip_html {
unless (ref $_[0]) {
return $_[0] = WebGUI::HTML::filter($_[0], 'all');
}
my $ref = shift;
if (ref $ref eq 'HASH') {
if (exists $ref->{description}) {
$ref->{description} = HTML::Entities::decode_entities($ref->{description});
}
foreach my $value (values %$ref) {
_strip_html($value);
}
}
elsif (ref $ref eq 'ARRAY') {
foreach my $value (@$ref) {
_strip_html($value);
}
}
return $ref;
}
=head2 generateFeed ()
#-------------------------------------------------------------------
# horrible kludge to find the channel or item record
# in the varying kinds of rss structures returned by RSSLite
Combines all feeds into a single XML::FeedPP object.
sub _find_record {
my ($data, $regex) = @_;
=cut
if (ref($data) eq 'HASH') {
# reset the hash before calling each()
keys(%{$data});
while (my ($name, $val) = each(%{$data})) {
if ($name =~ $_[1]) {
if ((((ref($val) eq 'HASH') &&
($val->{link} || $val->{title} ||
$val->{description})) ||
((ref($val) eq 'ARRAY') && @{$val} &&
(ref($val->[0]) eq 'HASH') &&
($val->[0]->{link} ||
$val->[0]->{title} ||
$val->[0]->{description})))) {
return $val;
}
}
if (my $record = _find_record($val, $regex)) {
return $record;
}
}
}
return undef;
}
#-------------------------------------------------------------------
# First, get rid of things we don't want.
# Copy the guid field to the link field if the guid looks like a link.
# This is a kludge that gets around the fact that some folks use the link
# field as the link to the story while others use it as the link
# to the story about which the story is written. The webuig templates seem
# to assume the former, so we should use the guid instead of the link, b/c
# the guid, if it is a link, always means the former.
# Also copy the first few words of the description into the title if
# there is no title
sub _normalize_items {
#my ($items) = @_;
# max number of words to take from description to fill in an empty
# title
my $max_words = 10;
for my $item (@{$_[0]}) {
# Get rid of any keys in the items that we do not want
my @wantedKeys = qw( title link description pubDate );
%{ $item } = map { $_ => $item->{ $_ } } @wantedKeys;
if ($item->{guid} && ($item->{guid} =~ /^http:\/\//i)) {
$item->{link} = $item->{guid};
}
if (!$item->{title}) {
my @description_words = split(/\s/, $item->{description});
if (@description_words <= $max_words) {
$item->{title} = $item->{description};
} else {
$item->{title} = join(' ', @description_words[0..$max_words-1]) .
' ...';
}
}
# IE doesn't recognize &apos;
$item->{title} =~ s/&apos;/\'/g;
$item->{description} =~ s/&apos;/\'/g;
$item->{category} = [$item->{category}]
if ref $item->{category} ne 'ARRAY';
appendChoppedDescriptionTemplateVars($item);
}
}
#-------------------------------------------------------------------
sub _get_rss_data {
my $session = shift;
my $url = shift;
# format of cache was changed, differentiate
my $cache = WebGUI::Cache->new($session,'url2:' . $url, 'RSS');
my $rss = $cache->get;
if ($rss) {
if ($rss->{error}) {
return undef;
}
return $rss;
}
else {
my $ua = LWP::UserAgent->new(timeout => 5);
$ua->env_proxy;
my $response = $ua->get($url);
if (!$response->is_success()) {
$session->errorHandler->warn("Error retrieving url '$url': " .
$response->status_line());
$cache->set({'error' => 1, 'error_status' => $response->status_line}, 3600);
return undef;
}
my $xmlEncoding;
if ($response->content =~ /<\?xml.*?encoding=['"](\S+)['"]/i) {
$xmlEncoding = $1;
}
my $xml = $response->decoded_content($xmlEncoding ? (charset => $xmlEncoding) : ());
# Approximate with current time if we don't have a Last-Modified
# header coming from the RSS source.
my $http_lm = $response->last_modified;
my $last_modified = defined($http_lm)? $http_lm : time;
# XML::RSSLite does not handle <![CDATA[ ]]> so:
$xml =~ s/<!\[CDATA\[(.*?)\]\]>/HTML::Entities::encode_entities($1)/esg;
my $rss_lite = {};
eval {
XML::RSSLite::parseXML($rss_lite, \$xml);
};
if ($@) {
$session->errorHandler->warn("error parsing rss for url $url :".$@);
#Returning undef on a parse failure is a change from previous behaviour,
#but it SHOULDN'T have a major effect.
return undef;
}
# make sure that the {channel} points to the channel
# description record and that {items} points to the list
# of items. without this voodoo, different versions of
# rss return the data in different places in the data
# structure.
$rss_lite = {channel => $rss_lite};
$rss = {};
if (!($rss->{channel} =
_find_record($rss_lite, qr/^channel$/))) {
$session->errorHandler->warn("unable to find channel info for url $url");
}
if (!($rss->{items} = _find_record($rss_lite, qr/^items?$/))) {
$session->errorHandler->warn("unable to find item info for url $url");
$rss->{items} = [];
}
_strip_html($rss);
$rss->{items} = [ $rss->{items} ] unless (ref $rss->{items} eq 'ARRAY');
_normalize_items($rss->{items});
#Assign dates "globally" rather than when seen in a viewed feed.
#This is important because we can "filter" now and want to ensure we keep order
#correctly as new items appear.
_assign_rss_dates($session, $rss->{items});
# Store last-modified date as well.
$rss->{last_modified} = $last_modified;
#Default to an hour timeout
$cache->set($rss, 3600);
}
return $rss;
}
#-------------------------------------------------------------------
# rss items don't have a standard date, so timestamp them the first time
# we see them and use that timestamp as the date. Periodically nuke the
# whole database to keep the thing from growing too large
sub _assign_rss_dates {
my $session = shift;
my ($items) = @_;
for my $item (@{$items}) {
my $key = 'dates:' . ($item->{guid} || $item->{title} ||
$item->{description} || $item->{link});
my $cache = WebGUI::Cache->new($session,$key, 'RSS');
if (my $date = $cache->get()) {
$item->{date} = $date;
}
else {
my $pubDate;
if ($item->{pubDate}) {
$pubDate = $session->datetime->mailToEpoch($item->{pubDate});
}
$item->{date} = $pubDate || $session->datetime->time() - (60 * 60 * 24 * 365); # handicap the undated
$cache->set($item->{date}, '1 year');
}
}
@{$items} = sort { $b->{date} <=> $a->{date} } @{$items};
}
#-------------------------------------------------------------------
# $items is the hashref to put items into.
# $rss_feeds is an arrayref of all the feeds in this wobject
# The only difference between an "interleaved" feed and a grouped feed
# is the order the items are output.
sub _create_grouped_items{
my($items,$rss_feeds,$maxHeadlines,$hasTermsRegex)=@_;
_create_interleaved_items($items,$rss_feeds,$maxHeadlines,$hasTermsRegex);
@$items=sort{$a->{'site_title'} cmp $b->{'site_title'}} @$items;
#Loop through the items and output the "site_
my $siteTitleTracker;
foreach (@$items) {
if ($siteTitleTracker ne $_->{site_title}) {
$_->{new_rss_site} = 1;
}
$siteTitleTracker = $_->{site_title};
}
}
#-------------------------------------------------------------------
# Loop through the feeds for this wobject
# and push in the items in "interleaved mode"
# No need to return because we're doing everything by reference.
sub _create_interleaved_items {
my ($items, $rss_feeds, $maxHeadlines, $hasTermsRegex) = @_;
# put all items together into a single list
foreach my $rss (@$rss_feeds) {
while (my $item = shift @{$rss->{items}}) {
if ($hasTermsRegex && ! _check_hasTerms($item, $hasTermsRegex)) {
next;
}
$item->{site_title} = $rss->{channel}->{title};
$item->{site_link} = $rss->{channel}->{link};
push @$items, $item;
}
}
@$items = sort { $b->{date} <=> $a->{date} } @$items;
# limit to $maxHeadlines
if (@$items > $maxHeadlines) {
splice @$items, $maxHeadlines;
}
}
#-------------------------------------------------------------------
# Uses the regex constructed in _get_items (with the terms defaulting to OR)
# to see if the title or description associated with this item match the kinds
# of items we're looking for.
#
sub _check_hasTerms{
my($item,$hasTermsRegex)=@_;
my $to_check=$item->{title}.$item->{description};
if ($to_check =~ /$hasTermsRegex/gism) {
return 1;
} else {
return 0;
}
}
#-------------------------------------------------------------------
sub _make_regex{
my $terms = shift;
my @terms = split(/,/,$terms);
return join('|',@terms);
}
#-------------------------------------------------------------------
# So- We're going to manage an "aggregate cache" that represents
# the rendering of the cumulative feeds in a Syndicated Wobject,
# but let each feed "fend for itself" based on URL in the cache.
#
# This means we can set up the hourly task to get and cache each
# individual feed WITHOUT having to re-request (undoubtedly the slowest
# part of every RSS parsing action is the network traffic) each feed
# when we re-render each aggregrate representation.
#
# If, however, a feed expires between hourly tasks, it will be re-requested and
# parsed per the usual. BUT, if a feed ever goes un-requested for more than an hour,
# then it's retrieval schedule will be taken over by the hourly task, and we'll
# be pre-seeding the RSS object cache automatically.
#
# Having the caching set up this way means we can re-use the same raw feed all over the site without
# having each wobject request it separately, ASSUMING the URL is the same.
#
# All the values that may have an effect on the composition of items
# are included in the cache key for the aggregate representation.
sub _get_items {
sub generateFeed {
my $self = shift;
my $urls = shift;
my $maxHeadlines = shift || $self->getValue('maxHeadlines');
my $displayMode=$self->getValue('displayMode');
my $hasTermsRegex=_make_regex($self->getValue('hasTerms'));
# Format of cache has changed several times
my $key=join(':', 'aggregate3', $displayMode,$hasTermsRegex,$maxHeadlines,$self->getRssUrl);
my $cache = WebGUI::Cache->new($self->session,$key, 'RSS');
my $cached = $cache->get;
my ($items, @rss_feeds);
if ($cached) {
$items = $cached->[0];
@rss_feeds = @{$cached->[1]};
} else {
$items = [];
for my $url (@{$urls}) {
my $rss_info=_get_rss_data($self->session,$url);
push(@rss_feeds, $rss_info) if(defined $rss_info);
}
# deal with the fact that we may never get valid data
if (scalar(@rss_feeds) < 1) {
return ({}, []);
my $feed = XML::FeedPP::Atom->new();
my $log = $self->session->log;
# build one feed out of many
foreach my $url (split("\n", $self->get('rssUrl'))) {
$log->info("Processing FEED: ".$url);
$url =~ s/^feed:/http:/;
if ($self->get('processMacroInRssUrl')) {
WebGUI::Macro::process($self->session, \$url);
}
#Sort feeds in order by channel title.
#@rss_feeds=sort{$a->{channel}->{title} cmp $b->{channel}->{title}} @rss_feeds;
if ($displayMode eq 'grouped') {
_create_grouped_items($items,\@rss_feeds,$maxHeadlines,$hasTermsRegex);
} else {
_create_interleaved_items($items,\@rss_feeds,$maxHeadlines,$hasTermsRegex);
my $cache = WebGUI::Cache->new($self->session, $url, "RSS");
my $value = $cache->setByHTTP($url, $self->get("cacheTimeout"));
eval { $feed->merge($value) };
if (my $e = WebGUI::Error->caught()) {
$log->error("Syndicated Content asset (".$self->getId.") has a bad feed URL (".$url."). Failed with ".$e->message);
}
#@{$items} = sort { $b->{date} <=> $a->{date} } @{$items};
$cache->set([$items, \@rss_feeds], 3600);
}
#So return the item loop and the first RSS feed, because
#when we're parsing a single feed we can use that feed's title and
#description for channel.title, channel.link, and channel.description
return ($items,\@rss_feeds);
# build a new feed that matches the term the user is interested in
if ($self->get('hasTerms') ne '') {
my @terms = split /,\s*/, $self->get('hasTerms'); # get the list of terms
my $termRegex = join("|", map quotemeta($_), @terms); # turn the terms into a regex string
my @items = $feed->match_item(title=>qr/$termRegex/msi, description=>qr/$termRegex/msi);
$feed->clear_item;
foreach my $item (@items) {
$feed->add_item($item);
}
}
# sort them by date
$feed->sort_item();
# limit the feed to the maxium number of headlines
$feed->limit_item($self->get('maxHeadlines'));
return $feed;
}
#-------------------------------------------------------------------
=head2 getTemplateVariables
Returns a hash reference of template variables.
=head3 feed
A reference to an XML::FeedPP object.
=cut
sub getTemplateVariables {
my ($self, $feed) = @_;
my @items = $feed->get_item;
my %var;
$var{channel_title} = WebGUI::HTML::filter($feed->title, 'javascript');
$var{channel_description} = WebGUI::HTML::filter($feed->description, 'javascript');
$var{channel_date} = WebGUI::HTML::filter($feed->pubDate, 'javascript');
$var{channel_copyright} = WebGUI::HTML::filter($feed->copyright, 'javascript');
$var{channel_link} = WebGUI::HTML::filter($feed->link, 'javascript');
my @image = $feed->image;
$var{channel_image_url} = WebGUI::HTML::filter($image[0], 'javascript');
$var{channel_image_title} = WebGUI::HTML::filter($image[1], 'javascript');
$var{channel_image_link} = WebGUI::HTML::filter($image[2], 'javascript');
$var{channel_image_description} = WebGUI::HTML::filter($image[3], 'javascript');
$var{channel_image_width} = WebGUI::HTML::filter($image[4], 'javascript');
$var{channel_image_height} = WebGUI::HTML::filter($image[5], 'javascript');
foreach my $object (@items) {
my %item;
$item{title} = WebGUI::HTML::filter($object->title, 'javascript');
$item{date} = WebGUI::HTML::filter($object->pubDate, 'javascript');
$item{category} = WebGUI::HTML::filter($object->category, 'javascript');
$item{author} = WebGUI::HTML::filter($object->author, 'javascript');
$item{guid} = WebGUI::HTML::filter($object->guid, 'javascript');
$item{link} = WebGUI::HTML::filter($object->link, 'javascript');
$item{description} = WebGUI::HTML::filter($object->description, 'javascript');
$item{descriptionFirst100words} = $item{description};
$item{descriptionFirst100words} =~ s/(((\S+)\s+){100}).*/$1/s;
$item{descriptionFirst75words} = $item{descriptionFirst100words};
$item{descriptionFirst75words} =~ s/(((\S+)\s+){75}).*/$1/s;
$item{descriptionFirst50words} = $item{descriptionFirst75words};
$item{descriptionFirst50words} =~ s/(((\S+)\s+){50}).*/$1/s;
$item{descriptionFirst25words} = $item{descriptionFirst50words};
$item{descriptionFirst25words} =~ s/(((\S+)\s+){25}).*/$1/s;
$item{descriptionFirst10words} = $item{descriptionFirst25words};
$item{descriptionFirst10words} =~ s/(((\S+)\s+){10}).*/$1/s;
$item{descriptionFirst2paragraphs} = $item{description};
$item{descriptionFirst2paragraphs} =~ s/^((.*?\n){2}).*/$1/s;
$item{descriptionFirstParagraph} = $item{descriptionFirst2paragraphs};
$item{descriptionFirstParagraph} =~ s/^(.*?\n).*/$1/s;
$item{descriptionFirst4sentences} = $item{description};
$item{descriptionFirst4sentences} =~ s/^((.*?\.){4}).*/$1/s;
$item{descriptionFirst3sentences} = $item{descriptionFirst4sentences};
$item{descriptionFirst3sentences} =~ s/^((.*?\.){3}).*/$1/s;
$item{descriptionFirst2sentences} = $item{descriptionFirst3sentences};
$item{descriptionFirst2sentences} =~ s/^((.*?\.){2}).*/$1/s;
$item{descriptionFirstSentence} = $item{descriptionFirst2sentences};
$item{descriptionFirstSentence} =~ s/^(.*?\.).*/$1/s;
push @{$var{item_loop}}, \%item;
}
return \%var;
}
#-------------------------------------------------------------------
@ -593,11 +281,12 @@ sub prepareView {
my $template = WebGUI::Asset::Template->new($self->session, $self->get("templateId"));
$template->prepare($self->getMetaDataAsTemplateVariables);
$self->{_viewTemplate} = $template;
my $i18n = WebGUI::International->new($self->session,'Asset_SyndicatedContent');
my $rssFeedSuffix=$i18n->get('RSS Feed Title Suffix');
my $title = $self->get("title")." ".$rssFeedSuffix;
my $title = $self->get("title");
$title =~ s/\"/&quot;/g;
$self->session->style->setLink($self->getUrl("func=viewRSS20"), { rel=>'alternate', type=>'application/rss+xml', title=>$title });
my $style = $self->session->style;
$style->setLink($self->getUrl("func=viewRss"), { rel=>'alternate', type=>'application/rss+xml', title=>$title.' (RSS)' });
$style->setLink($self->getUrl("func=viewRdf"), { rel=>'alternate', type=>'application/rdf+xml', title=>$title.' (RDF)' });
$style->setLink($self->getUrl("func=viewAtom"), { rel=>'alternate', type=>'application/atom+xml', title=>$title.' (Atom)' });
}
@ -625,6 +314,22 @@ Returns the rendered output of the wobject.
sub view {
my $self = shift;
# try the cached version
my $cache = WebGUI::Cache->new($self->session,"view_".$self->getId);
my $out = $cache->get;
return $out if ($out ne "");
# generate from scratch
my $feed = $self->generateFeed;
$out = $self->processTemplate($self->getTemplateVariables($feed),undef,$self->{_viewTemplate});
if (!$self->session->var->isAdminOn && $self->get("cacheTimeout") > 10) {
$cache->set($out,$self->get("cacheTimeout"));
}
return $out;
my $rssFlavor = shift;
if ($rssFlavor eq "" && !$self->session->var->isAdminOn && $self->get("cacheTimeout") > 10) {
my $out = WebGUI::Cache->new($self->session,"view_".$self->getId)->get;
@ -688,48 +393,6 @@ sub view {
#-------------------------------------------------------------------
=head2 getRssUrl
Get the RSS URL and process macros if we're supposed to.
=cut
sub getRssUrl {
my $self = shift;
my $value = $self->get("rssUrl");
WebGUI::Macro::process($self->session,\$value) if $self->get("processMacroInRssUrl");
return $value;
}
#-------------------------------------------------------------------
=head2 getContentLastModified ( )
Derive the last-modified date from the revisionDate of the object and from the dates of the RSS feeds.
=cut
sub getContentLastModified {
# Buggo, is this too expensive? Do we really want to do this every time?
# But how else are we supposed to get a reasonable last-modified date?
# Maybe just approximate... ?
my $self = shift;
my $maxHeadlines = $self->_getMaxHeadlines;
my @validatedUrls = $self->_getValidatedUrls;
my ($item_loop, $rss_feeds) = $self->_get_items(\@validatedUrls, $maxHeadlines);
my $mtime = $self->get("revisionDate");
foreach my $rss (@$rss_feeds) {
next unless defined $rss->{last_modified};
$mtime = $rss->{last_modified} if $rss->{last_modified} > $mtime;
}
return $mtime;
}
#-------------------------------------------------------------------
=head2 www_view ( )
See WebGUI::Asset::Wobject::www_view() for details.
@ -742,61 +405,113 @@ sub www_view {
$self->SUPER::www_view(@_);
}
#-------------------------------------------------------------------
=head2 www_viewRSS090 ( )
=head2 www_viewAtom ( )
Emit an RSS 0.9 feed.
Emit an Atom 0.3 feed.
=cut
sub www_viewRSS090 {
my $self=shift;
return $self->view('0.9');
sub www_viewAtom {
my $self = shift;
my $feed = $self->generateFeed;
my $atom = XML::FeedPP::Atom->new;
$atom->merge($feed);
$self->session->http->setMimeType('application/atom+xml');
return $atom->to_string;
}
#-------------------------------------------------------------------
=head2 www_viewRSS091 ( )
=head2 www_viewRdf ( )
Emit an RSS 0.91 feed.
Emit an RSS 1.0 / RDF feed.
=cut
sub www_viewRSS091 {
my $self=shift;
return $self->view('0.91');
sub www_viewRdf {
my $self = shift;
my $feed = $self->generateFeed;
my $rdf = XML::FeedPP::RDF->new;
$rdf->merge($feed);
$self->session->http->setMimeType('application/rdf+xml');
return $rdf->to_string;
}
#-------------------------------------------------------------------
=head2 www_viewRSS10 ( )
Emit an RSS 1.0 feed.
=cut
sub www_viewRSS10 {
my $self=shift;
return $self->view('1.0');
}
#-------------------------------------------------------------------
=head2 www_viewRSS20 ( )
=head2 www_viewRss ( )
Emit an RSS 2.0 feed.
=cut
sub www_viewRSS20 {
my $self=shift;
return $self->view('2.0');
sub www_viewRss {
my $self = shift;
my $feed = $self->generateFeed;
my $rss = XML::FeedPP::RSS->new;
$rss->merge($feed);
$self->session->http->setMimeType('application/rss+xml');
return $rss->to_string;
}
#-------------------------------------------------------------------
=head2 www_viewRSS090 ( )
Deprecated. Use www_viewRss() instead.
=cut
sub www_viewRSS10 {
my $self = shift;
return $self->www_viewRdf;
}
#-------------------------------------------------------------------
=head2 www_viewRSS091 ( )
Deprecated. Use www_viewRss() instead.
=cut
sub www_viewRSS10 {
my $self = shift;
return $self->www_viewRdf;
}
#-------------------------------------------------------------------
=head2 www_viewRSS10 ( )
Deprecated. Use www_viewRdf() instead.
=cut
sub www_viewRSS10 {
my $self = shift;
return $self->www_viewRdf;
}
#-------------------------------------------------------------------
=head2 www_viewRSS20 ( )
Deprecated. Use www_viewRss() instead.
=cut
sub www_viewRSS10 {
my $self = shift;
return $self->www_viewRdf;
}
1;

View file

@ -17,24 +17,27 @@ our $HELP = {
},
],
variables => [
{ 'name' => 'channel.title' },
{ 'name' => 'channel.description' },
{ 'name' => 'channel.link' },
{ 'name' => 'rss.url',
'variables' => [
{ 'name' => 'rss.url.0.9' },
{ 'name' => 'rss.url.0.91' },
{ 'name' => 'rss.url.1.0' },
{ 'name' => 'rss.url.2.0' }
]
},
{ 'name' => 'channel_title' },
{ 'name' => 'channel_description' },
{ 'name' => 'channel_link' },
{ 'name' => 'channel_date' },
{ 'name' => 'channel_copyright' },
{ 'name' => 'channel_image_url' },
{ 'name' => 'channel_image_title' },
{ 'name' => 'channel_image_link' },
{ 'name' => 'channel_image_description' },
{ 'name' => 'channel_image_width' },
{ 'name' => 'channel_image_height' },
{ 'name' => 'rss_url' },
{ 'name' => 'rdf_url' },
{ 'name' => 'atom_url' },
{ 'name' => 'item_loop',
'variables' => [
{ 'name' => 'site_title' },
{ 'name' => 'site_link' },
{ 'name' => 'new_rss_site' },
{ 'name' => 'title' },
{ 'name' => 'link' },
{ 'name' => 'category' },
{ 'name' => 'author' },
{ 'name' => 'guid' },
{ 'name' => 'description' },
{ 'name' => 'descriptionFirst100words' },
{ 'name' => 'descriptionFirst75words' },
@ -73,7 +76,6 @@ our $HELP = {
{ 'name' => 'rssUrl' },
{ 'name' => 'processMacrosInRssUrl' },
{ 'name' => 'maxHeadlines' },
{ 'name' => 'displayMode' },
{ 'name' => 'hasTerms' },
],
related => [],

View file

@ -76,65 +76,42 @@ sub execute {
$self->session->errorHandler->error("Could not instanciate Workflow Instance in GetSyndicatedContent Activity");
return $self->ERROR;
}
my $log = $self->session->log;
# start time to check for timeouts
my $time = time();
my $ttl = $self->getTTL;
my @syndicatedUrls = @{$self->getSyndicatedUrls($instance)};
while (my $url = shift(@syndicatedUrls)) {
my $assets = JSON->new->decode($instance->getScratch("syndicatedassets") || '[]');
if (scalar @$assets < 1) {
$assets = $self->session->db->buildArrayRef("select assetId from asset where className like 'WebGUI::Asset::Wobject::SyndicatedContent'");
}
while (my $id = shift(@{$assets})) {
# Get RSS data, which will be stored in the cache
$self->session->errorHandler->info("GetSyndicatedContent workflow: Caching $url");
my $returnValue = WebGUI::Asset::Wobject::SyndicatedContent::_get_rss_data($self->session, $url);
if (!defined $returnValue) {
$self->session->errorHandler->warn("GetSyndicatedContent Workflow Activity: _get_rss_data returned undef while trying to process syndicated content url $url, which usually indicates an improper URL, or a malformed document");
next;
}
$log->info("GetSyndicatedContent: Caching for $id");
my $asset = WebGUI::Asset::Wobject::SyndicatedContent->new($self->session, $id);
if (defined $asset) {
my $feed = $asset->generateFeed;
unless ($feed->isa('XML::FeedPP')) {
$log->error("GetSyndicatedContent: Syndicated Content Asset $id returned an invalid feed");
}
}
else {
$log->error("GetSyndicatedContent: Couldn't instanciate $id")
}
# Check for timeout
last
if (time() - $time > $ttl);
last if (time() - $time > $ttl);
}
# if there are urls left, we need to process again
if (scalar(@syndicatedUrls) > 0) {
$instance->setScratch("syndicatedUrls", JSON::encode_json(\@syndicatedUrls));
if (scalar(@$assets) > 0) {
$instance->setScratch("syndicatedassets", JSON->new->encode($assets));
return $self->WAITING;
}
$instance->deleteScratch("syndicatedUrls");
$instance->deleteScratch("syndicatedassets");
return $self->COMPLETE;
}
#---------------------------------------------------------------------
=head2 getWobjectUrls ( )
Returns URLs from all of the Syndicated Content Wobjects from scratch or fetches them from the db if needed
=head3 session
A reference to the current webgui session
=cut
sub getSyndicatedUrls {
my $self = shift;
my $instance = shift;
my $syndicatedUrls = $instance->getScratch("syndicatedUrls");
if ($syndicatedUrls) {
return JSON::decode_json($syndicatedUrls);
}
my $urls = [];
my $assets = WebGUI::Asset->getRoot($self->session)->getLineage(['descendants'], {
includeOnlyClasses => ['WebGUI::Asset::Wobject::SyndicatedContent'],
returnObjects => 1,
});
foreach my $asset (@$assets) {
push @$urls, split(/\s+/, $asset->getRssUrl);
}
$instance->setScratch("syndicatedUrls", JSON::encode_json($urls));
return $urls;
}
1;

View file

@ -46,44 +46,89 @@ our $I18N = {
message => q|Edit Syndicated Content|
},
'channel.title' => {
message => q|The title of this piece of syndicated content. This will be the same as the title of the Syndicated Content object when you're creating an aggregate feed.|,
lastUpdated => 1149567508,
'channel_title' => {
message => q|The title of this piece of syndicated content. This variable will be populated by the first feed in a multi-feed list.|,
lastUpdated => 0,
},
'channel.description' => {
message => q|A description of the content available through this channel. This will be the same as the description of the Syndicated Content object when you're creating an aggregate feed.|,
lastUpdated => 1149567508,
'channel_description' => {
message => q|A description of the content available through this channel. This variable will be populated by the first feed in a multi-feed list.|,
lastUpdated => 0,
},
'channel.link' => {
message => q|A URL back to the originating site of this channel. This variable *will not* exist when you're creating an aggregate feed, because there's no single channel to link to.|,
lastUpdated => 1149567508,
'channel_link' => {
message => q|A URL back to the originating site of this channel. This variable will be populated by the first feed in a multi-feed list.|,
lastUpdated => 0,
},
'rss.url' => {
message => q|This is the URL to use to get the contents of this Syndicated Content wobject as an RSS 2.0 feed. Additionally, you can specify RSS versions via the following template variables:|,
lastUpdated => 1149567508,
'channel_date' => {
message => q|The date this channel was updated. This variable will be populated by the first feed in a multi-feed list.|,
lastUpdated => 0,
},
'rss.url.0.9' => {
message => q|The contents of this wobject as an RSS 0.9 feed.|,
lastUpdated => 1149567508,
'channel_copyright' => {
message => q|Copyright holder information. This variable will be populated by the first feed in a multi-feed list.|,
lastUpdated => 0,
},
'rss.url.0.91' => {
message => q|The contents of this wobject as an RSS 0.91 feed.|,
lastUpdated => 1149567508,
'channel_image_url' => {
message => q|The URL of the image attached to this feed. This variable will be populated by the first feed in a multi-feed list.|,
lastUpdated => 0,
},
'rss.url.1.0' => {
message => q|The contents of this wobject as an RSS 1.0 feed.|,
lastUpdated => 1149567508,
'channel_image_title' => {
message => q|The title of the image attached to this feed. This variable will be populated by the first feed in a multi-feed list.|,
lastUpdated => 0,
},
'rss.url.2.0' => {
message => q|The contents of this wobject as an RSS 2.0 feed.|,
lastUpdated => 1149567508,
'channel_image_description' => {
message => q|The description of the image attached to this feed. This variable will be populated by the first feed in a multi-feed list.|,
lastUpdated => 0,
},
'channel_image_link' => {
message => q|The URL of the link that should wrap this feed's image. This variable will be populated by the first feed in a multi-feed list.|,
lastUpdated => 0,
},
'channel_image_width' => {
message => q|The width in pixels of this feed's image. This variable will be populated by the first feed in a multi-feed list.|,
lastUpdated => 0,
},
'channel_image_height' => {
message => q|The height in pixels of this feed's image. This variable will be populated by the first feed in a multi-feed list.|,
lastUpdated => 0,
},
'rss_url' => {
message => q|This is the URL to use to get the contents of this Syndicated Content asset as an RSS 2.0 feed. Additionally, you can specify RSS versions via the following template variables:|,
lastUpdated => 0,
},
'rdf_url' => {
message => q|The contents of this asset as an RDF/RSS 1.0 feed.|,
lastUpdated => 0,
},
'atom_url' => {
message => q|The contents of this asset as an Atom 0.3 feed.|,
lastUpdated => 0,
},
'category' => {
message => q|A category this item belongs to.|,
lastUpdated => 0,
},
'author' => {
message => q|The publisher of this item.|,
lastUpdated => 0,
},
'guid' => {
message => q|A unique id for this item.|,
lastUpdated => 0,
},
'item_loop' => {
@ -91,21 +136,6 @@ our $I18N = {
lastUpdated => 1149567508,
},
'site_title' => {
message => q|The title of the RSS feed this item comes from|,
lastUpdated => 1149567508,
},
'site_link' => {
message => q|Link to the source RSS feed.|,
lastUpdated => 1149567508,
},
'new_rss_site' => {
message => q|A "boolean" variable (suitable for using in a &lt;tmpl_if&gt; tag) that indicates we've started outputting items from a source RSS feed different than the previous item. This is most useful when you're viewing feeds in "grouped" mode- it gives you a hook to output <b>site_title</b> and <b>site_link</b> at the right time.|,
lastUpdated => 1149567508,
},
'title' => {
message => q|The title of a piece of content. If you're filtering on terms, this field will be inspected.|,
lastUpdated => 1149567508,
@ -126,51 +156,21 @@ our $I18N = {
message => q|Syndicated Content Template|
},
'displayModeLabel' => {
lastUpdated => 1047855526,
message => q|Display Mode|
},
'displayModeSubtext' => {
lastUpdated => 1047855526,
message => q|<p>"Interleaved" means items from all feeds are lumped together, "Grouped by Feed" means items are grouped by the feed they came from. Either setting is fine if you're only bringing in a single feed.</p>|
},
'grouped' => {
lastUpdated => 1047855526,
message => q|Grouped by Feed|
},
'hasTermsLabel' => {
lastUpdated => 1047855526,
message => q|With any of these terms|
},
'interleaved' => {
lastUpdated => 1047855526,
message => q|Interleaved|
},
'rssTabName' => {
lastUpdated => 1118417024,
message => q|RSS|
},
'RSS Feed Title Suffix' => {
lastUpdated => 1118417024,
message => q|RSS 2.0 Feed|
},
'72 description' => {
message => q|Select a template for this content.|,
lastUpdated => 1119977659,
},
'displayModeLabel description' => {
message => q|<p>If you're aggregating feeds, you can change the mode in which the items are displayed. "Grouped by Feed" means the items will be grouped together by the feeds they come from. "Interleaved" means the items will be mixed together in a "round-robin" fashion from all the feeds. If you're grouping your feeds, please look at <b>new_rss_site</b> "item_loop" template variables, it gives you a hook allowing you to output the feed title</p>|,
lastUpdated => 1146799950,
},
'hasTermsLabel description' => {
message => q|<p>Enter terms (separated by commas) that you'd like to filter the feeds on. For instance, if you enter:</p>
<div class="helpIndent"><b>linux, windows development, blogs</b></div>
@ -192,10 +192,10 @@ our $I18N = {
<li><a href="http://w.moreover.com/">http://w.moreover.com/</a></li>
</ul>
</div>
<p>Currently, WebGUI can handle RSS versions .90, .91, 1.0, and 2.0. Atom feeds aren't supported for now. Probably other RSS-ish files would work too.
<p>Currently, WebGUI can handle RSS versions .90, .91, 1.0, and 2.0; Atom .3 and 1.0. Probably other RSS-ish files would work too.
</p>
<p>To create an aggregate RSS feed (one that pulls information from multiple RSS feeds), include a list of URLs, one on each line, instead of a single URL. Items will be sorted by the date WebGUI first received the story.</p>|,
lastUpdated => 1168228049,
lastUpdated => 1225928949,
},
'3 description' => {

File diff suppressed because it is too large Load diff