migrated the get syndicated content hourly script to workflow

This commit is contained in:
JT Smith 2006-03-02 21:48:48 +00:00
parent 2139b9e5d4
commit 1a5bd9e647
5 changed files with 112 additions and 47 deletions

View file

@ -130,7 +130,8 @@ sub addWorkflow {
"WebGUI::Workflow::Activity::CleanFileCache", "WebGUI::Workflow::Activity::CleanLoginHistory", "WebGUI::Workflow::Activity::ArchiveOldThreads",
"WebGUI::Workflow::Activity::TrashExpiredEvents", "WebGUI::Workflow::Activity::CreateCronJob", "WebGUI::Workflow::Activity::DeleteExpiredSessions",
"WebGUI::Workflow::Activity::DeleteExpiredGroupings", "WebGUI::Workflow::Activity::PurgeOldAssetRevisions",
"WebGUI::Workflow::Activity::ExpireSubscriptionCodes", "WebGUI::Workflow::Activity::PurgeOldTrash"],
"WebGUI::Workflow::Activity::ExpireSubscriptionCodes", "WebGUI::Workflow::Activity::PurgeOldTrash",
"WebGUI::Workflow::Activity::GetSyndicatedContent"],
"WebGUI::User"=>["WebGUI::Workflow::Activity::CreateCronJob"],
"WebGUI::VersionTag"=>["WebGUI::Workflow::Activity::CommitVersionTag", "WebGUI::Workflow::Activity::RollbackVersionTag",
"WebGUI::Workflow::Activity::TrashVersionTag", "WebGUI::Workflow::Activity::CreateCronJob"]
@ -203,6 +204,8 @@ sub addWorkflow {
}, "pbworkflow000000000004");
$activity = $workflow->addActivity("WebGUI::Workflow::Activity::DeleteExpiredSessions", "pbwfactivity0000000009");
$activity->set("title", "delete expired sessions");
$activity = $workflow->addActivity("WebGUI::Workflow::Activity::GetSyndicatedContent", "pbwfactivity0000000012");
$activity->set("title", "Get syndicated content");
WebGUI::Workflow::Cron->create($session, {
title=>'Hourly Maintenance',
enabled=>1,

View file

@ -231,10 +231,10 @@ sub _normalize_items {
#-------------------------------------------------------------------
sub _get_rss_data {
my $self = shift;
my $session = shift;
my $url = shift;
my $cache = WebGUI::Cache->new($self->session,'url:' . $url, 'RSS');
my $cache = WebGUI::Cache->new($session,'url:' . $url, 'RSS');
my $rss_serial = $cache->get;
my $rss = {};
if ($rss_serial) {
@ -243,7 +243,7 @@ sub _get_rss_data {
my $ua = LWP::UserAgent->new(timeout => 5);
my $response = $ua->get($url);
if (!$response->is_success()) {
$self->session->errorHandler->warn("Error retrieving url '$url': " .
$session->errorHandler->warn("Error retrieving url '$url': " .
$response->status_line());
return undef;
}
@ -256,7 +256,7 @@ sub _get_rss_data {
my $encoding = 'utf8';
if (lc($xmlEncoding) ne lc($encoding)) {
eval { from_to($xml, $xmlEncoding, $encoding) };
$self->session->errorHandler->warn($@) if ($@);
$session->errorHandler->warn($@) if ($@);
}
}
@ -266,7 +266,7 @@ sub _get_rss_data {
XML::RSSLite::parseXML($rss_lite, \$xml);
};
if ($@) {
$self->session->errorHandler->warn("error parsing rss for url $url :".$@);
$session->errorHandler->warn("error parsing rss for url $url :".$@);
#Returning undef on a parse failure is a change from previous behaviour,
#but it SHOULDN'T have a major effect.
return undef;
@ -281,10 +281,10 @@ sub _get_rss_data {
$rss_lite = {channel => $rss_lite};
if (!($rss->{channel} =
_find_record($rss_lite, qr/^channel$/))) {
$self->session->errorHandler->warn("unable to find channel info for url $url");
$session->errorHandler->warn("unable to find channel info for url $url");
}
if (!($rss->{items} = _find_record($rss_lite, qr/^items?$/))) {
$self->session->errorHandler->warn("unable to find item info for url $url");
$session->errorHandler->warn("unable to find item info for url $url");
$rss->{items} = [];
}
@ -296,7 +296,7 @@ sub _get_rss_data {
#Assign dates "globally" rather than when seen in a viewed feed.
#This is important because we can "filter" now and want to ensure we keep order
#correctly as new items appear.
$self->_assign_rss_dates($rss->{items});
_assign_rss_dates($session, $rss->{items});
#Default to an hour timeout
$cache->set(Storable::freeze($rss), 3600);
@ -311,17 +311,17 @@ sub _get_rss_data {
# whole database to keep the thing from growing too large
sub _assign_rss_dates {
my $self = shift;
my $session = shift;
my ($items) = @_;
for my $item (@{$items}) {
my $key = 'dates:' . ($item->{guid} || $item->{title} ||
$item->{description} || $item->{link});
my $cache = WebGUI::Cache->new($self->session,$key, 'RSS');
my $cache = WebGUI::Cache->new($session,$key, 'RSS');
if (my $date = $cache->get()) {
$item->{date} = $date;
} else {
$item->{date} =$self->session->datetime->time();
$item->{date} =$session->datetime->time();
$cache->set($item->{date}, '1 year');
}
}
@ -444,7 +444,7 @@ sub _get_items {
$items = [];
for my $url (@{$urls}) {
my $rss_info=$self->_get_rss_data($url);
my $rss_info=_get_rss_data($self->session,$url);
push(@rss_feeds, $rss_info) if($rss_info);
}

View file

@ -0,0 +1,90 @@
package WebGUI::Workflow::Activity::GetSyndicatedContent;
=head1 LEGAL
-------------------------------------------------------------------
WebGUI is Copyright 2001-2006 Plain Black Corporation.
-------------------------------------------------------------------
Please read the legal notices (docs/legal.txt) and the license
(docs/license.txt) that came with this distribution before using
this software.
-------------------------------------------------------------------
http://www.plainblack.com info@plainblack.com
-------------------------------------------------------------------
=cut
use strict;
use base 'WebGUI::Workflow::Activity';
use WebGUI::Asset::Wobject::SyndicatedContent;
=head1 NAME
Package WebGUI::Workflow::Activity::GetSyndicatedContent;
=head1 DESCRIPTION
Prefetches syndicated content URLs so that the pages can be served up more quickly.
=head1 SYNOPSIS
See WebGUI::Workflow::Activity for details on how to use any activity.
=head1 METHODS
These methods are available from this class:
=cut
#-------------------------------------------------------------------
=head2 definition ( session, definition )
See WebGUI::Workflow::Activity::defintion() for details.
=cut
sub definition {
my $class = shift;
my $session = shift;
my $definition = shift;
my $i18n = WebGUI::International->new($session, "Asset_SyndicatedContent");
push(@{$definition}, {
name=>$i18n->get("get syndicated content"),
properties=> { }
});
return $class->SUPER::definition($session,$definition);
}
#-------------------------------------------------------------------
=head2 execute ( )
See WebGUI::Workflow::Activity::execute() for details.
=cut
sub execute {
my $self = shift;
#In the new Wobject, "rssURL" actually can refer to more than one URL.
my @syndicatedWobjectURLs = $self->session->db->buildArray("select distinct SyndicatedContent.rssUrl from SyndicatedContent left join asset on SyndicatedContent.assetId=asset.assetId where asset.state='published'");
foreach my $url(@syndicatedWobjectURLs) {
#Loop through the SyndicatedWobjects and split all the URLs they are syndicating off into
#a separate array.
my @urlsToSyndicate = split(/\s+/,$url);
foreach ((@urlsToSyndicate)) {
WebGUI::Asset::Wobject::SyndicatedContent::_get_rss_data($self->session,$_);
}
}
return 1;
}
1;

View file

@ -1,6 +1,12 @@
package WebGUI::i18n::English::Asset_SyndicatedContent;
our $I18N = {
'get syndicated content' => {
'lastUpdated' => 0,
'message' => 'Get Syndicated Content',
context => ' the title of the get syndicated content workflow activity'
},
'1' => {
'lastUpdated' => 1031514049,
'message' => 'URL to RSS File'

View file

@ -1,34 +0,0 @@
package Hourly::GetSyndicatedContent;
use strict;
use warnings;
use WebGUI::SQL;
use WebGUI::Asset::Wobject::SyndicatedContent;
=head2 Hourly::GetSyndicatedContent
Loops through all the URLs in the SyndicatedWobjects and puts them into WebGUI::Cache if they haven't been spidered or if they have expired from the cache. This should reduce HTTP traffic a little, and allow for more granular scheduling of feed downloads in the future.
=cut
#-------------------------------------------------------------------
sub process{
#In the new Wobject, "rssURL" actually can refer to more than one URL.
my @syndicatedWobjectURLs = WebGUI::SQL->buildArray("select distinct SyndicatedContent.rssUrl from SyndicatedContent left join asset on SyndicatedContent.assetId=asset.assetId where asset.state='published'");
foreach my $url(@syndicatedWobjectURLs) {
#Loop through the SyndicatedWobjects and split all the URLs they are syndicating off into
#a separate array.
my @urlsToSyndicate = split(/\s+/,$url);
foreach ((@urlsToSyndicate)) {
WebGUI::Asset::Wobject::SyndicatedContent::_get_rss_data($_);
}
}
}
1;