migrated the get syndicated content hourly script to workflow
This commit is contained in:
parent
2139b9e5d4
commit
1a5bd9e647
5 changed files with 112 additions and 47 deletions
|
|
@ -130,7 +130,8 @@ sub addWorkflow {
|
|||
"WebGUI::Workflow::Activity::CleanFileCache", "WebGUI::Workflow::Activity::CleanLoginHistory", "WebGUI::Workflow::Activity::ArchiveOldThreads",
|
||||
"WebGUI::Workflow::Activity::TrashExpiredEvents", "WebGUI::Workflow::Activity::CreateCronJob", "WebGUI::Workflow::Activity::DeleteExpiredSessions",
|
||||
"WebGUI::Workflow::Activity::DeleteExpiredGroupings", "WebGUI::Workflow::Activity::PurgeOldAssetRevisions",
|
||||
"WebGUI::Workflow::Activity::ExpireSubscriptionCodes", "WebGUI::Workflow::Activity::PurgeOldTrash"],
|
||||
"WebGUI::Workflow::Activity::ExpireSubscriptionCodes", "WebGUI::Workflow::Activity::PurgeOldTrash",
|
||||
"WebGUI::Workflow::Activity::GetSyndicatedContent"],
|
||||
"WebGUI::User"=>["WebGUI::Workflow::Activity::CreateCronJob"],
|
||||
"WebGUI::VersionTag"=>["WebGUI::Workflow::Activity::CommitVersionTag", "WebGUI::Workflow::Activity::RollbackVersionTag",
|
||||
"WebGUI::Workflow::Activity::TrashVersionTag", "WebGUI::Workflow::Activity::CreateCronJob"]
|
||||
|
|
@ -203,6 +204,8 @@ sub addWorkflow {
|
|||
}, "pbworkflow000000000004");
|
||||
$activity = $workflow->addActivity("WebGUI::Workflow::Activity::DeleteExpiredSessions", "pbwfactivity0000000009");
|
||||
$activity->set("title", "delete expired sessions");
|
||||
$activity = $workflow->addActivity("WebGUI::Workflow::Activity::GetSyndicatedContent", "pbwfactivity0000000012");
|
||||
$activity->set("title", "Get syndicated content");
|
||||
WebGUI::Workflow::Cron->create($session, {
|
||||
title=>'Hourly Maintenance',
|
||||
enabled=>1,
|
||||
|
|
|
|||
|
|
@ -231,10 +231,10 @@ sub _normalize_items {
|
|||
|
||||
#-------------------------------------------------------------------
|
||||
sub _get_rss_data {
|
||||
my $self = shift;
|
||||
my $session = shift;
|
||||
my $url = shift;
|
||||
|
||||
my $cache = WebGUI::Cache->new($self->session,'url:' . $url, 'RSS');
|
||||
my $cache = WebGUI::Cache->new($session,'url:' . $url, 'RSS');
|
||||
my $rss_serial = $cache->get;
|
||||
my $rss = {};
|
||||
if ($rss_serial) {
|
||||
|
|
@ -243,7 +243,7 @@ sub _get_rss_data {
|
|||
my $ua = LWP::UserAgent->new(timeout => 5);
|
||||
my $response = $ua->get($url);
|
||||
if (!$response->is_success()) {
|
||||
$self->session->errorHandler->warn("Error retrieving url '$url': " .
|
||||
$session->errorHandler->warn("Error retrieving url '$url': " .
|
||||
$response->status_line());
|
||||
return undef;
|
||||
}
|
||||
|
|
@ -256,7 +256,7 @@ sub _get_rss_data {
|
|||
my $encoding = 'utf8';
|
||||
if (lc($xmlEncoding) ne lc($encoding)) {
|
||||
eval { from_to($xml, $xmlEncoding, $encoding) };
|
||||
$self->session->errorHandler->warn($@) if ($@);
|
||||
$session->errorHandler->warn($@) if ($@);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -266,7 +266,7 @@ sub _get_rss_data {
|
|||
XML::RSSLite::parseXML($rss_lite, \$xml);
|
||||
};
|
||||
if ($@) {
|
||||
$self->session->errorHandler->warn("error parsing rss for url $url :".$@);
|
||||
$session->errorHandler->warn("error parsing rss for url $url :".$@);
|
||||
#Returning undef on a parse failure is a change from previous behaviour,
|
||||
#but it SHOULDN'T have a major effect.
|
||||
return undef;
|
||||
|
|
@ -281,10 +281,10 @@ sub _get_rss_data {
|
|||
$rss_lite = {channel => $rss_lite};
|
||||
if (!($rss->{channel} =
|
||||
_find_record($rss_lite, qr/^channel$/))) {
|
||||
$self->session->errorHandler->warn("unable to find channel info for url $url");
|
||||
$session->errorHandler->warn("unable to find channel info for url $url");
|
||||
}
|
||||
if (!($rss->{items} = _find_record($rss_lite, qr/^items?$/))) {
|
||||
$self->session->errorHandler->warn("unable to find item info for url $url");
|
||||
$session->errorHandler->warn("unable to find item info for url $url");
|
||||
$rss->{items} = [];
|
||||
}
|
||||
|
||||
|
|
@ -296,7 +296,7 @@ sub _get_rss_data {
|
|||
#Assign dates "globally" rather than when seen in a viewed feed.
|
||||
#This is important because we can "filter" now and want to ensure we keep order
|
||||
#correctly as new items appear.
|
||||
$self->_assign_rss_dates($rss->{items});
|
||||
_assign_rss_dates($session, $rss->{items});
|
||||
|
||||
#Default to an hour timeout
|
||||
$cache->set(Storable::freeze($rss), 3600);
|
||||
|
|
@ -311,17 +311,17 @@ sub _get_rss_data {
|
|||
# whole database to keep the thing from growing too large
|
||||
|
||||
sub _assign_rss_dates {
|
||||
my $self = shift;
|
||||
my $session = shift;
|
||||
my ($items) = @_;
|
||||
|
||||
for my $item (@{$items}) {
|
||||
my $key = 'dates:' . ($item->{guid} || $item->{title} ||
|
||||
$item->{description} || $item->{link});
|
||||
my $cache = WebGUI::Cache->new($self->session,$key, 'RSS');
|
||||
my $cache = WebGUI::Cache->new($session,$key, 'RSS');
|
||||
if (my $date = $cache->get()) {
|
||||
$item->{date} = $date;
|
||||
} else {
|
||||
$item->{date} =$self->session->datetime->time();
|
||||
$item->{date} =$session->datetime->time();
|
||||
$cache->set($item->{date}, '1 year');
|
||||
}
|
||||
}
|
||||
|
|
@ -444,7 +444,7 @@ sub _get_items {
|
|||
$items = [];
|
||||
|
||||
for my $url (@{$urls}) {
|
||||
my $rss_info=$self->_get_rss_data($url);
|
||||
my $rss_info=_get_rss_data($self->session,$url);
|
||||
push(@rss_feeds, $rss_info) if($rss_info);
|
||||
}
|
||||
|
||||
|
|
|
|||
90
lib/WebGUI/Workflow/Activity/GetSyndicatedContent.pm
Normal file
90
lib/WebGUI/Workflow/Activity/GetSyndicatedContent.pm
Normal file
|
|
@ -0,0 +1,90 @@
|
|||
package WebGUI::Workflow::Activity::GetSyndicatedContent;
|
||||
|
||||
|
||||
=head1 LEGAL
|
||||
|
||||
-------------------------------------------------------------------
|
||||
WebGUI is Copyright 2001-2006 Plain Black Corporation.
|
||||
-------------------------------------------------------------------
|
||||
Please read the legal notices (docs/legal.txt) and the license
|
||||
(docs/license.txt) that came with this distribution before using
|
||||
this software.
|
||||
-------------------------------------------------------------------
|
||||
http://www.plainblack.com info@plainblack.com
|
||||
-------------------------------------------------------------------
|
||||
|
||||
=cut
|
||||
|
||||
use strict;
|
||||
use base 'WebGUI::Workflow::Activity';
|
||||
use WebGUI::Asset::Wobject::SyndicatedContent;
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Package WebGUI::Workflow::Activity::GetSyndicatedContent;
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
Prefetches syndicated content URLs so that the pages can be served up more quickly.
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
See WebGUI::Workflow::Activity for details on how to use any activity.
|
||||
|
||||
=head1 METHODS
|
||||
|
||||
These methods are available from this class:
|
||||
|
||||
=cut
|
||||
|
||||
|
||||
#-------------------------------------------------------------------
|
||||
|
||||
=head2 definition ( session, definition )
|
||||
|
||||
See WebGUI::Workflow::Activity::defintion() for details.
|
||||
|
||||
=cut
|
||||
|
||||
sub definition {
|
||||
my $class = shift;
|
||||
my $session = shift;
|
||||
my $definition = shift;
|
||||
my $i18n = WebGUI::International->new($session, "Asset_SyndicatedContent");
|
||||
push(@{$definition}, {
|
||||
name=>$i18n->get("get syndicated content"),
|
||||
properties=> { }
|
||||
});
|
||||
return $class->SUPER::definition($session,$definition);
|
||||
}
|
||||
|
||||
|
||||
#-------------------------------------------------------------------
|
||||
|
||||
=head2 execute ( )
|
||||
|
||||
See WebGUI::Workflow::Activity::execute() for details.
|
||||
|
||||
=cut
|
||||
|
||||
sub execute {
|
||||
my $self = shift;
|
||||
#In the new Wobject, "rssURL" actually can refer to more than one URL.
|
||||
my @syndicatedWobjectURLs = $self->session->db->buildArray("select distinct SyndicatedContent.rssUrl from SyndicatedContent left join asset on SyndicatedContent.assetId=asset.assetId where asset.state='published'");
|
||||
foreach my $url(@syndicatedWobjectURLs) {
|
||||
#Loop through the SyndicatedWobjects and split all the URLs they are syndicating off into
|
||||
#a separate array.
|
||||
my @urlsToSyndicate = split(/\s+/,$url);
|
||||
foreach ((@urlsToSyndicate)) {
|
||||
WebGUI::Asset::Wobject::SyndicatedContent::_get_rss_data($self->session,$_);
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
1;
|
||||
|
||||
|
||||
|
|
@ -1,6 +1,12 @@
|
|||
package WebGUI::i18n::English::Asset_SyndicatedContent;
|
||||
|
||||
our $I18N = {
|
||||
'get syndicated content' => {
|
||||
'lastUpdated' => 0,
|
||||
'message' => 'Get Syndicated Content',
|
||||
context => ' the title of the get syndicated content workflow activity'
|
||||
},
|
||||
|
||||
'1' => {
|
||||
'lastUpdated' => 1031514049,
|
||||
'message' => 'URL to RSS File'
|
||||
|
|
|
|||
|
|
@ -1,34 +0,0 @@
|
|||
package Hourly::GetSyndicatedContent;
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use WebGUI::SQL;
|
||||
use WebGUI::Asset::Wobject::SyndicatedContent;
|
||||
|
||||
=head2 Hourly::GetSyndicatedContent
|
||||
|
||||
Loops through all the URLs in the SyndicatedWobjects and puts them into WebGUI::Cache if they haven't been spidered or if they have expired from the cache. This should reduce HTTP traffic a little, and allow for more granular scheduling of feed downloads in the future.
|
||||
|
||||
=cut
|
||||
|
||||
|
||||
#-------------------------------------------------------------------
|
||||
sub process{
|
||||
|
||||
#In the new Wobject, "rssURL" actually can refer to more than one URL.
|
||||
my @syndicatedWobjectURLs = WebGUI::SQL->buildArray("select distinct SyndicatedContent.rssUrl from SyndicatedContent left join asset on SyndicatedContent.assetId=asset.assetId where asset.state='published'");
|
||||
foreach my $url(@syndicatedWobjectURLs) {
|
||||
|
||||
#Loop through the SyndicatedWobjects and split all the URLs they are syndicating off into
|
||||
#a separate array.
|
||||
|
||||
my @urlsToSyndicate = split(/\s+/,$url);
|
||||
foreach ((@urlsToSyndicate)) {
|
||||
WebGUI::Asset::Wobject::SyndicatedContent::_get_rss_data($_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
1;
|
||||
Loading…
Add table
Add a link
Reference in a new issue