migrated the get syndicated content hourly script to workflow
This commit is contained in:
parent
2139b9e5d4
commit
1a5bd9e647
5 changed files with 112 additions and 47 deletions
|
|
@ -130,7 +130,8 @@ sub addWorkflow {
|
||||||
"WebGUI::Workflow::Activity::CleanFileCache", "WebGUI::Workflow::Activity::CleanLoginHistory", "WebGUI::Workflow::Activity::ArchiveOldThreads",
|
"WebGUI::Workflow::Activity::CleanFileCache", "WebGUI::Workflow::Activity::CleanLoginHistory", "WebGUI::Workflow::Activity::ArchiveOldThreads",
|
||||||
"WebGUI::Workflow::Activity::TrashExpiredEvents", "WebGUI::Workflow::Activity::CreateCronJob", "WebGUI::Workflow::Activity::DeleteExpiredSessions",
|
"WebGUI::Workflow::Activity::TrashExpiredEvents", "WebGUI::Workflow::Activity::CreateCronJob", "WebGUI::Workflow::Activity::DeleteExpiredSessions",
|
||||||
"WebGUI::Workflow::Activity::DeleteExpiredGroupings", "WebGUI::Workflow::Activity::PurgeOldAssetRevisions",
|
"WebGUI::Workflow::Activity::DeleteExpiredGroupings", "WebGUI::Workflow::Activity::PurgeOldAssetRevisions",
|
||||||
"WebGUI::Workflow::Activity::ExpireSubscriptionCodes", "WebGUI::Workflow::Activity::PurgeOldTrash"],
|
"WebGUI::Workflow::Activity::ExpireSubscriptionCodes", "WebGUI::Workflow::Activity::PurgeOldTrash",
|
||||||
|
"WebGUI::Workflow::Activity::GetSyndicatedContent"],
|
||||||
"WebGUI::User"=>["WebGUI::Workflow::Activity::CreateCronJob"],
|
"WebGUI::User"=>["WebGUI::Workflow::Activity::CreateCronJob"],
|
||||||
"WebGUI::VersionTag"=>["WebGUI::Workflow::Activity::CommitVersionTag", "WebGUI::Workflow::Activity::RollbackVersionTag",
|
"WebGUI::VersionTag"=>["WebGUI::Workflow::Activity::CommitVersionTag", "WebGUI::Workflow::Activity::RollbackVersionTag",
|
||||||
"WebGUI::Workflow::Activity::TrashVersionTag", "WebGUI::Workflow::Activity::CreateCronJob"]
|
"WebGUI::Workflow::Activity::TrashVersionTag", "WebGUI::Workflow::Activity::CreateCronJob"]
|
||||||
|
|
@ -203,6 +204,8 @@ sub addWorkflow {
|
||||||
}, "pbworkflow000000000004");
|
}, "pbworkflow000000000004");
|
||||||
$activity = $workflow->addActivity("WebGUI::Workflow::Activity::DeleteExpiredSessions", "pbwfactivity0000000009");
|
$activity = $workflow->addActivity("WebGUI::Workflow::Activity::DeleteExpiredSessions", "pbwfactivity0000000009");
|
||||||
$activity->set("title", "delete expired sessions");
|
$activity->set("title", "delete expired sessions");
|
||||||
|
$activity = $workflow->addActivity("WebGUI::Workflow::Activity::GetSyndicatedContent", "pbwfactivity0000000012");
|
||||||
|
$activity->set("title", "Get syndicated content");
|
||||||
WebGUI::Workflow::Cron->create($session, {
|
WebGUI::Workflow::Cron->create($session, {
|
||||||
title=>'Hourly Maintenance',
|
title=>'Hourly Maintenance',
|
||||||
enabled=>1,
|
enabled=>1,
|
||||||
|
|
|
||||||
|
|
@ -231,10 +231,10 @@ sub _normalize_items {
|
||||||
|
|
||||||
#-------------------------------------------------------------------
|
#-------------------------------------------------------------------
|
||||||
sub _get_rss_data {
|
sub _get_rss_data {
|
||||||
my $self = shift;
|
my $session = shift;
|
||||||
my $url = shift;
|
my $url = shift;
|
||||||
|
|
||||||
my $cache = WebGUI::Cache->new($self->session,'url:' . $url, 'RSS');
|
my $cache = WebGUI::Cache->new($session,'url:' . $url, 'RSS');
|
||||||
my $rss_serial = $cache->get;
|
my $rss_serial = $cache->get;
|
||||||
my $rss = {};
|
my $rss = {};
|
||||||
if ($rss_serial) {
|
if ($rss_serial) {
|
||||||
|
|
@ -243,7 +243,7 @@ sub _get_rss_data {
|
||||||
my $ua = LWP::UserAgent->new(timeout => 5);
|
my $ua = LWP::UserAgent->new(timeout => 5);
|
||||||
my $response = $ua->get($url);
|
my $response = $ua->get($url);
|
||||||
if (!$response->is_success()) {
|
if (!$response->is_success()) {
|
||||||
$self->session->errorHandler->warn("Error retrieving url '$url': " .
|
$session->errorHandler->warn("Error retrieving url '$url': " .
|
||||||
$response->status_line());
|
$response->status_line());
|
||||||
return undef;
|
return undef;
|
||||||
}
|
}
|
||||||
|
|
@ -256,7 +256,7 @@ sub _get_rss_data {
|
||||||
my $encoding = 'utf8';
|
my $encoding = 'utf8';
|
||||||
if (lc($xmlEncoding) ne lc($encoding)) {
|
if (lc($xmlEncoding) ne lc($encoding)) {
|
||||||
eval { from_to($xml, $xmlEncoding, $encoding) };
|
eval { from_to($xml, $xmlEncoding, $encoding) };
|
||||||
$self->session->errorHandler->warn($@) if ($@);
|
$session->errorHandler->warn($@) if ($@);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
@ -266,7 +266,7 @@ sub _get_rss_data {
|
||||||
XML::RSSLite::parseXML($rss_lite, \$xml);
|
XML::RSSLite::parseXML($rss_lite, \$xml);
|
||||||
};
|
};
|
||||||
if ($@) {
|
if ($@) {
|
||||||
$self->session->errorHandler->warn("error parsing rss for url $url :".$@);
|
$session->errorHandler->warn("error parsing rss for url $url :".$@);
|
||||||
#Returning undef on a parse failure is a change from previous behaviour,
|
#Returning undef on a parse failure is a change from previous behaviour,
|
||||||
#but it SHOULDN'T have a major effect.
|
#but it SHOULDN'T have a major effect.
|
||||||
return undef;
|
return undef;
|
||||||
|
|
@ -281,10 +281,10 @@ sub _get_rss_data {
|
||||||
$rss_lite = {channel => $rss_lite};
|
$rss_lite = {channel => $rss_lite};
|
||||||
if (!($rss->{channel} =
|
if (!($rss->{channel} =
|
||||||
_find_record($rss_lite, qr/^channel$/))) {
|
_find_record($rss_lite, qr/^channel$/))) {
|
||||||
$self->session->errorHandler->warn("unable to find channel info for url $url");
|
$session->errorHandler->warn("unable to find channel info for url $url");
|
||||||
}
|
}
|
||||||
if (!($rss->{items} = _find_record($rss_lite, qr/^items?$/))) {
|
if (!($rss->{items} = _find_record($rss_lite, qr/^items?$/))) {
|
||||||
$self->session->errorHandler->warn("unable to find item info for url $url");
|
$session->errorHandler->warn("unable to find item info for url $url");
|
||||||
$rss->{items} = [];
|
$rss->{items} = [];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -296,7 +296,7 @@ sub _get_rss_data {
|
||||||
#Assign dates "globally" rather than when seen in a viewed feed.
|
#Assign dates "globally" rather than when seen in a viewed feed.
|
||||||
#This is important because we can "filter" now and want to ensure we keep order
|
#This is important because we can "filter" now and want to ensure we keep order
|
||||||
#correctly as new items appear.
|
#correctly as new items appear.
|
||||||
$self->_assign_rss_dates($rss->{items});
|
_assign_rss_dates($session, $rss->{items});
|
||||||
|
|
||||||
#Default to an hour timeout
|
#Default to an hour timeout
|
||||||
$cache->set(Storable::freeze($rss), 3600);
|
$cache->set(Storable::freeze($rss), 3600);
|
||||||
|
|
@ -311,17 +311,17 @@ sub _get_rss_data {
|
||||||
# whole database to keep the thing from growing too large
|
# whole database to keep the thing from growing too large
|
||||||
|
|
||||||
sub _assign_rss_dates {
|
sub _assign_rss_dates {
|
||||||
my $self = shift;
|
my $session = shift;
|
||||||
my ($items) = @_;
|
my ($items) = @_;
|
||||||
|
|
||||||
for my $item (@{$items}) {
|
for my $item (@{$items}) {
|
||||||
my $key = 'dates:' . ($item->{guid} || $item->{title} ||
|
my $key = 'dates:' . ($item->{guid} || $item->{title} ||
|
||||||
$item->{description} || $item->{link});
|
$item->{description} || $item->{link});
|
||||||
my $cache = WebGUI::Cache->new($self->session,$key, 'RSS');
|
my $cache = WebGUI::Cache->new($session,$key, 'RSS');
|
||||||
if (my $date = $cache->get()) {
|
if (my $date = $cache->get()) {
|
||||||
$item->{date} = $date;
|
$item->{date} = $date;
|
||||||
} else {
|
} else {
|
||||||
$item->{date} =$self->session->datetime->time();
|
$item->{date} =$session->datetime->time();
|
||||||
$cache->set($item->{date}, '1 year');
|
$cache->set($item->{date}, '1 year');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -444,7 +444,7 @@ sub _get_items {
|
||||||
$items = [];
|
$items = [];
|
||||||
|
|
||||||
for my $url (@{$urls}) {
|
for my $url (@{$urls}) {
|
||||||
my $rss_info=$self->_get_rss_data($url);
|
my $rss_info=_get_rss_data($self->session,$url);
|
||||||
push(@rss_feeds, $rss_info) if($rss_info);
|
push(@rss_feeds, $rss_info) if($rss_info);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
90
lib/WebGUI/Workflow/Activity/GetSyndicatedContent.pm
Normal file
90
lib/WebGUI/Workflow/Activity/GetSyndicatedContent.pm
Normal file
|
|
@ -0,0 +1,90 @@
|
||||||
|
package WebGUI::Workflow::Activity::GetSyndicatedContent;
|
||||||
|
|
||||||
|
|
||||||
|
=head1 LEGAL
|
||||||
|
|
||||||
|
-------------------------------------------------------------------
|
||||||
|
WebGUI is Copyright 2001-2006 Plain Black Corporation.
|
||||||
|
-------------------------------------------------------------------
|
||||||
|
Please read the legal notices (docs/legal.txt) and the license
|
||||||
|
(docs/license.txt) that came with this distribution before using
|
||||||
|
this software.
|
||||||
|
-------------------------------------------------------------------
|
||||||
|
http://www.plainblack.com info@plainblack.com
|
||||||
|
-------------------------------------------------------------------
|
||||||
|
|
||||||
|
=cut
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
use base 'WebGUI::Workflow::Activity';
|
||||||
|
use WebGUI::Asset::Wobject::SyndicatedContent;
|
||||||
|
|
||||||
|
=head1 NAME
|
||||||
|
|
||||||
|
Package WebGUI::Workflow::Activity::GetSyndicatedContent;
|
||||||
|
|
||||||
|
=head1 DESCRIPTION
|
||||||
|
|
||||||
|
Prefetches syndicated content URLs so that the pages can be served up more quickly.
|
||||||
|
|
||||||
|
=head1 SYNOPSIS
|
||||||
|
|
||||||
|
See WebGUI::Workflow::Activity for details on how to use any activity.
|
||||||
|
|
||||||
|
=head1 METHODS
|
||||||
|
|
||||||
|
These methods are available from this class:
|
||||||
|
|
||||||
|
=cut
|
||||||
|
|
||||||
|
|
||||||
|
#-------------------------------------------------------------------
|
||||||
|
|
||||||
|
=head2 definition ( session, definition )
|
||||||
|
|
||||||
|
See WebGUI::Workflow::Activity::defintion() for details.
|
||||||
|
|
||||||
|
=cut
|
||||||
|
|
||||||
|
sub definition {
|
||||||
|
my $class = shift;
|
||||||
|
my $session = shift;
|
||||||
|
my $definition = shift;
|
||||||
|
my $i18n = WebGUI::International->new($session, "Asset_SyndicatedContent");
|
||||||
|
push(@{$definition}, {
|
||||||
|
name=>$i18n->get("get syndicated content"),
|
||||||
|
properties=> { }
|
||||||
|
});
|
||||||
|
return $class->SUPER::definition($session,$definition);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#-------------------------------------------------------------------
|
||||||
|
|
||||||
|
=head2 execute ( )
|
||||||
|
|
||||||
|
See WebGUI::Workflow::Activity::execute() for details.
|
||||||
|
|
||||||
|
=cut
|
||||||
|
|
||||||
|
sub execute {
|
||||||
|
my $self = shift;
|
||||||
|
#In the new Wobject, "rssURL" actually can refer to more than one URL.
|
||||||
|
my @syndicatedWobjectURLs = $self->session->db->buildArray("select distinct SyndicatedContent.rssUrl from SyndicatedContent left join asset on SyndicatedContent.assetId=asset.assetId where asset.state='published'");
|
||||||
|
foreach my $url(@syndicatedWobjectURLs) {
|
||||||
|
#Loop through the SyndicatedWobjects and split all the URLs they are syndicating off into
|
||||||
|
#a separate array.
|
||||||
|
my @urlsToSyndicate = split(/\s+/,$url);
|
||||||
|
foreach ((@urlsToSyndicate)) {
|
||||||
|
WebGUI::Asset::Wobject::SyndicatedContent::_get_rss_data($self->session,$_);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
1;
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,6 +1,12 @@
|
||||||
package WebGUI::i18n::English::Asset_SyndicatedContent;
|
package WebGUI::i18n::English::Asset_SyndicatedContent;
|
||||||
|
|
||||||
our $I18N = {
|
our $I18N = {
|
||||||
|
'get syndicated content' => {
|
||||||
|
'lastUpdated' => 0,
|
||||||
|
'message' => 'Get Syndicated Content',
|
||||||
|
context => ' the title of the get syndicated content workflow activity'
|
||||||
|
},
|
||||||
|
|
||||||
'1' => {
|
'1' => {
|
||||||
'lastUpdated' => 1031514049,
|
'lastUpdated' => 1031514049,
|
||||||
'message' => 'URL to RSS File'
|
'message' => 'URL to RSS File'
|
||||||
|
|
|
||||||
|
|
@ -1,34 +0,0 @@
|
||||||
package Hourly::GetSyndicatedContent;
|
|
||||||
|
|
||||||
use strict;
|
|
||||||
use warnings;
|
|
||||||
use WebGUI::SQL;
|
|
||||||
use WebGUI::Asset::Wobject::SyndicatedContent;
|
|
||||||
|
|
||||||
=head2 Hourly::GetSyndicatedContent
|
|
||||||
|
|
||||||
Loops through all the URLs in the SyndicatedWobjects and puts them into WebGUI::Cache if they haven't been spidered or if they have expired from the cache. This should reduce HTTP traffic a little, and allow for more granular scheduling of feed downloads in the future.
|
|
||||||
|
|
||||||
=cut
|
|
||||||
|
|
||||||
|
|
||||||
#-------------------------------------------------------------------
|
|
||||||
sub process{
|
|
||||||
|
|
||||||
#In the new Wobject, "rssURL" actually can refer to more than one URL.
|
|
||||||
my @syndicatedWobjectURLs = WebGUI::SQL->buildArray("select distinct SyndicatedContent.rssUrl from SyndicatedContent left join asset on SyndicatedContent.assetId=asset.assetId where asset.state='published'");
|
|
||||||
foreach my $url(@syndicatedWobjectURLs) {
|
|
||||||
|
|
||||||
#Loop through the SyndicatedWobjects and split all the URLs they are syndicating off into
|
|
||||||
#a separate array.
|
|
||||||
|
|
||||||
my @urlsToSyndicate = split(/\s+/,$url);
|
|
||||||
foreach ((@urlsToSyndicate)) {
|
|
||||||
WebGUI::Asset::Wobject::SyndicatedContent::_get_rss_data($_);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
1;
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue