From c53feaf14f6433f4f7ccfbd75a79db7ee04f4d9f Mon Sep 17 00:00:00 2001 From: JT Smith Date: Thu, 10 Apr 2003 03:04:09 +0000 Subject: [PATCH] Updated Syndicated Content to use new improved RSS feed reader. --- lib/WebGUI/Wobject/SyndicatedContent.pm | 6 +- lib/XML/RSSLite.pm | 647 ++++++++++++------------ 2 files changed, 327 insertions(+), 326 deletions(-) diff --git a/lib/WebGUI/Wobject/SyndicatedContent.pm b/lib/WebGUI/Wobject/SyndicatedContent.pm index c66b43383..3e4a474c2 100644 --- a/lib/WebGUI/Wobject/SyndicatedContent.pm +++ b/lib/WebGUI/Wobject/SyndicatedContent.pm @@ -86,8 +86,8 @@ sub www_view { unless (defined $rssFile) { $rssFile = $cache->setByHTTP($_[0]->get("rssUrl"),3600); } - $rssFile =~ s#()(.*?)()#$1.encode_entities($2).$3#ges; - eval{parseXML(\%rss, \$rssFile)}; + $rssFile =~ s#()(.*?)()#$1.encode_entities(decode_entities($2)).$3#ges; + eval{parseRSS(\%rss, \$rssFile)}; if ($@) { WebGUI::ErrorHandler::warn($_[0]->get("rssUrl")." ".$@); } @@ -96,7 +96,7 @@ sub www_view { $var{"channel.link"} = $rss{link}; $var{"channel.description"} = $rss{description}; my @items; - foreach my $item (@{$rss{items}}) { + foreach my $item (@{$rss{item}}) { push (@items,{ link=>$item->{link}, title=>$item->{title}, diff --git a/lib/XML/RSSLite.pm b/lib/XML/RSSLite.pm index 42562ba72..09c9e9b16 100644 --- a/lib/XML/RSSLite.pm +++ b/lib/XML/RSSLite.pm @@ -1,334 +1,299 @@ #!/usr/bin/perl package XML::RSSLite; use strict; -use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION); -use Exporter; -@ISA = ('Exporter'); -@EXPORT = qw/parseXML usableXML/; -@EXPORT_OK = qw/parseXML usableXML - isRSS isRDF isSN isWL - xml_content_string xml_content_array/; -$VERSION = '0.08'; +use vars qw($VERSION); + +$VERSION = 0.11; + +sub import{ + no strict 'refs'; + shift; + my $pkg = scalar caller(); + *{"${pkg}::parseRSS"} = \&parseRSS; + *{"${pkg}::parseXML"} = \&parseXML if grep($_ eq 'parseXML', @_); +} -sub parseXML { - my ($rr, $cr) = @_; +sub parseRSS { + my ($rr, $cref) = @_; - die "Parms to 'parse' must be refs to a hash and XML content!" - unless (ref($rr) and ref($cr)); + die "$rr is not a hash reference" unless ref($rr) eq 'HASH'; + die "$cref is not a scalar reference" unless ref($cref) eq 'SCALAR'; - return unless $$cr; ## Gotta have some content to parse + # Gotta have some content to parse + return unless $$cref; - my $type = usableXML($cr) - or die "Content must be RSS/RDF/ScriptingNews/Weblog XML " . - "(or something pretty close)"; - - preprocess($cr); - - if ($type == 1 or $type == 2) { - parseRSS($rr, $cr); - } elsif ($type == 3) { - parseSN($rr, $cr); - } elsif ($type == 4) { - parseWL($rr, $cr); - } else { - die "Screwed up XML type-checking somehow!"; + preprocess($cref); + { + _parseRSS($rr, $cref), last if index(${$cref}, '|<$1$2>|g; - $$cr =~ s|<([^<> ]+)\s+(.+?)\s+/>|<$1 $2>|g; - $$cr =~ s/[\012\015]{1,2}/\n/g; - $$cr =~ s/&(?!([a-zA-Z0-9]+|#\d+);)/&/g; - $$cr =~ y/\~\[\n 0-9a-zA-Z_\!\@\#\$\%\^\&\*\(\)\-\+\=\:\;\"\'\<\>\,\.\/?\]/ /c; - - ## Tidy up for debugging by starting open tags on new line -# $content =~ s|(?!\n)<(?!/)|\n<|gs; + my $cref = shift; + $$cref =~ y/\r\n/\n/s; + $$cref =~ y{\n\t ~0-9\-+!@#$%^&*()_=a-zA-Z[]\\;':",./<>?}{ }cs; + #XXX $$cref =~ s/&(?!0[a-zA-Z0-9]+|#\d+);/amp/gs; + #XXX Do we wish to (re)allow escaped HTML?! + $$cref =~ s{(?:<|<)/?(?:b|i|h\d|p|center|quote|strong)(?:>|>)}{}gsi; } +sub _parseRSS { + parseXML($_[0], $_[1], 'channel', 0); + $_[0]->{'items'} = $_[0]->{'item'}; +} + +sub _parseRDF { + my ($rr, $cref) = @_; + + $rr->{'items'} = []; + my $item; + + parseXML($_[0], $_[1], 'rdf:RDF', 0); + + # Alias RDF to RSS + if( exists($rr->{'item'}) ){ + $rr->{'items'} = $rr->{'item'}; + } + else{ + my $li = $_[0]->{'rdf:li'} || $_[0]->{'rdf:Seq'}->{'rdf:li'}; + foreach $item ( @{$li} ){ + my %ia; + if (exists $item->{'dc:description'}) { + $ia{'description'} = $item->{'dc:description'}; + } + if (exists $item->{'dc:title'}) { + $ia{'title'} = $item->{'dc:title'}; + } + if (exists $item->{'dc:identifier'}) { + $ia{'link'} = delete($item->{'dc:identifier'}); + } + + push(@{$rr->{'items'}}, \%ia); + } + } +} + +sub _parseSN { + my ($rr, $cref) = @_; + + $rr->{'items'} = (); + my $item; + + parseXML($rr, $cref, 'channel', 0); + + # Alias SN to RSS terms + foreach $item ( @{$_[0]->{'rdf:li'}} ){ + my %ia; + if (exists $item->{'text'}) { + $ia{'description'} = $item->{'text'}; + } + if (exists $item->{'linetext'}) { + $ia{'title'} = $item->{'linetext'}; + } + if (exists $item->{'url'}) { + $ia{'link'} = $item->{'url'}; + } + + push(@{$rr->{'items'}}, \%ia); + } +} + + +sub _parseWL { + my ($rr, $cref) = @_; + + $rr->{'items'} = (); + my $item; + + #XXX is this the right tag to parse for? + parseXML($rr, $cref, 'channel', 0); + + # Alias WL to RSS + foreach $item ( @{$_[0]->{'rdf:li'}} ){ + my %ia; + if (exists $item->{'url'}) { + $ia{'link'} = delete($item->{'url'}); + } + + push(@{$rr->{'items'}}, \%ia); + } +} + + sub postprocess { my $rr = shift; - $rr->{'link'} =~ s/&/&/gi; + #XXX Not much to do, what about un-munging URL's in source, etc.?! + return unless defined($rr->{'items'}); + $rr->{'items'} = [$rr->{'items'}] unless ref($rr->{'items'}) eq 'ARRAY'; - if (defined($rr->{'items'})) { - my $i; - - foreach $i (@{$rr->{'items'}}) { - $i->{'link'} = trim($i->{'link'}); - - # Put stuff into the right name if necessary - if (defined($i->{'url'}) and not $i->{'link'}) { - $i->{'link'} = $i->{'url'}; - } - - # Fix pre-process munging - $i->{'link'} =~ s/&/&/gi; + foreach my $i (@{$rr->{'items'}}) { + $i->{description} = $i->{description}->{'<>'} if ref($i->{description}); + # Put stuff into the right name if necessary + if( not $i->{'link'} ){ + if( defined($i->{'url'}) ){ + $i->{'link'} = delete($i->{'url'}); } # See if you can use misplaced url in title for empty links - if (not $i->{'link'}) { - if ($i->{'title'} =~ /^(?:ht)|ftp:/) { - $i->{'link'} = $i->{'title'}; - } elsif ($i->{'title'} =~ /"((?:ht)|ftp.*?)"/) { - $i->{'link'} = $1; - $i->{'title'} =~ s/<.*?>//; - } else { - next; - } + elsif( exists($i->{'title'}) ){ + # The next case would trap this, but try to short-circuit the gathering + if ($i->{'title'} =~ /^(?:ht)|ftp:/) { + $i->{'link'} = $i->{'title'}; + } + elsif ($i->{'title'} =~ /"((?:ht)|ftp.*?)"/) { + $i->{'link'} = $1; + $i->{'title'} =~ s/<.*?>//; + } + else { + next; + } } - - # Make sure you've got an http/ftp link - if ($i->{'link'} !~ m{^(http|ftp)://}i) { - ## Rip link out of anchor tag - $i->{'link'} =~ m{a\s+href=("|")?(.*?)("|>|"|>)?}i; - if ($2) { - $i->{'link'} = $2; - - } elsif ($i->{'link'} =~ m{[\.#/]}i and - $rr->{'link'} =~ m{^http://}) { - ## Smells like a relative url - if (substr($i->{'link'}, 0, 1) ne '/') { - $i->{'link'} = '/' . $i->{'link'}; - } - $i->{'link'} = $rr->{'link'} . $i->{'link'}; - - } else { - next; - } - } - - #If we don't have a title, use the link - if (not $i->{'title'}) { - $i->{'title'} = $i->{'link'}; - } - $i->{'link'} =~ s/ //g; - } - } -} - -sub parseRSS { - my ($rr, $cr) = @_; - - my $channel = xml_content_string('channel', $cr); - $channel =~ s|||gis; - clean(\$channel); - - my $ca; - my @channel_attrs = ($channel =~ m|(<.*?>.*?)|gi); - foreach $ca (@channel_attrs) { - $ca =~ m|^<(.*?)>(.*?)$|; - $rr->{$1} = trim($2); - } - - $rr->{'items'} = (); - my $item; - foreach $item (xml_content_array('item', $cr)) { - clean(\$item); - my @item_attrs = ($item =~ m|(<.*?>.*?)|gi); - my $ia; - my %ia; - foreach $ia (@item_attrs) { - $ia =~ m|^<(.*?)>(.*?)$|; - $ia{$1} = trim($2); - } - push(@{$rr->{'items'}}, \%ia); -#XXX test blanks here - } -} - -sub parseSN { - my ($rr, $cr) = @_; - - my $channel = xml_content_string('header', $cr); - $channel =~ s|||gis; - clean(\$channel); - - my $ca; - my @channel_attrs = ($channel =~ m|(<.*?>.*?)|gi); - foreach $ca (@channel_attrs) { - $ca =~ m|^<(.*?)>(.*?)$|; - $rr->{$1} = trim($2); - } - -## -## Alias SN to RSS terms -## - if (exists $rr->{'channelDescription'}) { - $rr->{'description'} = $rr->{'channelDescription'}; - } - if (exists $rr->{'channelTitle'}) { - $rr->{'title'} = $rr->{'channelTitle'}; - } - if (exists $rr->{'channelLink'}) { - $rr->{'link'} = $rr->{'channelLink'}; - } - - $rr->{'items'} = (); - my $item; - foreach $item (xml_content_array('item', $cr)) { - clean(\$item); - my @item_attrs = ($item =~ m|(<.*?>.*?)|gi); - my $ia; - my %ia; - foreach $ia (@item_attrs) { - $ia =~ m|^<(.*?)>(.*?)$|; - $ia{$1} = trim($2); - } - - # Links are nested, kill prev {'link'} and rebuild attrs inside it - undef $ia{'link'}; - my @linkitems = xml_content_array('link', \$item) - or next; - - my $linkitem = $linkitems[0]; ## Usually first one is most relevant - @item_attrs = ($linkitem =~ m|(<.*?>.*?)|gi); - foreach $ia (@item_attrs) { - $ia =~ m|^<(.*?)>(.*?)$|; - $ia{$1} = trim($2); } - # Alias SN to RSS - if (exists $ia{'text'}) { - $ia{'description'} = $ia{'text'}; - } - if (exists $ia{'linetext'}) { - $ia{'title'} = $ia{'linetext'}; - } - if (exists $ia{'url'}) { - $ia{'link'} = $ia{'url'}; - } - push(@{$rr->{'items'}}, \%ia); - } -} - - -sub parseWL { - my ($rr, $cr) = @_; - -# my $channel = xml_content_string('header', $cr); -# $channel =~ s|||gis; -# clean(\$channel); - -# my $ca; -# my @channel_attrs = ($channel =~ m|(<.*?>.*?)|gi); -# foreach $ca (@channel_attrs) { -# $ca =~ m|^<(.*?)>(.*?)$|; -# $rr->{$1} = trim($2); -# } - -## -## Alias SN to RSS terms -## -# if (exists $rr->{'channelDescription'}) { -# $rr->{'description'} = $rr->{'channelDescription'}; -# } -# if (exists $rr->{'channelTitle'}) { -# $rr->{'title'} = $rr->{'channelTitle'}; -# } -# if (exists $rr->{'channelLink'}) { -# $rr->{'link'} = $rr->{'channelLink'}; -# } - - $rr->{'items'} = (); - my $item; - foreach $item (xml_content_array('link', $cr)) { - clean(\$item); - my @item_attrs = ($item =~ m|(<.*?>.*?)|gi); - my $ia; - my %ia; - foreach $ia (@item_attrs) { - $ia =~ m|^<(.*?)>(.*?)$|; - $ia{$1} = trim($2); + # Make sure you've got an http/ftp link + if( exists( $i->{'link'}) && $i->{'link'} !~ m{^(http|ftp)://}i) { + ## Rip link out of anchor tag + $i->{'link'} =~ m{a\s+href=("|")?(.*?)("|>|"|>)?}i; + if( $2 ){ + $i->{'link'} = $2; + } + elsif( $i->{'link'} =~ m{[\.#/]}i and $rr->{'link'} =~ m{^http://} ){ + ## Smells like a relative url + if (substr($i->{'link'}, 0, 1) ne '/') { + $i->{'link'} = '/' . $i->{'link'}; + } + $i->{'link'} = $rr->{'link'} . $i->{'link'}; + } + else { + next; + } + } + + #If we don't have a title, use the link + unless( defined($i->{'title'}) ){ + $i->{'title'} = $i->{'link'}; + } + + if( exists($i->{'link'}) ){ +#XXX # Fix pre-process munging +# $i->{'link'} =~ s/&/&/gi; + $i->{'link'} =~ s/ /%20/g; } - # Alias WL to RSS - if (exists $ia{'url'}) { - $ia{'link'} = $ia{'url'}; - } - - push(@{$rr->{'items'}}, \%ia); } } -sub usableXML { - my $cref = shift; - my $content = $$cref; ## Don't change caller's content just for usability check - - preprocess(\$content); +sub parseXML{ + my($hash, $xml, $tag, $comments) = @_; + my($begin, $end, @comments); + local $_; - return 1 if isRSS(\$content); - return 2 if isRDF(\$content); - return 3 if isSN(\$content); - return 4 if isWL(\$content); - - return 0; -} - -sub isRSS { - my $cref = shift; - return scalar($$cref =~ /.*<\/rss>/is); -} - -sub isRDF { - my $cref = shift; - return scalar($$cref =~ /.*<\/rdf:RDF>/is); -} - -sub isSN { - my $cref = shift; - return scalar($$cref =~ /.*<\/scriptingnews>/is); -} - -sub isWL { - my $cref = shift; - return scalar($$cref =~ /.*<\/weblog>/is); -} - -sub xml_content_string { - my $tag = shift; - my $cref = shift; - - $$cref =~ /<${tag}.*?>(.*)<\/${tag}>/is; - return $1; -} - -sub xml_content_array { - my $tag = shift; - my $cref = shift; - my $keeptags = shift; - $keeptags = 0 unless $keeptags; - my @result; - - if ($keeptags) { - @result = ($$cref =~ /(<${tag}.*?>.*?<\/${tag}>)/gis); - } else { - @result = ($$cref =~ /<${tag}.*?>(.*?)<\/${tag}>/gis); + #Kill comments + while( ($begin = index(${$xml}, '') for @comments; + + #Expose comments if requested + do{ push(@$comments, $_->[1]) for @comments } if ref($comments) eq 'ARRAY'; + } } -sub clean { - my $cref = shift; +sub _parseXML{ + my($hash, $xml, $tag, $index) = @_; + my($begin, $end); - $$cref =~ s{(\n|

|

|||||||||
|
||)}{ }gsi; -} + #Find topTag and set pos to start matching from there + ${$xml} =~ /<$tag(?:>|\s)/g; + ($begin, $end) = (0, pos(${$xml})||0); -sub trim { - my $s = shift; + #Match either or , optional attributes, stash tag name + while( ${$xml} =~ m%<([^\s>]+)(?:\s+[^>]*?)?(?:/|>.*?%sg ){ - $s =~ s/^\s*(.*?)\s*$/$1/; - return $s; + #Save the tag name, we'll need it + $tag = $1 || $2; + + #Save the new beginning and end + ($begin, $end) = ($end, pos(${$xml})); + + #Get the bit we just matched. + my $str = substr(${$xml}, $begin, $end-$begin); + + #Extract the actual attributes and contents of the tag + $str =~ m%<\Q$tag\E\s*([^>]*?)?>(.*?)%s || +#XXX pointed out by hv +# $str =~ s%^.*?<$tag\s*([^>]*?)?>(.*?)%<$tag>$2%s || + $str =~ m%<\Q$tag\E\s*([^>]*?)?\s*/>%; + my($attr, $content) = ($1, $2); + + #Did we get attributes? clean them up and chuck them in a hash. + if( $attr ){ + ($_, $attr) = ($attr, {}); + $attr->{$1} = $3 while m/([^\s=]+)\s*=\s*(['"])(.*?)\2/g; + } + + my $inhash; + #Recurse if contents has more tags, replace contents with reference we get + if( $content && index($content, '<') > -1 ){ + _parseXML($inhash={}, \$str, $tag); + #Was there any data in the contents? We should extract that... + if( $str =~ />[^><]+)?\s*(?=<)%%g; #XXX ~same speed, wacko warning + #$str =~ s%(>?)\s*<%$1<%g; +#XXX #$str =~ s%(?:^|(?<=>))\s*(?:(?=<)|\z)%%g + + my $qr = qr{@{[join('|', keys %{$inhash})]}}; + $content =~ s%<($qr)\s*(?:[^>]*?)?(?:/|>.*?%%sg; + + $inhash->{'<>'} = $content if $content =~ /\S/; + } + } + + if( ref($inhash) ){ + #We have attributes? Then we should merge them. + if( ref($attr) ){ + for( keys %{$attr} ){ + $inhash->{$_} = exists($inhash->{$_}) ? + (ref($inhash->{$_}) eq 'ARRAY' ? + [@{$inhash->{$_}}, $attr->{$_}] : + [ $inhash->{$_}, $attr->{$_}] ) : $attr->{$_}; + } + } + } + elsif( ref($attr) ){ + $inhash = $attr; + } + else{ + #Otherwise save our content + $inhash = $content; + } + + $hash->{$tag} = exists($hash->{$tag}) ? + (ref($hash->{$tag}) eq 'ARRAY' ? + [@{$hash->{$tag}}, $inhash] : + [ $hash->{$tag}, $inhash] ) : $inhash; + } } 1; @@ -338,7 +303,7 @@ __END__ =head1 NAME -XML::RSSLite - Perl extension for "relaxed" RSS parsing +XML::RSSLite - lightweight, "relaxed" RSS (and XML-ish) parser =head1 SYNOPSIS @@ -346,14 +311,14 @@ XML::RSSLite - Perl extension for "relaxed" RSS parsing . . . - parseXML(\%result, \$content); + parseRSS(\%result, \$content); print "=== Channel ===\n", "Title: $result{'title'}\n", "Desc: $result{'description'}\n", "Link: $result{'link'}\n\n"; - foreach $item (@{$result{'items'}}) { + foreach $item (@{$result{'item'}}) { print " --- Item ---\n", " Title: $item->{'title'}\n", " Desc: $item->{'description'}\n", @@ -369,7 +334,7 @@ old-fashioned Perl regular expressions. It stores the data in a simple hash structure, and "aliases" certain tags so that when done, you can count on having the minimal data necessary for re-constructing a valid RSS file. This means you get the basic title, description, and link for a -channel and its items. Anything else present in the hash is a bonus :) +channel and its items. This module extracts more usable links by parsing "scriptingNews" and "weblog" formats in addition to RDF & RSS. It also "sanitizes" the @@ -397,60 +362,96 @@ output for best results. The munging includes: =over -=item parseXML($outHashRef, $inScalarRef) +=item parseRSS($outHashRef, $inScalarRef) I<$inScalarRef> is a reference to a scalar containing the document to be parsed, the contents will effectively be destroyed. I<$outHashRef> is a reference to the hash within which to store the parsed content. -=item usableXML($inScalarRef) - -Test whether or not B understands the content of the referenced -document. - =back =head2 EXPORTABLE =over -=item isRDF($inScalarRef) +=item parseXML(\%parsedTree, \$parseThis, 'topTag', $comments); -Tests if a referenced document is RDF. +=over -=item isRSS($inScalarRef) +=item parsedTree - required -Tests if a referenced document is RSS. +Reference to hash to store the parsed document within. -=item isSN($inScalarRef) +=item parseThis - required -Tests if a referenced document is scriptingNews. +Reference to scalar containing the document to parse. -=item isWL($inScalarRef) +=item topTag - optional -Tests if a referenced document is weblog. +Tag to consider the root node, leaving this undefined is not recommended. + +=item comments - optional + +=over + +=item false will remove contents from parseThis + +=item true will not remove comments from parseThis + +=item array reference is true, comments are stored here =back -=head1 BUGS +=back -Sometimes the title of an item will be missed, -the condition will presist until additional items have been added -to the document. As a stop gap, when this happens the item title -is set equal to the item link. +=head2 CAVEATS -It may take awhile for the tuits to fix this to accumulate. -feel free to submit a patch. +This is not a conforming parser. It does not handle the following + +=over + +=item + + + +=item + + + +=item + + + +=item + + PI + +=back + +It's non-validating, without a DTD the following cannot be properly addressed + +=over + +=item entities + +=item namespaces + +This might be arriving in the next release. + +=back + +=back =head1 SEE ALSO -perl(1), C +perl(1), C, C, +C, =head1 AUTHOR Jerrad Pierce . -Scott Thomason +Scott Thomason =head1 LICENSE