From f1229e66fba46a34c362f32c55ed9c185dba548d Mon Sep 17 00:00:00 2001 From: JT Smith Date: Thu, 13 Mar 2003 03:30:23 +0000 Subject: [PATCH] had to revert back to older version --- lib/XML/RSSLite.pm | 651 ++++++++++++++++++++++----------------------- 1 file changed, 325 insertions(+), 326 deletions(-) diff --git a/lib/XML/RSSLite.pm b/lib/XML/RSSLite.pm index 09c9e9b16..42562ba72 100644 --- a/lib/XML/RSSLite.pm +++ b/lib/XML/RSSLite.pm @@ -1,299 +1,334 @@ #!/usr/bin/perl package XML::RSSLite; use strict; -use vars qw($VERSION); - -$VERSION = 0.11; - -sub import{ - no strict 'refs'; - shift; - my $pkg = scalar caller(); - *{"${pkg}::parseRSS"} = \&parseRSS; - *{"${pkg}::parseXML"} = \&parseXML if grep($_ eq 'parseXML', @_); -} +use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION); +use Exporter; +@ISA = ('Exporter'); +@EXPORT = qw/parseXML usableXML/; +@EXPORT_OK = qw/parseXML usableXML + isRSS isRDF isSN isWL + xml_content_string xml_content_array/; +$VERSION = '0.08'; -sub parseRSS { - my ($rr, $cref) = @_; +sub parseXML { + my ($rr, $cr) = @_; - die "$rr is not a hash reference" unless ref($rr) eq 'HASH'; - die "$cref is not a scalar reference" unless ref($cref) eq 'SCALAR'; + die "Parms to 'parse' must be refs to a hash and XML content!" + unless (ref($rr) and ref($cr)); - # Gotta have some content to parse - return unless $$cref; + return unless $$cr; ## Gotta have some content to parse - preprocess($cref); - { - _parseRSS($rr, $cref), last if index(${$cref}, '?}{ }cs; - #XXX $$cref =~ s/&(?!0[a-zA-Z0-9]+|#\d+);/amp/gs; - #XXX Do we wish to (re)allow escaped HTML?! - $$cref =~ s{(?:<|<)/?(?:b|i|h\d|p|center|quote|strong)(?:>|>)}{}gsi; + my $cr = shift; + + ## + ## Help create "well-formed" XML so parser doesn't puke by + ## 1. Making unix-style line endings + ## 2. Using & for & (this screws up urls, but we fix it later) + ## 3. Removing objectionable characters + ## + $$cr =~ s|<(/*)rss\d+:(.*?)>|<$1$2>|g; + $$cr =~ s|<([^<> ]+)\s+(.+?)\s+/>|<$1 $2>|g; + $$cr =~ s/[\012\015]{1,2}/\n/g; + $$cr =~ s/&(?!([a-zA-Z0-9]+|#\d+);)/&/g; + $$cr =~ y/\~\[\n 0-9a-zA-Z_\!\@\#\$\%\^\&\*\(\)\-\+\=\:\;\"\'\<\>\,\.\/?\]/ /c; + + ## Tidy up for debugging by starting open tags on new line +# $content =~ s|(?!\n)<(?!/)|\n<|gs; } -sub _parseRSS { - parseXML($_[0], $_[1], 'channel', 0); - $_[0]->{'items'} = $_[0]->{'item'}; -} - -sub _parseRDF { - my ($rr, $cref) = @_; - - $rr->{'items'} = []; - my $item; - - parseXML($_[0], $_[1], 'rdf:RDF', 0); - - # Alias RDF to RSS - if( exists($rr->{'item'}) ){ - $rr->{'items'} = $rr->{'item'}; - } - else{ - my $li = $_[0]->{'rdf:li'} || $_[0]->{'rdf:Seq'}->{'rdf:li'}; - foreach $item ( @{$li} ){ - my %ia; - if (exists $item->{'dc:description'}) { - $ia{'description'} = $item->{'dc:description'}; - } - if (exists $item->{'dc:title'}) { - $ia{'title'} = $item->{'dc:title'}; - } - if (exists $item->{'dc:identifier'}) { - $ia{'link'} = delete($item->{'dc:identifier'}); - } - - push(@{$rr->{'items'}}, \%ia); - } - } -} - -sub _parseSN { - my ($rr, $cref) = @_; - - $rr->{'items'} = (); - my $item; - - parseXML($rr, $cref, 'channel', 0); - - # Alias SN to RSS terms - foreach $item ( @{$_[0]->{'rdf:li'}} ){ - my %ia; - if (exists $item->{'text'}) { - $ia{'description'} = $item->{'text'}; - } - if (exists $item->{'linetext'}) { - $ia{'title'} = $item->{'linetext'}; - } - if (exists $item->{'url'}) { - $ia{'link'} = $item->{'url'}; - } - - push(@{$rr->{'items'}}, \%ia); - } -} - - -sub _parseWL { - my ($rr, $cref) = @_; - - $rr->{'items'} = (); - my $item; - - #XXX is this the right tag to parse for? - parseXML($rr, $cref, 'channel', 0); - - # Alias WL to RSS - foreach $item ( @{$_[0]->{'rdf:li'}} ){ - my %ia; - if (exists $item->{'url'}) { - $ia{'link'} = delete($item->{'url'}); - } - - push(@{$rr->{'items'}}, \%ia); - } -} - - sub postprocess { my $rr = shift; - #XXX Not much to do, what about un-munging URL's in source, etc.?! - return unless defined($rr->{'items'}); - $rr->{'items'} = [$rr->{'items'}] unless ref($rr->{'items'}) eq 'ARRAY'; + $rr->{'link'} =~ s/&/&/gi; - foreach my $i (@{$rr->{'items'}}) { - $i->{description} = $i->{description}->{'<>'} if ref($i->{description}); + if (defined($rr->{'items'})) { + my $i; + + foreach $i (@{$rr->{'items'}}) { + $i->{'link'} = trim($i->{'link'}); + + # Put stuff into the right name if necessary + if (defined($i->{'url'}) and not $i->{'link'}) { + $i->{'link'} = $i->{'url'}; + } + + # Fix pre-process munging + $i->{'link'} =~ s/&/&/gi; - # Put stuff into the right name if necessary - if( not $i->{'link'} ){ - if( defined($i->{'url'}) ){ - $i->{'link'} = delete($i->{'url'}); } # See if you can use misplaced url in title for empty links - elsif( exists($i->{'title'}) ){ - # The next case would trap this, but try to short-circuit the gathering - if ($i->{'title'} =~ /^(?:ht)|ftp:/) { - $i->{'link'} = $i->{'title'}; - } - elsif ($i->{'title'} =~ /"((?:ht)|ftp.*?)"/) { - $i->{'link'} = $1; - $i->{'title'} =~ s/<.*?>//; - } - else { - next; - } + if (not $i->{'link'}) { + if ($i->{'title'} =~ /^(?:ht)|ftp:/) { + $i->{'link'} = $i->{'title'}; + } elsif ($i->{'title'} =~ /"((?:ht)|ftp.*?)"/) { + $i->{'link'} = $1; + $i->{'title'} =~ s/<.*?>//; + } else { + next; + } } - } - - # Make sure you've got an http/ftp link - if( exists( $i->{'link'}) && $i->{'link'} !~ m{^(http|ftp)://}i) { - ## Rip link out of anchor tag - $i->{'link'} =~ m{a\s+href=("|")?(.*?)("|>|"|>)?}i; - if( $2 ){ - $i->{'link'} = $2; + + # Make sure you've got an http/ftp link + if ($i->{'link'} !~ m{^(http|ftp)://}i) { + ## Rip link out of anchor tag + $i->{'link'} =~ m{a\s+href=("|")?(.*?)("|>|"|>)?}i; + if ($2) { + $i->{'link'} = $2; + + } elsif ($i->{'link'} =~ m{[\.#/]}i and + $rr->{'link'} =~ m{^http://}) { + ## Smells like a relative url + if (substr($i->{'link'}, 0, 1) ne '/') { + $i->{'link'} = '/' . $i->{'link'}; + } + $i->{'link'} = $rr->{'link'} . $i->{'link'}; + + } else { + next; + } } - elsif( $i->{'link'} =~ m{[\.#/]}i and $rr->{'link'} =~ m{^http://} ){ - ## Smells like a relative url - if (substr($i->{'link'}, 0, 1) ne '/') { - $i->{'link'} = '/' . $i->{'link'}; - } - $i->{'link'} = $rr->{'link'} . $i->{'link'}; + + #If we don't have a title, use the link + if (not $i->{'title'}) { + $i->{'title'} = $i->{'link'}; } - else { - next; - } - } - - #If we don't have a title, use the link - unless( defined($i->{'title'}) ){ - $i->{'title'} = $i->{'link'}; - } - - if( exists($i->{'link'}) ){ -#XXX # Fix pre-process munging -# $i->{'link'} =~ s/&/&/gi; - $i->{'link'} =~ s/ /%20/g; + $i->{'link'} =~ s/ //g; } } } -sub parseXML{ - my($hash, $xml, $tag, $comments) = @_; - my($begin, $end, @comments); - local $_; - - #Kill comments - while( ($begin = index(${$xml}, '') for @comments; - - #Expose comments if requested - do{ push(@$comments, $_->[1]) for @comments } if ref($comments) eq 'ARRAY'; + $rr->{'items'} = (); + my $item; + foreach $item (xml_content_array('item', $cr)) { + clean(\$item); + my @item_attrs = ($item =~ m|(<.*?>.*?)|gi); + my $ia; + my %ia; + foreach $ia (@item_attrs) { + $ia =~ m|^<(.*?)>(.*?)$|; + $ia{$1} = trim($2); + } + push(@{$rr->{'items'}}, \%ia); +#XXX test blanks here } } -sub _parseXML{ - my($hash, $xml, $tag, $index) = @_; - my($begin, $end); - - #Find topTag and set pos to start matching from there - ${$xml} =~ /<$tag(?:>|\s)/g; - ($begin, $end) = (0, pos(${$xml})||0); - - #Match either or , optional attributes, stash tag name - while( ${$xml} =~ m%<([^\s>]+)(?:\s+[^>]*?)?(?:/|>.*?%sg ){ - - #Save the tag name, we'll need it - $tag = $1 || $2; - - #Save the new beginning and end - ($begin, $end) = ($end, pos(${$xml})); - - #Get the bit we just matched. - my $str = substr(${$xml}, $begin, $end-$begin); - - #Extract the actual attributes and contents of the tag - $str =~ m%<\Q$tag\E\s*([^>]*?)?>(.*?)%s || -#XXX pointed out by hv -# $str =~ s%^.*?<$tag\s*([^>]*?)?>(.*?)%<$tag>$2%s || - $str =~ m%<\Q$tag\E\s*([^>]*?)?\s*/>%; - my($attr, $content) = ($1, $2); - - #Did we get attributes? clean them up and chuck them in a hash. - if( $attr ){ - ($_, $attr) = ($attr, {}); - $attr->{$1} = $3 while m/([^\s=]+)\s*=\s*(['"])(.*?)\2/g; - } - - my $inhash; - #Recurse if contents has more tags, replace contents with reference we get - if( $content && index($content, '<') > -1 ){ - _parseXML($inhash={}, \$str, $tag); - #Was there any data in the contents? We should extract that... - if( $str =~ />[^><]+)?\s*(?=<)%%g; #XXX ~same speed, wacko warning - #$str =~ s%(>?)\s*<%$1<%g; -#XXX #$str =~ s%(?:^|(?<=>))\s*(?:(?=<)|\z)%%g - - my $qr = qr{@{[join('|', keys %{$inhash})]}}; - $content =~ s%<($qr)\s*(?:[^>]*?)?(?:/|>.*?%%sg; - - $inhash->{'<>'} = $content if $content =~ /\S/; - } - } - - if( ref($inhash) ){ - #We have attributes? Then we should merge them. - if( ref($attr) ){ - for( keys %{$attr} ){ - $inhash->{$_} = exists($inhash->{$_}) ? - (ref($inhash->{$_}) eq 'ARRAY' ? - [@{$inhash->{$_}}, $attr->{$_}] : - [ $inhash->{$_}, $attr->{$_}] ) : $attr->{$_}; - } - } - } - elsif( ref($attr) ){ - $inhash = $attr; - } - else{ - #Otherwise save our content - $inhash = $content; - } - - $hash->{$tag} = exists($hash->{$tag}) ? - (ref($hash->{$tag}) eq 'ARRAY' ? - [@{$hash->{$tag}}, $inhash] : - [ $hash->{$tag}, $inhash] ) : $inhash; +sub parseSN { + my ($rr, $cr) = @_; + + my $channel = xml_content_string('header', $cr); + $channel =~ s|||gis; + clean(\$channel); + + my $ca; + my @channel_attrs = ($channel =~ m|(<.*?>.*?)|gi); + foreach $ca (@channel_attrs) { + $ca =~ m|^<(.*?)>(.*?)$|; + $rr->{$1} = trim($2); } + +## +## Alias SN to RSS terms +## + if (exists $rr->{'channelDescription'}) { + $rr->{'description'} = $rr->{'channelDescription'}; + } + if (exists $rr->{'channelTitle'}) { + $rr->{'title'} = $rr->{'channelTitle'}; + } + if (exists $rr->{'channelLink'}) { + $rr->{'link'} = $rr->{'channelLink'}; + } + + $rr->{'items'} = (); + my $item; + foreach $item (xml_content_array('item', $cr)) { + clean(\$item); + my @item_attrs = ($item =~ m|(<.*?>.*?)|gi); + my $ia; + my %ia; + foreach $ia (@item_attrs) { + $ia =~ m|^<(.*?)>(.*?)$|; + $ia{$1} = trim($2); + } + + # Links are nested, kill prev {'link'} and rebuild attrs inside it + undef $ia{'link'}; + my @linkitems = xml_content_array('link', \$item) + or next; + + my $linkitem = $linkitems[0]; ## Usually first one is most relevant + @item_attrs = ($linkitem =~ m|(<.*?>.*?)|gi); + foreach $ia (@item_attrs) { + $ia =~ m|^<(.*?)>(.*?)$|; + $ia{$1} = trim($2); + } + + # Alias SN to RSS + if (exists $ia{'text'}) { + $ia{'description'} = $ia{'text'}; + } + if (exists $ia{'linetext'}) { + $ia{'title'} = $ia{'linetext'}; + } + if (exists $ia{'url'}) { + $ia{'link'} = $ia{'url'}; + } + push(@{$rr->{'items'}}, \%ia); + } +} + + +sub parseWL { + my ($rr, $cr) = @_; + +# my $channel = xml_content_string('header', $cr); +# $channel =~ s|||gis; +# clean(\$channel); + +# my $ca; +# my @channel_attrs = ($channel =~ m|(<.*?>.*?)|gi); +# foreach $ca (@channel_attrs) { +# $ca =~ m|^<(.*?)>(.*?)$|; +# $rr->{$1} = trim($2); +# } + +## +## Alias SN to RSS terms +## +# if (exists $rr->{'channelDescription'}) { +# $rr->{'description'} = $rr->{'channelDescription'}; +# } +# if (exists $rr->{'channelTitle'}) { +# $rr->{'title'} = $rr->{'channelTitle'}; +# } +# if (exists $rr->{'channelLink'}) { +# $rr->{'link'} = $rr->{'channelLink'}; +# } + + $rr->{'items'} = (); + my $item; + foreach $item (xml_content_array('link', $cr)) { + clean(\$item); + my @item_attrs = ($item =~ m|(<.*?>.*?)|gi); + my $ia; + my %ia; + foreach $ia (@item_attrs) { + $ia =~ m|^<(.*?)>(.*?)$|; + $ia{$1} = trim($2); + } + # Alias WL to RSS + if (exists $ia{'url'}) { + $ia{'link'} = $ia{'url'}; + } + + push(@{$rr->{'items'}}, \%ia); + } +} + +sub usableXML { + my $cref = shift; + my $content = $$cref; ## Don't change caller's content just for usability check + + preprocess(\$content); + + return 1 if isRSS(\$content); + return 2 if isRDF(\$content); + return 3 if isSN(\$content); + return 4 if isWL(\$content); + + return 0; +} + +sub isRSS { + my $cref = shift; + return scalar($$cref =~ /.*<\/rss>/is); +} + +sub isRDF { + my $cref = shift; + return scalar($$cref =~ /.*<\/rdf:RDF>/is); +} + +sub isSN { + my $cref = shift; + return scalar($$cref =~ /.*<\/scriptingnews>/is); +} + +sub isWL { + my $cref = shift; + return scalar($$cref =~ /.*<\/weblog>/is); +} + +sub xml_content_string { + my $tag = shift; + my $cref = shift; + + $$cref =~ /<${tag}.*?>(.*)<\/${tag}>/is; + return $1; +} + +sub xml_content_array { + my $tag = shift; + my $cref = shift; + my $keeptags = shift; + $keeptags = 0 unless $keeptags; + my @result; + + if ($keeptags) { + @result = ($$cref =~ /(<${tag}.*?>.*?<\/${tag}>)/gis); + } else { + @result = ($$cref =~ /<${tag}.*?>(.*?)<\/${tag}>/gis); + } + + return @result; +} + +sub clean { + my $cref = shift; + + $$cref =~ s{(\n|

|

|||||||||
|
||)}{ }gsi; +} + +sub trim { + my $s = shift; + + $s =~ s/^\s*(.*?)\s*$/$1/; + return $s; } 1; @@ -303,7 +338,7 @@ __END__ =head1 NAME -XML::RSSLite - lightweight, "relaxed" RSS (and XML-ish) parser +XML::RSSLite - Perl extension for "relaxed" RSS parsing =head1 SYNOPSIS @@ -311,14 +346,14 @@ XML::RSSLite - lightweight, "relaxed" RSS (and XML-ish) parser . . . - parseRSS(\%result, \$content); + parseXML(\%result, \$content); print "=== Channel ===\n", "Title: $result{'title'}\n", "Desc: $result{'description'}\n", "Link: $result{'link'}\n\n"; - foreach $item (@{$result{'item'}}) { + foreach $item (@{$result{'items'}}) { print " --- Item ---\n", " Title: $item->{'title'}\n", " Desc: $item->{'description'}\n", @@ -334,7 +369,7 @@ old-fashioned Perl regular expressions. It stores the data in a simple hash structure, and "aliases" certain tags so that when done, you can count on having the minimal data necessary for re-constructing a valid RSS file. This means you get the basic title, description, and link for a -channel and its items. +channel and its items. Anything else present in the hash is a bonus :) This module extracts more usable links by parsing "scriptingNews" and "weblog" formats in addition to RDF & RSS. It also "sanitizes" the @@ -362,96 +397,60 @@ output for best results. The munging includes: =over -=item parseRSS($outHashRef, $inScalarRef) +=item parseXML($outHashRef, $inScalarRef) I<$inScalarRef> is a reference to a scalar containing the document to be parsed, the contents will effectively be destroyed. I<$outHashRef> is a reference to the hash within which to store the parsed content. +=item usableXML($inScalarRef) + +Test whether or not B understands the content of the referenced +document. + =back =head2 EXPORTABLE =over -=item parseXML(\%parsedTree, \$parseThis, 'topTag', $comments); +=item isRDF($inScalarRef) -=over +Tests if a referenced document is RDF. -=item parsedTree - required +=item isRSS($inScalarRef) -Reference to hash to store the parsed document within. +Tests if a referenced document is RSS. -=item parseThis - required +=item isSN($inScalarRef) -Reference to scalar containing the document to parse. +Tests if a referenced document is scriptingNews. -=item topTag - optional +=item isWL($inScalarRef) -Tag to consider the root node, leaving this undefined is not recommended. - -=item comments - optional - -=over - -=item false will remove contents from parseThis - -=item true will not remove comments from parseThis - -=item array reference is true, comments are stored here +Tests if a referenced document is weblog. =back -=back +=head1 BUGS -=head2 CAVEATS +Sometimes the title of an item will be missed, +the condition will presist until additional items have been added +to the document. As a stop gap, when this happens the item title +is set equal to the item link. -This is not a conforming parser. It does not handle the following - -=over - -=item - - - -=item - - - -=item - - - -=item - - PI - -=back - -It's non-validating, without a DTD the following cannot be properly addressed - -=over - -=item entities - -=item namespaces - -This might be arriving in the next release. - -=back - -=back +It may take awhile for the tuits to fix this to accumulate. +feel free to submit a patch. =head1 SEE ALSO -perl(1), C, C, -C, +perl(1), C =head1 AUTHOR Jerrad Pierce . -Scott Thomason +Scott Thomason =head1 LICENSE