X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FXML%2FFeed.pm;h=db800ff8c9db587565b40805a1b0460821b3dbce;hb=c56795ef520c7281600d9d674d5c3280a56fa7d5;hp=7621bbb71ebcc01adcc3b796fe0d806ab31b03bc;hpb=ba87bd32e42464e100f90993c2e63e2a6a90afac;p=catagits%2FXML-Feed.git diff --git a/lib/XML/Feed.pm b/lib/XML/Feed.pm index 7621bbb..db800ff 100644 --- a/lib/XML/Feed.pm +++ b/lib/XML/Feed.pm @@ -1,36 +1,50 @@ -# $Id: Feed.pm,v 1.6 2004/05/30 16:59:02 btrott Exp $ +# $Id: Feed.pm 1958 2006-08-14 05:31:27Z btrott $ package XML::Feed; use strict; -use base qw( XML::Feed::ErrorHandler ); +use base qw( Class::ErrorHandler ); +use Feed::Find; +use URI::Fetch; use LWP::UserAgent; -use HTML::Parser; +use Carp; +use Module::Pluggable search_path => "XML::Feed::Format", + require => 1, + sub_name => 'formatters'; + +our $VERSION = '0.3'; +our @formatters; +BEGIN { + @formatters = __PACKAGE__->formatters; +} -use vars qw( $VERSION ); -$VERSION = '0.01'; +sub new { + my $class = shift; + my $format = shift || 'Atom'; + my $format_class = 'XML::Feed::Format::' . $format; + eval "use $format_class"; + Carp::croak("Unsupported format $format: $@") if $@; + my $feed = bless {}, join('::', __PACKAGE__, "Format", $format); + $feed->init_empty(@_) or return $class->error($feed->errstr); + $feed; +} -use constant FEED_MIME_TYPES => [ - 'application/x.atom+xml', - 'application/atom+xml', - 'text/xml', - 'application/rss+xml', - 'application/rdf+xml', -]; +sub init_empty { 1 } sub parse { my $class = shift; - my($stream) = @_; + my($stream, $specified_format) = @_; return $class->error("Stream parameter is required") unless $stream; my $feed = bless {}, $class; my $xml = ''; if (UNIVERSAL::isa($stream, 'URI')) { - my $ua = LWP::UserAgent->new; - my $req = HTTP::Request->new(GET => $stream); - my $res = $ua->request($req); - if ($res->is_success) { - $xml = $res->content; - } + my $ua = LWP::UserAgent->new; + $ua->env_proxy; # force allowing of proxies + my $res = URI::Fetch->fetch($stream, UserAgent => $ua) + or return $class->error(URI::Fetch->errstr); + return $class->error("This feed has been permanently removed") + if $res->status == URI::Fetch::URI_GONE(); + $xml = $res->content; } elsif (ref($stream) eq 'SCALAR') { $xml = $$stream; } elsif (ref($stream)) { @@ -47,70 +61,111 @@ sub parse { } return $class->error("Can't get feed XML content from $stream") unless $xml; + my $format; + if ($specified_format) { + $format = $specified_format; + } else { + $format = $feed->identify_format(\$xml) or return $class->error($feed->errstr); + } + + my $format_class = join '::', __PACKAGE__, "Format", $format; + eval "use $format_class"; + return $class->error("Unsupported format $format: $@") if $@; + bless $feed, $format_class; + $feed->init_string(\$xml) or return $class->error($feed->errstr); + $feed; +} + +sub identify_format { + my $feed = shift; + my($xml) = @_; + foreach my $class (@formatters) { + my ($name) = ($class =~ m!([^:]+)$!); + # TODO ugly + my $tmp = $$xml; + return $name if eval { $class->identify(\$tmp) }; + return $feed->error($@) if $@; + } + return $feed->error("Cannot detect feed type"); +} + +sub _get_first_tag { + my $class = shift; + my ($xml) = @_; + + ## Auto-detect feed type based on first element. This is prone ## to breakage, but then again we don't want to parse the whole ## feed ourselves. - my($tag) = $xml =~ /<([a-zA-Z]\S+)/s; - $tag =~ s/^.*://; - if ($tag eq 'rss' || $tag eq 'RDF') { - require XML::Feed::RSS; - bless $feed, 'XML::Feed::RSS'; - } elsif ($tag eq 'feed') { - require XML::Feed::Atom; - bless $feed, 'XML::Feed::Atom'; - } else { - return $class->error("Cannot detect feed type"); + my $tag; + while ($$xml =~ /<(\S+)/sg) { + (my $t = $1) =~ tr/a-zA-Z0-9:\-\?!//cd; + my $first = substr $t, 0, 1; + $tag = $t, last unless $first eq '?' || $first eq '!'; } - $feed->init_string($xml) or return; - $feed; + die ("Cannot find first element") unless $tag; + $tag =~ s/^.*://; + return $tag; } sub find_feeds { my $class = shift; my($uri) = @_; - my $ua = LWP::UserAgent->new; - my $req = HTTP::Request->new(GET => $uri); - my $res = $ua->request($req); - return unless $res->is_success; - my @feeds; - my %is_feed = map { $_ => 1 } @{ FEED_MIME_TYPES() }; - my $ct = $res->content_type; - if ($is_feed{$ct}) { - @feeds = ($uri); - } elsif ($ct eq 'text/html' || $ct eq 'application/xhtml+xml') { - my $base_uri = $uri; - my $find_links = sub { - my($tag, $attr) = @_; - if ($tag eq 'link') { - return unless $attr->{rel}; - my %rel = map { $_ => 1 } split /\s+/, lc($attr->{rel}); - (my $type = lc $attr->{type}) =~ s/^\s*//; - $type =~ s/\s*$//; - push @feeds, URI->new_abs($attr->{href}, $base_uri)->as_string - if $is_feed{$type} && - ($rel{alternate} || $rel{'service.feed'}); - } elsif ($tag eq 'base') { - $base_uri = $attr->{href}; - } - }; - my $p = HTML::Parser->new(api_version => 3, - start_h => [ $find_links, "tagname, attr" ]); - $p->parse($res->content); - } + my @feeds = Feed::Find->find($uri) + or return $class->error(Feed::Find->errstr); @feeds; } +sub convert { + my $feed = shift; + my($format) = @_; + my $new = XML::Feed->new($format); + for my $field (qw( title link description language author copyright modified generator )) { + my $val = $feed->$field(); + next unless defined $val; + $new->$field($val); + } + for my $entry ($feed->entries) { + $new->add_entry($entry->convert($format)); + } + $new; +} + +sub splice { + my $feed = shift; + my($other) = @_; + my %ids = map { $_->id => 1 } $feed->entries; + for my $entry ($other->entries) { + $feed->add_entry($entry) unless $ids{$entry->id}++; + } +} + +sub _convert_entry { + my $feed = shift; + my $entry = shift; + my $feed_format = ref($feed); $feed_format =~ s!^XML::Feed::Format::!!; + my $entry_format = ref($entry); $entry_format =~ s!^XML::Feed::Entry::Format::!!; + return $entry if $entry_format eq $feed_format; + return $entry->convert($feed_format); +} + +sub base; sub format; sub title; sub link; +sub self_link; sub description; sub language; +sub author; sub copyright; sub modified; sub generator; +sub add_entry; sub entries; +sub as_xml; +sub id; -sub tagline { $_[0]->description } +sub tagline { shift->description(@_) } sub items { $_[0]->entries } 1; @@ -164,8 +219,18 @@ I objects, which it then returns to the caller. =head1 USAGE +=head2 XML::Feed->new($format) + +Creates a new empty I object using the format I<$format>. + + $feed = XML::Feed->new('Atom'); + $feed = XML::Feed->new('RSS'); + $feed = XML::Feed->new('RSS', version => '0.91'); + =head2 XML::Feed->parse($stream) +=head2 XML::Feed->parse($stream, $format) + Parses a syndication feed identified by I<$stream>. I<$stream> can be any one of the following: @@ -189,6 +254,8 @@ A URI from which the feed XML will be retrieved. =back +I<$format> allows you to override format guessing. + =head2 XML::Feed->find_feeds($uri) Given a URI I<$uri>, use auto-discovery to find all of the feeds linked @@ -196,47 +263,149 @@ from that page (using IlinkE> tags). Returns a list of feed URIs. +=head2 XML::Feed->identify_format($xml) + +Given the xml of a feed return what format it is in (C, or some version of C). + +=head2 $feed->convert($format) + +Converts the I object into the I<$format> format, and returns +the new object. + +=head2 $feed->splice($other_feed) + +Splices in all of the entries from the feed I<$other_feed> into I<$feed>, +skipping posts that are already in I<$feed>. + =head2 $feed->format Returns the format of the feed (C, or some version of C). -=head2 $feed->title +=head2 $feed->title([ $title ]) The title of the feed/channel. -=head2 $feed->link +=head2 $feed->base([ $base ]) + +The url base of the feed/channel. + +=head2 $feed->link([ $uri ]) The permalink of the feed/channel. -=head2 $feed->tagline +=head2 $feed->tagline([ $tagline ]) The description or tagline of the feed/channel. -=head2 $feed->description +=head2 $feed->description([ $description ]) Alias for I<$feed-Etagline>. -=head2 $feed->language +=head2 $feed->author([ $author ]) + +The author of the feed/channel. + +=head2 $feed->language([ $language ]) The language of the feed. -=head2 $feed->copyright +=head2 $feed->copyright([ $copyright ]) The copyright notice of the feed. -=head2 $feed->modified +=head2 $feed->modified([ $modified ]) A I object representing the last-modified date of the feed. -=head2 $feed->generator +If present, I<$modified> should be a I object. + +=head2 $feed->generator([ $generator ]) The generator of the feed. +=head2 $feed->self_link ([ $uri ]) + +The Atom Self-link of the feed: + +L + +A string. + =head2 $feed->entries A list of the entries/items in the feed. Returns an array containing I objects. +=head2 $feed->items + +A synonym for I<$feed->entries>. + +=head2 $feed->add_entry($entry) + +Adds an entry to the feed. I<$entry> should be an I +object in the correct format for the feed. + +=head2 $feed->as_xml + +Returns an XML representation of the feed, in the format determined by +the current format of the I<$feed> object. + +=head1 PACKAGE VARIABLES + +=over 4 + +=item C<$XML::Feed::RSS::PREFERRED_PARSER> + +If you want to use another RSS parser class than XML::RSS (default), you can +change the class by setting C<$PREFERRED_PARSER> variable in XML::Feed::RSS +package. + + $XML::Feed::RSS::PREFERRED_PARSER = "XML::RSS::LibXML"; + +B this will only work for parsing feeds, not creating feeds. + +=back + +=head1 VALID FEEDS + +For reference, this cgi script will create valid, albeit nonsensical feeds +(according to C anyway) for Atom 1.0 and RSS 0.90, +0.91, 1.0 and 2.0. + + #!perl -w + + use strict; + use CGI; + use CGI::Carp qw(fatalsToBrowser); + use DateTime; + use XML::Feed; + + my $cgi = CGI->new; + my @args = ( $cgi->param('format') || "Atom" ); + push @args, ( version => $cgi->param('version') ) if $cgi->param('version'); + + my $feed = XML::Feed->new(@args); + $feed->id("http://".time.rand()."/"); + $feed->title('Test Feed'); + $feed->link($cgi->url); + $feed->self_link($cgi->url( -query => 1, -full => 1, -rewrite => 1) ); + $feed->modified(DateTime->now); + + my $entry = XML::Feed::Entry->new(); + $entry->id("http://".time.rand()."/"); + $entry->link("http://example.com"); + $entry->title("Test entry"); + $entry->summary("Test summary"); + $entry->content("Foo"); + $entry->modified(DateTime->now); + $entry->author('test@example.com (Testy McTesterson)'); + $feed->add_entry($entry); + + my $mime = ("Atom" eq $feed->format) ? "application/atom+xml" : "application/rss+xml"; + print $cgi->header($mime); + print $feed->as_xml; + + =head1 LICENSE I is free software; you may redistribute it and/or modify it @@ -244,7 +413,13 @@ under the same terms as Perl itself. =head1 AUTHOR & COPYRIGHT -Except where otherwise noted, I is Copyright 2004 Benjamin -Trott, cpan@stupidfool.org. All rights reserved. +Except where otherwise noted, I is Copyright 2004-2008 +Six Apart, cpan@sixapart.com. All rights reserved. + +=head1 SUBVERSION + +The latest version of I can be found at + + http://code.sixapart.com/svn/XML-Feed/trunk/ =cut