[catagits/XML-Feed.git] / lib / XML / Feed.pm

# $Id: Feed.pm,v 1.6 2004/05/30 16:59:02 btrott Exp $

package XML::Feed;
use strict;

use base qw( XML::Feed::ErrorHandler );
use LWP::UserAgent;
use HTML::Parser;

use vars qw( $VERSION );
$VERSION = '0.01';

use constant FEED_MIME_TYPES => [
    'application/x.atom+xml',
    'application/atom+xml',
    'text/xml',
    'application/rss+xml',
    'application/rdf+xml',
];

sub parse {
    my $class = shift;
    my($stream) = @_;
    return $class->error("Stream parameter is required") unless $stream;
    my $feed = bless {}, $class;
    my $xml = '';
    if (UNIVERSAL::isa($stream, 'URI')) {
        my $ua = LWP::UserAgent->new;
        my $req = HTTP::Request->new(GET => $stream);
        my $res = $ua->request($req);
        if ($res->is_success) {
            $xml = $res->content;
        }
    } elsif (ref($stream) eq 'SCALAR') {
        $xml = $$stream;
    } elsif (ref($stream)) {
        while (read($stream, my($chunk), 8192)) {
            $xml .= $chunk;
        }
    } else {
        open my $fh, $stream
            or return $class->error("Can't open $stream: $!");
        while (read $fh, my($chunk), 8192) {
            $xml .= $chunk;
        }
        close $fh;
    }
    return $class->error("Can't get feed XML content from $stream")
        unless $xml;
    ## Auto-detect feed type based on first element. This is prone
    ## to breakage, but then again we don't want to parse the whole
    ## feed ourselves.
    my($tag) = $xml =~ /<([a-zA-Z]\S+)/s;
    $tag =~ s/^.*://;
    if ($tag eq 'rss' || $tag eq 'RDF') {
        require XML::Feed::RSS;
        bless $feed, 'XML::Feed::RSS';
    } elsif ($tag eq 'feed') {
        require XML::Feed::Atom;
        bless $feed, 'XML::Feed::Atom';
    } else {
        return $class->error("Cannot detect feed type");
    }
    $feed->init_string($xml) or return;
    $feed;
}

sub find_feeds {
    my $class = shift;
    my($uri) = @_;
    my $ua = LWP::UserAgent->new;
    my $req = HTTP::Request->new(GET => $uri);
    my $res = $ua->request($req);
    return unless $res->is_success;
    my @feeds;
    my %is_feed = map { $_ => 1 } @{ FEED_MIME_TYPES() };
    my $ct = $res->content_type;
    if ($is_feed{$ct}) {
        @feeds = ($uri);
    } elsif ($ct eq 'text/html' || $ct eq 'application/xhtml+xml') {
        my $base_uri = $uri;
        my $find_links = sub {
            my($tag, $attr) = @_;
            if ($tag eq 'link') {
                return unless $attr->{rel};
                my %rel = map { $_ => 1 } split /\s+/, lc($attr->{rel});
                (my $type = lc $attr->{type}) =~ s/^\s*//;
                $type =~ s/\s*$//;
                push @feeds, URI->new_abs($attr->{href}, $base_uri)->as_string
                   if $is_feed{$type} &&
                      ($rel{alternate} || $rel{'service.feed'});
            } elsif ($tag eq 'base') {
                $base_uri = $attr->{href};
            }
        };
        my $p = HTML::Parser->new(api_version => 3,
                                  start_h => [ $find_links, "tagname, attr" ]);
        $p->parse($res->content);
    }
    @feeds;
}

sub format;
sub title;
sub link;
sub description;
sub language;
sub copyright;
sub modified;
sub generator;
sub entries;

sub tagline { $_[0]->description }
sub items   { $_[0]->entries     }

1;
__END__

=head1 NAME

XML::Feed - Syndication feed parser and auto-discovery

=head1 SYNOPSIS

    use XML::Feed;
    my $feed = XML::Feed->parse(URI->new('http://example.com/atom.xml'))
        or die XML::Feed->errstr;
    print $feed->title, "\n";
    for my $entry ($feed->entries) {
    }

    ## Find all of the syndication feeds on a given page, using
    ## auto-discovery.
    my @feeds = XML::Feed->find_feeds('http://example.com/');

=head1 DESCRIPTION

I<XML::Feed> is a syndication feed parser for both RSS and Atom feeds. It
also implements feed auto-discovery for finding feeds, given a URI.

I<XML::Feed> supports the following syndication feed formats:

=over 4

=item * RSS 0.91

=item * RSS 1.0

=item * RSS 2.0

=item * Atom

=back

The goal of I<XML::Feed> is to provide a unified API for parsing and using
the various syndication formats. The different flavors of RSS and Atom
handle data in different ways: date handling; summaries and content;
escaping and quoting; etc. This module attempts to remove those differences
by providing a wrapper around the formats and the classes implementing
those formats (I<XML::RSS> and I<XML::Atom::Feed>). For example, dates are
handled differently in each of the above formats. To provide a unified API for
date handling, I<XML::Feed> converts all date formats transparently into
I<DateTime> objects, which it then returns to the caller.

=head1 USAGE

=head2 XML::Feed->parse($stream)

Parses a syndication feed identified by I<$stream>. I<$stream> can be any
one of the following:

=over 4

=item * Scalar reference

A reference to string containing the XML body of the feed.

=item * Filehandle

An open filehandle from which the feed XML will be read.

=item * File name

The name of a file containing the feed XML.

=item * URI object

A URI from which the feed XML will be retrieved.

=back

=head2 XML::Feed->find_feeds($uri)

Given a URI I<$uri>, use auto-discovery to find all of the feeds linked
from that page (using I<E<lt>linkE<gt>> tags).

Returns a list of feed URIs.

=head2 $feed->format

Returns the format of the feed (C<Atom>, or some version of C<RSS>).

=head2 $feed->title

The title of the feed/channel.

=head2 $feed->link

The permalink of the feed/channel.

=head2 $feed->tagline

The description or tagline of the feed/channel.

=head2 $feed->description

Alias for I<$feed-E<gt>tagline>.

=head2 $feed->language

The language of the feed.

=head2 $feed->copyright

The copyright notice of the feed.

=head2 $feed->modified

A I<DateTime> object representing the last-modified date of the feed.

=head2 $feed->generator

The generator of the feed.

=head2 $feed->entries

A list of the entries/items in the feed. Returns an array containing
I<XML::Feed::Entry> objects.

=head1 LICENSE

I<XML::Feed> is free software; you may redistribute it and/or modify it
under the same terms as Perl itself.

=head1 AUTHOR & COPYRIGHT

Except where otherwise noted, I<XML::Feed> is Copyright 2004 Benjamin
Trott, cpan@stupidfool.org. All rights reserved.

=cut
Commit	Line	Data
0d5e38d1	1	# $Id: Feed.pm,v 1.6 2004/05/30 16:59:02 btrott Exp $
	2
	3	package XML::Feed;
	4	use strict;
	5
	6	use base qw( XML::Feed::ErrorHandler );
	7	use LWP::UserAgent;
	8	use HTML::Parser;
	9
	10	use vars qw( $VERSION );
	11	$VERSION = '0.01';
	12
	13	use constant FEED_MIME_TYPES => [
	14	'application/x.atom+xml',
	15	'application/atom+xml',
	16	'text/xml',
	17	'application/rss+xml',
	18	'application/rdf+xml',
	19	];
	20
	21	sub parse {
	22	my $class = shift;
	23	my($stream) = @_;
	24	return $class->error("Stream parameter is required") unless $stream;
	25	my $feed = bless {}, $class;
	26	my $xml = '';
	27	if (UNIVERSAL::isa($stream, 'URI')) {
	28	my $ua = LWP::UserAgent->new;
	29	my $req = HTTP::Request->new(GET => $stream);
	30	my $res = $ua->request($req);
	31	if ($res->is_success) {
	32	$xml = $res->content;
	33	}
	34	} elsif (ref($stream) eq 'SCALAR') {
	35	$xml = $$stream;
	36	} elsif (ref($stream)) {
	37	while (read($stream, my($chunk), 8192)) {
	38	$xml .= $chunk;
	39	}
	40	} else {
	41	open my $fh, $stream
	42	or return $class->error("Can't open $stream: $!");
	43	while (read $fh, my($chunk), 8192) {
	44	$xml .= $chunk;
	45	}
	46	close $fh;
	47	}
	48	return $class->error("Can't get feed XML content from $stream")
	49	unless $xml;
	50	## Auto-detect feed type based on first element. This is prone
	51	## to breakage, but then again we don't want to parse the whole
	52	## feed ourselves.
	53	my($tag) = $xml =~ /<([a-zA-Z]\S+)/s;
	54	$tag =~ s/^.*://;
	55	if ($tag eq 'rss' \|\| $tag eq 'RDF') {
	56	require XML::Feed::RSS;
	57	bless $feed, 'XML::Feed::RSS';
	58	} elsif ($tag eq 'feed') {
	59	require XML::Feed::Atom;
	60	bless $feed, 'XML::Feed::Atom';
	61	} else {
	62	return $class->error("Cannot detect feed type");
	63	}
	64	$feed->init_string($xml) or return;
65	$feed;
66	}
67
68	sub find_feeds {
69	my $class = shift;
70	my($uri) = @_;
71	my $ua = LWP::UserAgent->new;
72	my $req = HTTP::Request->new(GET => $uri);
73	my $res = $ua->request($req);
74	return unless $res->is_success;
75	my @feeds;
76	my %is_feed = map { $_ => 1 } @{ FEED_MIME_TYPES() };
77	my $ct = $res->content_type;
78	if ($is_feed{$ct}) {
79	@feeds = ($uri);
80	} elsif ($ct eq 'text/html' \|\| $ct eq 'application/xhtml+xml') {
81	my $base_uri = $uri;
82	my $find_links = sub {
83	my($tag, $attr) = @_;
84	if ($tag eq 'link') {
85	return unless $attr->{rel};
86	my %rel = map { $_ => 1 } split /\s+/, lc($attr->{rel});
87	(my $type = lc $attr->{type}) =~ s/^\s*//;
88	$type =~ s/\s*$//;
89	push @feeds, URI->new_abs($attr->{href}, $base_uri)->as_string
90	if $is_feed{$type} &&
91	($rel{alternate} \|\| $rel{'service.feed'});
92	} elsif ($tag eq 'base') {
93	$base_uri = $attr->{href};
94	}
95	};
96	my $p = HTML::Parser->new(api_version => 3,
97	start_h => [ $find_links, "tagname, attr" ]);
98	$p->parse($res->content);
99	}
100	@feeds;
101	}
102
103	sub format;
104	sub title;
105	sub link;
106	sub description;
107	sub language;
108	sub copyright;
109	sub modified;
110	sub generator;
111	sub entries;
112
113	sub tagline { $_[0]->description }
114	sub items { $_[0]->entries }
115
116	1;
117	__END__
118
119	=head1 NAME
120
121	XML::Feed - Syndication feed parser and auto-discovery
122
123	=head1 SYNOPSIS
124
125	use XML::Feed;
126	my $feed = XML::Feed->parse(URI->new('http://example.com/atom.xml'))
127	or die XML::Feed->errstr;
128	print $feed->title, "\n";
129	for my $entry ($feed->entries) {
130	}
131
132	## Find all of the syndication feeds on a given page, using
133	## auto-discovery.
134	my @feeds = XML::Feed->find_feeds('http://example.com/');
135
136	=head1 DESCRIPTION
137
138	I<XML::Feed> is a syndication feed parser for both RSS and Atom feeds. It
139	also implements feed auto-discovery for finding feeds, given a URI.
140
141	I<XML::Feed> supports the following syndication feed formats:
142
143	=over 4
144
145	=item * RSS 0.91
146
147	=item * RSS 1.0
148
149	=item * RSS 2.0
150
151	=item * Atom
152
153	=back
154
155	The goal of I<XML::Feed> is to provide a unified API for parsing and using
156	the various syndication formats. The different flavors of RSS and Atom
157	handle data in different ways: date handling; summaries and content;
158	escaping and quoting; etc. This module attempts to remove those differences
159	by providing a wrapper around the formats and the classes implementing
160	those formats (I<XML::RSS> and I<XML::Atom::Feed>). For example, dates are
161	handled differently in each of the above formats. To provide a unified API for
162	date handling, I<XML::Feed> converts all date formats transparently into
163	I<DateTime> objects, which it then returns to the caller.
164
165	=head1 USAGE
166
167	=head2 XML::Feed->parse($stream)
168
169	Parses a syndication feed identified by I<$stream>. I<$stream> can be any
170	one of the following:
171
172	=over 4
173
174	=item * Scalar reference
175
176	A reference to string containing the XML body of the feed.
177
178	=item * Filehandle
179
180	An open filehandle from which the feed XML will be read.
181
182	=item * File name
183
184	The name of a file containing the feed XML.
185
186	=item * URI object
187
188	A URI from which the feed XML will be retrieved.
189
190	=back
191
192	=head2 XML::Feed->find_feeds($uri)
193
194	Given a URI I<$uri>, use auto-discovery to find all of the feeds linked
195	from that page (using I<E<lt>linkE<gt>> tags).
196
197	Returns a list of feed URIs.
198
199	=head2 $feed->format
200
201	Returns the format of the feed (C<Atom>, or some version of C<RSS>).
202
203	=head2 $feed->title
204
205	The title of the feed/channel.
206
207	=head2 $feed->link
208
209	The permalink of the feed/channel.
210
211	=head2 $feed->tagline
212
213	The description or tagline of the feed/channel.
214
215	=head2 $feed->description
216
217	Alias for I<$feed-E<gt>tagline>.
218
219	=head2 $feed->language
220
221	The language of the feed.
222
223	=head2 $feed->copyright
224
225	The copyright notice of the feed.
226
227	=head2 $feed->modified
228
229	A I<DateTime> object representing the last-modified date of the feed.
230
231	=head2 $feed->generator
232
233	The generator of the feed.
234
235	=head2 $feed->entries
236
237	A list of the entries/items in the feed. Returns an array containing
238	I<XML::Feed::Entry> objects.
239
240	=head1 LICENSE
241
242	I<XML::Feed> is free software; you may redistribute it and/or modify it
243	under the same terms as Perl itself.
244
245	=head1 AUTHOR & COPYRIGHT
246
247	Except where otherwise noted, I<XML::Feed> is Copyright 2004 Benjamin
248	Trott, cpan@stupidfool.org. All rights reserved.
249
250	=cut