Commit | Line | Data |
0d5e38d1 |
1 | # $Id: Feed.pm,v 1.6 2004/05/30 16:59:02 btrott Exp $ |
2 | |
3 | package XML::Feed; |
4 | use strict; |
5 | |
6 | use base qw( XML::Feed::ErrorHandler ); |
7 | use LWP::UserAgent; |
8 | use HTML::Parser; |
9 | |
10 | use vars qw( $VERSION ); |
11 | $VERSION = '0.01'; |
12 | |
13 | use constant FEED_MIME_TYPES => [ |
14 | 'application/x.atom+xml', |
15 | 'application/atom+xml', |
16 | 'text/xml', |
17 | 'application/rss+xml', |
18 | 'application/rdf+xml', |
19 | ]; |
20 | |
21 | sub parse { |
22 | my $class = shift; |
23 | my($stream) = @_; |
24 | return $class->error("Stream parameter is required") unless $stream; |
25 | my $feed = bless {}, $class; |
26 | my $xml = ''; |
27 | if (UNIVERSAL::isa($stream, 'URI')) { |
28 | my $ua = LWP::UserAgent->new; |
29 | my $req = HTTP::Request->new(GET => $stream); |
30 | my $res = $ua->request($req); |
31 | if ($res->is_success) { |
32 | $xml = $res->content; |
33 | } |
34 | } elsif (ref($stream) eq 'SCALAR') { |
35 | $xml = $$stream; |
36 | } elsif (ref($stream)) { |
37 | while (read($stream, my($chunk), 8192)) { |
38 | $xml .= $chunk; |
39 | } |
40 | } else { |
41 | open my $fh, $stream |
42 | or return $class->error("Can't open $stream: $!"); |
43 | while (read $fh, my($chunk), 8192) { |
44 | $xml .= $chunk; |
45 | } |
46 | close $fh; |
47 | } |
48 | return $class->error("Can't get feed XML content from $stream") |
49 | unless $xml; |
50 | ## Auto-detect feed type based on first element. This is prone |
51 | ## to breakage, but then again we don't want to parse the whole |
52 | ## feed ourselves. |
53 | my($tag) = $xml =~ /<([a-zA-Z]\S+)/s; |
54 | $tag =~ s/^.*://; |
55 | if ($tag eq 'rss' || $tag eq 'RDF') { |
56 | require XML::Feed::RSS; |
57 | bless $feed, 'XML::Feed::RSS'; |
58 | } elsif ($tag eq 'feed') { |
59 | require XML::Feed::Atom; |
60 | bless $feed, 'XML::Feed::Atom'; |
61 | } else { |
62 | return $class->error("Cannot detect feed type"); |
63 | } |
64 | $feed->init_string($xml) or return; |
65 | $feed; |
66 | } |
67 | |
68 | sub find_feeds { |
69 | my $class = shift; |
70 | my($uri) = @_; |
71 | my $ua = LWP::UserAgent->new; |
72 | my $req = HTTP::Request->new(GET => $uri); |
73 | my $res = $ua->request($req); |
74 | return unless $res->is_success; |
75 | my @feeds; |
76 | my %is_feed = map { $_ => 1 } @{ FEED_MIME_TYPES() }; |
77 | my $ct = $res->content_type; |
78 | if ($is_feed{$ct}) { |
79 | @feeds = ($uri); |
80 | } elsif ($ct eq 'text/html' || $ct eq 'application/xhtml+xml') { |
81 | my $base_uri = $uri; |
82 | my $find_links = sub { |
83 | my($tag, $attr) = @_; |
84 | if ($tag eq 'link') { |
85 | return unless $attr->{rel}; |
86 | my %rel = map { $_ => 1 } split /\s+/, lc($attr->{rel}); |
87 | (my $type = lc $attr->{type}) =~ s/^\s*//; |
88 | $type =~ s/\s*$//; |
89 | push @feeds, URI->new_abs($attr->{href}, $base_uri)->as_string |
90 | if $is_feed{$type} && |
91 | ($rel{alternate} || $rel{'service.feed'}); |
92 | } elsif ($tag eq 'base') { |
93 | $base_uri = $attr->{href}; |
94 | } |
95 | }; |
96 | my $p = HTML::Parser->new(api_version => 3, |
97 | start_h => [ $find_links, "tagname, attr" ]); |
98 | $p->parse($res->content); |
99 | } |
100 | @feeds; |
101 | } |
102 | |
103 | sub format; |
104 | sub title; |
105 | sub link; |
106 | sub description; |
107 | sub language; |
108 | sub copyright; |
109 | sub modified; |
110 | sub generator; |
111 | sub entries; |
112 | |
113 | sub tagline { $_[0]->description } |
114 | sub items { $_[0]->entries } |
115 | |
116 | 1; |
117 | __END__ |
118 | |
119 | =head1 NAME |
120 | |
121 | XML::Feed - Syndication feed parser and auto-discovery |
122 | |
123 | =head1 SYNOPSIS |
124 | |
125 | use XML::Feed; |
126 | my $feed = XML::Feed->parse(URI->new('http://example.com/atom.xml')) |
127 | or die XML::Feed->errstr; |
128 | print $feed->title, "\n"; |
129 | for my $entry ($feed->entries) { |
130 | } |
131 | |
132 | ## Find all of the syndication feeds on a given page, using |
133 | ## auto-discovery. |
134 | my @feeds = XML::Feed->find_feeds('http://example.com/'); |
135 | |
136 | =head1 DESCRIPTION |
137 | |
138 | I<XML::Feed> is a syndication feed parser for both RSS and Atom feeds. It |
139 | also implements feed auto-discovery for finding feeds, given a URI. |
140 | |
141 | I<XML::Feed> supports the following syndication feed formats: |
142 | |
143 | =over 4 |
144 | |
145 | =item * RSS 0.91 |
146 | |
147 | =item * RSS 1.0 |
148 | |
149 | =item * RSS 2.0 |
150 | |
151 | =item * Atom |
152 | |
153 | =back |
154 | |
155 | The goal of I<XML::Feed> is to provide a unified API for parsing and using |
156 | the various syndication formats. The different flavors of RSS and Atom |
157 | handle data in different ways: date handling; summaries and content; |
158 | escaping and quoting; etc. This module attempts to remove those differences |
159 | by providing a wrapper around the formats and the classes implementing |
160 | those formats (I<XML::RSS> and I<XML::Atom::Feed>). For example, dates are |
161 | handled differently in each of the above formats. To provide a unified API for |
162 | date handling, I<XML::Feed> converts all date formats transparently into |
163 | I<DateTime> objects, which it then returns to the caller. |
164 | |
165 | =head1 USAGE |
166 | |
167 | =head2 XML::Feed->parse($stream) |
168 | |
169 | Parses a syndication feed identified by I<$stream>. I<$stream> can be any |
170 | one of the following: |
171 | |
172 | =over 4 |
173 | |
174 | =item * Scalar reference |
175 | |
176 | A reference to string containing the XML body of the feed. |
177 | |
178 | =item * Filehandle |
179 | |
180 | An open filehandle from which the feed XML will be read. |
181 | |
182 | =item * File name |
183 | |
184 | The name of a file containing the feed XML. |
185 | |
186 | =item * URI object |
187 | |
188 | A URI from which the feed XML will be retrieved. |
189 | |
190 | =back |
191 | |
192 | =head2 XML::Feed->find_feeds($uri) |
193 | |
194 | Given a URI I<$uri>, use auto-discovery to find all of the feeds linked |
195 | from that page (using I<E<lt>linkE<gt>> tags). |
196 | |
197 | Returns a list of feed URIs. |
198 | |
199 | =head2 $feed->format |
200 | |
201 | Returns the format of the feed (C<Atom>, or some version of C<RSS>). |
202 | |
203 | =head2 $feed->title |
204 | |
205 | The title of the feed/channel. |
206 | |
207 | =head2 $feed->link |
208 | |
209 | The permalink of the feed/channel. |
210 | |
211 | =head2 $feed->tagline |
212 | |
213 | The description or tagline of the feed/channel. |
214 | |
215 | =head2 $feed->description |
216 | |
217 | Alias for I<$feed-E<gt>tagline>. |
218 | |
219 | =head2 $feed->language |
220 | |
221 | The language of the feed. |
222 | |
223 | =head2 $feed->copyright |
224 | |
225 | The copyright notice of the feed. |
226 | |
227 | =head2 $feed->modified |
228 | |
229 | A I<DateTime> object representing the last-modified date of the feed. |
230 | |
231 | =head2 $feed->generator |
232 | |
233 | The generator of the feed. |
234 | |
235 | =head2 $feed->entries |
236 | |
237 | A list of the entries/items in the feed. Returns an array containing |
238 | I<XML::Feed::Entry> objects. |
239 | |
240 | =head1 LICENSE |
241 | |
242 | I<XML::Feed> is free software; you may redistribute it and/or modify it |
243 | under the same terms as Perl itself. |
244 | |
245 | =head1 AUTHOR & COPYRIGHT |
246 | |
247 | Except where otherwise noted, I<XML::Feed> is Copyright 2004 Benjamin |
248 | Trott, cpan@stupidfool.org. All rights reserved. |
249 | |
250 | =cut |