Commit | Line | Data |
0d5e38d1 |
1 | package XML::Feed; |
2 | use strict; |
3 | |
62d92771 |
4 | use base qw( Class::ErrorHandler ); |
62d92771 |
5 | use Feed::Find; |
fe71566d |
6 | use URI::Fetch; |
d39809aa |
7 | use LWP::UserAgent; |
973e1f9e |
8 | use Carp; |
9b6bc912 |
9 | use Module::Pluggable search_path => "XML::Feed::Format", |
10 | require => 1, |
11 | sub_name => 'formatters'; |
0d5e38d1 |
12 | |
43f04277 |
13 | our $VERSION = '0.51'; |
b1aa7a62 |
14 | our $MULTIPLE_ENCLOSURES = 0; |
9b6bc912 |
15 | our @formatters; |
16 | BEGIN { |
17 | @formatters = __PACKAGE__->formatters; |
18 | } |
973e1f9e |
19 | |
20 | sub new { |
21 | my $class = shift; |
4e9c4625 |
22 | my $format = shift || 'Atom'; |
729cd7a8 |
23 | my $format_class = 'XML::Feed::Format::' . $format; |
973e1f9e |
24 | eval "use $format_class"; |
25 | Carp::croak("Unsupported format $format: $@") if $@; |
729cd7a8 |
26 | my $feed = bless {}, join('::', __PACKAGE__, "Format", $format); |
4e9c4625 |
27 | $feed->init_empty(@_) or return $class->error($feed->errstr); |
973e1f9e |
28 | $feed; |
29 | } |
30 | |
31 | sub init_empty { 1 } |
0d5e38d1 |
32 | |
33 | sub parse { |
34 | my $class = shift; |
41e8c132 |
35 | my($stream, $specified_format) = @_; |
0d5e38d1 |
36 | return $class->error("Stream parameter is required") unless $stream; |
37 | my $feed = bless {}, $class; |
38 | my $xml = ''; |
39 | if (UNIVERSAL::isa($stream, 'URI')) { |
d39809aa |
40 | my $ua = LWP::UserAgent->new; |
eeff6c3c |
41 | $ua->agent(__PACKAGE__ . "/$VERSION"); |
d39809aa |
42 | $ua->env_proxy; # force allowing of proxies |
43 | my $res = URI::Fetch->fetch($stream, UserAgent => $ua) |
fe71566d |
44 | or return $class->error(URI::Fetch->errstr); |
45 | return $class->error("This feed has been permanently removed") |
b3b6d2fc |
46 | if $res->status == URI::Fetch::URI_GONE(); |
fe71566d |
47 | $xml = $res->content; |
0d5e38d1 |
48 | } elsif (ref($stream) eq 'SCALAR') { |
49 | $xml = $$stream; |
50 | } elsif (ref($stream)) { |
51 | while (read($stream, my($chunk), 8192)) { |
52 | $xml .= $chunk; |
53 | } |
54 | } else { |
55 | open my $fh, $stream |
56 | or return $class->error("Can't open $stream: $!"); |
57 | while (read $fh, my($chunk), 8192) { |
58 | $xml .= $chunk; |
59 | } |
60 | close $fh; |
61 | } |
62 | return $class->error("Can't get feed XML content from $stream") |
63 | unless $xml; |
41e8c132 |
64 | my $format; |
65 | if ($specified_format) { |
66 | $format = $specified_format; |
67 | } else { |
0c81cb25 |
68 | $format = $feed->identify_format(\$xml) or return $class->error($feed->errstr); |
41e8c132 |
69 | } |
70 | |
729cd7a8 |
71 | my $format_class = join '::', __PACKAGE__, "Format", $format; |
fe71566d |
72 | eval "use $format_class"; |
73 | return $class->error("Unsupported format $format: $@") if $@; |
74 | bless $feed, $format_class; |
75 | $feed->init_string(\$xml) or return $class->error($feed->errstr); |
76 | $feed; |
77 | } |
78 | |
79 | sub identify_format { |
9b6bc912 |
80 | my $feed = shift; |
81 | my($xml) = @_; |
82 | foreach my $class (@formatters) { |
83 | my ($name) = ($class =~ m!([^:]+)$!); |
84 | # TODO ugly |
85 | my $tmp = $$xml; |
86 | return $name if eval { $class->identify(\$tmp) }; |
87 | return $feed->error($@) if $@; |
88 | } |
89 | return $feed->error("Cannot detect feed type"); |
90 | } |
91 | |
92 | sub _get_first_tag { |
93 | my $class = shift; |
94 | my ($xml) = @_; |
95 | |
96 | |
0d5e38d1 |
97 | ## Auto-detect feed type based on first element. This is prone |
98 | ## to breakage, but then again we don't want to parse the whole |
99 | ## feed ourselves. |
a749d9b9 |
100 | my $tag; |
fe71566d |
101 | while ($$xml =~ /<(\S+)/sg) { |
62d92771 |
102 | (my $t = $1) =~ tr/a-zA-Z0-9:\-\?!//cd; |
103 | my $first = substr $t, 0, 1; |
104 | $tag = $t, last unless $first eq '?' || $first eq '!'; |
a749d9b9 |
105 | } |
9b6bc912 |
106 | die ("Cannot find first element") unless $tag; |
0d5e38d1 |
107 | $tag =~ s/^.*://; |
9b6bc912 |
108 | return $tag; |
0d5e38d1 |
109 | } |
110 | |
111 | sub find_feeds { |
112 | my $class = shift; |
113 | my($uri) = @_; |
62d92771 |
114 | my @feeds = Feed::Find->find($uri) |
115 | or return $class->error(Feed::Find->errstr); |
0d5e38d1 |
116 | @feeds; |
117 | } |
118 | |
973e1f9e |
119 | sub convert { |
120 | my $feed = shift; |
121 | my($format) = @_; |
729cd7a8 |
122 | my $new = XML::Feed->new($format); |
ecac864a |
123 | for my $field (qw( title link description language author copyright modified generator )) { |
23103173 |
124 | my $val = $feed->$field(); |
125 | next unless defined $val; |
126 | $new->$field($val); |
973e1f9e |
127 | } |
128 | for my $entry ($feed->entries) { |
129 | $new->add_entry($entry->convert($format)); |
130 | } |
131 | $new; |
132 | } |
133 | |
23103173 |
134 | sub splice { |
135 | my $feed = shift; |
136 | my($other) = @_; |
137 | my %ids = map { $_->id => 1 } $feed->entries; |
138 | for my $entry ($other->entries) { |
139 | $feed->add_entry($entry) unless $ids{$entry->id}++; |
140 | } |
141 | } |
142 | |
33d4cb3f |
143 | sub _convert_entry { |
144 | my $feed = shift; |
145 | my $entry = shift; |
729cd7a8 |
146 | my $feed_format = ref($feed); $feed_format =~ s!^XML::Feed::Format::!!; |
147 | my $entry_format = ref($entry); $entry_format =~ s!^XML::Feed::Entry::Format::!!; |
33d4cb3f |
148 | return $entry if $entry_format eq $feed_format; |
149 | return $entry->convert($feed_format); |
150 | } |
151 | |
5383a560 |
152 | sub base; |
0d5e38d1 |
153 | sub format; |
154 | sub title; |
155 | sub link; |
9a36f82c |
156 | sub self_link; |
0d5e38d1 |
157 | sub description; |
158 | sub language; |
973e1f9e |
159 | sub author; |
0d5e38d1 |
160 | sub copyright; |
161 | sub modified; |
162 | sub generator; |
973e1f9e |
163 | sub add_entry; |
0d5e38d1 |
164 | sub entries; |
973e1f9e |
165 | sub as_xml; |
813f78d8 |
166 | sub id; |
f2512641 |
167 | sub image; |
0d5e38d1 |
168 | |
973e1f9e |
169 | sub tagline { shift->description(@_) } |
0d5e38d1 |
170 | sub items { $_[0]->entries } |
171 | |
02de1790 |
172 | # RFC 5005 |
173 | sub first_link; |
174 | sub last_link; |
175 | sub previous_link; |
176 | sub next_link; |
177 | sub current_link; |
178 | sub prev_archive_link; |
179 | sub next_archive_link; |
180 | |
0d5e38d1 |
181 | 1; |
182 | __END__ |
183 | |
184 | =head1 NAME |
185 | |
186 | XML::Feed - Syndication feed parser and auto-discovery |
187 | |
188 | =head1 SYNOPSIS |
189 | |
190 | use XML::Feed; |
191 | my $feed = XML::Feed->parse(URI->new('http://example.com/atom.xml')) |
192 | or die XML::Feed->errstr; |
193 | print $feed->title, "\n"; |
194 | for my $entry ($feed->entries) { |
195 | } |
196 | |
197 | ## Find all of the syndication feeds on a given page, using |
198 | ## auto-discovery. |
199 | my @feeds = XML::Feed->find_feeds('http://example.com/'); |
200 | |
201 | =head1 DESCRIPTION |
202 | |
203 | I<XML::Feed> is a syndication feed parser for both RSS and Atom feeds. It |
204 | also implements feed auto-discovery for finding feeds, given a URI. |
205 | |
206 | I<XML::Feed> supports the following syndication feed formats: |
207 | |
208 | =over 4 |
209 | |
210 | =item * RSS 0.91 |
211 | |
212 | =item * RSS 1.0 |
213 | |
214 | =item * RSS 2.0 |
215 | |
216 | =item * Atom |
217 | |
218 | =back |
219 | |
220 | The goal of I<XML::Feed> is to provide a unified API for parsing and using |
221 | the various syndication formats. The different flavors of RSS and Atom |
222 | handle data in different ways: date handling; summaries and content; |
223 | escaping and quoting; etc. This module attempts to remove those differences |
224 | by providing a wrapper around the formats and the classes implementing |
200ce863 |
225 | those formats (L<XML::RSS> and L<XML::Atom::Feed>). For example, dates are |
0d5e38d1 |
226 | handled differently in each of the above formats. To provide a unified API for |
227 | date handling, I<XML::Feed> converts all date formats transparently into |
200ce863 |
228 | L<DateTime> objects, which it then returns to the caller. |
0d5e38d1 |
229 | |
230 | =head1 USAGE |
231 | |
973e1f9e |
232 | =head2 XML::Feed->new($format) |
233 | |
234 | Creates a new empty I<XML::Feed> object using the format I<$format>. |
235 | |
813f78d8 |
236 | $feed = XML::Feed->new('Atom'); |
237 | $feed = XML::Feed->new('RSS'); |
238 | $feed = XML::Feed->new('RSS', version => '0.91'); |
239 | |
0d5e38d1 |
240 | =head2 XML::Feed->parse($stream) |
241 | |
41e8c132 |
242 | =head2 XML::Feed->parse($stream, $format) |
243 | |
200ce863 |
244 | Parses a syndication feed identified by I<$stream> and returns an |
245 | I<XML::Feed> obhect. I<$stream> can be any |
0d5e38d1 |
246 | one of the following: |
247 | |
248 | =over 4 |
249 | |
250 | =item * Scalar reference |
251 | |
252 | A reference to string containing the XML body of the feed. |
253 | |
254 | =item * Filehandle |
255 | |
256 | An open filehandle from which the feed XML will be read. |
257 | |
258 | =item * File name |
259 | |
260 | The name of a file containing the feed XML. |
261 | |
262 | =item * URI object |
263 | |
264 | A URI from which the feed XML will be retrieved. |
265 | |
266 | =back |
267 | |
8c30ad3d |
268 | I<$format> allows you to override format guessing. |
41e8c132 |
269 | |
0d5e38d1 |
270 | =head2 XML::Feed->find_feeds($uri) |
271 | |
272 | Given a URI I<$uri>, use auto-discovery to find all of the feeds linked |
273 | from that page (using I<E<lt>linkE<gt>> tags). |
274 | |
275 | Returns a list of feed URIs. |
276 | |
8c30ad3d |
277 | =head2 XML::Feed->identify_format($xml) |
278 | |
279 | Given the xml of a feed return what format it is in (C<Atom>, or some version of C<RSS>). |
280 | |
973e1f9e |
281 | =head2 $feed->convert($format) |
282 | |
283 | Converts the I<XML::Feed> object into the I<$format> format, and returns |
284 | the new object. |
285 | |
23103173 |
286 | =head2 $feed->splice($other_feed) |
287 | |
288 | Splices in all of the entries from the feed I<$other_feed> into I<$feed>, |
289 | skipping posts that are already in I<$feed>. |
290 | |
0d5e38d1 |
291 | =head2 $feed->format |
292 | |
293 | Returns the format of the feed (C<Atom>, or some version of C<RSS>). |
294 | |
973e1f9e |
295 | =head2 $feed->title([ $title ]) |
0d5e38d1 |
296 | |
297 | The title of the feed/channel. |
298 | |
5383a560 |
299 | =head2 $feed->base([ $base ]) |
300 | |
301 | The url base of the feed/channel. |
302 | |
973e1f9e |
303 | =head2 $feed->link([ $uri ]) |
0d5e38d1 |
304 | |
305 | The permalink of the feed/channel. |
306 | |
973e1f9e |
307 | =head2 $feed->tagline([ $tagline ]) |
0d5e38d1 |
308 | |
309 | The description or tagline of the feed/channel. |
310 | |
973e1f9e |
311 | =head2 $feed->description([ $description ]) |
0d5e38d1 |
312 | |
313 | Alias for I<$feed-E<gt>tagline>. |
314 | |
973e1f9e |
315 | =head2 $feed->author([ $author ]) |
316 | |
317 | The author of the feed/channel. |
318 | |
319 | =head2 $feed->language([ $language ]) |
0d5e38d1 |
320 | |
321 | The language of the feed. |
322 | |
973e1f9e |
323 | =head2 $feed->copyright([ $copyright ]) |
0d5e38d1 |
324 | |
325 | The copyright notice of the feed. |
326 | |
973e1f9e |
327 | =head2 $feed->modified([ $modified ]) |
0d5e38d1 |
328 | |
329 | A I<DateTime> object representing the last-modified date of the feed. |
330 | |
973e1f9e |
331 | If present, I<$modified> should be a I<DateTime> object. |
332 | |
333 | =head2 $feed->generator([ $generator ]) |
0d5e38d1 |
334 | |
335 | The generator of the feed. |
336 | |
9a36f82c |
337 | =head2 $feed->self_link ([ $uri ]) |
338 | |
339 | The Atom Self-link of the feed: |
340 | |
341 | L<http://validator.w3.org/feed/docs/warning/MissingAtomSelfLink.html> |
342 | |
343 | A string. |
344 | |
0d5e38d1 |
345 | =head2 $feed->entries |
346 | |
347 | A list of the entries/items in the feed. Returns an array containing |
200ce863 |
348 | L<XML::Feed::Entry> objects. |
0d5e38d1 |
349 | |
8c30ad3d |
350 | =head2 $feed->items |
351 | |
200ce863 |
352 | A synonym (alias) for <$feed-E<gt>entries>. |
8c30ad3d |
353 | |
973e1f9e |
354 | =head2 $feed->add_entry($entry) |
355 | |
200ce863 |
356 | Adds an entry to the feed. I<$entry> should be an L<XML::Feed::Entry> |
973e1f9e |
357 | object in the correct format for the feed. |
358 | |
359 | =head2 $feed->as_xml |
360 | |
361 | Returns an XML representation of the feed, in the format determined by |
362 | the current format of the I<$feed> object. |
363 | |
02de1790 |
364 | =head2 $feed->first_link ([ $uri ]) |
365 | |
366 | The Atom First-link for feed paging and archiving (RFC 5005). |
367 | |
368 | L<http://tools.ietf.org/html/rfc5005> |
369 | |
370 | =head2 $feed->last_link ([ $uri ]) |
371 | |
372 | The Atom Last-link for feed paging and archiving. |
373 | |
374 | =head2 $feed->next_link ([ $uri ]) |
375 | |
376 | The Atom Next-link for feed paging and archiving. |
377 | |
378 | =head2 $feed->previous_link ([ $uri ]) |
379 | |
380 | The Atom Previous-link for feed paging and archiving. |
381 | |
382 | =head2 $feed->current_link ([ $uri ]) |
383 | |
384 | The Atom Current-link for feed paging and archiving. |
385 | |
386 | =head2 $feed->next_archive_link ([ $uri ]) |
387 | |
388 | The Atom Next-link for feed paging and archiving. |
389 | |
390 | =head2 $feed->prev_archive_link ([ $uri ]) |
391 | |
392 | The Atom Prev-Archive-link for feed paging and archiving. |
393 | |
973e1f9e |
394 | =head1 PACKAGE VARIABLES |
395 | |
396 | =over 4 |
397 | |
7b5fccb1 |
398 | =item C<$XML::Feed::Format::RSS::PREFERRED_PARSER> |
973e1f9e |
399 | |
400 | If you want to use another RSS parser class than XML::RSS (default), you can |
7b5fccb1 |
401 | change the class by setting C<$PREFERRED_PARSER> variable in the |
402 | XML::Feed::Format::RSS package. |
973e1f9e |
403 | |
7b5fccb1 |
404 | $XML::Feed::Format::RSS::PREFERRED_PARSER = "XML::RSS::LibXML"; |
973e1f9e |
405 | |
406 | B<Note:> this will only work for parsing feeds, not creating feeds. |
407 | |
0c201593 |
408 | B<Note:> Only C<XML::RSS::LibXML> version 0.3004 is known to work at the moment. |
409 | |
b1aa7a62 |
410 | =item C<$XML::Feed::MULTIPLE_ENCLOSURES> |
411 | |
412 | Although the RSS specification states that there can be at most one enclosure per item |
413 | some feeds break this rule. |
414 | |
415 | If this variable is set then C<XML::Feed> captures all of them and makes them available as a list. |
416 | |
417 | Otherwise it returns the last enclosure parsed. |
418 | |
419 | B<Note:> C<XML::RSS> version 1.44 is needed for this to work. |
420 | |
973e1f9e |
421 | =back |
422 | |
b1aa7a62 |
423 | =cut |
424 | |
b8bc97f3 |
425 | =head1 VALID FEEDS |
426 | |
427 | For reference, this cgi script will create valid, albeit nonsensical feeds |
428 | (according to C<http://feedvalidator.org> anyway) for Atom 1.0 and RSS 0.90, |
429 | 0.91, 1.0 and 2.0. |
430 | |
431 | #!perl -w |
432 | |
433 | use strict; |
434 | use CGI; |
435 | use CGI::Carp qw(fatalsToBrowser); |
436 | use DateTime; |
437 | use XML::Feed; |
438 | |
439 | my $cgi = CGI->new; |
440 | my @args = ( $cgi->param('format') || "Atom" ); |
441 | push @args, ( version => $cgi->param('version') ) if $cgi->param('version'); |
442 | |
443 | my $feed = XML::Feed->new(@args); |
444 | $feed->id("http://".time.rand()."/"); |
445 | $feed->title('Test Feed'); |
446 | $feed->link($cgi->url); |
447 | $feed->self_link($cgi->url( -query => 1, -full => 1, -rewrite => 1) ); |
448 | $feed->modified(DateTime->now); |
449 | |
450 | my $entry = XML::Feed::Entry->new(); |
451 | $entry->id("http://".time.rand()."/"); |
452 | $entry->link("http://example.com"); |
453 | $entry->title("Test entry"); |
454 | $entry->summary("Test summary"); |
455 | $entry->content("Foo"); |
456 | $entry->modified(DateTime->now); |
457 | $entry->author('test@example.com (Testy McTesterson)'); |
458 | $feed->add_entry($entry); |
459 | |
460 | my $mime = ("Atom" eq $feed->format) ? "application/atom+xml" : "application/rss+xml"; |
461 | print $cgi->header($mime); |
462 | print $feed->as_xml; |
463 | |
464 | |
0d5e38d1 |
465 | =head1 LICENSE |
466 | |
467 | I<XML::Feed> is free software; you may redistribute it and/or modify it |
468 | under the same terms as Perl itself. |
469 | |
470 | =head1 AUTHOR & COPYRIGHT |
471 | |
70f935cc |
472 | Except where otherwise noted, I<XML::Feed> is Copyright 2004-2008 |
e933a73f |
473 | Six Apart. All rights reserved. |
0d5e38d1 |
474 | |
e933a73f |
475 | =head1 SUPPORT |
476 | |
477 | For support contact the XML::Feed mailing list - xml-feed@perlhacks.com. |
478 | |
479 | =head1 SOURCE CODE |
70f935cc |
480 | |
481 | The latest version of I<XML::Feed> can be found at |
482 | |
e933a73f |
483 | http://github.com/davorg/XML-Feed |
70f935cc |
484 | |
0d5e38d1 |
485 | =cut |