Commit | Line | Data |
b67ffc2e |
1 | package BackPAN::Index; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | |
6 | our $VERSION = '0.39'; |
7 | |
8 | use autodie; |
9 | use App::Cache 0.37; |
10 | use CPAN::DistnameInfo 0.09; |
11 | use LWP::Simple qw(getstore head is_success); |
12 | use Archive::Extract; |
13 | use Path::Class (); |
14 | use File::stat; |
15 | use BackPAN::Index::Schema; |
16 | |
17 | use parent qw( Class::Accessor::Fast ); |
18 | |
19 | __PACKAGE__->mk_accessors(qw( |
20 | update |
21 | cache_ttl |
22 | debug |
23 | releases_only_from_authors |
24 | cache_dir |
25 | backpan_index_url |
26 | |
27 | backpan_index schema cache |
28 | )); |
29 | |
30 | my %Defaults = ( |
31 | backpan_index_url => "http://www.astray.com/tmp/backpan.txt.gz", |
32 | releases_only_from_authors => 1, |
33 | debug => 0, |
34 | cache_ttl => 60 * 60, |
35 | ); |
36 | |
37 | |
38 | sub new { |
39 | my $class = shift; |
40 | my $options = shift; |
41 | |
42 | $options ||= {}; |
43 | |
44 | # Apply defaults |
45 | %$options = ( %Defaults, %$options ); |
46 | |
47 | my $self = $class->SUPER::new($options); |
48 | |
49 | my %cache_opts; |
50 | $cache_opts{ttl} = $self->cache_ttl; |
51 | $cache_opts{directory} = $self->cache_dir if $self->cache_dir; |
52 | $cache_opts{enabled} = !$self->update; |
53 | |
54 | my $cache = App::Cache->new( \%cache_opts ); |
55 | $self->cache($cache); |
56 | |
57 | $self->_update_database(); |
58 | |
59 | return $self; |
60 | } |
61 | |
62 | sub _dbh { |
63 | my $self = shift; |
64 | return $self->schema->storage->dbh; |
65 | } |
66 | |
67 | sub _log { |
68 | my $self = shift; |
69 | return unless $self->debug; |
2593d57c |
70 | print STDOUT @_, "\n"; |
b67ffc2e |
71 | } |
72 | |
73 | sub _update_database { |
74 | my $self = shift; |
75 | |
76 | my $cache = $self->cache; |
77 | my $db_file = Path::Class::file($cache->directory, "backpan.sqlite"); |
78 | |
79 | my $should_update_db; |
80 | if( ! -e $db_file ) { |
81 | $should_update_db = 1; |
82 | } |
83 | elsif( defined $self->update ) { |
84 | $should_update_db = $self->update; |
85 | } |
86 | else { |
87 | # Check the database file before we connect to it. Connecting will create |
88 | # the file. |
89 | # XXX Should probably just put a timestamp in the DB |
90 | my $db_mtime = $db_file->stat->mtime; |
91 | my $db_age = time - $db_mtime; |
92 | $should_update_db = ($db_age > $cache->ttl); |
93 | |
94 | # No matter what, update the DB if we got a new index file. |
95 | my $archive_mtime = -e $self->_backpan_index_archive ? $self->_backpan_index_archive->stat->mtime : 0; |
96 | $should_update_db = 1 if $db_mtime < $archive_mtime; |
97 | } |
98 | |
99 | unlink $db_file if -e $db_file and $should_update_db; |
100 | |
101 | $self->schema( BackPAN::Index::Schema->connect("dbi:SQLite:dbname=$db_file") ); |
102 | return unless $should_update_db; |
103 | |
104 | # Delay loading it into memory until we need it |
105 | $self->_log("Fetching BackPAN index..."); |
106 | $self->_get_backpan_index; |
107 | $self->_log("Done."); |
108 | |
109 | $self->_setup_database; |
110 | |
b67ffc2e |
111 | my $dbh = $self->_dbh; |
112 | |
113 | $self->_log("Populating database..."); |
114 | $dbh->begin_work; |
115 | |
116 | # Get it out of the hot loop. |
117 | my $only_authors = $self->releases_only_from_authors; |
118 | |
119 | my $insert_file_sth = $dbh->prepare(q[ |
120 | INSERT INTO files |
121 | (path, date, size) |
122 | VALUES (?, ?, ? ) |
123 | ]); |
124 | |
125 | my $insert_release_sth = $dbh->prepare(q[ |
126 | INSERT INTO releases |
127 | (path, dist, version, date, size, maturity, cpanid, distvname) |
128 | VALUES (?, ?, ?, ?, ?, ?, ?, ? ) |
129 | ]); |
130 | |
131 | my $insert_dist_sth = $dbh->prepare(q[ |
132 | INSERT INTO dists |
133 | (name, num_releases, |
134 | first_release, first_date, first_author, |
135 | latest_release, latest_date, latest_author) |
136 | VALUES (?, ?, |
137 | ?, ?, ?, |
138 | ?, ?, ?) |
139 | ]); |
140 | |
141 | my %dists; |
142 | my %files; |
143 | open my $fh, $self->_backpan_index_file; |
144 | while( my $line = <$fh> ) { |
145 | chomp $line; |
146 | my ( $path, $date, $size, @junk ) = split ' ', $line; |
147 | |
148 | if( $files{$path}++ ) { |
149 | $self->_log("Duplicate file $path in index, ignoring"); |
150 | next; |
151 | } |
152 | |
153 | if( !defined $path or !defined $date or !defined $size or @junk ) { |
154 | $self->_log("Bad data read at line $.: $line"); |
155 | next; |
156 | } |
157 | |
158 | next unless $size; |
159 | next if $only_authors and $path !~ m{^authors/}; |
160 | |
161 | $insert_file_sth->execute($path, $date, $size); |
162 | |
163 | next if $path =~ /\.(readme|meta)$/; |
164 | |
165 | my $i = CPAN::DistnameInfo->new( $path ); |
166 | |
167 | my $dist = $i->dist; |
168 | next unless $i->dist; |
169 | |
170 | $insert_release_sth->execute( |
171 | $path, |
172 | $dist, |
173 | $i->version || '', |
174 | $date, |
175 | $size, |
176 | $i->maturity, |
177 | $i->cpanid, |
178 | $i->distvname, |
179 | ); |
180 | |
181 | |
182 | # Update aggregate data about dists |
183 | my $distdata = ($dists{$dist} ||= { name => $dist }); |
184 | |
185 | if( !defined $distdata->{first_release} || |
186 | $date < $distdata->{first_date} ) |
187 | { |
188 | $distdata->{first_release} = $path; |
189 | $distdata->{first_author} = $i->cpanid; |
190 | $distdata->{first_date} = $date; |
191 | } |
192 | |
193 | if( !defined $distdata->{latest_release} || |
194 | $date > $distdata->{latest_date} ) |
195 | { |
196 | $distdata->{latest_release} = $path; |
197 | $distdata->{latest_author} = $i->cpanid; |
198 | $distdata->{latest_date} = $date; |
199 | } |
200 | |
201 | $distdata->{num_releases}++; |
202 | } |
203 | |
204 | for my $dist (values %dists) { |
205 | $insert_dist_sth->execute( |
206 | @{$dist} |
207 | {qw(name num_releases |
208 | first_release first_date first_author |
209 | latest_release latest_date latest_author |
210 | )} |
211 | ); |
212 | } |
213 | |
214 | # Add indexes after inserting so as not to slow down the inserts |
215 | $self->_add_indexes; |
216 | |
217 | $dbh->commit; |
218 | |
219 | $self->_log("Done."); |
220 | |
221 | return; |
222 | } |
223 | |
224 | |
225 | sub _database_is_empty { |
226 | my $self = shift; |
227 | |
228 | return 1 unless $self->files->count; |
229 | return 1 unless $self->releases->count; |
230 | return 0; |
231 | } |
232 | |
233 | |
234 | # This is denormalized for performance, its read-only anyway |
235 | sub _setup_database { |
236 | my $self = shift; |
237 | |
238 | my %create_for = ( |
239 | files => <<'SQL', |
240 | CREATE TABLE IF NOT EXISTS files ( |
241 | path TEXT NOT NULL PRIMARY KEY, |
242 | date INTEGER NOT NULL, |
243 | size INTEGER NOT NULL CHECK ( size >= 0 ) |
244 | ) |
245 | SQL |
246 | releases => <<'SQL', |
247 | CREATE TABLE IF NOT EXISTS releases ( |
248 | path TEXT NOT NULL PRIMARY KEY REFERENCES files, |
249 | dist TEXT NOT NULL REFERENCES dists, |
250 | date INTEGER NOT NULL, |
251 | size TEXT NOT NULL, |
252 | version TEXT NOT NULL, |
253 | maturity TEXT NOT NULL, |
254 | distvname TEXT NOT NULL, |
255 | cpanid TEXT NOT NULL |
256 | ) |
257 | SQL |
258 | |
259 | dists => <<'SQL', |
260 | CREATE TABLE IF NOT EXISTS dists ( |
261 | name TEXT NOT NULL PRIMARY KEY, |
262 | first_release TEXT NOT NULL REFERENCES releases, |
263 | latest_release TEXT NOT NULL REFERENCES releases, |
264 | first_date INTEGER NOT NULL, |
265 | latest_date INTEGER NOT NULL, |
266 | first_author TEXT NOT NULL, |
267 | latest_author TEXT NOT NULL, |
268 | num_releases INTEGER NOT NULL |
269 | ) |
270 | SQL |
271 | ); |
272 | |
273 | my $dbh = $self->_dbh; |
274 | for my $sql (values %create_for) { |
275 | $dbh->do($sql); |
276 | } |
277 | |
278 | $self->schema->rescan; |
279 | |
280 | return; |
281 | } |
282 | |
283 | |
284 | sub _add_indexes { |
285 | my $self = shift; |
286 | |
287 | my @indexes = ( |
288 | # Speed up dists_by several orders of magnitude |
289 | "CREATE INDEX IF NOT EXISTS dists_by ON releases (cpanid, dist)", |
290 | |
291 | # Speed up files_by a lot |
292 | "CREATE INDEX IF NOT EXISTS files_by ON releases (cpanid, path)", |
293 | |
294 | # Let us order releases by date quickly |
295 | "CREATE INDEX IF NOT EXISTS releases_by_date ON releases (date, dist)", |
296 | ); |
297 | my $dbh = $self->_dbh; |
298 | for my $sql (@indexes) { |
299 | $dbh->do($sql); |
300 | } |
301 | } |
302 | |
303 | |
304 | sub _get_backpan_index { |
305 | my $self = shift; |
306 | |
307 | my $url = $self->backpan_index_url; |
308 | |
309 | return if !$self->_backpan_index_has_changed; |
310 | |
311 | my $status = getstore($url, $self->_backpan_index_archive.''); |
312 | die "Error fetching $url: $status" unless is_success($status); |
313 | |
314 | # Faster |
315 | local $Archive::Extract::PREFER_BIN = 1; |
316 | |
317 | # Archive::Extract is vulnerable to the ORS. |
318 | local $\; |
319 | |
320 | my $ae = Archive::Extract->new( archive => $self->_backpan_index_archive ); |
321 | $ae->extract( to => $self->_backpan_index_file ) |
322 | or die "Problem extracting @{[ $self->_backpan_index_archive ]}: @{[ $ae->error ]}"; |
323 | |
324 | # If the backpan index age is older than the TTL this prevents us |
325 | # from immediately looking again. |
326 | # XXX Should probably use a "last checked" semaphore file |
327 | $self->_backpan_index_file->touch; |
328 | |
329 | return; |
330 | } |
331 | |
332 | |
333 | sub _backpan_index_archive { |
334 | my $self = shift; |
335 | |
336 | my $file = URI->new($self->backpan_index_url)->path; |
337 | $file = Path::Class::file($file)->basename; |
338 | return Path::Class::file($file)->absolute($self->cache->directory); |
339 | } |
340 | |
341 | |
342 | sub _backpan_index_file { |
343 | my $self = shift; |
344 | |
345 | my $file = $self->_backpan_index_archive; |
346 | $file =~ s{\.[^.]+$}{}; |
347 | |
348 | return Path::Class::file($file); |
349 | } |
350 | |
351 | |
352 | sub _backpan_index_has_changed { |
353 | my $self = shift; |
354 | |
355 | my $file = $self->_backpan_index_file; |
356 | return 1 unless -e $file; |
357 | |
358 | my $local_mod_time = stat($file)->mtime; |
359 | my $local_age = time - $local_mod_time; |
360 | return 0 unless $local_age > $self->cache->ttl; |
361 | |
362 | # We looked, don't have to look again until the ttl is up. |
363 | $self->_backpan_index_file->touch; |
364 | |
365 | my(undef, undef, $remote_mod_time) = head($self->backpan_index_url); |
55f9d76f |
366 | return defined $remote_mod_time && $remote_mod_time > $local_mod_time; |
b67ffc2e |
367 | } |
368 | |
369 | |
370 | sub files { |
371 | my $self = shift; |
372 | return $self->schema->resultset('File'); |
373 | } |
374 | |
375 | |
376 | sub dist { |
377 | my($self, $dist) = @_; |
378 | |
379 | return $self->dists->single({ name => $dist }); |
380 | } |
381 | |
382 | |
383 | sub releases { |
384 | my($self, $dist) = @_; |
385 | |
386 | return $self->schema->resultset("Release") unless defined $dist; |
387 | return $self->schema->resultset("Release")->search({ dist => $dist }); |
388 | } |
389 | |
390 | |
391 | sub release { |
392 | my($self, $dist, $version) = @_; |
393 | |
394 | return $self->releases($dist)->single({ version => $version }); |
395 | } |
396 | |
397 | |
398 | sub dists { |
399 | my $self = shift; |
400 | |
401 | return $self->schema->resultset("Dist"); |
402 | } |
403 | |
404 | |
405 | =head1 NAME |
406 | |
407 | BackPAN::Index - An interface to the BackPAN index |
408 | |
409 | =head1 SYNOPSIS |
410 | |
411 | use BackPAN::Index; |
412 | my $backpan = BackPAN::Index->new; |
413 | |
414 | # These are all DBIx::Class::ResultSet's |
415 | my $files = $backpan->files; |
416 | my $dists = $backpan->dists; |
417 | my $releases = $backpan->releases("Acme-Pony"); |
418 | |
419 | # Use DBIx::Class::ResultSet methods on them |
420 | my $release = $releases->single({ version => '1.23' }); |
421 | |
422 | my $dist = $backpan->dist("Test-Simple"); |
423 | my $releases = $dist->releases; |
424 | |
425 | =head1 DESCRIPTION |
426 | |
427 | This downloads, caches and parses the BackPAN index into a local |
428 | database for efficient querying. |
429 | |
430 | Its a pretty thin wrapper around DBIx::Class returning |
431 | L<DBIx::Class::ResultSet> objects which makes it efficient and |
432 | flexible. |
433 | |
434 | The Comprehensive Perl Archive Network (CPAN) is a very useful |
435 | collection of Perl code. However, in order to keep CPAN relatively |
436 | small, authors of modules can delete older versions of modules to only |
437 | let CPAN have the latest version of a module. BackPAN is where these |
438 | deleted modules are backed up. It's more like a full CPAN mirror, only |
439 | without the deletions. This module provides an index of BackPAN and |
440 | some handy methods. |
441 | |
442 | =head1 METHODS |
443 | |
444 | =head2 new |
445 | |
446 | my $backpan = BackPAN::Index->new(\%options); |
447 | |
448 | Create a new object representing the BackPAN index. |
449 | |
450 | It will, if necessary, download the BackPAN index and compile it into |
451 | a database for efficient storage. Initial creation is slow, but it |
452 | will be cached. |
453 | |
454 | new() takes some options |
455 | |
456 | =head3 update |
457 | |
458 | Because it is rather large, BackPAN::Index caches a copy of the |
459 | BackPAN index and builds a local database to speed access. This flag |
460 | controls if the local index is updated. |
461 | |
462 | If true, forces an update of the BACKPAN index. |
463 | |
464 | If false, the index will never be updated even if the cache is |
465 | expired. It will always create a new index if one does not exist. |
466 | |
467 | By default the index is cached and checked for updates according to |
468 | C<<$backpan->cache_ttl>>. |
469 | |
470 | =head3 cache_ttl |
471 | |
472 | How many seconds before checking for an updated index. |
473 | |
474 | Defaults to an hour. |
475 | |
476 | =head3 debug |
477 | |
478 | If true, debug messages will be printed. |
479 | |
480 | Defaults to false. |
481 | |
482 | =head3 releases_only_from_authors |
483 | |
484 | If true, only files in the C<authors> directory will be considered as |
485 | releases. If false any file in the index may be considered for a |
486 | release. |
487 | |
488 | Defaults to true. |
489 | |
490 | =head3 cache_dir |
491 | |
492 | Location of the cache directory. |
493 | |
494 | Defaults to whatever L<App::Cache> does. |
495 | |
496 | =head3 backpan_index_url |
497 | |
498 | URL to the BackPAN index. |
499 | |
500 | Defaults to a sensible location. |
501 | |
502 | |
503 | =head2 files |
504 | |
505 | my $files = $backpan->files; |
506 | |
507 | Returns a ResultSet representing all the files on BackPAN. |
508 | |
509 | =head2 files_by |
510 | |
511 | my $files = $backpan->files_by($cpanid); |
512 | my @files = $backpan->files_by($cpanid); |
513 | |
514 | Returns all the files by a given $cpanid. |
515 | |
516 | Returns either a list of BackPAN::Index::Files or a ResultSet. |
517 | |
518 | =cut |
519 | |
520 | sub files_by { |
521 | my $self = shift; |
522 | my $cpanid = shift; |
523 | |
524 | return $self->files->search({ "releases.cpanid" => $cpanid }, { join => "releases" }); |
525 | } |
526 | |
527 | =head2 dists |
528 | |
529 | my $dists = $backpan->dists; |
530 | |
531 | Returns a ResultSet representing all the distributions on BackPAN. |
532 | |
533 | =head2 dist |
534 | |
535 | my $dists = $backpan->dist($dist_name); |
536 | |
537 | Returns a single BackPAN::Index::Dist object for $dist_name. |
538 | |
539 | =head2 dists_by |
540 | |
541 | my $dists = $backpan->dists_by($cpanid); |
542 | my @dists = $backpan->dists_by($cpanid); |
543 | |
544 | Returns the dists which contain at least one release by the given |
545 | $cpanid. |
546 | |
547 | Returns either a ResultSet or a list of the Dists. |
548 | |
549 | =cut |
550 | |
551 | sub dists_by { |
552 | my $self = shift; |
553 | my $cpanid = shift; |
554 | |
555 | return $self->dists->search({ "releases.cpanid" => $cpanid }, { join => "releases", distinct => 1 }); |
556 | } |
557 | |
558 | |
559 | =head2 dists_changed_since |
560 | |
561 | my $dists = $backpan->dists_changed_since($time); |
562 | |
563 | Returns a ResultSet of distributions which have had releases at or after after $time. |
564 | |
565 | =cut |
566 | |
567 | sub dists_changed_since { |
568 | my $self = shift; |
569 | my $time = shift; |
570 | |
571 | return $self->dists->search( latest_date => \">= $time" ); |
572 | } |
573 | |
574 | =head2 releases |
575 | |
576 | my $all_releases = $backpan->releases(); |
577 | my $dist_releases = $backpan->releases($dist_name); |
578 | |
579 | Returns a ResultSet representing all the releases on BackPAN. If a |
580 | $dist_name is given it returns the releases of just one distribution. |
581 | |
582 | =head2 release |
583 | |
584 | my $release = $backpan->release($dist_name, $version); |
585 | |
586 | Returns a single BackPAN::Index::Release object for the given |
587 | $dist_name and $version. |
588 | |
589 | =head2 releases_by |
590 | |
591 | my $releases = $backpan->releases_by($cpanid); |
592 | my @releases = $backpan->releases_by($cpanid); |
593 | |
594 | Returns all the releases of a single author. |
595 | |
596 | Returns either a list of Releases or a ResultSet representing those releases. |
597 | |
598 | =cut |
599 | |
600 | sub releases_by { |
601 | my $self = shift; |
602 | my $cpanid = shift; |
603 | |
604 | return $self->releases->search({ cpanid => $cpanid }); |
605 | } |
606 | |
607 | |
608 | =head2 releases_since |
609 | |
610 | my $releases = $backpan->releases_since($time); |
611 | |
612 | Returns a ResultSet of releases which were released at or after $time. |
613 | |
614 | =cut |
615 | |
616 | sub releases_since { |
617 | my $self = shift; |
618 | my $time = shift; |
619 | |
620 | return $self->releases->search( date => \">= $time" ); |
621 | } |
622 | |
623 | |
624 | =head1 EXAMPLES |
625 | |
626 | The real power of BackPAN::Index comes from L<DBIx::Class::ResultSet>. |
627 | Its very flexible and very powerful but not always obvious how to get |
628 | it to do things. Here's some examples. |
629 | |
630 | # How many files are on BackPAN? |
631 | my $count = $backpan->files->count; |
632 | |
633 | # How big is BackPAN? |
634 | my $size = $backpan->files->get_column("size")->sum; |
635 | |
636 | # What are the names of all the distributions? |
637 | my @names = $backpan->dists->get_column("name")->all; |
638 | |
639 | # What path contains this release? |
640 | my $path = $backpan->release("Acme-Pony", 1.01)->path; |
641 | |
642 | # Get all the releases of Moose ordered by version |
643 | my @releases = $backpan->dist("Moose")->releases |
644 | ->search(undef, { order_by => "version" }); |
645 | |
646 | =head1 AUTHOR |
647 | |
648 | Michael G Schwern <schwern@pobox.com> |
649 | |
650 | =head1 COPYRIGHT |
651 | |
652 | Copyright 2009, Michael G Schwern |
653 | |
654 | =head1 LICENSE |
655 | |
656 | This module is free software; you can redistribute it or modify it under |
657 | the same terms as Perl itself. |
658 | |
659 | =head1 SEE ALSO |
660 | |
661 | L<DBIx::Class::ResultSet>, L<BackPAN::Index::File>, |
662 | L<BackPAN::Index::Release>, L<BackPAN::Index::Dist> |
663 | |
664 | Repository: L<http://github.com/acme/parse-backpan-packages> |
665 | Bugs: L<http://rt.cpan.org/Public/Dist/Display.html?Name=Parse-BACKPAN-Packages> |
666 | |
667 | =cut |
668 | |
669 | 1; |