Commit | Line | Data |
b67ffc2e |
1 | package BackPAN::Index; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | |
6 | our $VERSION = '0.39'; |
7 | |
8 | use autodie; |
9 | use App::Cache 0.37; |
10 | use CPAN::DistnameInfo 0.09; |
11 | use LWP::Simple qw(getstore head is_success); |
12 | use Archive::Extract; |
13 | use Path::Class (); |
14 | use File::stat; |
15 | use BackPAN::Index::Schema; |
16 | |
17 | use parent qw( Class::Accessor::Fast ); |
18 | |
19 | __PACKAGE__->mk_accessors(qw( |
20 | update |
21 | cache_ttl |
22 | debug |
23 | releases_only_from_authors |
24 | cache_dir |
25 | backpan_index_url |
26 | |
27 | backpan_index schema cache |
28 | )); |
29 | |
30 | my %Defaults = ( |
31 | backpan_index_url => "http://www.astray.com/tmp/backpan.txt.gz", |
32 | releases_only_from_authors => 1, |
33 | debug => 0, |
34 | cache_ttl => 60 * 60, |
35 | ); |
36 | |
37 | |
38 | sub new { |
39 | my $class = shift; |
40 | my $options = shift; |
41 | |
42 | $options ||= {}; |
43 | |
44 | # Apply defaults |
45 | %$options = ( %Defaults, %$options ); |
46 | |
47 | my $self = $class->SUPER::new($options); |
48 | |
49 | my %cache_opts; |
50 | $cache_opts{ttl} = $self->cache_ttl; |
51 | $cache_opts{directory} = $self->cache_dir if $self->cache_dir; |
52 | $cache_opts{enabled} = !$self->update; |
53 | |
54 | my $cache = App::Cache->new( \%cache_opts ); |
55 | $self->cache($cache); |
56 | |
57 | $self->_update_database(); |
58 | |
59 | return $self; |
60 | } |
61 | |
62 | sub _dbh { |
63 | my $self = shift; |
64 | return $self->schema->storage->dbh; |
65 | } |
66 | |
67 | sub _log { |
68 | my $self = shift; |
69 | return unless $self->debug; |
2593d57c |
70 | print STDOUT @_, "\n"; |
b67ffc2e |
71 | } |
72 | |
73 | sub _update_database { |
74 | my $self = shift; |
75 | |
76 | my $cache = $self->cache; |
77 | my $db_file = Path::Class::file($cache->directory, "backpan.sqlite"); |
78 | |
79 | my $should_update_db; |
80 | if( ! -e $db_file ) { |
81 | $should_update_db = 1; |
82 | } |
83 | elsif( defined $self->update ) { |
84 | $should_update_db = $self->update; |
85 | } |
86 | else { |
87 | # Check the database file before we connect to it. Connecting will create |
88 | # the file. |
89 | # XXX Should probably just put a timestamp in the DB |
90 | my $db_mtime = $db_file->stat->mtime; |
91 | my $db_age = time - $db_mtime; |
92 | $should_update_db = ($db_age > $cache->ttl); |
93 | |
94 | # No matter what, update the DB if we got a new index file. |
95 | my $archive_mtime = -e $self->_backpan_index_archive ? $self->_backpan_index_archive->stat->mtime : 0; |
96 | $should_update_db = 1 if $db_mtime < $archive_mtime; |
97 | } |
98 | |
b67ffc2e |
99 | $self->schema( BackPAN::Index::Schema->connect("dbi:SQLite:dbname=$db_file") ); |
100 | return unless $should_update_db; |
101 | |
102 | # Delay loading it into memory until we need it |
103 | $self->_log("Fetching BackPAN index..."); |
104 | $self->_get_backpan_index; |
105 | $self->_log("Done."); |
106 | |
107 | $self->_setup_database; |
108 | |
b67ffc2e |
109 | my $dbh = $self->_dbh; |
110 | |
111 | $self->_log("Populating database..."); |
112 | $dbh->begin_work; |
113 | |
114 | # Get it out of the hot loop. |
115 | my $only_authors = $self->releases_only_from_authors; |
116 | |
117 | my $insert_file_sth = $dbh->prepare(q[ |
118 | INSERT INTO files |
119 | (path, date, size) |
120 | VALUES (?, ?, ? ) |
121 | ]); |
122 | |
123 | my $insert_release_sth = $dbh->prepare(q[ |
124 | INSERT INTO releases |
125 | (path, dist, version, date, size, maturity, cpanid, distvname) |
126 | VALUES (?, ?, ?, ?, ?, ?, ?, ? ) |
127 | ]); |
128 | |
129 | my $insert_dist_sth = $dbh->prepare(q[ |
130 | INSERT INTO dists |
131 | (name, num_releases, |
132 | first_release, first_date, first_author, |
133 | latest_release, latest_date, latest_author) |
134 | VALUES (?, ?, |
135 | ?, ?, ?, |
136 | ?, ?, ?) |
137 | ]); |
138 | |
139 | my %dists; |
140 | my %files; |
141 | open my $fh, $self->_backpan_index_file; |
142 | while( my $line = <$fh> ) { |
143 | chomp $line; |
144 | my ( $path, $date, $size, @junk ) = split ' ', $line; |
145 | |
146 | if( $files{$path}++ ) { |
147 | $self->_log("Duplicate file $path in index, ignoring"); |
148 | next; |
149 | } |
150 | |
151 | if( !defined $path or !defined $date or !defined $size or @junk ) { |
152 | $self->_log("Bad data read at line $.: $line"); |
153 | next; |
154 | } |
155 | |
156 | next unless $size; |
157 | next if $only_authors and $path !~ m{^authors/}; |
158 | |
159 | $insert_file_sth->execute($path, $date, $size); |
160 | |
161 | next if $path =~ /\.(readme|meta)$/; |
162 | |
163 | my $i = CPAN::DistnameInfo->new( $path ); |
164 | |
165 | my $dist = $i->dist; |
166 | next unless $i->dist; |
167 | |
168 | $insert_release_sth->execute( |
169 | $path, |
170 | $dist, |
171 | $i->version || '', |
172 | $date, |
173 | $size, |
174 | $i->maturity, |
175 | $i->cpanid, |
176 | $i->distvname, |
177 | ); |
178 | |
179 | |
180 | # Update aggregate data about dists |
181 | my $distdata = ($dists{$dist} ||= { name => $dist }); |
182 | |
183 | if( !defined $distdata->{first_release} || |
184 | $date < $distdata->{first_date} ) |
185 | { |
186 | $distdata->{first_release} = $path; |
187 | $distdata->{first_author} = $i->cpanid; |
188 | $distdata->{first_date} = $date; |
189 | } |
190 | |
191 | if( !defined $distdata->{latest_release} || |
192 | $date > $distdata->{latest_date} ) |
193 | { |
194 | $distdata->{latest_release} = $path; |
195 | $distdata->{latest_author} = $i->cpanid; |
196 | $distdata->{latest_date} = $date; |
197 | } |
198 | |
199 | $distdata->{num_releases}++; |
200 | } |
201 | |
202 | for my $dist (values %dists) { |
203 | $insert_dist_sth->execute( |
204 | @{$dist} |
205 | {qw(name num_releases |
206 | first_release first_date first_author |
207 | latest_release latest_date latest_author |
208 | )} |
209 | ); |
210 | } |
211 | |
212 | # Add indexes after inserting so as not to slow down the inserts |
213 | $self->_add_indexes; |
214 | |
215 | $dbh->commit; |
216 | |
217 | $self->_log("Done."); |
218 | |
219 | return; |
220 | } |
221 | |
222 | |
223 | sub _database_is_empty { |
224 | my $self = shift; |
225 | |
226 | return 1 unless $self->files->count; |
227 | return 1 unless $self->releases->count; |
228 | return 0; |
229 | } |
230 | |
231 | |
232 | # This is denormalized for performance, its read-only anyway |
233 | sub _setup_database { |
234 | my $self = shift; |
235 | |
236 | my %create_for = ( |
237 | files => <<'SQL', |
238 | CREATE TABLE IF NOT EXISTS files ( |
239 | path TEXT NOT NULL PRIMARY KEY, |
240 | date INTEGER NOT NULL, |
241 | size INTEGER NOT NULL CHECK ( size >= 0 ) |
242 | ) |
243 | SQL |
244 | releases => <<'SQL', |
245 | CREATE TABLE IF NOT EXISTS releases ( |
246 | path TEXT NOT NULL PRIMARY KEY REFERENCES files, |
247 | dist TEXT NOT NULL REFERENCES dists, |
248 | date INTEGER NOT NULL, |
249 | size TEXT NOT NULL, |
250 | version TEXT NOT NULL, |
251 | maturity TEXT NOT NULL, |
252 | distvname TEXT NOT NULL, |
253 | cpanid TEXT NOT NULL |
254 | ) |
255 | SQL |
256 | |
257 | dists => <<'SQL', |
258 | CREATE TABLE IF NOT EXISTS dists ( |
259 | name TEXT NOT NULL PRIMARY KEY, |
260 | first_release TEXT NOT NULL REFERENCES releases, |
261 | latest_release TEXT NOT NULL REFERENCES releases, |
262 | first_date INTEGER NOT NULL, |
263 | latest_date INTEGER NOT NULL, |
264 | first_author TEXT NOT NULL, |
265 | latest_author TEXT NOT NULL, |
266 | num_releases INTEGER NOT NULL |
267 | ) |
268 | SQL |
269 | ); |
c4e00ace |
270 | my %delete_for = ( |
271 | files => 'DELETE FROM files', |
272 | releases => 'DELETE FROM releases', |
273 | dists => 'DELETE FROM dists', |
274 | ); |
b67ffc2e |
275 | |
276 | my $dbh = $self->_dbh; |
c4e00ace |
277 | for my $table_name (keys %create_for) { |
278 | $dbh->do($create_for{$table_name}); |
279 | $dbh->do($delete_for{$table_name}); |
b67ffc2e |
280 | } |
281 | |
282 | $self->schema->rescan; |
283 | |
284 | return; |
285 | } |
286 | |
287 | |
288 | sub _add_indexes { |
289 | my $self = shift; |
290 | |
291 | my @indexes = ( |
292 | # Speed up dists_by several orders of magnitude |
293 | "CREATE INDEX IF NOT EXISTS dists_by ON releases (cpanid, dist)", |
294 | |
295 | # Speed up files_by a lot |
296 | "CREATE INDEX IF NOT EXISTS files_by ON releases (cpanid, path)", |
297 | |
298 | # Let us order releases by date quickly |
299 | "CREATE INDEX IF NOT EXISTS releases_by_date ON releases (date, dist)", |
300 | ); |
301 | my $dbh = $self->_dbh; |
302 | for my $sql (@indexes) { |
303 | $dbh->do($sql); |
304 | } |
305 | } |
306 | |
307 | |
308 | sub _get_backpan_index { |
309 | my $self = shift; |
310 | |
311 | my $url = $self->backpan_index_url; |
312 | |
313 | return if !$self->_backpan_index_has_changed; |
314 | |
315 | my $status = getstore($url, $self->_backpan_index_archive.''); |
316 | die "Error fetching $url: $status" unless is_success($status); |
317 | |
318 | # Faster |
319 | local $Archive::Extract::PREFER_BIN = 1; |
320 | |
321 | # Archive::Extract is vulnerable to the ORS. |
322 | local $\; |
323 | |
324 | my $ae = Archive::Extract->new( archive => $self->_backpan_index_archive ); |
325 | $ae->extract( to => $self->_backpan_index_file ) |
326 | or die "Problem extracting @{[ $self->_backpan_index_archive ]}: @{[ $ae->error ]}"; |
327 | |
328 | # If the backpan index age is older than the TTL this prevents us |
329 | # from immediately looking again. |
330 | # XXX Should probably use a "last checked" semaphore file |
331 | $self->_backpan_index_file->touch; |
332 | |
333 | return; |
334 | } |
335 | |
336 | |
337 | sub _backpan_index_archive { |
338 | my $self = shift; |
339 | |
340 | my $file = URI->new($self->backpan_index_url)->path; |
341 | $file = Path::Class::file($file)->basename; |
342 | return Path::Class::file($file)->absolute($self->cache->directory); |
343 | } |
344 | |
345 | |
346 | sub _backpan_index_file { |
347 | my $self = shift; |
348 | |
349 | my $file = $self->_backpan_index_archive; |
350 | $file =~ s{\.[^.]+$}{}; |
351 | |
352 | return Path::Class::file($file); |
353 | } |
354 | |
355 | |
356 | sub _backpan_index_has_changed { |
357 | my $self = shift; |
358 | |
359 | my $file = $self->_backpan_index_file; |
360 | return 1 unless -e $file; |
361 | |
362 | my $local_mod_time = stat($file)->mtime; |
363 | my $local_age = time - $local_mod_time; |
364 | return 0 unless $local_age > $self->cache->ttl; |
365 | |
366 | # We looked, don't have to look again until the ttl is up. |
367 | $self->_backpan_index_file->touch; |
368 | |
369 | my(undef, undef, $remote_mod_time) = head($self->backpan_index_url); |
55f9d76f |
370 | return defined $remote_mod_time && $remote_mod_time > $local_mod_time; |
b67ffc2e |
371 | } |
372 | |
373 | |
374 | sub files { |
375 | my $self = shift; |
376 | return $self->schema->resultset('File'); |
377 | } |
378 | |
379 | |
380 | sub dist { |
381 | my($self, $dist) = @_; |
382 | |
383 | return $self->dists->single({ name => $dist }); |
384 | } |
385 | |
386 | |
387 | sub releases { |
388 | my($self, $dist) = @_; |
389 | |
390 | return $self->schema->resultset("Release") unless defined $dist; |
391 | return $self->schema->resultset("Release")->search({ dist => $dist }); |
392 | } |
393 | |
394 | |
395 | sub release { |
396 | my($self, $dist, $version) = @_; |
397 | |
398 | return $self->releases($dist)->single({ version => $version }); |
399 | } |
400 | |
401 | |
402 | sub dists { |
403 | my $self = shift; |
404 | |
405 | return $self->schema->resultset("Dist"); |
406 | } |
407 | |
408 | |
409 | =head1 NAME |
410 | |
411 | BackPAN::Index - An interface to the BackPAN index |
412 | |
413 | =head1 SYNOPSIS |
414 | |
415 | use BackPAN::Index; |
416 | my $backpan = BackPAN::Index->new; |
417 | |
418 | # These are all DBIx::Class::ResultSet's |
419 | my $files = $backpan->files; |
420 | my $dists = $backpan->dists; |
421 | my $releases = $backpan->releases("Acme-Pony"); |
422 | |
423 | # Use DBIx::Class::ResultSet methods on them |
424 | my $release = $releases->single({ version => '1.23' }); |
425 | |
426 | my $dist = $backpan->dist("Test-Simple"); |
427 | my $releases = $dist->releases; |
428 | |
429 | =head1 DESCRIPTION |
430 | |
431 | This downloads, caches and parses the BackPAN index into a local |
432 | database for efficient querying. |
433 | |
434 | Its a pretty thin wrapper around DBIx::Class returning |
435 | L<DBIx::Class::ResultSet> objects which makes it efficient and |
436 | flexible. |
437 | |
438 | The Comprehensive Perl Archive Network (CPAN) is a very useful |
439 | collection of Perl code. However, in order to keep CPAN relatively |
440 | small, authors of modules can delete older versions of modules to only |
441 | let CPAN have the latest version of a module. BackPAN is where these |
442 | deleted modules are backed up. It's more like a full CPAN mirror, only |
443 | without the deletions. This module provides an index of BackPAN and |
444 | some handy methods. |
445 | |
446 | =head1 METHODS |
447 | |
448 | =head2 new |
449 | |
450 | my $backpan = BackPAN::Index->new(\%options); |
451 | |
452 | Create a new object representing the BackPAN index. |
453 | |
454 | It will, if necessary, download the BackPAN index and compile it into |
455 | a database for efficient storage. Initial creation is slow, but it |
456 | will be cached. |
457 | |
458 | new() takes some options |
459 | |
460 | =head3 update |
461 | |
462 | Because it is rather large, BackPAN::Index caches a copy of the |
463 | BackPAN index and builds a local database to speed access. This flag |
464 | controls if the local index is updated. |
465 | |
466 | If true, forces an update of the BACKPAN index. |
467 | |
468 | If false, the index will never be updated even if the cache is |
469 | expired. It will always create a new index if one does not exist. |
470 | |
471 | By default the index is cached and checked for updates according to |
472 | C<<$backpan->cache_ttl>>. |
473 | |
474 | =head3 cache_ttl |
475 | |
476 | How many seconds before checking for an updated index. |
477 | |
478 | Defaults to an hour. |
479 | |
480 | =head3 debug |
481 | |
482 | If true, debug messages will be printed. |
483 | |
484 | Defaults to false. |
485 | |
486 | =head3 releases_only_from_authors |
487 | |
488 | If true, only files in the C<authors> directory will be considered as |
489 | releases. If false any file in the index may be considered for a |
490 | release. |
491 | |
492 | Defaults to true. |
493 | |
494 | =head3 cache_dir |
495 | |
496 | Location of the cache directory. |
497 | |
498 | Defaults to whatever L<App::Cache> does. |
499 | |
500 | =head3 backpan_index_url |
501 | |
502 | URL to the BackPAN index. |
503 | |
504 | Defaults to a sensible location. |
505 | |
506 | |
507 | =head2 files |
508 | |
509 | my $files = $backpan->files; |
510 | |
511 | Returns a ResultSet representing all the files on BackPAN. |
512 | |
513 | =head2 files_by |
514 | |
515 | my $files = $backpan->files_by($cpanid); |
516 | my @files = $backpan->files_by($cpanid); |
517 | |
518 | Returns all the files by a given $cpanid. |
519 | |
520 | Returns either a list of BackPAN::Index::Files or a ResultSet. |
521 | |
522 | =cut |
523 | |
524 | sub files_by { |
525 | my $self = shift; |
526 | my $cpanid = shift; |
527 | |
528 | return $self->files->search({ "releases.cpanid" => $cpanid }, { join => "releases" }); |
529 | } |
530 | |
531 | =head2 dists |
532 | |
533 | my $dists = $backpan->dists; |
534 | |
535 | Returns a ResultSet representing all the distributions on BackPAN. |
536 | |
537 | =head2 dist |
538 | |
539 | my $dists = $backpan->dist($dist_name); |
540 | |
541 | Returns a single BackPAN::Index::Dist object for $dist_name. |
542 | |
543 | =head2 dists_by |
544 | |
545 | my $dists = $backpan->dists_by($cpanid); |
546 | my @dists = $backpan->dists_by($cpanid); |
547 | |
548 | Returns the dists which contain at least one release by the given |
549 | $cpanid. |
550 | |
551 | Returns either a ResultSet or a list of the Dists. |
552 | |
553 | =cut |
554 | |
555 | sub dists_by { |
556 | my $self = shift; |
557 | my $cpanid = shift; |
558 | |
559 | return $self->dists->search({ "releases.cpanid" => $cpanid }, { join => "releases", distinct => 1 }); |
560 | } |
561 | |
562 | |
563 | =head2 dists_changed_since |
564 | |
565 | my $dists = $backpan->dists_changed_since($time); |
566 | |
567 | Returns a ResultSet of distributions which have had releases at or after after $time. |
568 | |
569 | =cut |
570 | |
571 | sub dists_changed_since { |
572 | my $self = shift; |
573 | my $time = shift; |
574 | |
575 | return $self->dists->search( latest_date => \">= $time" ); |
576 | } |
577 | |
578 | =head2 releases |
579 | |
580 | my $all_releases = $backpan->releases(); |
581 | my $dist_releases = $backpan->releases($dist_name); |
582 | |
583 | Returns a ResultSet representing all the releases on BackPAN. If a |
584 | $dist_name is given it returns the releases of just one distribution. |
585 | |
586 | =head2 release |
587 | |
588 | my $release = $backpan->release($dist_name, $version); |
589 | |
590 | Returns a single BackPAN::Index::Release object for the given |
591 | $dist_name and $version. |
592 | |
593 | =head2 releases_by |
594 | |
595 | my $releases = $backpan->releases_by($cpanid); |
596 | my @releases = $backpan->releases_by($cpanid); |
597 | |
598 | Returns all the releases of a single author. |
599 | |
600 | Returns either a list of Releases or a ResultSet representing those releases. |
601 | |
602 | =cut |
603 | |
604 | sub releases_by { |
605 | my $self = shift; |
606 | my $cpanid = shift; |
607 | |
608 | return $self->releases->search({ cpanid => $cpanid }); |
609 | } |
610 | |
611 | |
612 | =head2 releases_since |
613 | |
614 | my $releases = $backpan->releases_since($time); |
615 | |
616 | Returns a ResultSet of releases which were released at or after $time. |
617 | |
618 | =cut |
619 | |
620 | sub releases_since { |
621 | my $self = shift; |
622 | my $time = shift; |
623 | |
624 | return $self->releases->search( date => \">= $time" ); |
625 | } |
626 | |
627 | |
628 | =head1 EXAMPLES |
629 | |
630 | The real power of BackPAN::Index comes from L<DBIx::Class::ResultSet>. |
631 | Its very flexible and very powerful but not always obvious how to get |
632 | it to do things. Here's some examples. |
633 | |
634 | # How many files are on BackPAN? |
635 | my $count = $backpan->files->count; |
636 | |
637 | # How big is BackPAN? |
638 | my $size = $backpan->files->get_column("size")->sum; |
639 | |
640 | # What are the names of all the distributions? |
641 | my @names = $backpan->dists->get_column("name")->all; |
642 | |
643 | # What path contains this release? |
644 | my $path = $backpan->release("Acme-Pony", 1.01)->path; |
645 | |
646 | # Get all the releases of Moose ordered by version |
647 | my @releases = $backpan->dist("Moose")->releases |
648 | ->search(undef, { order_by => "version" }); |
649 | |
650 | =head1 AUTHOR |
651 | |
652 | Michael G Schwern <schwern@pobox.com> |
653 | |
654 | =head1 COPYRIGHT |
655 | |
656 | Copyright 2009, Michael G Schwern |
657 | |
658 | =head1 LICENSE |
659 | |
660 | This module is free software; you can redistribute it or modify it under |
661 | the same terms as Perl itself. |
662 | |
663 | =head1 SEE ALSO |
664 | |
665 | L<DBIx::Class::ResultSet>, L<BackPAN::Index::File>, |
666 | L<BackPAN::Index::Release>, L<BackPAN::Index::Dist> |
667 | |
668 | Repository: L<http://github.com/acme/parse-backpan-packages> |
669 | Bugs: L<http://rt.cpan.org/Public/Dist/Display.html?Name=Parse-BACKPAN-Packages> |
670 | |
671 | =cut |
672 | |
673 | 1; |