1 package BackPAN::Index;
10 use CPAN::DistnameInfo 0.09;
11 use LWP::Simple qw(getstore head is_success);
15 use BackPAN::Index::Schema;
17 use parent qw( Class::Accessor::Fast );
19 __PACKAGE__->mk_accessors(qw(
23 releases_only_from_authors
27 backpan_index schema cache
31 backpan_index_url => "http://www.astray.com/tmp/backpan.txt.gz",
32 releases_only_from_authors => 1,
45 %$options = ( %Defaults, %$options );
47 my $self = $class->SUPER::new($options);
50 $cache_opts{ttl} = $self->cache_ttl;
51 $cache_opts{directory} = $self->cache_dir if $self->cache_dir;
52 $cache_opts{enabled} = !$self->update;
54 my $cache = App::Cache->new( \%cache_opts );
57 $self->_update_database();
64 return $self->schema->storage->dbh;
69 return unless $self->debug;
70 print STDOUT @_, "\n";
73 sub _update_database {
76 my $cache = $self->cache;
77 my $db_file = Path::Class::file($cache->directory, "backpan.sqlite");
81 $should_update_db = 1;
83 elsif( defined $self->update ) {
84 $should_update_db = $self->update;
87 # Check the database file before we connect to it. Connecting will create
89 # XXX Should probably just put a timestamp in the DB
90 my $db_mtime = $db_file->stat->mtime;
91 my $db_age = time - $db_mtime;
92 $should_update_db = ($db_age > $cache->ttl);
94 # No matter what, update the DB if we got a new index file.
95 my $archive_mtime = -e $self->_backpan_index_archive ? $self->_backpan_index_archive->stat->mtime : 0;
96 $should_update_db = 1 if $db_mtime < $archive_mtime;
99 unlink $db_file if -e $db_file and $should_update_db;
101 $self->schema( BackPAN::Index::Schema->connect("dbi:SQLite:dbname=$db_file") );
102 return unless $should_update_db;
104 # Delay loading it into memory until we need it
105 $self->_log("Fetching BackPAN index...");
106 $self->_get_backpan_index;
107 $self->_log("Done.");
109 $self->_setup_database;
111 my $dbh = $self->_dbh;
113 $self->_log("Populating database...");
116 # Get it out of the hot loop.
117 my $only_authors = $self->releases_only_from_authors;
119 my $insert_file_sth = $dbh->prepare(q[
125 my $insert_release_sth = $dbh->prepare(q[
127 (path, dist, version, date, size, maturity, cpanid, distvname)
128 VALUES (?, ?, ?, ?, ?, ?, ?, ? )
131 my $insert_dist_sth = $dbh->prepare(q[
134 first_release, first_date, first_author,
135 latest_release, latest_date, latest_author)
143 open my $fh, $self->_backpan_index_file;
144 while( my $line = <$fh> ) {
146 my ( $path, $date, $size, @junk ) = split ' ', $line;
148 if( $files{$path}++ ) {
149 $self->_log("Duplicate file $path in index, ignoring");
153 if( !defined $path or !defined $date or !defined $size or @junk ) {
154 $self->_log("Bad data read at line $.: $line");
159 next if $only_authors and $path !~ m{^authors/};
161 $insert_file_sth->execute($path, $date, $size);
163 next if $path =~ /\.(readme|meta)$/;
165 my $i = CPAN::DistnameInfo->new( $path );
168 next unless $i->dist;
170 $insert_release_sth->execute(
182 # Update aggregate data about dists
183 my $distdata = ($dists{$dist} ||= { name => $dist });
185 if( !defined $distdata->{first_release} ||
186 $date < $distdata->{first_date} )
188 $distdata->{first_release} = $path;
189 $distdata->{first_author} = $i->cpanid;
190 $distdata->{first_date} = $date;
193 if( !defined $distdata->{latest_release} ||
194 $date > $distdata->{latest_date} )
196 $distdata->{latest_release} = $path;
197 $distdata->{latest_author} = $i->cpanid;
198 $distdata->{latest_date} = $date;
201 $distdata->{num_releases}++;
204 for my $dist (values %dists) {
205 $insert_dist_sth->execute(
207 {qw(name num_releases
208 first_release first_date first_author
209 latest_release latest_date latest_author
214 # Add indexes after inserting so as not to slow down the inserts
219 $self->_log("Done.");
225 sub _database_is_empty {
228 return 1 unless $self->files->count;
229 return 1 unless $self->releases->count;
234 # This is denormalized for performance, its read-only anyway
235 sub _setup_database {
240 CREATE TABLE IF NOT EXISTS files (
241 path TEXT NOT NULL PRIMARY KEY,
242 date INTEGER NOT NULL,
243 size INTEGER NOT NULL CHECK ( size >= 0 )
247 CREATE TABLE IF NOT EXISTS releases (
248 path TEXT NOT NULL PRIMARY KEY REFERENCES files,
249 dist TEXT NOT NULL REFERENCES dists,
250 date INTEGER NOT NULL,
252 version TEXT NOT NULL,
253 maturity TEXT NOT NULL,
254 distvname TEXT NOT NULL,
260 CREATE TABLE IF NOT EXISTS dists (
261 name TEXT NOT NULL PRIMARY KEY,
262 first_release TEXT NOT NULL REFERENCES releases,
263 latest_release TEXT NOT NULL REFERENCES releases,
264 first_date INTEGER NOT NULL,
265 latest_date INTEGER NOT NULL,
266 first_author TEXT NOT NULL,
267 latest_author TEXT NOT NULL,
268 num_releases INTEGER NOT NULL
273 my $dbh = $self->_dbh;
274 for my $sql (values %create_for) {
278 $self->schema->rescan;
288 # Speed up dists_by several orders of magnitude
289 "CREATE INDEX IF NOT EXISTS dists_by ON releases (cpanid, dist)",
291 # Speed up files_by a lot
292 "CREATE INDEX IF NOT EXISTS files_by ON releases (cpanid, path)",
294 # Let us order releases by date quickly
295 "CREATE INDEX IF NOT EXISTS releases_by_date ON releases (date, dist)",
297 my $dbh = $self->_dbh;
298 for my $sql (@indexes) {
304 sub _get_backpan_index {
307 my $url = $self->backpan_index_url;
309 return if !$self->_backpan_index_has_changed;
311 my $status = getstore($url, $self->_backpan_index_archive.'');
312 die "Error fetching $url: $status" unless is_success($status);
315 local $Archive::Extract::PREFER_BIN = 1;
317 # Archive::Extract is vulnerable to the ORS.
320 my $ae = Archive::Extract->new( archive => $self->_backpan_index_archive );
321 $ae->extract( to => $self->_backpan_index_file )
322 or die "Problem extracting @{[ $self->_backpan_index_archive ]}: @{[ $ae->error ]}";
324 # If the backpan index age is older than the TTL this prevents us
325 # from immediately looking again.
326 # XXX Should probably use a "last checked" semaphore file
327 $self->_backpan_index_file->touch;
333 sub _backpan_index_archive {
336 my $file = URI->new($self->backpan_index_url)->path;
337 $file = Path::Class::file($file)->basename;
338 return Path::Class::file($file)->absolute($self->cache->directory);
342 sub _backpan_index_file {
345 my $file = $self->_backpan_index_archive;
346 $file =~ s{\.[^.]+$}{};
348 return Path::Class::file($file);
352 sub _backpan_index_has_changed {
355 my $file = $self->_backpan_index_file;
356 return 1 unless -e $file;
358 my $local_mod_time = stat($file)->mtime;
359 my $local_age = time - $local_mod_time;
360 return 0 unless $local_age > $self->cache->ttl;
362 # We looked, don't have to look again until the ttl is up.
363 $self->_backpan_index_file->touch;
365 my(undef, undef, $remote_mod_time) = head($self->backpan_index_url);
366 return defined $remote_mod_time && $remote_mod_time > $local_mod_time;
372 return $self->schema->resultset('File');
377 my($self, $dist) = @_;
379 return $self->dists->single({ name => $dist });
384 my($self, $dist) = @_;
386 return $self->schema->resultset("Release") unless defined $dist;
387 return $self->schema->resultset("Release")->search({ dist => $dist });
392 my($self, $dist, $version) = @_;
394 return $self->releases($dist)->single({ version => $version });
401 return $self->schema->resultset("Dist");
407 BackPAN::Index - An interface to the BackPAN index
412 my $backpan = BackPAN::Index->new;
414 # These are all DBIx::Class::ResultSet's
415 my $files = $backpan->files;
416 my $dists = $backpan->dists;
417 my $releases = $backpan->releases("Acme-Pony");
419 # Use DBIx::Class::ResultSet methods on them
420 my $release = $releases->single({ version => '1.23' });
422 my $dist = $backpan->dist("Test-Simple");
423 my $releases = $dist->releases;
427 This downloads, caches and parses the BackPAN index into a local
428 database for efficient querying.
430 Its a pretty thin wrapper around DBIx::Class returning
431 L<DBIx::Class::ResultSet> objects which makes it efficient and
434 The Comprehensive Perl Archive Network (CPAN) is a very useful
435 collection of Perl code. However, in order to keep CPAN relatively
436 small, authors of modules can delete older versions of modules to only
437 let CPAN have the latest version of a module. BackPAN is where these
438 deleted modules are backed up. It's more like a full CPAN mirror, only
439 without the deletions. This module provides an index of BackPAN and
446 my $backpan = BackPAN::Index->new(\%options);
448 Create a new object representing the BackPAN index.
450 It will, if necessary, download the BackPAN index and compile it into
451 a database for efficient storage. Initial creation is slow, but it
454 new() takes some options
458 Because it is rather large, BackPAN::Index caches a copy of the
459 BackPAN index and builds a local database to speed access. This flag
460 controls if the local index is updated.
462 If true, forces an update of the BACKPAN index.
464 If false, the index will never be updated even if the cache is
465 expired. It will always create a new index if one does not exist.
467 By default the index is cached and checked for updates according to
468 C<<$backpan->cache_ttl>>.
472 How many seconds before checking for an updated index.
478 If true, debug messages will be printed.
482 =head3 releases_only_from_authors
484 If true, only files in the C<authors> directory will be considered as
485 releases. If false any file in the index may be considered for a
492 Location of the cache directory.
494 Defaults to whatever L<App::Cache> does.
496 =head3 backpan_index_url
498 URL to the BackPAN index.
500 Defaults to a sensible location.
505 my $files = $backpan->files;
507 Returns a ResultSet representing all the files on BackPAN.
511 my $files = $backpan->files_by($cpanid);
512 my @files = $backpan->files_by($cpanid);
514 Returns all the files by a given $cpanid.
516 Returns either a list of BackPAN::Index::Files or a ResultSet.
524 return $self->files->search({ "releases.cpanid" => $cpanid }, { join => "releases" });
529 my $dists = $backpan->dists;
531 Returns a ResultSet representing all the distributions on BackPAN.
535 my $dists = $backpan->dist($dist_name);
537 Returns a single BackPAN::Index::Dist object for $dist_name.
541 my $dists = $backpan->dists_by($cpanid);
542 my @dists = $backpan->dists_by($cpanid);
544 Returns the dists which contain at least one release by the given
547 Returns either a ResultSet or a list of the Dists.
555 return $self->dists->search({ "releases.cpanid" => $cpanid }, { join => "releases", distinct => 1 });
559 =head2 dists_changed_since
561 my $dists = $backpan->dists_changed_since($time);
563 Returns a ResultSet of distributions which have had releases at or after after $time.
567 sub dists_changed_since {
571 return $self->dists->search( latest_date => \">= $time" );
576 my $all_releases = $backpan->releases();
577 my $dist_releases = $backpan->releases($dist_name);
579 Returns a ResultSet representing all the releases on BackPAN. If a
580 $dist_name is given it returns the releases of just one distribution.
584 my $release = $backpan->release($dist_name, $version);
586 Returns a single BackPAN::Index::Release object for the given
587 $dist_name and $version.
591 my $releases = $backpan->releases_by($cpanid);
592 my @releases = $backpan->releases_by($cpanid);
594 Returns all the releases of a single author.
596 Returns either a list of Releases or a ResultSet representing those releases.
604 return $self->releases->search({ cpanid => $cpanid });
608 =head2 releases_since
610 my $releases = $backpan->releases_since($time);
612 Returns a ResultSet of releases which were released at or after $time.
620 return $self->releases->search( date => \">= $time" );
626 The real power of BackPAN::Index comes from L<DBIx::Class::ResultSet>.
627 Its very flexible and very powerful but not always obvious how to get
628 it to do things. Here's some examples.
630 # How many files are on BackPAN?
631 my $count = $backpan->files->count;
633 # How big is BackPAN?
634 my $size = $backpan->files->get_column("size")->sum;
636 # What are the names of all the distributions?
637 my @names = $backpan->dists->get_column("name")->all;
639 # What path contains this release?
640 my $path = $backpan->release("Acme-Pony", 1.01)->path;
642 # Get all the releases of Moose ordered by version
643 my @releases = $backpan->dist("Moose")->releases
644 ->search(undef, { order_by => "version" });
648 Michael G Schwern <schwern@pobox.com>
652 Copyright 2009, Michael G Schwern
656 This module is free software; you can redistribute it or modify it under
657 the same terms as Perl itself.
661 L<DBIx::Class::ResultSet>, L<BackPAN::Index::File>,
662 L<BackPAN::Index::Release>, L<BackPAN::Index::Dist>
664 Repository: L<http://github.com/acme/parse-backpan-packages>
665 Bugs: L<http://rt.cpan.org/Public/Dist/Display.html?Name=Parse-BACKPAN-Packages>