1 package BackPAN::Index;
10 use CPAN::DistnameInfo 0.09;
11 use LWP::Simple qw(getstore head is_success);
15 use BackPAN::Index::Schema;
17 use parent qw( Class::Accessor::Fast );
19 __PACKAGE__->mk_accessors(qw(
23 releases_only_from_authors
27 backpan_index schema cache
31 backpan_index_url => "http://www.astray.com/tmp/backpan.txt.gz",
32 releases_only_from_authors => 1,
45 %$options = ( %Defaults, %$options );
47 my $self = $class->SUPER::new($options);
50 $cache_opts{ttl} = $self->cache_ttl;
51 $cache_opts{directory} = $self->cache_dir if $self->cache_dir;
52 $cache_opts{enabled} = !$self->update;
54 my $cache = App::Cache->new( \%cache_opts );
57 $self->_update_database();
64 return $self->schema->storage->dbh;
69 return unless $self->debug;
70 print STDOUT @_, "\n";
73 sub _update_database {
76 my $cache = $self->cache;
77 my $db_file = Path::Class::file($cache->directory, "backpan.sqlite");
81 $should_update_db = 1;
83 elsif( defined $self->update ) {
84 $should_update_db = $self->update;
87 # Check the database file before we connect to it. Connecting will create
89 # XXX Should probably just put a timestamp in the DB
90 my $db_mtime = $db_file->stat->mtime;
91 my $db_age = time - $db_mtime;
92 $should_update_db = ($db_age > $cache->ttl);
94 # No matter what, update the DB if we got a new index file.
95 my $archive_mtime = -e $self->_backpan_index_archive ? $self->_backpan_index_archive->stat->mtime : 0;
96 $should_update_db = 1 if $db_mtime < $archive_mtime;
99 $self->schema( BackPAN::Index::Schema->connect("dbi:SQLite:dbname=$db_file") );
100 return unless $should_update_db;
102 # Delay loading it into memory until we need it
103 $self->_log("Fetching BackPAN index...");
104 $self->_get_backpan_index;
105 $self->_log("Done.");
107 $self->_setup_database;
109 my $dbh = $self->_dbh;
111 $self->_log("Populating database...");
114 # Get it out of the hot loop.
115 my $only_authors = $self->releases_only_from_authors;
117 my $insert_file_sth = $dbh->prepare(q[
123 my $insert_release_sth = $dbh->prepare(q[
125 (path, dist, version, date, size, maturity, cpanid, distvname)
126 VALUES (?, ?, ?, ?, ?, ?, ?, ? )
129 my $insert_dist_sth = $dbh->prepare(q[
132 first_release, first_date, first_author,
133 latest_release, latest_date, latest_author)
141 open my $fh, $self->_backpan_index_file;
142 while( my $line = <$fh> ) {
144 my ( $path, $date, $size, @junk ) = split ' ', $line;
146 if( $files{$path}++ ) {
147 $self->_log("Duplicate file $path in index, ignoring");
151 if( !defined $path or !defined $date or !defined $size or @junk ) {
152 $self->_log("Bad data read at line $.: $line");
157 next if $only_authors and $path !~ m{^authors/};
159 $insert_file_sth->execute($path, $date, $size);
161 next if $path =~ /\.(readme|meta)$/;
163 my $i = CPAN::DistnameInfo->new( $path );
166 next unless $i->dist;
168 $insert_release_sth->execute(
180 # Update aggregate data about dists
181 my $distdata = ($dists{$dist} ||= { name => $dist });
183 if( !defined $distdata->{first_release} ||
184 $date < $distdata->{first_date} )
186 $distdata->{first_release} = $path;
187 $distdata->{first_author} = $i->cpanid;
188 $distdata->{first_date} = $date;
191 if( !defined $distdata->{latest_release} ||
192 $date > $distdata->{latest_date} )
194 $distdata->{latest_release} = $path;
195 $distdata->{latest_author} = $i->cpanid;
196 $distdata->{latest_date} = $date;
199 $distdata->{num_releases}++;
202 for my $dist (values %dists) {
203 $insert_dist_sth->execute(
205 {qw(name num_releases
206 first_release first_date first_author
207 latest_release latest_date latest_author
212 # Add indexes after inserting so as not to slow down the inserts
217 $self->_log("Done.");
223 sub _database_is_empty {
226 return 1 unless $self->files->count;
227 return 1 unless $self->releases->count;
232 # This is denormalized for performance, its read-only anyway
233 sub _setup_database {
238 CREATE TABLE IF NOT EXISTS files (
239 path TEXT NOT NULL PRIMARY KEY,
240 date INTEGER NOT NULL,
241 size INTEGER NOT NULL CHECK ( size >= 0 )
245 CREATE TABLE IF NOT EXISTS releases (
246 path TEXT NOT NULL PRIMARY KEY REFERENCES files,
247 dist TEXT NOT NULL REFERENCES dists,
248 date INTEGER NOT NULL,
250 version TEXT NOT NULL,
251 maturity TEXT NOT NULL,
252 distvname TEXT NOT NULL,
258 CREATE TABLE IF NOT EXISTS dists (
259 name TEXT NOT NULL PRIMARY KEY,
260 first_release TEXT NOT NULL REFERENCES releases,
261 latest_release TEXT NOT NULL REFERENCES releases,
262 first_date INTEGER NOT NULL,
263 latest_date INTEGER NOT NULL,
264 first_author TEXT NOT NULL,
265 latest_author TEXT NOT NULL,
266 num_releases INTEGER NOT NULL
271 files => 'DELETE FROM files',
272 releases => 'DELETE FROM releases',
273 dists => 'DELETE FROM dists',
276 my $dbh = $self->_dbh;
277 for my $table_name (keys %create_for) {
278 $dbh->do($create_for{$table_name});
279 $dbh->do($delete_for{$table_name});
282 $self->schema->rescan;
292 # Speed up dists_by several orders of magnitude
293 "CREATE INDEX IF NOT EXISTS dists_by ON releases (cpanid, dist)",
295 # Speed up files_by a lot
296 "CREATE INDEX IF NOT EXISTS files_by ON releases (cpanid, path)",
298 # Let us order releases by date quickly
299 "CREATE INDEX IF NOT EXISTS releases_by_date ON releases (date, dist)",
301 my $dbh = $self->_dbh;
302 for my $sql (@indexes) {
308 sub _get_backpan_index {
311 my $url = $self->backpan_index_url;
313 return if !$self->_backpan_index_has_changed;
315 my $status = getstore($url, $self->_backpan_index_archive.'');
316 die "Error fetching $url: $status" unless is_success($status);
319 local $Archive::Extract::PREFER_BIN = 1;
321 # Archive::Extract is vulnerable to the ORS.
324 my $ae = Archive::Extract->new( archive => $self->_backpan_index_archive );
325 $ae->extract( to => $self->_backpan_index_file )
326 or die "Problem extracting @{[ $self->_backpan_index_archive ]}: @{[ $ae->error ]}";
328 # If the backpan index age is older than the TTL this prevents us
329 # from immediately looking again.
330 # XXX Should probably use a "last checked" semaphore file
331 $self->_backpan_index_file->touch;
337 sub _backpan_index_archive {
340 my $file = URI->new($self->backpan_index_url)->path;
341 $file = Path::Class::file($file)->basename;
342 return Path::Class::file($file)->absolute($self->cache->directory);
346 sub _backpan_index_file {
349 my $file = $self->_backpan_index_archive;
350 $file =~ s{\.[^.]+$}{};
352 return Path::Class::file($file);
356 sub _backpan_index_has_changed {
359 my $file = $self->_backpan_index_file;
360 return 1 unless -e $file;
362 my $local_mod_time = stat($file)->mtime;
363 my $local_age = time - $local_mod_time;
364 return 0 unless $local_age > $self->cache->ttl;
366 # We looked, don't have to look again until the ttl is up.
367 $self->_backpan_index_file->touch;
369 my(undef, undef, $remote_mod_time) = head($self->backpan_index_url);
370 return defined $remote_mod_time && $remote_mod_time > $local_mod_time;
376 return $self->schema->resultset('File');
381 my($self, $dist) = @_;
383 return $self->dists->single({ name => $dist });
388 my($self, $dist) = @_;
390 return $self->schema->resultset("Release") unless defined $dist;
391 return $self->schema->resultset("Release")->search({ dist => $dist });
396 my($self, $dist, $version) = @_;
398 return $self->releases($dist)->single({ version => $version });
405 return $self->schema->resultset("Dist");
411 BackPAN::Index - An interface to the BackPAN index
416 my $backpan = BackPAN::Index->new;
418 # These are all DBIx::Class::ResultSet's
419 my $files = $backpan->files;
420 my $dists = $backpan->dists;
421 my $releases = $backpan->releases("Acme-Pony");
423 # Use DBIx::Class::ResultSet methods on them
424 my $release = $releases->single({ version => '1.23' });
426 my $dist = $backpan->dist("Test-Simple");
427 my $releases = $dist->releases;
431 This downloads, caches and parses the BackPAN index into a local
432 database for efficient querying.
434 Its a pretty thin wrapper around DBIx::Class returning
435 L<DBIx::Class::ResultSet> objects which makes it efficient and
438 The Comprehensive Perl Archive Network (CPAN) is a very useful
439 collection of Perl code. However, in order to keep CPAN relatively
440 small, authors of modules can delete older versions of modules to only
441 let CPAN have the latest version of a module. BackPAN is where these
442 deleted modules are backed up. It's more like a full CPAN mirror, only
443 without the deletions. This module provides an index of BackPAN and
450 my $backpan = BackPAN::Index->new(\%options);
452 Create a new object representing the BackPAN index.
454 It will, if necessary, download the BackPAN index and compile it into
455 a database for efficient storage. Initial creation is slow, but it
458 new() takes some options
462 Because it is rather large, BackPAN::Index caches a copy of the
463 BackPAN index and builds a local database to speed access. This flag
464 controls if the local index is updated.
466 If true, forces an update of the BACKPAN index.
468 If false, the index will never be updated even if the cache is
469 expired. It will always create a new index if one does not exist.
471 By default the index is cached and checked for updates according to
472 C<<$backpan->cache_ttl>>.
476 How many seconds before checking for an updated index.
482 If true, debug messages will be printed.
486 =head3 releases_only_from_authors
488 If true, only files in the C<authors> directory will be considered as
489 releases. If false any file in the index may be considered for a
496 Location of the cache directory.
498 Defaults to whatever L<App::Cache> does.
500 =head3 backpan_index_url
502 URL to the BackPAN index.
504 Defaults to a sensible location.
509 my $files = $backpan->files;
511 Returns a ResultSet representing all the files on BackPAN.
515 my $files = $backpan->files_by($cpanid);
516 my @files = $backpan->files_by($cpanid);
518 Returns all the files by a given $cpanid.
520 Returns either a list of BackPAN::Index::Files or a ResultSet.
528 return $self->files->search({ "releases.cpanid" => $cpanid }, { join => "releases" });
533 my $dists = $backpan->dists;
535 Returns a ResultSet representing all the distributions on BackPAN.
539 my $dists = $backpan->dist($dist_name);
541 Returns a single BackPAN::Index::Dist object for $dist_name.
545 my $dists = $backpan->dists_by($cpanid);
546 my @dists = $backpan->dists_by($cpanid);
548 Returns the dists which contain at least one release by the given
551 Returns either a ResultSet or a list of the Dists.
559 return $self->dists->search({ "releases.cpanid" => $cpanid }, { join => "releases", distinct => 1 });
563 =head2 dists_changed_since
565 my $dists = $backpan->dists_changed_since($time);
567 Returns a ResultSet of distributions which have had releases at or after after $time.
571 sub dists_changed_since {
575 return $self->dists->search( latest_date => \">= $time" );
580 my $all_releases = $backpan->releases();
581 my $dist_releases = $backpan->releases($dist_name);
583 Returns a ResultSet representing all the releases on BackPAN. If a
584 $dist_name is given it returns the releases of just one distribution.
588 my $release = $backpan->release($dist_name, $version);
590 Returns a single BackPAN::Index::Release object for the given
591 $dist_name and $version.
595 my $releases = $backpan->releases_by($cpanid);
596 my @releases = $backpan->releases_by($cpanid);
598 Returns all the releases of a single author.
600 Returns either a list of Releases or a ResultSet representing those releases.
608 return $self->releases->search({ cpanid => $cpanid });
612 =head2 releases_since
614 my $releases = $backpan->releases_since($time);
616 Returns a ResultSet of releases which were released at or after $time.
624 return $self->releases->search( date => \">= $time" );
630 The real power of BackPAN::Index comes from L<DBIx::Class::ResultSet>.
631 Its very flexible and very powerful but not always obvious how to get
632 it to do things. Here's some examples.
634 # How many files are on BackPAN?
635 my $count = $backpan->files->count;
637 # How big is BackPAN?
638 my $size = $backpan->files->get_column("size")->sum;
640 # What are the names of all the distributions?
641 my @names = $backpan->dists->get_column("name")->all;
643 # What path contains this release?
644 my $path = $backpan->release("Acme-Pony", 1.01)->path;
646 # Get all the releases of Moose ordered by version
647 my @releases = $backpan->dist("Moose")->releases
648 ->search(undef, { order_by => "version" });
652 Michael G Schwern <schwern@pobox.com>
656 Copyright 2009, Michael G Schwern
660 This module is free software; you can redistribute it or modify it under
661 the same terms as Perl itself.
665 L<DBIx::Class::ResultSet>, L<BackPAN::Index::File>,
666 L<BackPAN::Index::Release>, L<BackPAN::Index::Dist>
668 Repository: L<http://github.com/acme/parse-backpan-packages>
669 Bugs: L<http://rt.cpan.org/Public/Dist/Display.html?Name=Parse-BACKPAN-Packages>