Update with latest fixes
[catagits/BackPAN-Web.git] / lib / BackPAN / Index.pm
CommitLineData
b67ffc2e 1package BackPAN::Index;
2
3use strict;
4use warnings;
5
6our $VERSION = '0.39';
7
8use autodie;
9use App::Cache 0.37;
10use CPAN::DistnameInfo 0.09;
11use LWP::Simple qw(getstore head is_success);
12use Archive::Extract;
13use Path::Class ();
14use File::stat;
15use BackPAN::Index::Schema;
16
17use parent qw( Class::Accessor::Fast );
18
19__PACKAGE__->mk_accessors(qw(
20 update
21 cache_ttl
22 debug
23 releases_only_from_authors
24 cache_dir
25 backpan_index_url
26
27 backpan_index schema cache
28));
29
30my %Defaults = (
31 backpan_index_url => "http://www.astray.com/tmp/backpan.txt.gz",
32 releases_only_from_authors => 1,
33 debug => 0,
34 cache_ttl => 60 * 60,
35);
36
37
38sub new {
39 my $class = shift;
40 my $options = shift;
41
42 $options ||= {};
43
44 # Apply defaults
45 %$options = ( %Defaults, %$options );
46
47 my $self = $class->SUPER::new($options);
48
49 my %cache_opts;
50 $cache_opts{ttl} = $self->cache_ttl;
51 $cache_opts{directory} = $self->cache_dir if $self->cache_dir;
52 $cache_opts{enabled} = !$self->update;
53
54 my $cache = App::Cache->new( \%cache_opts );
55 $self->cache($cache);
56
57 $self->_update_database();
58
59 return $self;
60}
61
62sub _dbh {
63 my $self = shift;
64 return $self->schema->storage->dbh;
65}
66
67sub _log {
68 my $self = shift;
69 return unless $self->debug;
2593d57c 70 print STDOUT @_, "\n";
b67ffc2e 71}
72
73sub _update_database {
74 my $self = shift;
75
76 my $cache = $self->cache;
77 my $db_file = Path::Class::file($cache->directory, "backpan.sqlite");
78
79 my $should_update_db;
80 if( ! -e $db_file ) {
81 $should_update_db = 1;
82 }
83 elsif( defined $self->update ) {
84 $should_update_db = $self->update;
85 }
86 else {
87 # Check the database file before we connect to it. Connecting will create
88 # the file.
89 # XXX Should probably just put a timestamp in the DB
90 my $db_mtime = $db_file->stat->mtime;
91 my $db_age = time - $db_mtime;
92 $should_update_db = ($db_age > $cache->ttl);
93
94 # No matter what, update the DB if we got a new index file.
95 my $archive_mtime = -e $self->_backpan_index_archive ? $self->_backpan_index_archive->stat->mtime : 0;
96 $should_update_db = 1 if $db_mtime < $archive_mtime;
97 }
98
b67ffc2e 99 $self->schema( BackPAN::Index::Schema->connect("dbi:SQLite:dbname=$db_file") );
100 return unless $should_update_db;
101
102 # Delay loading it into memory until we need it
103 $self->_log("Fetching BackPAN index...");
104 $self->_get_backpan_index;
105 $self->_log("Done.");
106
107 $self->_setup_database;
108
b67ffc2e 109 my $dbh = $self->_dbh;
110
111 $self->_log("Populating database...");
112 $dbh->begin_work;
113
114 # Get it out of the hot loop.
115 my $only_authors = $self->releases_only_from_authors;
116
117 my $insert_file_sth = $dbh->prepare(q[
118 INSERT INTO files
119 (path, date, size)
120 VALUES (?, ?, ? )
121 ]);
122
123 my $insert_release_sth = $dbh->prepare(q[
124 INSERT INTO releases
125 (path, dist, version, date, size, maturity, cpanid, distvname)
126 VALUES (?, ?, ?, ?, ?, ?, ?, ? )
127 ]);
128
129 my $insert_dist_sth = $dbh->prepare(q[
130 INSERT INTO dists
131 (name, num_releases,
132 first_release, first_date, first_author,
133 latest_release, latest_date, latest_author)
134 VALUES (?, ?,
135 ?, ?, ?,
136 ?, ?, ?)
137 ]);
138
139 my %dists;
140 my %files;
141 open my $fh, $self->_backpan_index_file;
142 while( my $line = <$fh> ) {
143 chomp $line;
144 my ( $path, $date, $size, @junk ) = split ' ', $line;
145
146 if( $files{$path}++ ) {
147 $self->_log("Duplicate file $path in index, ignoring");
148 next;
149 }
150
151 if( !defined $path or !defined $date or !defined $size or @junk ) {
152 $self->_log("Bad data read at line $.: $line");
153 next;
154 }
155
156 next unless $size;
157 next if $only_authors and $path !~ m{^authors/};
158
159 $insert_file_sth->execute($path, $date, $size);
160
161 next if $path =~ /\.(readme|meta)$/;
162
163 my $i = CPAN::DistnameInfo->new( $path );
164
165 my $dist = $i->dist;
166 next unless $i->dist;
167
168 $insert_release_sth->execute(
169 $path,
170 $dist,
171 $i->version || '',
172 $date,
173 $size,
174 $i->maturity,
175 $i->cpanid,
176 $i->distvname,
177 );
178
179
180 # Update aggregate data about dists
181 my $distdata = ($dists{$dist} ||= { name => $dist });
182
183 if( !defined $distdata->{first_release} ||
184 $date < $distdata->{first_date} )
185 {
186 $distdata->{first_release} = $path;
187 $distdata->{first_author} = $i->cpanid;
188 $distdata->{first_date} = $date;
189 }
190
191 if( !defined $distdata->{latest_release} ||
192 $date > $distdata->{latest_date} )
193 {
194 $distdata->{latest_release} = $path;
195 $distdata->{latest_author} = $i->cpanid;
196 $distdata->{latest_date} = $date;
197 }
198
199 $distdata->{num_releases}++;
200 }
201
202 for my $dist (values %dists) {
203 $insert_dist_sth->execute(
204 @{$dist}
205 {qw(name num_releases
206 first_release first_date first_author
207 latest_release latest_date latest_author
208 )}
209 );
210 }
211
212 # Add indexes after inserting so as not to slow down the inserts
213 $self->_add_indexes;
214
215 $dbh->commit;
216
217 $self->_log("Done.");
218
219 return;
220}
221
222
223sub _database_is_empty {
224 my $self = shift;
225
226 return 1 unless $self->files->count;
227 return 1 unless $self->releases->count;
228 return 0;
229}
230
231
232# This is denormalized for performance, its read-only anyway
233sub _setup_database {
234 my $self = shift;
235
236 my %create_for = (
237 files => <<'SQL',
238CREATE TABLE IF NOT EXISTS files (
239 path TEXT NOT NULL PRIMARY KEY,
240 date INTEGER NOT NULL,
241 size INTEGER NOT NULL CHECK ( size >= 0 )
242)
243SQL
244 releases => <<'SQL',
245CREATE TABLE IF NOT EXISTS releases (
246 path TEXT NOT NULL PRIMARY KEY REFERENCES files,
247 dist TEXT NOT NULL REFERENCES dists,
248 date INTEGER NOT NULL,
249 size TEXT NOT NULL,
250 version TEXT NOT NULL,
251 maturity TEXT NOT NULL,
252 distvname TEXT NOT NULL,
253 cpanid TEXT NOT NULL
254)
255SQL
256
257 dists => <<'SQL',
258CREATE TABLE IF NOT EXISTS dists (
259 name TEXT NOT NULL PRIMARY KEY,
260 first_release TEXT NOT NULL REFERENCES releases,
261 latest_release TEXT NOT NULL REFERENCES releases,
262 first_date INTEGER NOT NULL,
263 latest_date INTEGER NOT NULL,
264 first_author TEXT NOT NULL,
265 latest_author TEXT NOT NULL,
266 num_releases INTEGER NOT NULL
267)
268SQL
269);
c4e00ace 270 my %delete_for = (
271 files => 'DELETE FROM files',
272 releases => 'DELETE FROM releases',
273 dists => 'DELETE FROM dists',
274 );
b67ffc2e 275
276 my $dbh = $self->_dbh;
c4e00ace 277 for my $table_name (keys %create_for) {
278 $dbh->do($create_for{$table_name});
279 $dbh->do($delete_for{$table_name});
b67ffc2e 280 }
281
282 $self->schema->rescan;
283
284 return;
285}
286
287
288sub _add_indexes {
289 my $self = shift;
290
291 my @indexes = (
292 # Speed up dists_by several orders of magnitude
293 "CREATE INDEX IF NOT EXISTS dists_by ON releases (cpanid, dist)",
294
295 # Speed up files_by a lot
296 "CREATE INDEX IF NOT EXISTS files_by ON releases (cpanid, path)",
297
298 # Let us order releases by date quickly
299 "CREATE INDEX IF NOT EXISTS releases_by_date ON releases (date, dist)",
300 );
301 my $dbh = $self->_dbh;
302 for my $sql (@indexes) {
303 $dbh->do($sql);
304 }
305}
306
307
308sub _get_backpan_index {
309 my $self = shift;
310
311 my $url = $self->backpan_index_url;
312
313 return if !$self->_backpan_index_has_changed;
314
315 my $status = getstore($url, $self->_backpan_index_archive.'');
316 die "Error fetching $url: $status" unless is_success($status);
317
318 # Faster
319 local $Archive::Extract::PREFER_BIN = 1;
320
321 # Archive::Extract is vulnerable to the ORS.
322 local $\;
323
324 my $ae = Archive::Extract->new( archive => $self->_backpan_index_archive );
325 $ae->extract( to => $self->_backpan_index_file )
326 or die "Problem extracting @{[ $self->_backpan_index_archive ]}: @{[ $ae->error ]}";
327
328 # If the backpan index age is older than the TTL this prevents us
329 # from immediately looking again.
330 # XXX Should probably use a "last checked" semaphore file
331 $self->_backpan_index_file->touch;
332
333 return;
334}
335
336
337sub _backpan_index_archive {
338 my $self = shift;
339
340 my $file = URI->new($self->backpan_index_url)->path;
341 $file = Path::Class::file($file)->basename;
342 return Path::Class::file($file)->absolute($self->cache->directory);
343}
344
345
346sub _backpan_index_file {
347 my $self = shift;
348
349 my $file = $self->_backpan_index_archive;
350 $file =~ s{\.[^.]+$}{};
351
352 return Path::Class::file($file);
353}
354
355
356sub _backpan_index_has_changed {
357 my $self = shift;
358
359 my $file = $self->_backpan_index_file;
360 return 1 unless -e $file;
361
362 my $local_mod_time = stat($file)->mtime;
363 my $local_age = time - $local_mod_time;
364 return 0 unless $local_age > $self->cache->ttl;
365
366 # We looked, don't have to look again until the ttl is up.
367 $self->_backpan_index_file->touch;
368
369 my(undef, undef, $remote_mod_time) = head($self->backpan_index_url);
55f9d76f 370 return defined $remote_mod_time && $remote_mod_time > $local_mod_time;
b67ffc2e 371}
372
373
374sub files {
375 my $self = shift;
376 return $self->schema->resultset('File');
377}
378
379
380sub dist {
381 my($self, $dist) = @_;
382
383 return $self->dists->single({ name => $dist });
384}
385
386
387sub releases {
388 my($self, $dist) = @_;
389
390 return $self->schema->resultset("Release") unless defined $dist;
391 return $self->schema->resultset("Release")->search({ dist => $dist });
392}
393
394
395sub release {
396 my($self, $dist, $version) = @_;
397
398 return $self->releases($dist)->single({ version => $version });
399}
400
401
402sub dists {
403 my $self = shift;
404
405 return $self->schema->resultset("Dist");
406}
407
408
409=head1 NAME
410
411BackPAN::Index - An interface to the BackPAN index
412
413=head1 SYNOPSIS
414
415 use BackPAN::Index;
416 my $backpan = BackPAN::Index->new;
417
418 # These are all DBIx::Class::ResultSet's
419 my $files = $backpan->files;
420 my $dists = $backpan->dists;
421 my $releases = $backpan->releases("Acme-Pony");
422
423 # Use DBIx::Class::ResultSet methods on them
424 my $release = $releases->single({ version => '1.23' });
425
426 my $dist = $backpan->dist("Test-Simple");
427 my $releases = $dist->releases;
428
429=head1 DESCRIPTION
430
431This downloads, caches and parses the BackPAN index into a local
432database for efficient querying.
433
434Its a pretty thin wrapper around DBIx::Class returning
435L<DBIx::Class::ResultSet> objects which makes it efficient and
436flexible.
437
438The Comprehensive Perl Archive Network (CPAN) is a very useful
439collection of Perl code. However, in order to keep CPAN relatively
440small, authors of modules can delete older versions of modules to only
441let CPAN have the latest version of a module. BackPAN is where these
442deleted modules are backed up. It's more like a full CPAN mirror, only
443without the deletions. This module provides an index of BackPAN and
444some handy methods.
445
446=head1 METHODS
447
448=head2 new
449
450 my $backpan = BackPAN::Index->new(\%options);
451
452Create a new object representing the BackPAN index.
453
454It will, if necessary, download the BackPAN index and compile it into
455a database for efficient storage. Initial creation is slow, but it
456will be cached.
457
458new() takes some options
459
460=head3 update
461
462Because it is rather large, BackPAN::Index caches a copy of the
463BackPAN index and builds a local database to speed access. This flag
464controls if the local index is updated.
465
466If true, forces an update of the BACKPAN index.
467
468If false, the index will never be updated even if the cache is
469expired. It will always create a new index if one does not exist.
470
471By default the index is cached and checked for updates according to
472C<<$backpan->cache_ttl>>.
473
474=head3 cache_ttl
475
476How many seconds before checking for an updated index.
477
478Defaults to an hour.
479
480=head3 debug
481
482If true, debug messages will be printed.
483
484Defaults to false.
485
486=head3 releases_only_from_authors
487
488If true, only files in the C<authors> directory will be considered as
489releases. If false any file in the index may be considered for a
490release.
491
492Defaults to true.
493
494=head3 cache_dir
495
496Location of the cache directory.
497
498Defaults to whatever L<App::Cache> does.
499
500=head3 backpan_index_url
501
502URL to the BackPAN index.
503
504Defaults to a sensible location.
505
506
507=head2 files
508
509 my $files = $backpan->files;
510
511Returns a ResultSet representing all the files on BackPAN.
512
513=head2 files_by
514
515 my $files = $backpan->files_by($cpanid);
516 my @files = $backpan->files_by($cpanid);
517
518Returns all the files by a given $cpanid.
519
520Returns either a list of BackPAN::Index::Files or a ResultSet.
521
522=cut
523
524sub files_by {
525 my $self = shift;
526 my $cpanid = shift;
527
528 return $self->files->search({ "releases.cpanid" => $cpanid }, { join => "releases" });
529}
530
531=head2 dists
532
533 my $dists = $backpan->dists;
534
535Returns a ResultSet representing all the distributions on BackPAN.
536
537=head2 dist
538
539 my $dists = $backpan->dist($dist_name);
540
541Returns a single BackPAN::Index::Dist object for $dist_name.
542
543=head2 dists_by
544
545 my $dists = $backpan->dists_by($cpanid);
546 my @dists = $backpan->dists_by($cpanid);
547
548Returns the dists which contain at least one release by the given
549$cpanid.
550
551Returns either a ResultSet or a list of the Dists.
552
553=cut
554
555sub dists_by {
556 my $self = shift;
557 my $cpanid = shift;
558
559 return $self->dists->search({ "releases.cpanid" => $cpanid }, { join => "releases", distinct => 1 });
560}
561
562
563=head2 dists_changed_since
564
565 my $dists = $backpan->dists_changed_since($time);
566
567Returns a ResultSet of distributions which have had releases at or after after $time.
568
569=cut
570
571sub dists_changed_since {
572 my $self = shift;
573 my $time = shift;
574
575 return $self->dists->search( latest_date => \">= $time" );
576}
577
578=head2 releases
579
580 my $all_releases = $backpan->releases();
581 my $dist_releases = $backpan->releases($dist_name);
582
583Returns a ResultSet representing all the releases on BackPAN. If a
584$dist_name is given it returns the releases of just one distribution.
585
586=head2 release
587
588 my $release = $backpan->release($dist_name, $version);
589
590Returns a single BackPAN::Index::Release object for the given
591$dist_name and $version.
592
593=head2 releases_by
594
595 my $releases = $backpan->releases_by($cpanid);
596 my @releases = $backpan->releases_by($cpanid);
597
598Returns all the releases of a single author.
599
600Returns either a list of Releases or a ResultSet representing those releases.
601
602=cut
603
604sub releases_by {
605 my $self = shift;
606 my $cpanid = shift;
607
608 return $self->releases->search({ cpanid => $cpanid });
609}
610
611
612=head2 releases_since
613
614 my $releases = $backpan->releases_since($time);
615
616Returns a ResultSet of releases which were released at or after $time.
617
618=cut
619
620sub releases_since {
621 my $self = shift;
622 my $time = shift;
623
624 return $self->releases->search( date => \">= $time" );
625}
626
627
628=head1 EXAMPLES
629
630The real power of BackPAN::Index comes from L<DBIx::Class::ResultSet>.
631Its very flexible and very powerful but not always obvious how to get
632it to do things. Here's some examples.
633
634 # How many files are on BackPAN?
635 my $count = $backpan->files->count;
636
637 # How big is BackPAN?
638 my $size = $backpan->files->get_column("size")->sum;
639
640 # What are the names of all the distributions?
641 my @names = $backpan->dists->get_column("name")->all;
642
643 # What path contains this release?
644 my $path = $backpan->release("Acme-Pony", 1.01)->path;
645
646 # Get all the releases of Moose ordered by version
647 my @releases = $backpan->dist("Moose")->releases
648 ->search(undef, { order_by => "version" });
649
650=head1 AUTHOR
651
652Michael G Schwern <schwern@pobox.com>
653
654=head1 COPYRIGHT
655
656Copyright 2009, Michael G Schwern
657
658=head1 LICENSE
659
660This module is free software; you can redistribute it or modify it under
661the same terms as Perl itself.
662
663=head1 SEE ALSO
664
665L<DBIx::Class::ResultSet>, L<BackPAN::Index::File>,
666L<BackPAN::Index::Release>, L<BackPAN::Index::Dist>
667
668Repository: L<http://github.com/acme/parse-backpan-packages>
669Bugs: L<http://rt.cpan.org/Public/Dist/Display.html?Name=Parse-BACKPAN-Packages>
670
671=cut
672
6731;