BackPAN::Index logs to STDOUT now
[catagits/BackPAN-Web.git] / lib / BackPAN / Index.pm
CommitLineData
b67ffc2e 1package BackPAN::Index;
2
3use strict;
4use warnings;
5
6our $VERSION = '0.39';
7
8use autodie;
9use App::Cache 0.37;
10use CPAN::DistnameInfo 0.09;
11use LWP::Simple qw(getstore head is_success);
12use Archive::Extract;
13use Path::Class ();
14use File::stat;
15use BackPAN::Index::Schema;
16
17use parent qw( Class::Accessor::Fast );
18
19__PACKAGE__->mk_accessors(qw(
20 update
21 cache_ttl
22 debug
23 releases_only_from_authors
24 cache_dir
25 backpan_index_url
26
27 backpan_index schema cache
28));
29
30my %Defaults = (
31 backpan_index_url => "http://www.astray.com/tmp/backpan.txt.gz",
32 releases_only_from_authors => 1,
33 debug => 0,
34 cache_ttl => 60 * 60,
35);
36
37
38sub new {
39 my $class = shift;
40 my $options = shift;
41
42 $options ||= {};
43
44 # Apply defaults
45 %$options = ( %Defaults, %$options );
46
47 my $self = $class->SUPER::new($options);
48
49 my %cache_opts;
50 $cache_opts{ttl} = $self->cache_ttl;
51 $cache_opts{directory} = $self->cache_dir if $self->cache_dir;
52 $cache_opts{enabled} = !$self->update;
53
54 my $cache = App::Cache->new( \%cache_opts );
55 $self->cache($cache);
56
57 $self->_update_database();
58
59 return $self;
60}
61
62sub _dbh {
63 my $self = shift;
64 return $self->schema->storage->dbh;
65}
66
67sub _log {
68 my $self = shift;
69 return unless $self->debug;
2593d57c 70 print STDOUT @_, "\n";
b67ffc2e 71}
72
73sub _update_database {
74 my $self = shift;
75
76 my $cache = $self->cache;
77 my $db_file = Path::Class::file($cache->directory, "backpan.sqlite");
78
79 my $should_update_db;
80 if( ! -e $db_file ) {
81 $should_update_db = 1;
82 }
83 elsif( defined $self->update ) {
84 $should_update_db = $self->update;
85 }
86 else {
87 # Check the database file before we connect to it. Connecting will create
88 # the file.
89 # XXX Should probably just put a timestamp in the DB
90 my $db_mtime = $db_file->stat->mtime;
91 my $db_age = time - $db_mtime;
92 $should_update_db = ($db_age > $cache->ttl);
93
94 # No matter what, update the DB if we got a new index file.
95 my $archive_mtime = -e $self->_backpan_index_archive ? $self->_backpan_index_archive->stat->mtime : 0;
96 $should_update_db = 1 if $db_mtime < $archive_mtime;
97 }
98
99 unlink $db_file if -e $db_file and $should_update_db;
100
101 $self->schema( BackPAN::Index::Schema->connect("dbi:SQLite:dbname=$db_file") );
102 return unless $should_update_db;
103
104 # Delay loading it into memory until we need it
105 $self->_log("Fetching BackPAN index...");
106 $self->_get_backpan_index;
107 $self->_log("Done.");
108
109 $self->_setup_database;
110
111 $should_update_db = 1 if $self->_database_is_empty;
112
113 return unless $should_update_db;
114
115 my $dbh = $self->_dbh;
116
117 $self->_log("Populating database...");
118 $dbh->begin_work;
119
120 # Get it out of the hot loop.
121 my $only_authors = $self->releases_only_from_authors;
122
123 my $insert_file_sth = $dbh->prepare(q[
124 INSERT INTO files
125 (path, date, size)
126 VALUES (?, ?, ? )
127 ]);
128
129 my $insert_release_sth = $dbh->prepare(q[
130 INSERT INTO releases
131 (path, dist, version, date, size, maturity, cpanid, distvname)
132 VALUES (?, ?, ?, ?, ?, ?, ?, ? )
133 ]);
134
135 my $insert_dist_sth = $dbh->prepare(q[
136 INSERT INTO dists
137 (name, num_releases,
138 first_release, first_date, first_author,
139 latest_release, latest_date, latest_author)
140 VALUES (?, ?,
141 ?, ?, ?,
142 ?, ?, ?)
143 ]);
144
145 my %dists;
146 my %files;
147 open my $fh, $self->_backpan_index_file;
148 while( my $line = <$fh> ) {
149 chomp $line;
150 my ( $path, $date, $size, @junk ) = split ' ', $line;
151
152 if( $files{$path}++ ) {
153 $self->_log("Duplicate file $path in index, ignoring");
154 next;
155 }
156
157 if( !defined $path or !defined $date or !defined $size or @junk ) {
158 $self->_log("Bad data read at line $.: $line");
159 next;
160 }
161
162 next unless $size;
163 next if $only_authors and $path !~ m{^authors/};
164
165 $insert_file_sth->execute($path, $date, $size);
166
167 next if $path =~ /\.(readme|meta)$/;
168
169 my $i = CPAN::DistnameInfo->new( $path );
170
171 my $dist = $i->dist;
172 next unless $i->dist;
173
174 $insert_release_sth->execute(
175 $path,
176 $dist,
177 $i->version || '',
178 $date,
179 $size,
180 $i->maturity,
181 $i->cpanid,
182 $i->distvname,
183 );
184
185
186 # Update aggregate data about dists
187 my $distdata = ($dists{$dist} ||= { name => $dist });
188
189 if( !defined $distdata->{first_release} ||
190 $date < $distdata->{first_date} )
191 {
192 $distdata->{first_release} = $path;
193 $distdata->{first_author} = $i->cpanid;
194 $distdata->{first_date} = $date;
195 }
196
197 if( !defined $distdata->{latest_release} ||
198 $date > $distdata->{latest_date} )
199 {
200 $distdata->{latest_release} = $path;
201 $distdata->{latest_author} = $i->cpanid;
202 $distdata->{latest_date} = $date;
203 }
204
205 $distdata->{num_releases}++;
206 }
207
208 for my $dist (values %dists) {
209 $insert_dist_sth->execute(
210 @{$dist}
211 {qw(name num_releases
212 first_release first_date first_author
213 latest_release latest_date latest_author
214 )}
215 );
216 }
217
218 # Add indexes after inserting so as not to slow down the inserts
219 $self->_add_indexes;
220
221 $dbh->commit;
222
223 $self->_log("Done.");
224
225 return;
226}
227
228
229sub _database_is_empty {
230 my $self = shift;
231
232 return 1 unless $self->files->count;
233 return 1 unless $self->releases->count;
234 return 0;
235}
236
237
238# This is denormalized for performance, its read-only anyway
239sub _setup_database {
240 my $self = shift;
241
242 my %create_for = (
243 files => <<'SQL',
244CREATE TABLE IF NOT EXISTS files (
245 path TEXT NOT NULL PRIMARY KEY,
246 date INTEGER NOT NULL,
247 size INTEGER NOT NULL CHECK ( size >= 0 )
248)
249SQL
250 releases => <<'SQL',
251CREATE TABLE IF NOT EXISTS releases (
252 path TEXT NOT NULL PRIMARY KEY REFERENCES files,
253 dist TEXT NOT NULL REFERENCES dists,
254 date INTEGER NOT NULL,
255 size TEXT NOT NULL,
256 version TEXT NOT NULL,
257 maturity TEXT NOT NULL,
258 distvname TEXT NOT NULL,
259 cpanid TEXT NOT NULL
260)
261SQL
262
263 dists => <<'SQL',
264CREATE TABLE IF NOT EXISTS dists (
265 name TEXT NOT NULL PRIMARY KEY,
266 first_release TEXT NOT NULL REFERENCES releases,
267 latest_release TEXT NOT NULL REFERENCES releases,
268 first_date INTEGER NOT NULL,
269 latest_date INTEGER NOT NULL,
270 first_author TEXT NOT NULL,
271 latest_author TEXT NOT NULL,
272 num_releases INTEGER NOT NULL
273)
274SQL
275);
276
277 my $dbh = $self->_dbh;
278 for my $sql (values %create_for) {
279 $dbh->do($sql);
280 }
281
282 $self->schema->rescan;
283
284 return;
285}
286
287
288sub _add_indexes {
289 my $self = shift;
290
291 my @indexes = (
292 # Speed up dists_by several orders of magnitude
293 "CREATE INDEX IF NOT EXISTS dists_by ON releases (cpanid, dist)",
294
295 # Speed up files_by a lot
296 "CREATE INDEX IF NOT EXISTS files_by ON releases (cpanid, path)",
297
298 # Let us order releases by date quickly
299 "CREATE INDEX IF NOT EXISTS releases_by_date ON releases (date, dist)",
300 );
301 my $dbh = $self->_dbh;
302 for my $sql (@indexes) {
303 $dbh->do($sql);
304 }
305}
306
307
308sub _get_backpan_index {
309 my $self = shift;
310
311 my $url = $self->backpan_index_url;
312
313 return if !$self->_backpan_index_has_changed;
314
315 my $status = getstore($url, $self->_backpan_index_archive.'');
316 die "Error fetching $url: $status" unless is_success($status);
317
318 # Faster
319 local $Archive::Extract::PREFER_BIN = 1;
320
321 # Archive::Extract is vulnerable to the ORS.
322 local $\;
323
324 my $ae = Archive::Extract->new( archive => $self->_backpan_index_archive );
325 $ae->extract( to => $self->_backpan_index_file )
326 or die "Problem extracting @{[ $self->_backpan_index_archive ]}: @{[ $ae->error ]}";
327
328 # If the backpan index age is older than the TTL this prevents us
329 # from immediately looking again.
330 # XXX Should probably use a "last checked" semaphore file
331 $self->_backpan_index_file->touch;
332
333 return;
334}
335
336
337sub _backpan_index_archive {
338 my $self = shift;
339
340 my $file = URI->new($self->backpan_index_url)->path;
341 $file = Path::Class::file($file)->basename;
342 return Path::Class::file($file)->absolute($self->cache->directory);
343}
344
345
346sub _backpan_index_file {
347 my $self = shift;
348
349 my $file = $self->_backpan_index_archive;
350 $file =~ s{\.[^.]+$}{};
351
352 return Path::Class::file($file);
353}
354
355
356sub _backpan_index_has_changed {
357 my $self = shift;
358
359 my $file = $self->_backpan_index_file;
360 return 1 unless -e $file;
361
362 my $local_mod_time = stat($file)->mtime;
363 my $local_age = time - $local_mod_time;
364 return 0 unless $local_age > $self->cache->ttl;
365
366 # We looked, don't have to look again until the ttl is up.
367 $self->_backpan_index_file->touch;
368
369 my(undef, undef, $remote_mod_time) = head($self->backpan_index_url);
370 return $remote_mod_time > $local_mod_time;
371}
372
373
374sub files {
375 my $self = shift;
376 return $self->schema->resultset('File');
377}
378
379
380sub dist {
381 my($self, $dist) = @_;
382
383 return $self->dists->single({ name => $dist });
384}
385
386
387sub releases {
388 my($self, $dist) = @_;
389
390 return $self->schema->resultset("Release") unless defined $dist;
391 return $self->schema->resultset("Release")->search({ dist => $dist });
392}
393
394
395sub release {
396 my($self, $dist, $version) = @_;
397
398 return $self->releases($dist)->single({ version => $version });
399}
400
401
402sub dists {
403 my $self = shift;
404
405 return $self->schema->resultset("Dist");
406}
407
408
409=head1 NAME
410
411BackPAN::Index - An interface to the BackPAN index
412
413=head1 SYNOPSIS
414
415 use BackPAN::Index;
416 my $backpan = BackPAN::Index->new;
417
418 # These are all DBIx::Class::ResultSet's
419 my $files = $backpan->files;
420 my $dists = $backpan->dists;
421 my $releases = $backpan->releases("Acme-Pony");
422
423 # Use DBIx::Class::ResultSet methods on them
424 my $release = $releases->single({ version => '1.23' });
425
426 my $dist = $backpan->dist("Test-Simple");
427 my $releases = $dist->releases;
428
429=head1 DESCRIPTION
430
431This downloads, caches and parses the BackPAN index into a local
432database for efficient querying.
433
434Its a pretty thin wrapper around DBIx::Class returning
435L<DBIx::Class::ResultSet> objects which makes it efficient and
436flexible.
437
438The Comprehensive Perl Archive Network (CPAN) is a very useful
439collection of Perl code. However, in order to keep CPAN relatively
440small, authors of modules can delete older versions of modules to only
441let CPAN have the latest version of a module. BackPAN is where these
442deleted modules are backed up. It's more like a full CPAN mirror, only
443without the deletions. This module provides an index of BackPAN and
444some handy methods.
445
446=head1 METHODS
447
448=head2 new
449
450 my $backpan = BackPAN::Index->new(\%options);
451
452Create a new object representing the BackPAN index.
453
454It will, if necessary, download the BackPAN index and compile it into
455a database for efficient storage. Initial creation is slow, but it
456will be cached.
457
458new() takes some options
459
460=head3 update
461
462Because it is rather large, BackPAN::Index caches a copy of the
463BackPAN index and builds a local database to speed access. This flag
464controls if the local index is updated.
465
466If true, forces an update of the BACKPAN index.
467
468If false, the index will never be updated even if the cache is
469expired. It will always create a new index if one does not exist.
470
471By default the index is cached and checked for updates according to
472C<<$backpan->cache_ttl>>.
473
474=head3 cache_ttl
475
476How many seconds before checking for an updated index.
477
478Defaults to an hour.
479
480=head3 debug
481
482If true, debug messages will be printed.
483
484Defaults to false.
485
486=head3 releases_only_from_authors
487
488If true, only files in the C<authors> directory will be considered as
489releases. If false any file in the index may be considered for a
490release.
491
492Defaults to true.
493
494=head3 cache_dir
495
496Location of the cache directory.
497
498Defaults to whatever L<App::Cache> does.
499
500=head3 backpan_index_url
501
502URL to the BackPAN index.
503
504Defaults to a sensible location.
505
506
507=head2 files
508
509 my $files = $backpan->files;
510
511Returns a ResultSet representing all the files on BackPAN.
512
513=head2 files_by
514
515 my $files = $backpan->files_by($cpanid);
516 my @files = $backpan->files_by($cpanid);
517
518Returns all the files by a given $cpanid.
519
520Returns either a list of BackPAN::Index::Files or a ResultSet.
521
522=cut
523
524sub files_by {
525 my $self = shift;
526 my $cpanid = shift;
527
528 return $self->files->search({ "releases.cpanid" => $cpanid }, { join => "releases" });
529}
530
531=head2 dists
532
533 my $dists = $backpan->dists;
534
535Returns a ResultSet representing all the distributions on BackPAN.
536
537=head2 dist
538
539 my $dists = $backpan->dist($dist_name);
540
541Returns a single BackPAN::Index::Dist object for $dist_name.
542
543=head2 dists_by
544
545 my $dists = $backpan->dists_by($cpanid);
546 my @dists = $backpan->dists_by($cpanid);
547
548Returns the dists which contain at least one release by the given
549$cpanid.
550
551Returns either a ResultSet or a list of the Dists.
552
553=cut
554
555sub dists_by {
556 my $self = shift;
557 my $cpanid = shift;
558
559 return $self->dists->search({ "releases.cpanid" => $cpanid }, { join => "releases", distinct => 1 });
560}
561
562
563=head2 dists_changed_since
564
565 my $dists = $backpan->dists_changed_since($time);
566
567Returns a ResultSet of distributions which have had releases at or after after $time.
568
569=cut
570
571sub dists_changed_since {
572 my $self = shift;
573 my $time = shift;
574
575 return $self->dists->search( latest_date => \">= $time" );
576}
577
578=head2 releases
579
580 my $all_releases = $backpan->releases();
581 my $dist_releases = $backpan->releases($dist_name);
582
583Returns a ResultSet representing all the releases on BackPAN. If a
584$dist_name is given it returns the releases of just one distribution.
585
586=head2 release
587
588 my $release = $backpan->release($dist_name, $version);
589
590Returns a single BackPAN::Index::Release object for the given
591$dist_name and $version.
592
593=head2 releases_by
594
595 my $releases = $backpan->releases_by($cpanid);
596 my @releases = $backpan->releases_by($cpanid);
597
598Returns all the releases of a single author.
599
600Returns either a list of Releases or a ResultSet representing those releases.
601
602=cut
603
604sub releases_by {
605 my $self = shift;
606 my $cpanid = shift;
607
608 return $self->releases->search({ cpanid => $cpanid });
609}
610
611
612=head2 releases_since
613
614 my $releases = $backpan->releases_since($time);
615
616Returns a ResultSet of releases which were released at or after $time.
617
618=cut
619
620sub releases_since {
621 my $self = shift;
622 my $time = shift;
623
624 return $self->releases->search( date => \">= $time" );
625}
626
627
628=head1 EXAMPLES
629
630The real power of BackPAN::Index comes from L<DBIx::Class::ResultSet>.
631Its very flexible and very powerful but not always obvious how to get
632it to do things. Here's some examples.
633
634 # How many files are on BackPAN?
635 my $count = $backpan->files->count;
636
637 # How big is BackPAN?
638 my $size = $backpan->files->get_column("size")->sum;
639
640 # What are the names of all the distributions?
641 my @names = $backpan->dists->get_column("name")->all;
642
643 # What path contains this release?
644 my $path = $backpan->release("Acme-Pony", 1.01)->path;
645
646 # Get all the releases of Moose ordered by version
647 my @releases = $backpan->dist("Moose")->releases
648 ->search(undef, { order_by => "version" });
649
650=head1 AUTHOR
651
652Michael G Schwern <schwern@pobox.com>
653
654=head1 COPYRIGHT
655
656Copyright 2009, Michael G Schwern
657
658=head1 LICENSE
659
660This module is free software; you can redistribute it or modify it under
661the same terms as Perl itself.
662
663=head1 SEE ALSO
664
665L<DBIx::Class::ResultSet>, L<BackPAN::Index::File>,
666L<BackPAN::Index::Release>, L<BackPAN::Index::Dist>
667
668Repository: L<http://github.com/acme/parse-backpan-packages>
669Bugs: L<http://rt.cpan.org/Public/Dist/Display.html?Name=Parse-BACKPAN-Packages>
670
671=cut
672
6731;