Bump
[catagits/BackPAN-Web.git] / lib / BackPAN / Index.pm
CommitLineData
b67ffc2e 1package BackPAN::Index;
2
3use strict;
4use warnings;
5
6our $VERSION = '0.39';
7
8use autodie;
9use App::Cache 0.37;
10use CPAN::DistnameInfo 0.09;
11use LWP::Simple qw(getstore head is_success);
12use Archive::Extract;
13use Path::Class ();
14use File::stat;
15use BackPAN::Index::Schema;
16
17use parent qw( Class::Accessor::Fast );
18
19__PACKAGE__->mk_accessors(qw(
20 update
21 cache_ttl
22 debug
23 releases_only_from_authors
24 cache_dir
25 backpan_index_url
26
27 backpan_index schema cache
28));
29
30my %Defaults = (
31 backpan_index_url => "http://www.astray.com/tmp/backpan.txt.gz",
32 releases_only_from_authors => 1,
33 debug => 0,
34 cache_ttl => 60 * 60,
35);
36
37
38sub new {
39 my $class = shift;
40 my $options = shift;
41
42 $options ||= {};
43
44 # Apply defaults
45 %$options = ( %Defaults, %$options );
46
47 my $self = $class->SUPER::new($options);
48
49 my %cache_opts;
50 $cache_opts{ttl} = $self->cache_ttl;
51 $cache_opts{directory} = $self->cache_dir if $self->cache_dir;
52 $cache_opts{enabled} = !$self->update;
53
54 my $cache = App::Cache->new( \%cache_opts );
55 $self->cache($cache);
56
57 $self->_update_database();
58
59 return $self;
60}
61
62sub _dbh {
63 my $self = shift;
64 return $self->schema->storage->dbh;
65}
66
67sub _log {
68 my $self = shift;
69 return unless $self->debug;
2593d57c 70 print STDOUT @_, "\n";
b67ffc2e 71}
72
73sub _update_database {
74 my $self = shift;
75
76 my $cache = $self->cache;
77 my $db_file = Path::Class::file($cache->directory, "backpan.sqlite");
78
79 my $should_update_db;
80 if( ! -e $db_file ) {
81 $should_update_db = 1;
82 }
83 elsif( defined $self->update ) {
84 $should_update_db = $self->update;
85 }
86 else {
87 # Check the database file before we connect to it. Connecting will create
88 # the file.
89 # XXX Should probably just put a timestamp in the DB
90 my $db_mtime = $db_file->stat->mtime;
91 my $db_age = time - $db_mtime;
92 $should_update_db = ($db_age > $cache->ttl);
93
94 # No matter what, update the DB if we got a new index file.
95 my $archive_mtime = -e $self->_backpan_index_archive ? $self->_backpan_index_archive->stat->mtime : 0;
96 $should_update_db = 1 if $db_mtime < $archive_mtime;
97 }
98
99 unlink $db_file if -e $db_file and $should_update_db;
100
101 $self->schema( BackPAN::Index::Schema->connect("dbi:SQLite:dbname=$db_file") );
102 return unless $should_update_db;
103
104 # Delay loading it into memory until we need it
105 $self->_log("Fetching BackPAN index...");
106 $self->_get_backpan_index;
107 $self->_log("Done.");
108
109 $self->_setup_database;
110
b67ffc2e 111 my $dbh = $self->_dbh;
112
113 $self->_log("Populating database...");
114 $dbh->begin_work;
115
116 # Get it out of the hot loop.
117 my $only_authors = $self->releases_only_from_authors;
118
119 my $insert_file_sth = $dbh->prepare(q[
120 INSERT INTO files
121 (path, date, size)
122 VALUES (?, ?, ? )
123 ]);
124
125 my $insert_release_sth = $dbh->prepare(q[
126 INSERT INTO releases
127 (path, dist, version, date, size, maturity, cpanid, distvname)
128 VALUES (?, ?, ?, ?, ?, ?, ?, ? )
129 ]);
130
131 my $insert_dist_sth = $dbh->prepare(q[
132 INSERT INTO dists
133 (name, num_releases,
134 first_release, first_date, first_author,
135 latest_release, latest_date, latest_author)
136 VALUES (?, ?,
137 ?, ?, ?,
138 ?, ?, ?)
139 ]);
140
141 my %dists;
142 my %files;
143 open my $fh, $self->_backpan_index_file;
144 while( my $line = <$fh> ) {
145 chomp $line;
146 my ( $path, $date, $size, @junk ) = split ' ', $line;
147
148 if( $files{$path}++ ) {
149 $self->_log("Duplicate file $path in index, ignoring");
150 next;
151 }
152
153 if( !defined $path or !defined $date or !defined $size or @junk ) {
154 $self->_log("Bad data read at line $.: $line");
155 next;
156 }
157
158 next unless $size;
159 next if $only_authors and $path !~ m{^authors/};
160
161 $insert_file_sth->execute($path, $date, $size);
162
163 next if $path =~ /\.(readme|meta)$/;
164
165 my $i = CPAN::DistnameInfo->new( $path );
166
167 my $dist = $i->dist;
168 next unless $i->dist;
169
170 $insert_release_sth->execute(
171 $path,
172 $dist,
173 $i->version || '',
174 $date,
175 $size,
176 $i->maturity,
177 $i->cpanid,
178 $i->distvname,
179 );
180
181
182 # Update aggregate data about dists
183 my $distdata = ($dists{$dist} ||= { name => $dist });
184
185 if( !defined $distdata->{first_release} ||
186 $date < $distdata->{first_date} )
187 {
188 $distdata->{first_release} = $path;
189 $distdata->{first_author} = $i->cpanid;
190 $distdata->{first_date} = $date;
191 }
192
193 if( !defined $distdata->{latest_release} ||
194 $date > $distdata->{latest_date} )
195 {
196 $distdata->{latest_release} = $path;
197 $distdata->{latest_author} = $i->cpanid;
198 $distdata->{latest_date} = $date;
199 }
200
201 $distdata->{num_releases}++;
202 }
203
204 for my $dist (values %dists) {
205 $insert_dist_sth->execute(
206 @{$dist}
207 {qw(name num_releases
208 first_release first_date first_author
209 latest_release latest_date latest_author
210 )}
211 );
212 }
213
214 # Add indexes after inserting so as not to slow down the inserts
215 $self->_add_indexes;
216
217 $dbh->commit;
218
219 $self->_log("Done.");
220
221 return;
222}
223
224
225sub _database_is_empty {
226 my $self = shift;
227
228 return 1 unless $self->files->count;
229 return 1 unless $self->releases->count;
230 return 0;
231}
232
233
234# This is denormalized for performance, its read-only anyway
235sub _setup_database {
236 my $self = shift;
237
238 my %create_for = (
239 files => <<'SQL',
240CREATE TABLE IF NOT EXISTS files (
241 path TEXT NOT NULL PRIMARY KEY,
242 date INTEGER NOT NULL,
243 size INTEGER NOT NULL CHECK ( size >= 0 )
244)
245SQL
246 releases => <<'SQL',
247CREATE TABLE IF NOT EXISTS releases (
248 path TEXT NOT NULL PRIMARY KEY REFERENCES files,
249 dist TEXT NOT NULL REFERENCES dists,
250 date INTEGER NOT NULL,
251 size TEXT NOT NULL,
252 version TEXT NOT NULL,
253 maturity TEXT NOT NULL,
254 distvname TEXT NOT NULL,
255 cpanid TEXT NOT NULL
256)
257SQL
258
259 dists => <<'SQL',
260CREATE TABLE IF NOT EXISTS dists (
261 name TEXT NOT NULL PRIMARY KEY,
262 first_release TEXT NOT NULL REFERENCES releases,
263 latest_release TEXT NOT NULL REFERENCES releases,
264 first_date INTEGER NOT NULL,
265 latest_date INTEGER NOT NULL,
266 first_author TEXT NOT NULL,
267 latest_author TEXT NOT NULL,
268 num_releases INTEGER NOT NULL
269)
270SQL
271);
272
273 my $dbh = $self->_dbh;
274 for my $sql (values %create_for) {
275 $dbh->do($sql);
276 }
277
278 $self->schema->rescan;
279
280 return;
281}
282
283
284sub _add_indexes {
285 my $self = shift;
286
287 my @indexes = (
288 # Speed up dists_by several orders of magnitude
289 "CREATE INDEX IF NOT EXISTS dists_by ON releases (cpanid, dist)",
290
291 # Speed up files_by a lot
292 "CREATE INDEX IF NOT EXISTS files_by ON releases (cpanid, path)",
293
294 # Let us order releases by date quickly
295 "CREATE INDEX IF NOT EXISTS releases_by_date ON releases (date, dist)",
296 );
297 my $dbh = $self->_dbh;
298 for my $sql (@indexes) {
299 $dbh->do($sql);
300 }
301}
302
303
304sub _get_backpan_index {
305 my $self = shift;
306
307 my $url = $self->backpan_index_url;
308
309 return if !$self->_backpan_index_has_changed;
310
311 my $status = getstore($url, $self->_backpan_index_archive.'');
312 die "Error fetching $url: $status" unless is_success($status);
313
314 # Faster
315 local $Archive::Extract::PREFER_BIN = 1;
316
317 # Archive::Extract is vulnerable to the ORS.
318 local $\;
319
320 my $ae = Archive::Extract->new( archive => $self->_backpan_index_archive );
321 $ae->extract( to => $self->_backpan_index_file )
322 or die "Problem extracting @{[ $self->_backpan_index_archive ]}: @{[ $ae->error ]}";
323
324 # If the backpan index age is older than the TTL this prevents us
325 # from immediately looking again.
326 # XXX Should probably use a "last checked" semaphore file
327 $self->_backpan_index_file->touch;
328
329 return;
330}
331
332
333sub _backpan_index_archive {
334 my $self = shift;
335
336 my $file = URI->new($self->backpan_index_url)->path;
337 $file = Path::Class::file($file)->basename;
338 return Path::Class::file($file)->absolute($self->cache->directory);
339}
340
341
342sub _backpan_index_file {
343 my $self = shift;
344
345 my $file = $self->_backpan_index_archive;
346 $file =~ s{\.[^.]+$}{};
347
348 return Path::Class::file($file);
349}
350
351
352sub _backpan_index_has_changed {
353 my $self = shift;
354
355 my $file = $self->_backpan_index_file;
356 return 1 unless -e $file;
357
358 my $local_mod_time = stat($file)->mtime;
359 my $local_age = time - $local_mod_time;
360 return 0 unless $local_age > $self->cache->ttl;
361
362 # We looked, don't have to look again until the ttl is up.
363 $self->_backpan_index_file->touch;
364
365 my(undef, undef, $remote_mod_time) = head($self->backpan_index_url);
55f9d76f 366 return defined $remote_mod_time && $remote_mod_time > $local_mod_time;
b67ffc2e 367}
368
369
370sub files {
371 my $self = shift;
372 return $self->schema->resultset('File');
373}
374
375
376sub dist {
377 my($self, $dist) = @_;
378
379 return $self->dists->single({ name => $dist });
380}
381
382
383sub releases {
384 my($self, $dist) = @_;
385
386 return $self->schema->resultset("Release") unless defined $dist;
387 return $self->schema->resultset("Release")->search({ dist => $dist });
388}
389
390
391sub release {
392 my($self, $dist, $version) = @_;
393
394 return $self->releases($dist)->single({ version => $version });
395}
396
397
398sub dists {
399 my $self = shift;
400
401 return $self->schema->resultset("Dist");
402}
403
404
405=head1 NAME
406
407BackPAN::Index - An interface to the BackPAN index
408
409=head1 SYNOPSIS
410
411 use BackPAN::Index;
412 my $backpan = BackPAN::Index->new;
413
414 # These are all DBIx::Class::ResultSet's
415 my $files = $backpan->files;
416 my $dists = $backpan->dists;
417 my $releases = $backpan->releases("Acme-Pony");
418
419 # Use DBIx::Class::ResultSet methods on them
420 my $release = $releases->single({ version => '1.23' });
421
422 my $dist = $backpan->dist("Test-Simple");
423 my $releases = $dist->releases;
424
425=head1 DESCRIPTION
426
427This downloads, caches and parses the BackPAN index into a local
428database for efficient querying.
429
430Its a pretty thin wrapper around DBIx::Class returning
431L<DBIx::Class::ResultSet> objects which makes it efficient and
432flexible.
433
434The Comprehensive Perl Archive Network (CPAN) is a very useful
435collection of Perl code. However, in order to keep CPAN relatively
436small, authors of modules can delete older versions of modules to only
437let CPAN have the latest version of a module. BackPAN is where these
438deleted modules are backed up. It's more like a full CPAN mirror, only
439without the deletions. This module provides an index of BackPAN and
440some handy methods.
441
442=head1 METHODS
443
444=head2 new
445
446 my $backpan = BackPAN::Index->new(\%options);
447
448Create a new object representing the BackPAN index.
449
450It will, if necessary, download the BackPAN index and compile it into
451a database for efficient storage. Initial creation is slow, but it
452will be cached.
453
454new() takes some options
455
456=head3 update
457
458Because it is rather large, BackPAN::Index caches a copy of the
459BackPAN index and builds a local database to speed access. This flag
460controls if the local index is updated.
461
462If true, forces an update of the BACKPAN index.
463
464If false, the index will never be updated even if the cache is
465expired. It will always create a new index if one does not exist.
466
467By default the index is cached and checked for updates according to
468C<<$backpan->cache_ttl>>.
469
470=head3 cache_ttl
471
472How many seconds before checking for an updated index.
473
474Defaults to an hour.
475
476=head3 debug
477
478If true, debug messages will be printed.
479
480Defaults to false.
481
482=head3 releases_only_from_authors
483
484If true, only files in the C<authors> directory will be considered as
485releases. If false any file in the index may be considered for a
486release.
487
488Defaults to true.
489
490=head3 cache_dir
491
492Location of the cache directory.
493
494Defaults to whatever L<App::Cache> does.
495
496=head3 backpan_index_url
497
498URL to the BackPAN index.
499
500Defaults to a sensible location.
501
502
503=head2 files
504
505 my $files = $backpan->files;
506
507Returns a ResultSet representing all the files on BackPAN.
508
509=head2 files_by
510
511 my $files = $backpan->files_by($cpanid);
512 my @files = $backpan->files_by($cpanid);
513
514Returns all the files by a given $cpanid.
515
516Returns either a list of BackPAN::Index::Files or a ResultSet.
517
518=cut
519
520sub files_by {
521 my $self = shift;
522 my $cpanid = shift;
523
524 return $self->files->search({ "releases.cpanid" => $cpanid }, { join => "releases" });
525}
526
527=head2 dists
528
529 my $dists = $backpan->dists;
530
531Returns a ResultSet representing all the distributions on BackPAN.
532
533=head2 dist
534
535 my $dists = $backpan->dist($dist_name);
536
537Returns a single BackPAN::Index::Dist object for $dist_name.
538
539=head2 dists_by
540
541 my $dists = $backpan->dists_by($cpanid);
542 my @dists = $backpan->dists_by($cpanid);
543
544Returns the dists which contain at least one release by the given
545$cpanid.
546
547Returns either a ResultSet or a list of the Dists.
548
549=cut
550
551sub dists_by {
552 my $self = shift;
553 my $cpanid = shift;
554
555 return $self->dists->search({ "releases.cpanid" => $cpanid }, { join => "releases", distinct => 1 });
556}
557
558
559=head2 dists_changed_since
560
561 my $dists = $backpan->dists_changed_since($time);
562
563Returns a ResultSet of distributions which have had releases at or after after $time.
564
565=cut
566
567sub dists_changed_since {
568 my $self = shift;
569 my $time = shift;
570
571 return $self->dists->search( latest_date => \">= $time" );
572}
573
574=head2 releases
575
576 my $all_releases = $backpan->releases();
577 my $dist_releases = $backpan->releases($dist_name);
578
579Returns a ResultSet representing all the releases on BackPAN. If a
580$dist_name is given it returns the releases of just one distribution.
581
582=head2 release
583
584 my $release = $backpan->release($dist_name, $version);
585
586Returns a single BackPAN::Index::Release object for the given
587$dist_name and $version.
588
589=head2 releases_by
590
591 my $releases = $backpan->releases_by($cpanid);
592 my @releases = $backpan->releases_by($cpanid);
593
594Returns all the releases of a single author.
595
596Returns either a list of Releases or a ResultSet representing those releases.
597
598=cut
599
600sub releases_by {
601 my $self = shift;
602 my $cpanid = shift;
603
604 return $self->releases->search({ cpanid => $cpanid });
605}
606
607
608=head2 releases_since
609
610 my $releases = $backpan->releases_since($time);
611
612Returns a ResultSet of releases which were released at or after $time.
613
614=cut
615
616sub releases_since {
617 my $self = shift;
618 my $time = shift;
619
620 return $self->releases->search( date => \">= $time" );
621}
622
623
624=head1 EXAMPLES
625
626The real power of BackPAN::Index comes from L<DBIx::Class::ResultSet>.
627Its very flexible and very powerful but not always obvious how to get
628it to do things. Here's some examples.
629
630 # How many files are on BackPAN?
631 my $count = $backpan->files->count;
632
633 # How big is BackPAN?
634 my $size = $backpan->files->get_column("size")->sum;
635
636 # What are the names of all the distributions?
637 my @names = $backpan->dists->get_column("name")->all;
638
639 # What path contains this release?
640 my $path = $backpan->release("Acme-Pony", 1.01)->path;
641
642 # Get all the releases of Moose ordered by version
643 my @releases = $backpan->dist("Moose")->releases
644 ->search(undef, { order_by => "version" });
645
646=head1 AUTHOR
647
648Michael G Schwern <schwern@pobox.com>
649
650=head1 COPYRIGHT
651
652Copyright 2009, Michael G Schwern
653
654=head1 LICENSE
655
656This module is free software; you can redistribute it or modify it under
657the same terms as Perl itself.
658
659=head1 SEE ALSO
660
661L<DBIx::Class::ResultSet>, L<BackPAN::Index::File>,
662L<BackPAN::Index::Release>, L<BackPAN::Index::Dist>
663
664Repository: L<http://github.com/acme/parse-backpan-packages>
665Bugs: L<http://rt.cpan.org/Public/Dist/Display.html?Name=Parse-BACKPAN-Packages>
666
667=cut
668
6691;