Update to check all URLs in the source, including ftp and ignore known example URLs
Leon Brocard [Mon, 28 Sep 2009 12:07:38 +0000 (13:07 +0100)]
Porting/checkURL.pl

index db55c49..5dfe65f 100644 (file)
-#!/usr/bin/perl
-
+#!perl
 use strict;
-use warnings 'all';
+use warnings;
+use autodie;
+use feature qw(say);
+use File::Find::Rule;
+use File::Slurp;
+use File::Spec;
+use IO::Socket::SSL;
+use List::Util qw(sum);
+use LWP::UserAgent;
+use Net::FTP;
+use Parallel::Fork::BossWorkerAsync;
+use Term::ProgressBar::Simple;
+use URI::Find::Simple qw( list_uris );
+$| = 1;
+
+my %ignore;
+while ( my $line = <main::DATA> ) {
+    chomp $line;
+    next if $line =~ /^#/;
+    next unless $line;
+    $ignore{$line} = 1;
+}
 
-use LWP::Simple qw /$ua getstore/;
+my $ua = LWP::UserAgent->new;
+$ua->timeout(58);
+$ua->env_proxy;
 
-my %urls;
+my @filenames = @ARGV;
+@filenames = sort grep { $_ !~ /^\.git/ } File::Find::Rule->new->file->in('.')
+    unless @filenames;
+
+my $total_bytes = sum map {-s} @filenames;
+
+my $extract_progress = Term::ProgressBar::Simple->new(
+    {   count => $total_bytes,
+        name  => 'Extracting URIs',
+    }
+);
 
-my @dummy = qw(
-          http://something.here
-          http://www.pvhp.com
-             );
-my %dummy;
+my %uris;
+foreach my $filename (@filenames) {
+    next if $filename =~ /uris\.txt/;
+    next if $filename =~ /check_uris/;
+    next if $filename =~ /\.patch$/;
+    my $contents = read_file($filename);
+    my @uris     = list_uris($contents);
+    foreach my $uri (@uris) {
+        next unless $uri =~ /^(http|ftp)/;
+        next if $ignore{$uri};
 
-@dummy{@dummy} = ();
+        # no need to hit rt.perl.org
+        next
+            if $uri =~ m{^http://rt.perl.org/rt3/Ticket/Display.html?id=\d+$};
 
-foreach my $file (<*/*.pod */*/*.pod */*/*/*.pod README README.* INSTALL>) {
-    open my $fh => $file or die "Failed to open $file: $!\n";
-    while (<$fh>) {
-        if (m{(?:http|ftp)://(?:(?!\w<)[-\w~?@=.])+} && !exists $dummy{$&}) {
-            my $url = $&;
-            $url =~ s/\.$//;
-            $urls {$url} ||= { };
-            $urls {$url} {$file} = 1;
+        # no need to hit rt.cpan.org
+        next
+            if $uri =~ m{^http://rt.cpan.org/Public/Bug/Display.html?id=\d+$};
+        push @{ $uris{$uri} }, $filename;
+    }
+    $extract_progress += -s $filename;
+}
+
+my $bw = Parallel::Fork::BossWorkerAsync->new(
+    work_handler   => \&work_alarmed,
+    global_timeout => 120,
+    worker_count   => 20,
+);
+
+foreach my $uri ( keys %uris ) {
+    my @filenames = @{ $uris{$uri} };
+    $bw->add_work( { uri => $uri, filenames => \@filenames } );
+}
+
+undef $extract_progress;
+
+my $fetch_progress = Term::ProgressBar::Simple->new(
+    {   count => scalar( keys %uris ),
+        name  => 'Fetching URIs',
+    }
+);
+
+my %filenames;
+while ( $bw->pending() ) {
+    my $response   = $bw->get_result();
+    my $uri        = $response->{uri};
+    my @filenames  = @{ $response->{filenames} };
+    my $is_success = $response->{is_success};
+    my $message    = $response->{message};
+
+    unless ($is_success) {
+        foreach my $filename (@filenames) {
+            push @{ $filenames{$filename} },
+                { uri => $uri, message => $message };
         }
     }
-    close $fh;
+    $fetch_progress++;
 }
+$bw->shut_down();
 
-sub fisher_yates_shuffle {
-    my $deck = shift;  # $deck is a reference to an array
-    my $i = @$deck;
-    while (--$i) {
-       my $j = int rand ($i+1);
-       @$deck[$i,$j] = @$deck[$j,$i];
+my $fh = IO::File->new('> uris.txt');
+foreach my $filename ( sort keys %filenames ) {
+    $fh->say("* $filename");
+    my @bits = @{ $filenames{$filename} };
+    foreach my $bit (@bits) {
+        my $uri     = $bit->{uri};
+        my $message = $bit->{message};
+        $fh->say("  $uri");
+        $fh->say("    $message");
     }
 }
+$fh->close;
 
-my @urls = keys %urls;
+say 'Finished, see uris.txt';
 
-fisher_yates_shuffle(\@urls);
+sub work_alarmed {
+    my $conf = shift;
+    eval {
+        local $SIG{ALRM} = sub { die "alarm\n" };    # NB: \n required
+        alarm 60;
+        $conf = work($conf);
+        alarm 0;
+    };
+    if ($@) {
+        $conf->{is_success} = 0;
+        $conf->{message}    = 'Timed out';
 
-sub todo {
-    warn "(", scalar @urls, " URLs)\n";
+    }
+    return $conf;
 }
 
-my $MAXPROC = 40;
-my $MAXURL  = 10;
-my $MAXFORK = $MAXPROC < $MAXURL ? 1 : $MAXPROC / $MAXURL;
+sub work {
+    my $conf      = shift;
+    my $uri       = $conf->{uri};
+    my @filenames = @{ $conf->{filenames} };
 
-select(STDERR); $| = 1;
-select(STDOUT); $| = 1;
+    if ( $uri =~ /^http/ ) {
+        my $uri_without_fragment = URI->new($uri);
+        my $fragment             = $uri_without_fragment->fragment(undef);
+        my $response             = $ua->head($uri_without_fragment);
 
-while (@urls) {
-    my @list;
-    my $pid;
-    my $i;
+        $conf->{is_success} = $response->is_success;
+        $conf->{message}    = $response->status_line;
+        return $conf;
+    } else {
 
-    todo();
+        my $uri_object = URI->new($uri);
+        my $host       = $uri_object->host;
+        my $path       = $uri_object->path;
+        my ( $volume, $directories, $filename )
+            = File::Spec->splitpath($path);
 
-    for ($i = 0; $i < $MAXFORK; $i++) {
-       $list[$i] = [ splice @urls, 0, $MAXURL ];
-       $pid = fork;
-       die "Failed to fork: $!\n" unless defined $pid;
-       last unless $pid; # Child.
-    }
+        my $ftp = Net::FTP->new( $host, Passive => 1, Timeout => 60 );
+        unless ($ftp) {
+            $conf->{is_succcess} = 0;
+            $conf->{message}     = "Can not connect to $host: $@";
+            return $conf;
+        }
 
-    if ($pid) {
-        # Parent.
-       warn "(waiting)\n";
-       1 until -1 == wait; # Reap.
-    } else {
-        # Child.
-        foreach my $url (@{$list[$i]}) {
-            my $code = getstore $url, "/dev/null";
-            next if $code == 200;
-            my $f = join ", " => keys %{$urls {$url}};
-            printf "%03d  %s: %s\n" => $code, $url, $f;
+        my $can_login = $ftp->login( "anonymous", '-anonymous@' );
+        unless ($can_login) {
+            $conf->{is_success} = 0;
+            $conf->{message} = "Can not login ", $ftp->message;
+            return $conf;
         }
 
-        exit;
+        my $can_binary = $ftp->binary();
+        unless ($can_binary) {
+            $conf->{is_success} = 0;
+            $conf->{message} = "Can not binary ", $ftp->message;
+            return $conf;
+        }
+
+        my $can_cwd = $ftp->cwd($directories);
+        unless ($can_cwd) {
+            $conf->{is_success} = 0;
+            $conf->{message} = "Can not cwd to $directories ", $ftp->message;
+            return $conf;
+        }
+
+        if ($filename) {
+            my $can_size = $ftp->size($filename);
+            unless ($can_size) {
+                $conf->{is_success} = 0;
+                $conf->{message}
+                    = "Can not size $filename in $directories",
+                    $ftp->message;
+                return $conf;
+            }
+        } else {
+            my ($can_dir) = $ftp->dir;
+            unless ($can_dir) {
+                my ($can_ls) = $ftp->ls;
+                unless ($can_ls) {
+                    $conf->{is_success} = 0;
+                    $conf->{message}
+                        = "Can not dir or ls in $directories ",
+                        $ftp->message;
+                    return $conf;
+                }
+            }
+        }
+
+        $conf->{is_success} = 1;
+        return $conf;
     }
 }
 
+__DATA__
+# these are fine but give errors
+ftp://ftp.stratus.com/pub/vos/posix/ga/ga.html
+ftp://ftp.stratus.com/pub/vos/utility/utility.html
+
+# this is missing, sigh
+ftp://ftp.sco.com/SLS/ptf7051e.Z
+http://perlmonks.thepen.com/42898.html
+
+# this are URI extraction bugs
+http://www.perl.org/E
+http://en.wikipedia.org/wiki/SREC_(file_format
+http://somewhere.else',-type=/
+ftp:passive-mode
+ftp:
+http:[-
+http://search.cpan.org/src/KWILLIAMS/Module-Build-0.30/lib/Module/Build/Platform/Windows.pm:split_like_shell
+http://www.xray.mpe.mpg.de/mailing-lists/perl5-
+http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP:
+
+# these are used as an example
+http://example.com/
+http://something.here/
+http://users.perl5.git.perl.org/~yourlogin/
+http://github.com/USERNAME/perl/tree/orange
+http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
+http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
+http://somewhere.else$/
+http://somewhere.else$/
+http://somewhere.else/bin/foo&bar',-Type=
+http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
+http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
+http://www.perl.org/test.cgi
+http://cpan2.local/
+http://search.cpan.org/perldoc?
+http://cpan1.local/
+http://cpan.dev.local/CPAN
+http:///
+ftp://
+ftp://myurl/
+ftp://ftp.software.ibm.com/aix/fixes/v4/other/vac.C.5.0.2.6.bff
+http://www14.software.ibm.com/webapp/download/downloadaz.jsp
+http://www.cpan.org/modules/by-module/Archive/Archive-Zip-*.tar.gz
+http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT
+http://localhost/tmp/index.txt
+http://example.com/foo/bar.html
+http://example.com/Text-Bastardize-1.06.tar.gz
+ftp://example.com/sources/packages.txt
+http://example.com/sources/packages.txt
+http://example.com/sources
+ftp://example.com/sources
+http://some.where.com/dir/file.txt
+http://some.where.com/dir/a.txt
+http://foo.com/X.tgz
+ftp://foo.com/X.tgz
+http://foo/
+http://www.foo.com:8000/
+http://mysite.com/cgi-bin/log.cgi/http://www.some.other.site/args
+http://decoded/mirror/path
+http://a/b/c/d/e/f/g/h/i/j
+http://foo/bar.gz
+ftp://ftp.perl.org
+http://purl.org/rss/1.0/modules/taxonomy/
+ftp://ftp.sun.ac.za/CPAN/CPAN/
+ftp://ftp.cpan.org/pub/mirror/index.txt
+ftp://cpan.org/pub/mirror/index.txt
+http://example.com/~eh/
+http://plagger.org/.../rss
+http://umn.dl.sourceforge.net/mingw/gcc-g++-3.4.5-20060117-1.tar.gz
+http://umn.dl.sourceforge.net/mingw/binutils-2.16.91-20060119-1.tar.gz
+http://umn.dl.sourceforge.net/mingw/mingw-runtime-3.10.tar.gz
+http://umn.dl.sourceforge.net/mingw/gcc-core-3.4.5-20060117-1.tar.gz
+http://umn.dl.sourceforge.net/mingw/w32api-3.6.tar.gz
+http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip  
+http://module-build.sourceforge.net/META-spec-new.html
+http://module-build.sourceforge.net/META-spec-v1.4.html
+http://www.cs.vu.nl/~tmgil/vi.html
+http://perlcomposer.sourceforge.net/vperl.html
+http://www.perl.com/CPAN/authors/Tom_Christiansen/scripts/rep
+http://w4.lns.cornell.edu/~pvhp/ptk/ptkTOC.html
+http://world.std.com/~aep/ptkdb/
+http://www.castlelink.co.uk/object_system/
+http://www.fh-wedel.de/elvis/
+ftp://ftp.blarg.net/users/amol/zsh/
+ftp://ftp.funet.fi/pub/languages/perl/CPAN
+http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
+
+# these are used to generate or match URLs
+http://www.cpan.org/modules/by-authors/id/$1/$1$2/$dist
+http://www.cpantesters.org/show/%s.yaml
+ftp://(.*?)/(.*)/(.*
+ftp://(.*?)/(.*)/(.*
+ftp://(.*?)/(.*)/(.*
+ftp://ftp.foo.bar/
+http://$host/
+http://wwwe%3C46/
+ftp:/
+
+# weird redirects that LWP doesn't like
+http://www.theperlreview.com/community_calendar
+http://www.software.hp.com/portal/swdepot/displayProductInfo.do?productNumber=PERL
+http://groups.google.com/
+http://groups.google.com/group/comp.lang.perl.misc/topics
+http://groups.google.co.uk/group/perl.perl5.porters/browse_thread/thread/b4e2b1d88280ddff/630b667a66e3509f?#630b667a66e3509f
+http://groups.google.com/group/comp.sys.sgi.admin/msg/3ad8353bc4ce3cb0
+http://groups.google.com/group/perl.module.build/browse_thread/thread/c8065052b2e0d741
+http://groups.google.com/group/perl.perl5.porters/browse_thread/thread/adaffaaf939b042e/20dafc298df737f0%2320dafc298df737f0?sa=X&oi=groupsr&start=0&num=3
+
+# broken webserver that doesn't like HEAD requests
+http://docs.sun.com/app/docs/doc/816-5168/syslog-3c?a=view
+
+# these have been reported upstream to CPAN authors
+http://www.gnu.org/manual/tar/html_node/tar_139.html
+http://www.w3.org/pub/WWW/TR/Wd-css-1.html
+http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP
+http://msdn.microsoft.com/workshop/author/dhtml/httponly_cookies.asp
+http://search.cpan.org/search?query=Module::Build::Convert
+http://www.refcnt.org/papers/module-build-convert
+http://csrc.nist.gov/cryptval/shs.html
+http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset4.asp
+http://www.debian.or.jp/~kubota/unicode-symbols.html.en
+http://www.mail-archive.com/perl5-porters@perl.org/msg69766.html
+http://www.debian.or.jp/~kubota/unicode-symbols.html.en
+http://rfc.net/rfc2781.html
+http://www.icu-project.org/charset/
+http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html
+http://www.rfc-editor.org/
+http://www.rfc.net/
+http://www.oreilly.com/people/authors/lunde/cjk_inf.html
+http://www.oreilly.com/catalog/cjkvinfo/
+http://www.cpan.org/modules/by-author/Damian_Conway/Filter-Simple.tar.gz
+http://www.csse.monash.edu.au/~damian/CPAN/Filter-Simple.tar.gz
+http://www.egt.ie/standards/iso3166/iso3166-1-en.html
+http://www.bsi-global.com/iso4217currency
+http://www.plover.com/~mjd/perl/Memoize/
+http://www.plover.com/~mjd/perl/MiniMemoize/
+http://www.sysadminmag.com/tpj/issues/vol5_5/
+ftp://ftp.tpc.int/tpc/server/UNIX/
+http://www.nara.gov/genealogy/
+http://home.utah-inter.net/kinsearch/Soundex.html
+http://www.nara.gov/genealogy/soundex/soundex.html
+http://rfc.net/rfc3461.html
+ftp://ftp.cs.pdx.edu/pub/elvis/
+http://www.fh-wedel.de/elvis/
+
 __END__
+
+=head1 NAME
+
+checkURL.pl - Check that all the URLs in the Perl source are valid
+
+=head1 DESCRIPTION
+
+This program checks that all the URLs in the Perl source are valid. It
+checks HTTP and FTP links in parallel and contains a list of known
+bad example links in its source. It takes 4 minutes to run on my
+machine. The results are written to 'uris.txt' and list the filename,
+the URL and the error:
+
+  * ext/Locale-Maketext/lib/Locale/Maketext.pod
+    http://sunsite.dk/RFC/rfc/rfc2277.html
+      404 Not Found
+  ...
+
+It should be run every so often and links fixed and upstream authors
+notified.
+
+Note that the web is unstable and some websites are temporarily down.