Commit | Line | Data |
23c4d79e |
1 | #!perl |
6e7dc4a9 |
2 | use strict; |
23c4d79e |
3 | use warnings; |
4 | use autodie; |
5 | use feature qw(say); |
6 | use File::Find::Rule; |
7 | use File::Slurp; |
8 | use File::Spec; |
9 | use IO::Socket::SSL; |
10 | use List::Util qw(sum); |
11 | use LWP::UserAgent; |
12 | use Net::FTP; |
13 | use Parallel::Fork::BossWorkerAsync; |
14 | use Term::ProgressBar::Simple; |
15 | use URI::Find::Simple qw( list_uris ); |
16 | $| = 1; |
17 | |
18 | my %ignore; |
19 | while ( my $line = <main::DATA> ) { |
20 | chomp $line; |
21 | next if $line =~ /^#/; |
22 | next unless $line; |
23 | $ignore{$line} = 1; |
24 | } |
6e7dc4a9 |
25 | |
23c4d79e |
26 | my $ua = LWP::UserAgent->new; |
27 | $ua->timeout(58); |
28 | $ua->env_proxy; |
6e7dc4a9 |
29 | |
23c4d79e |
30 | my @filenames = @ARGV; |
31 | @filenames = sort grep { $_ !~ /^\.git/ } File::Find::Rule->new->file->in('.') |
32 | unless @filenames; |
33 | |
34 | my $total_bytes = sum map {-s} @filenames; |
35 | |
36 | my $extract_progress = Term::ProgressBar::Simple->new( |
37 | { count => $total_bytes, |
38 | name => 'Extracting URIs', |
39 | } |
40 | ); |
6e7dc4a9 |
41 | |
23c4d79e |
42 | my %uris; |
43 | foreach my $filename (@filenames) { |
44 | next if $filename =~ /uris\.txt/; |
45 | next if $filename =~ /check_uris/; |
46 | next if $filename =~ /\.patch$/; |
47 | my $contents = read_file($filename); |
48 | my @uris = list_uris($contents); |
49 | foreach my $uri (@uris) { |
50 | next unless $uri =~ /^(http|ftp)/; |
51 | next if $ignore{$uri}; |
6e7dc4a9 |
52 | |
23c4d79e |
53 | # no need to hit rt.perl.org |
54 | next |
55 | if $uri =~ m{^http://rt.perl.org/rt3/Ticket/Display.html?id=\d+$}; |
6e7dc4a9 |
56 | |
23c4d79e |
57 | # no need to hit rt.cpan.org |
58 | next |
59 | if $uri =~ m{^http://rt.cpan.org/Public/Bug/Display.html?id=\d+$}; |
60 | push @{ $uris{$uri} }, $filename; |
61 | } |
62 | $extract_progress += -s $filename; |
63 | } |
64 | |
65 | my $bw = Parallel::Fork::BossWorkerAsync->new( |
66 | work_handler => \&work_alarmed, |
67 | global_timeout => 120, |
68 | worker_count => 20, |
69 | ); |
70 | |
71 | foreach my $uri ( keys %uris ) { |
72 | my @filenames = @{ $uris{$uri} }; |
73 | $bw->add_work( { uri => $uri, filenames => \@filenames } ); |
74 | } |
75 | |
76 | undef $extract_progress; |
77 | |
78 | my $fetch_progress = Term::ProgressBar::Simple->new( |
79 | { count => scalar( keys %uris ), |
80 | name => 'Fetching URIs', |
81 | } |
82 | ); |
83 | |
84 | my %filenames; |
85 | while ( $bw->pending() ) { |
86 | my $response = $bw->get_result(); |
87 | my $uri = $response->{uri}; |
88 | my @filenames = @{ $response->{filenames} }; |
89 | my $is_success = $response->{is_success}; |
90 | my $message = $response->{message}; |
91 | |
92 | unless ($is_success) { |
93 | foreach my $filename (@filenames) { |
94 | push @{ $filenames{$filename} }, |
95 | { uri => $uri, message => $message }; |
6e7dc4a9 |
96 | } |
97 | } |
23c4d79e |
98 | $fetch_progress++; |
6e7dc4a9 |
99 | } |
23c4d79e |
100 | $bw->shut_down(); |
6e7dc4a9 |
101 | |
23c4d79e |
102 | my $fh = IO::File->new('> uris.txt'); |
103 | foreach my $filename ( sort keys %filenames ) { |
104 | $fh->say("* $filename"); |
105 | my @bits = @{ $filenames{$filename} }; |
106 | foreach my $bit (@bits) { |
107 | my $uri = $bit->{uri}; |
108 | my $message = $bit->{message}; |
109 | $fh->say(" $uri"); |
110 | $fh->say(" $message"); |
0d6d7233 |
111 | } |
112 | } |
23c4d79e |
113 | $fh->close; |
0d6d7233 |
114 | |
23c4d79e |
115 | say 'Finished, see uris.txt'; |
6e7dc4a9 |
116 | |
23c4d79e |
117 | sub work_alarmed { |
118 | my $conf = shift; |
119 | eval { |
120 | local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required |
121 | alarm 60; |
122 | $conf = work($conf); |
123 | alarm 0; |
124 | }; |
125 | if ($@) { |
126 | $conf->{is_success} = 0; |
127 | $conf->{message} = 'Timed out'; |
0d6d7233 |
128 | |
23c4d79e |
129 | } |
130 | return $conf; |
0d6d7233 |
131 | } |
132 | |
23c4d79e |
133 | sub work { |
134 | my $conf = shift; |
135 | my $uri = $conf->{uri}; |
136 | my @filenames = @{ $conf->{filenames} }; |
0d6d7233 |
137 | |
23c4d79e |
138 | if ( $uri =~ /^http/ ) { |
139 | my $uri_without_fragment = URI->new($uri); |
140 | my $fragment = $uri_without_fragment->fragment(undef); |
141 | my $response = $ua->head($uri_without_fragment); |
0d6d7233 |
142 | |
23c4d79e |
143 | $conf->{is_success} = $response->is_success; |
144 | $conf->{message} = $response->status_line; |
145 | return $conf; |
146 | } else { |
0d6d7233 |
147 | |
23c4d79e |
148 | my $uri_object = URI->new($uri); |
149 | my $host = $uri_object->host; |
150 | my $path = $uri_object->path; |
151 | my ( $volume, $directories, $filename ) |
152 | = File::Spec->splitpath($path); |
0d6d7233 |
153 | |
23c4d79e |
154 | my $ftp = Net::FTP->new( $host, Passive => 1, Timeout => 60 ); |
155 | unless ($ftp) { |
156 | $conf->{is_succcess} = 0; |
157 | $conf->{message} = "Can not connect to $host: $@"; |
158 | return $conf; |
159 | } |
0d6d7233 |
160 | |
23c4d79e |
161 | my $can_login = $ftp->login( "anonymous", '-anonymous@' ); |
162 | unless ($can_login) { |
163 | $conf->{is_success} = 0; |
164 | $conf->{message} = "Can not login ", $ftp->message; |
165 | return $conf; |
6e7dc4a9 |
166 | } |
167 | |
23c4d79e |
168 | my $can_binary = $ftp->binary(); |
169 | unless ($can_binary) { |
170 | $conf->{is_success} = 0; |
171 | $conf->{message} = "Can not binary ", $ftp->message; |
172 | return $conf; |
173 | } |
174 | |
175 | my $can_cwd = $ftp->cwd($directories); |
176 | unless ($can_cwd) { |
177 | $conf->{is_success} = 0; |
178 | $conf->{message} = "Can not cwd to $directories ", $ftp->message; |
179 | return $conf; |
180 | } |
181 | |
182 | if ($filename) { |
183 | my $can_size = $ftp->size($filename); |
184 | unless ($can_size) { |
185 | $conf->{is_success} = 0; |
186 | $conf->{message} |
187 | = "Can not size $filename in $directories", |
188 | $ftp->message; |
189 | return $conf; |
190 | } |
191 | } else { |
192 | my ($can_dir) = $ftp->dir; |
193 | unless ($can_dir) { |
194 | my ($can_ls) = $ftp->ls; |
195 | unless ($can_ls) { |
196 | $conf->{is_success} = 0; |
197 | $conf->{message} |
198 | = "Can not dir or ls in $directories ", |
199 | $ftp->message; |
200 | return $conf; |
201 | } |
202 | } |
203 | } |
204 | |
205 | $conf->{is_success} = 1; |
206 | return $conf; |
6e7dc4a9 |
207 | } |
208 | } |
209 | |
23c4d79e |
210 | __DATA__ |
211 | # these are fine but give errors |
212 | ftp://ftp.stratus.com/pub/vos/posix/ga/ga.html |
213 | ftp://ftp.stratus.com/pub/vos/utility/utility.html |
214 | |
215 | # this is missing, sigh |
216 | ftp://ftp.sco.com/SLS/ptf7051e.Z |
217 | http://perlmonks.thepen.com/42898.html |
218 | |
219 | # this are URI extraction bugs |
220 | http://www.perl.org/E |
221 | http://en.wikipedia.org/wiki/SREC_(file_format |
222 | http://somewhere.else',-type=/ |
223 | ftp:passive-mode |
224 | ftp: |
225 | http:[- |
226 | http://search.cpan.org/src/KWILLIAMS/Module-Build-0.30/lib/Module/Build/Platform/Windows.pm:split_like_shell |
227 | http://www.xray.mpe.mpg.de/mailing-lists/perl5- |
228 | http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP: |
229 | |
230 | # these are used as an example |
231 | http://example.com/ |
232 | http://something.here/ |
233 | http://users.perl5.git.perl.org/~yourlogin/ |
234 | http://github.com/USERNAME/perl/tree/orange |
235 | http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi |
236 | http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar |
237 | http://somewhere.else$/ |
238 | http://somewhere.else$/ |
239 | http://somewhere.else/bin/foo&bar',-Type= |
240 | http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi |
241 | http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar |
242 | http://www.perl.org/test.cgi |
243 | http://cpan2.local/ |
244 | http://search.cpan.org/perldoc? |
245 | http://cpan1.local/ |
246 | http://cpan.dev.local/CPAN |
247 | http:/// |
248 | ftp:// |
249 | ftp://myurl/ |
250 | ftp://ftp.software.ibm.com/aix/fixes/v4/other/vac.C.5.0.2.6.bff |
251 | http://www14.software.ibm.com/webapp/download/downloadaz.jsp |
252 | http://www.cpan.org/modules/by-module/Archive/Archive-Zip-*.tar.gz |
253 | http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT |
254 | http://localhost/tmp/index.txt |
255 | http://example.com/foo/bar.html |
256 | http://example.com/Text-Bastardize-1.06.tar.gz |
257 | ftp://example.com/sources/packages.txt |
258 | http://example.com/sources/packages.txt |
259 | http://example.com/sources |
260 | ftp://example.com/sources |
261 | http://some.where.com/dir/file.txt |
262 | http://some.where.com/dir/a.txt |
263 | http://foo.com/X.tgz |
264 | ftp://foo.com/X.tgz |
265 | http://foo/ |
266 | http://www.foo.com:8000/ |
267 | http://mysite.com/cgi-bin/log.cgi/http://www.some.other.site/args |
268 | http://decoded/mirror/path |
269 | http://a/b/c/d/e/f/g/h/i/j |
270 | http://foo/bar.gz |
271 | ftp://ftp.perl.org |
272 | http://purl.org/rss/1.0/modules/taxonomy/ |
273 | ftp://ftp.sun.ac.za/CPAN/CPAN/ |
274 | ftp://ftp.cpan.org/pub/mirror/index.txt |
275 | ftp://cpan.org/pub/mirror/index.txt |
276 | http://example.com/~eh/ |
277 | http://plagger.org/.../rss |
278 | http://umn.dl.sourceforge.net/mingw/gcc-g++-3.4.5-20060117-1.tar.gz |
279 | http://umn.dl.sourceforge.net/mingw/binutils-2.16.91-20060119-1.tar.gz |
280 | http://umn.dl.sourceforge.net/mingw/mingw-runtime-3.10.tar.gz |
281 | http://umn.dl.sourceforge.net/mingw/gcc-core-3.4.5-20060117-1.tar.gz |
282 | http://umn.dl.sourceforge.net/mingw/w32api-3.6.tar.gz |
283 | http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip |
284 | http://module-build.sourceforge.net/META-spec-new.html |
285 | http://module-build.sourceforge.net/META-spec-v1.4.html |
286 | http://www.cs.vu.nl/~tmgil/vi.html |
287 | http://perlcomposer.sourceforge.net/vperl.html |
288 | http://www.perl.com/CPAN/authors/Tom_Christiansen/scripts/rep |
289 | http://w4.lns.cornell.edu/~pvhp/ptk/ptkTOC.html |
290 | http://world.std.com/~aep/ptkdb/ |
291 | http://www.castlelink.co.uk/object_system/ |
292 | http://www.fh-wedel.de/elvis/ |
293 | ftp://ftp.blarg.net/users/amol/zsh/ |
294 | ftp://ftp.funet.fi/pub/languages/perl/CPAN |
295 | http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip |
296 | |
297 | # these are used to generate or match URLs |
298 | http://www.cpan.org/modules/by-authors/id/$1/$1$2/$dist |
299 | http://www.cpantesters.org/show/%s.yaml |
300 | ftp://(.*?)/(.*)/(.* |
301 | ftp://(.*?)/(.*)/(.* |
302 | ftp://(.*?)/(.*)/(.* |
303 | ftp://ftp.foo.bar/ |
304 | http://$host/ |
305 | http://wwwe%3C46/ |
306 | ftp:/ |
307 | |
308 | # weird redirects that LWP doesn't like |
309 | http://www.theperlreview.com/community_calendar |
310 | http://www.software.hp.com/portal/swdepot/displayProductInfo.do?productNumber=PERL |
311 | http://groups.google.com/ |
312 | http://groups.google.com/group/comp.lang.perl.misc/topics |
313 | http://groups.google.co.uk/group/perl.perl5.porters/browse_thread/thread/b4e2b1d88280ddff/630b667a66e3509f?#630b667a66e3509f |
314 | http://groups.google.com/group/comp.sys.sgi.admin/msg/3ad8353bc4ce3cb0 |
315 | http://groups.google.com/group/perl.module.build/browse_thread/thread/c8065052b2e0d741 |
316 | http://groups.google.com/group/perl.perl5.porters/browse_thread/thread/adaffaaf939b042e/20dafc298df737f0%2320dafc298df737f0?sa=X&oi=groupsr&start=0&num=3 |
317 | |
318 | # broken webserver that doesn't like HEAD requests |
319 | http://docs.sun.com/app/docs/doc/816-5168/syslog-3c?a=view |
320 | |
321 | # these have been reported upstream to CPAN authors |
322 | http://www.gnu.org/manual/tar/html_node/tar_139.html |
323 | http://www.w3.org/pub/WWW/TR/Wd-css-1.html |
324 | http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP |
325 | http://msdn.microsoft.com/workshop/author/dhtml/httponly_cookies.asp |
326 | http://search.cpan.org/search?query=Module::Build::Convert |
327 | http://www.refcnt.org/papers/module-build-convert |
328 | http://csrc.nist.gov/cryptval/shs.html |
329 | http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset4.asp |
330 | http://www.debian.or.jp/~kubota/unicode-symbols.html.en |
331 | http://www.mail-archive.com/perl5-porters@perl.org/msg69766.html |
332 | http://www.debian.or.jp/~kubota/unicode-symbols.html.en |
333 | http://rfc.net/rfc2781.html |
334 | http://www.icu-project.org/charset/ |
335 | http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html |
336 | http://www.rfc-editor.org/ |
337 | http://www.rfc.net/ |
338 | http://www.oreilly.com/people/authors/lunde/cjk_inf.html |
339 | http://www.oreilly.com/catalog/cjkvinfo/ |
340 | http://www.cpan.org/modules/by-author/Damian_Conway/Filter-Simple.tar.gz |
341 | http://www.csse.monash.edu.au/~damian/CPAN/Filter-Simple.tar.gz |
342 | http://www.egt.ie/standards/iso3166/iso3166-1-en.html |
343 | http://www.bsi-global.com/iso4217currency |
344 | http://www.plover.com/~mjd/perl/Memoize/ |
345 | http://www.plover.com/~mjd/perl/MiniMemoize/ |
346 | http://www.sysadminmag.com/tpj/issues/vol5_5/ |
347 | ftp://ftp.tpc.int/tpc/server/UNIX/ |
348 | http://www.nara.gov/genealogy/ |
349 | http://home.utah-inter.net/kinsearch/Soundex.html |
350 | http://www.nara.gov/genealogy/soundex/soundex.html |
351 | http://rfc.net/rfc3461.html |
352 | ftp://ftp.cs.pdx.edu/pub/elvis/ |
353 | http://www.fh-wedel.de/elvis/ |
354 | |
6e7dc4a9 |
355 | __END__ |
23c4d79e |
356 | |
357 | =head1 NAME |
358 | |
359 | checkURL.pl - Check that all the URLs in the Perl source are valid |
360 | |
361 | =head1 DESCRIPTION |
362 | |
363 | This program checks that all the URLs in the Perl source are valid. It |
364 | checks HTTP and FTP links in parallel and contains a list of known |
365 | bad example links in its source. It takes 4 minutes to run on my |
366 | machine. The results are written to 'uris.txt' and list the filename, |
367 | the URL and the error: |
368 | |
369 | * ext/Locale-Maketext/lib/Locale/Maketext.pod |
370 | http://sunsite.dk/RFC/rfc/rfc2277.html |
371 | 404 Not Found |
372 | ... |
373 | |
374 | It should be run every so often and links fixed and upstream authors |
375 | notified. |
376 | |
377 | Note that the web is unstable and some websites are temporarily down. |