Commit | Line | Data |
0a753a76 |
1 | @rem = '--*-Perl-*--'; |
2 | @rem = ' |
3 | @echo off |
4 | perl -S %0.bat %1 %2 %3 %4 %5 %6 %7 %8 %9 |
5 | goto endofperl |
6 | @rem '; |
7 | #!/usr/local/bin/perl -w |
8 | |
9 | #- |
10 | #!/usr/local/bin/perl -w |
11 | $version = "951121.18"; |
12 | $comments = 'jfriedl@omron.co.jp'; |
13 | |
14 | ## |
15 | ## This is "webget" |
16 | ## |
17 | ## Jeffrey Friedl (jfriedl@omron.co.jp), July 1994. |
18 | ## Copyright 19.... ah hell, just take it. |
19 | ## Should work with either perl4 or perl5 |
20 | ## |
21 | ## BLURB: |
22 | ## Given a URL on the command line (HTTP and FTP supported at the moment), |
23 | ## webget fetches the named object (HTML text, images, audio, whatever the |
24 | ## object happens to be). Will automatically use a proxy if one is defined |
25 | ## in the environment, follow "this URL has moved" responses, and retry |
26 | ## "can't find host" responses from a proxy in case host lookup was slow). |
27 | ## Supports users & passwords (FTP), Basic Authorization (HTTP), update-if- |
28 | ## modified (HTTP), and much more. Works with perl4 or perl5. |
29 | |
30 | ## |
31 | ## More-detailed instructions in the comment block below the history list. |
32 | ## |
33 | |
34 | ## |
35 | ## To-do: |
36 | ## Add gopher support. |
37 | ## Fix up how error messages are passed among this and the libraries. |
38 | ## |
39 | |
40 | ## 951219.19 |
41 | ## Lost ftp connections now die with a bit more grace. |
42 | ## |
43 | ## 951121.18 |
44 | ## Add -nnab. |
45 | ## Brought the "usage" string in line with reality. |
46 | ## |
47 | ## 951114.17 |
48 | ## Added -head. |
49 | ## Added -update/-refresh/-IfNewerThan. If any URL was not pulled |
50 | ## because it was not out of date, an exit value of 2 is returned. |
51 | ## |
52 | ## 951031.16 |
53 | ## Added -timeout. Cleaned up (a bit) the exit value. Now exits |
54 | ## with 1 if all URLs had some error (timeout exits immediately with |
55 | ## code 3, though. This is subject to change). Exits with 0 if any |
56 | ## URL was brought over safely. |
57 | ## |
58 | ## 951017.15 |
59 | ## Neat -pf, -postfile idea from Lorrie Cranor |
60 | ## (http://www.ccrc.wustl.edu/~lorracks/) |
61 | ## |
62 | ## 950912.14 |
63 | ## Sigh, fixed a typo. |
64 | ## |
65 | ## 950911.13 |
66 | ## Added Basic Authorization support for http. See "PASSWORDS AND STUFF" |
67 | ## in the documentation. |
68 | ## |
69 | ## 950911.12 |
70 | ## Implemented a most-excellent suggestion by Anthony D'Atri |
71 | ## (aad@nwnet.net), to be able to automatically grab to a local file of |
72 | ## the same name as the URL. See the '-nab' flag. |
73 | ## |
74 | ## 950706.11 |
75 | ## Quelled small -w warning (thanks: Lars Rasmussen <gnort@daimi.aau.dk>) |
76 | ## |
77 | ## 950630.10 |
78 | ## Steve Campbell to the rescue again. FTP now works when supplied |
79 | ## with a userid & password (eg ftp://user:pass@foo.bar.com/index.txt). |
80 | ## |
81 | ## 950623.9 |
82 | ## Incorporated changes from Steve Campbell (steven_campbell@uk.ibm.com) |
83 | ## so that the ftp will work when no password is required of a user. |
84 | ## |
85 | ## 950530.8 |
86 | ## Minor changes: |
87 | ## Eliminate read-size warning message when size unknown. |
88 | ## Pseudo-debug/warning messages at the end of debug_read now go to |
89 | ## stderr. Some better error handling when trying to contact systems |
90 | ## that aren't really set up for ftp. Fixed a bug concerning FTP access |
91 | ## to a root directory. Added proxy documentation at head of file. |
92 | ## |
93 | ## 950426.6,7 |
94 | ## Complete Overhaul: |
95 | ## Renamed from httpget. Added ftp support (very sketchy at the moment). |
96 | ## Redid to work with new 'www.pl' library; chucked 'Www.pl' library. |
97 | ## More or less new and/or improved in many ways, but probably introduced |
98 | ## a few bugs along the way. |
99 | ## |
100 | ## 941227.5 |
101 | ## Added follow stuff (with -nofollow, etc.) |
102 | ## Added -updateme. Cool! |
103 | ## Some general tidying up. |
104 | ## |
105 | ## 941107.4 |
106 | ## Allowed for ^M ending a header line... PCs give those kind of headers. |
107 | ## |
108 | ## 940820.3 |
109 | ## First sorta'clean net release. |
110 | ## |
111 | ## |
112 | |
113 | ## |
114 | ##> |
115 | ## |
116 | ## Fetch http and/or ftp URL(s) given on the command line and spit to |
117 | ## STDOUT. |
118 | ## |
119 | ## Options include: |
120 | ## -V, -version |
121 | ## Print version information; exit. |
122 | ## |
123 | ## -p, -post |
124 | ## If the URL looks like a reply to a form (i.e. has a '?' in it), |
125 | ## the request is POST'ed instead of GET'ed. |
126 | ## |
127 | ## -head |
128 | ## Gets the header only (for HTTP). This might include such useful |
129 | ## things as 'Last-modified' and 'Content-length' fields |
130 | ## (a lack of a 'Last-modified' might be a good indication that it's |
131 | ## a CGI). |
132 | ## |
133 | ## The "-head" option implies "-nostrip", but does *not* imply, |
134 | ## for example "-nofollow". |
135 | ## |
136 | ## |
137 | ## -pf, -postfile |
138 | ## The item after the '?' is taken as a local filename, and the contents |
139 | ## are POST'ed as with -post |
140 | ## |
141 | ## -nab, -f, -file |
142 | ## Rather than spit the URL(s) to standard output, unconditionally |
143 | ## dump to a file (or files) whose name is that as used in the URL, |
144 | ## sans path. I like '-nab', but supply '-file' as well since that's |
145 | ## what was originally suggested. Also see '-update' below for the |
146 | ## only-if-changed version. |
147 | ## |
148 | ## -nnab |
149 | ## Like -nab, but in addtion to dumping to a file, dump to stdout as well. |
150 | ## Sort of like the 'tee' command. |
151 | ## |
152 | ## -update, -refresh |
153 | ## Do the same thing as -nab, etc., but does not bother pulling the |
154 | ## URL if it older than the localfile. Only applies to HTTP. |
155 | ## Uses the HTTP "If-Modified-Since" field. If the URL was not modified |
156 | ## (and hence not changed), the return value is '2'. |
157 | ## |
158 | ## -IfNewerThan FILE |
159 | ## -int FILE |
160 | ## Only pulls URLs if they are newer than the date the local FILE was |
161 | ## last written. |
162 | ## |
163 | ## -q, -quiet |
164 | ## Suppresses all non-essential informational messages. |
165 | ## |
166 | ## -nf, -nofollow |
167 | ## Normally, a "this URL has moved" HTTP response is automatically |
168 | ## followed. Not done with -nofollow. |
169 | ## |
170 | ## -nr, -noretry |
171 | ## Normally, an HTTP proxy response of "can't find host" is retried |
172 | ## up to three times, to give the remote hostname lookup time to |
173 | ## come back with an answer. This suppresses the retries. This is the |
174 | ## same as '-retry 0'. |
175 | ## |
176 | ## -r#, -retry#, -r #, -retry # |
177 | ## Sets the number of times to retry. Default 3. |
178 | ## |
179 | ## -ns, -nostrip |
180 | ## For HTTP items (including other items going through an HTTP proxy), |
181 | ## the HTTP response header is printed rather than stripped as default. |
182 | ## |
183 | ## -np, -noproxy |
184 | ## A proxy is not used, even if defined for the protocol. |
185 | ## |
186 | ## -h, -help |
187 | ## Show a usage message and exit. |
188 | ## |
189 | ## -d, -debug |
190 | ## Show some debugging messages. |
191 | ## |
192 | ## -updateme |
193 | ## The special and rather cool flag "-updateme" will see if webget has |
194 | ## been updated since you got your version, and prepare a local |
195 | ## version of the new version for you to use. Keep updated! (although |
196 | ## you can always ask to be put on the ping list to be notified when |
197 | ## there's a new version -- see the author's perl web page). |
198 | ## |
199 | ## -timeout TIMESPAN |
200 | ## -to TIMESPAN |
201 | ## Time out if a connection can not be made within the specified time |
202 | ## period. TIMESPAN is normally in seconds, although a 'm' or 'h' may |
203 | ## be appended to indicate minutes and hours. "-to 1.5m" would timeout |
204 | ## after 90 seconds. |
205 | ## |
206 | ## (At least for now), a timeout causes immediate program death (with |
207 | ## exit value 3). For some reason, the alarm doesn't always cause a |
208 | ## waiting read or connect to abort, so I just die immediately.. /-: |
209 | ## |
210 | ## I might consider adding an "entire fetch" timeout, if someone |
211 | ## wants it. |
212 | ## |
213 | ## PASSWORDS AND SUCH |
214 | ## |
215 | ## You can use webget to do FTP fetches from non-Anonymous systems and |
216 | ## accounts. Just put the required username and password into the URL, |
217 | ## as with |
218 | ## webget 'ftp:/user:password@ftp.somesite.com/pub/pix/babe.gif |
219 | ## ^^^^^^^^^^^^^ |
220 | ## Note the user:password is separated from the hostname by a '@'. |
221 | ## |
222 | ## You can use the same kind of thing with HTTP, and if so it will provide |
223 | ## what's know as Basic Authorization. This is >weak< authorization. It |
224 | ## also provides >zero< security -- I wouldn't be sending any credit-card |
225 | ## numbers this way (unless you send them 'round my way :-). It seems to |
226 | ## be used most by providers of free stuff where they want to make some |
227 | ## attempt to limit access to "known users". |
228 | ## |
229 | ## PROXY STUFF |
230 | ## |
231 | ## If you need to go through a gateway to get out to the whole internet, |
232 | ## you can use a proxy if one's been set up on the gateway. This is done |
233 | ## by setting the "http_proxy" environmental variable to point to the |
234 | ## proxy server. Other variables are used for other target protocols.... |
235 | ## "gopher_proxy", "ftp_proxy", "wais_proxy", etc. |
236 | ## |
237 | ## For example, I have the following in my ".login" file (for use with csh): |
238 | ## |
239 | ## setenv http_proxy http://local.gateway.machine:8080/ |
240 | ## |
241 | ## This is to indicate that any http URL should go to local.gateway.machine |
242 | ## (port 8080) via HTTP. Additionally, I have |
243 | ## |
244 | ## setenv gopher_proxy "$http_proxy" |
245 | ## setenv wais_proxy "$http_proxy" |
246 | ## setenv ftp_proxy "$http_proxy" |
247 | ## |
248 | ## This means that any gopher, wais, or ftp URL should also go to the |
249 | ## same place, also via HTTP. This allows webget to get, for example, |
250 | ## GOPHER URLs even though it doesn't support GOPHER itself. It uses HTTP |
251 | ## to talk to the proxy, which then uses GOPHER to talk to the destination. |
252 | ## |
253 | ## Finally, if there are sites inside your gateway that you would like to |
254 | ## connect to, you can list them in the "no_proxy" variable. This will allow |
255 | ## you to connect to them directly and skip going through the proxy: |
256 | ## |
257 | ## setenv no_proxy "www.this,www.that,www.other" |
258 | ## |
259 | ## I (jfriedl@omron.co.jp) have little personal experience with proxies |
260 | ## except what I deal with here at Omron, so if this is not representative |
261 | ## of your situation, please let me know. |
262 | ## |
263 | ## RETURN VALUE |
264 | ## The value returned to the system by webget is rather screwed up because |
265 | ## I didn't think about dealing with it until things were already |
266 | ## complicated. Since there can be more than one URL on the command line, |
267 | ## it's hard to decide what to return when one times out, another is fetched, |
268 | ## another doesn't need to be fetched, and a fourth isn't found. |
269 | ## |
270 | ## So, here's the current status: |
271 | ## |
272 | ## Upon any timeout (via the -timeout arg), webget immediately |
273 | ## returns 3. End of story. Otherwise.... |
274 | ## |
275 | ## If any URL was fetched with a date limit (i.e. via |
276 | ## '-update/-refresh/-IfNewerThan' and was found to not have changed, |
277 | ## 2 is returned. Otherwise.... |
278 | ## |
279 | ## If any URL was successfully fetched, 0 is returned. Otherwise... |
280 | ## |
281 | ## If there were any errors, 1 is returned. Otherwise... |
282 | ## |
283 | ## Must have been an info-only or do-nothing instance. 0 is returned. |
284 | ## |
285 | ## Phew. Hopefully useful to someone. |
286 | ##< |
287 | ## |
288 | |
289 | ## Where latest version should be. |
290 | $WEB_normal = 'http://www.wg.omron.co.jp/~jfriedl/perl/webget'; |
291 | $WEB_inlined = 'http://www.wg.omron.co.jp/~jfriedl/perl/inlined/webget'; |
292 | |
293 | |
294 | require 'network.pl'; ## inline if possible (directive to a tool of mine) |
295 | require 'www.pl'; ## inline if possible (directive to a tool of mine) |
296 | $inlined=0; ## this might be changed by a the inline thing. |
297 | |
298 | ## |
299 | ## Exit values. All screwed up. |
300 | ## |
301 | $EXIT_ok = 0; |
302 | $EXIT_error = 1; |
303 | $EXIT_notmodified = 2; |
304 | $EXIT_timeout = 3; |
305 | |
306 | ## |
307 | ## |
308 | |
309 | warn qq/WARNING:\n$0: need a newer version of "network.pl"\n/ if |
310 | !defined($network'version) || $network'version < "950311.5"; |
311 | warn qq/WARNING:\n$0: need a newer version of "www.pl"\n/ if |
312 | !defined($www'version) || $www'version < "951114.8"; |
313 | |
314 | $WEB = $inlined ? $WEB_inlined : $WEB_normal; |
315 | |
316 | $debug = 0; |
317 | $strip = 1; ## default is to strip |
318 | $quiet = 0; ## also normally off. |
319 | $follow = 1; ## normally, we follow "Found (302)" links |
320 | $retry = 3; ## normally, retry proxy hostname lookups up to 3 times. |
321 | $nab = 0; ## If true, grab to a local file of the same name. |
322 | $refresh = 0; ## If true, use 'If-Modified-Since' with -nab get. |
323 | $postfile = 0; ## If true, filename is given after the '?' |
324 | $defaultdelta2print = 2048; |
325 | $TimeoutSpan = 0; ## seconds after which we should time out. |
326 | |
327 | while (@ARGV && $ARGV[0] =~ m/^-/) |
328 | { |
329 | $arg = shift(@ARGV); |
330 | |
331 | $nab = 1, next if $arg =~ m/^-f(ile)?$/; |
332 | $nab = 1, next if $arg =~ m/^-nab$/; |
333 | $nab = 2, next if $arg =~ m/^-nnab$/; |
334 | $post = 1, next if $arg =~ m/^-p(ost)?$/i; |
335 | $post = $postfile = 1, next if $arg =~ m/^-p(ost)?f(ile)?$/i; |
336 | $quiet=1, next if $arg =~ m/^-q(uiet)?$/; |
337 | $follow = 0, next if $arg =~ m/^-no?f(ollow)?$/; |
338 | $strip = 0, next if $arg =~ m/^-no?s(trip)?$/; |
339 | $debug=1, next if $arg =~ m/^-d(ebug)?$/; |
340 | $noproxy=1, next if $arg =~ m/^-no?p(roxy)?$/; |
341 | $retry=0, next if $arg =~ m/^-no?r(etry)?$/; |
342 | $retry=$2, next if $arg =~ m/^-r(etry)?(\d+)$/; |
343 | &updateme if $arg eq '-updateme'; |
344 | $strip = 0, $head = 1, next if $arg =~ m/^-head(er)?/; |
345 | $nab = $refresh = 1, next if $arg =~ m/^-(refresh|update)/; |
346 | |
347 | &usage($EXIT_ok) if $arg =~ m/^-h(elp)?$/; |
348 | &show_version, exit($EXIT_ok) if $arg eq '-version' || $arg eq '-V'; |
349 | |
350 | if ($arg =~ m/^-t(ime)?o(ut)?$/i) { |
351 | local($num) = shift(@ARGV); |
352 | &usage($EXIT_error, "expecting timespan argument to $arg\n") unless |
353 | $num =~ m/^\d+(\d*)?[hms]?$/; |
354 | &timeout_arg($num); |
355 | next; |
356 | } |
357 | |
358 | if ($arg =~ m/^-if?n(ewer)?t(han)?$/i) { |
359 | $reference_file = shift(@ARGV); |
360 | &usage($EXIT_error, "expecting filename arg to $arg") |
361 | if !defined $reference_file; |
362 | if (!-f $reference_file) { |
363 | warn qq/$0: ${arg}'s "$reference_file" not found.\n/; |
364 | exit($EXIT_error); |
365 | } |
366 | next; |
367 | } |
368 | |
369 | if ($arg eq '-r' || $arg eq '-retry') { |
370 | local($num) = shift(@ARGV); |
371 | &usage($EXIT_error, "expecting numerical arg to $arg\n") unless |
372 | defined($num) && $num =~ m/^\d+$/; |
373 | $retry = $num; |
374 | next; |
375 | } |
376 | &usage($EXIT_error, qq/$0: unknown option "$arg"\n/); |
377 | } |
378 | |
379 | if ($head && $post) { |
380 | warn "$0: combining -head and -post makes no sense, ignoring -post.\n"; |
381 | $post = 0; |
382 | undef $postfile; |
383 | } |
384 | |
385 | if ($refresh && defined($reference_file)) { |
386 | warn "$0: combining -update and -IfNewerThan make no sense, ignoring -IfNewerThan.\n"; |
387 | undef $reference_file; |
388 | } |
389 | |
390 | if (@ARGV == 0) { |
391 | warn "$0: nothing to do. Use -help for info.\n"; |
392 | exit($EXIT_ok); |
393 | } |
394 | |
395 | |
396 | ## |
397 | ## Now run through the remaining arguments (mostly URLs) and do a quick |
398 | ## check to see if they look well-formed. We won't *do* anything -- just |
399 | ## want to catch quick errors before really starting the work. |
400 | ## |
401 | @tmp = @ARGV; |
402 | $errors = 0; |
403 | while (@tmp) { |
404 | $arg = shift(@tmp); |
405 | if ($arg =~ m/^-t(ime)?o(ut)?$/) { |
406 | local($num) = shift(@tmp); |
407 | if ($num !~ m/^\d+(\d*)?[hms]?$/) { |
408 | &warn("expecting timespan argument to $arg\n"); |
409 | $errors++; |
410 | } |
411 | } else { |
412 | local($protocol) = &www'grok_URL($arg, $noproxy); |
413 | |
414 | if (!defined $protocol) { |
415 | warn qq/can't grok "$arg"/; |
416 | $errors++; |
417 | } elsif (!$quiet && ($protocol eq 'ftp')) { |
418 | warn qq/warning: -head ignored for ftp URLs\n/ if $head; |
419 | warn qq/warning: -refresh ignored for ftp URLs\n/if $refresh; |
420 | warn qq/warning: -IfNewerThan ignored for ftp URLs\n/if defined($reference_file); |
421 | |
422 | } |
423 | } |
424 | } |
425 | |
426 | exit($EXIT_error) if $errors; |
427 | |
428 | |
429 | $SuccessfulCount = 0; |
430 | $NotModifiedCount = 0; |
431 | |
432 | ## |
433 | ## Now do the real thing. |
434 | ## |
435 | while (@ARGV) { |
436 | $arg = shift(@ARGV); |
437 | if ($arg =~ m/^-t(ime)?o(ut)?$/) { |
438 | &timeout_arg(shift(@ARGV)); |
439 | } else { |
440 | &fetch_url($arg); |
441 | } |
442 | } |
443 | |
444 | if ($NotModifiedCount) { |
445 | exit($EXIT_notmodified); |
446 | } elsif ($SuccessfulCount) { |
447 | exit($EXIT_ok); |
448 | } else { |
449 | exit($EXIT_error); |
450 | } |
451 | |
452 | ########################################################################### |
453 | ########################################################################### |
454 | |
455 | sub timeout_arg |
456 | { |
457 | ($TimeoutSpan) = @_; |
458 | $TimeoutSpan =~ s/s//; |
459 | $TimeoutSpan *= 60 if $TimeoutSpan =~ m/m/; |
460 | $TimeoutSpan *= 3600 if $TimeoutSpan =~ m/h/; |
461 | |
462 | } |
463 | |
464 | ## |
465 | ## As a byproduct, returns the basename of $0. |
466 | ## |
467 | sub show_version |
468 | { |
469 | local($base) = $0; |
470 | $base =~ s,.*/,,; |
471 | print STDERR "This is $base version $version\n"; |
472 | $base; |
473 | } |
474 | |
475 | ## |
476 | ## &usage(exitval, message); |
477 | ## |
478 | ## Prints a usage message to STDERR. |
479 | ## If MESSAGE is defined, prints that first. |
480 | ## If exitval is defined, exits with that value. Otherwise, returns. |
481 | ## |
482 | sub usage |
483 | { |
484 | local($exit, $message) = @_; |
485 | |
486 | print STDERR $message if defined $message; |
487 | local($base) = &show_version; |
488 | print STDERR <<INLINE_LITERAL_TEXT; |
489 | usage: $0 [options] URL ... |
490 | Fetches and displays the named URL(s). Supports http and ftp. |
491 | (if no protocol is given, a leading "http://" is normally used). |
492 | |
493 | Options are from among: |
494 | -V, -version Print version information; exit. |
495 | -p, -post If URL looks like a form reply, does POST instead of GET. |
496 | -pf, -postfile Like -post, but takes everything after ? to be a filename. |
497 | -q, -quiet All non-essential informational messages are suppressed. |
498 | -nf, -nofollow Don't follow "this document has moved" replies. |
499 | -nr, -noretry Doesn't retry a failed hostname lookup (same as -retry 0) |
500 | -r #, -retry # Sets failed-hostname-lookup-retry to # (default $retry) |
501 | -np, -noproxy Uses no proxy, even if one defined for the protocol. |
502 | -ns, -nostrip The HTTP header, normally elided, is printed. |
503 | -head gets item header only (implies -ns) |
504 | -nab, -file Dumps output to file whose name taken from URL, minus path |
505 | -nnab Like -nab, but *also* dumps to stdout. |
506 | -update HTTP only. Like -nab, but only if the page has been modified. |
507 | -h, -help Prints this message. |
508 | -IfNewerThan F HTTP only. Only brings page if it is newer than named file. |
509 | -timeout T Fail if a connection can't be made in the specified time. |
510 | |
511 | -updateme Pull the latest version of $base from |
512 | $WEB |
513 | and reports if it is newer than your current version. |
514 | |
515 | Comments to $comments. |
516 | INLINE_LITERAL_TEXT |
517 | |
518 | exit($exit) if defined $exit; |
519 | } |
520 | |
521 | ## |
522 | ## Pull the latest version of this program to a local file. |
523 | ## Clip the first couple lines from this executing file so that we |
524 | ## preserve the local invocation style. |
525 | ## |
526 | sub updateme |
527 | { |
528 | ## |
529 | ## Open a temp file to hold the new version, |
530 | ## redirecting STDOUT to it. |
531 | ## |
532 | open(STDOUT, '>'.($tempFile="/tmp/webget.new")) || |
533 | open(STDOUT, '>'.($tempFile="/usr/tmp/webget.new")) || |
534 | open(STDOUT, '>'.($tempFile="/webget.new")) || |
535 | open(STDOUT, '>'.($tempFile="webget.new")) || |
536 | die "$0: can't open a temp file.\n"; |
537 | |
538 | ## |
539 | ## See if we can figure out how we were called. |
540 | ## The seek will rewind not to the start of the data, but to the |
541 | ## start of the whole program script. |
542 | ## |
543 | ## Keep the first line if it begins with #!, and the next two if they |
544 | ## look like the trick mentioned in the perl man page for getting |
545 | ## around the lack of #!-support. |
546 | ## |
547 | if (seek(DATA, 0, 0)) { ## |
548 | $_ = <DATA>; if (m/^#!/) { print STDOUT; |
549 | $_ = <DATA>; if (m/^\s*eval/) { print STDOUT; |
550 | $_ = <DATA>; if (m/^\s*if/) { print STDOUT; } |
551 | } |
552 | } |
553 | print STDOUT "\n#-\n"; |
554 | } |
555 | |
556 | ## Go get the latest one... |
557 | local(@options); |
558 | push(@options, 'head') if $head; |
559 | push(@options, 'nofollow') unless $follow; |
560 | push(@options, ('retry') x $retry) if $retry; |
561 | push(@options, 'quiet') if $quiet; |
562 | push(@options, 'debug') if $debug; |
563 | local($status, $memo, %info) = &www'open_http_url(*IN, $WEB, @options); |
564 | die "fetching $WEB:\n $memo\n" unless $status eq 'ok'; |
565 | |
566 | $size = $info{'content-length'}; |
567 | while (<IN>) |
568 | { |
569 | $size -= length; |
570 | print STDOUT; |
571 | if (!defined $fetched_version && m/version\s*=\s*"([^"]+)"/) { |
572 | $fetched_version = $1; |
573 | &general_read(*IN, $size); |
574 | last; |
575 | } |
576 | } |
577 | |
578 | $fetched_version = "<unknown>" unless defined $fetched_version; |
579 | |
580 | ## |
581 | ## Try to update the mode of the temp file with the mode of this file. |
582 | ## Don't worry if it fails. |
583 | ## |
584 | chmod($mode, $tempFile) if $mode = (stat($0))[2]; |
585 | |
586 | $as_well = ''; |
587 | if ($fetched_version eq $version) |
588 | { |
589 | print STDERR "You already have the most-recent version ($version).\n", |
590 | qq/FWIW, the newly fetched one has been left in "$tempFile".\n/; |
591 | } |
592 | elsif ($fetched_version <= $version) |
593 | { |
594 | print STDERR |
595 | "Mmm, your current version seems newer (?!):\n", |
596 | qq/ your version: "$version"\n/, |
597 | qq/ new version: "$fetched_version"\n/, |
598 | qq/FWIW, fetched one left in "$tempFile".\n/; |
599 | } |
600 | else |
601 | { |
602 | print STDERR |
603 | "Indeed, your current version was old:\n", |
604 | qq/ your version: "$version"\n/, |
605 | qq/ new version: "$fetched_version"\n/, |
606 | qq/The file "$tempFile" is ready to replace the old one.\n/; |
607 | print STDERR qq/Just do:\n % mv $tempFile $0\n/ if -f $0; |
608 | $as_well = ' as well'; |
609 | } |
610 | print STDERR "Note that the libraries it uses may (or may not) need updating$as_well.\n" |
611 | unless $inlined; |
612 | exit($EXIT_ok); |
613 | } |
614 | |
615 | ## |
616 | ## Given a list of URLs, fetch'em. |
617 | ## Parses the URL and calls the routine for the appropriate protocol |
618 | ## |
619 | sub fetch_url |
620 | { |
621 | local(@todo) = @_; |
622 | local(%circref, %hold_circref); |
623 | |
624 | URL_LOOP: while (@todo) |
625 | { |
626 | $URL = shift(@todo); |
627 | %hold_circref = %circref; undef %circref; |
628 | |
629 | local($protocol, @args) = &www'grok_URL($URL, $noproxy); |
630 | |
631 | if (!defined $protocol) { |
632 | &www'message(1, qq/can't grok "$URL"/); |
633 | next URL_LOOP; |
634 | } |
635 | |
636 | ## call protocol-specific handler |
637 | $func = "fetch_via_" . $protocol; |
638 | $error = &$func(@args, $TimeoutSpan); |
639 | if (defined $error) { |
640 | &www'message(1, "$URL: $error"); |
641 | } else { |
642 | $SuccessfulCount++; |
643 | } |
644 | } |
645 | } |
646 | |
647 | sub filedate |
648 | { |
649 | local($filename) = @_; |
650 | local($filetime) = (stat($filename))[9]; |
651 | return 0 if !defined $filetime; |
652 | local($sec, $min, $hour, $mday, $mon, $year, $wday) = gmtime($filetime); |
653 | return 0 if !defined $wday; |
654 | sprintf(qq/"%s, %02d-%s-%02d %02d:%02d:%02d GMT"/, |
655 | ("Sunday", "Monday", "Tuesdsy", "Wednesday", |
656 | "Thursday", "Friday", "Saturday")[$wday], |
657 | $mday, |
658 | ("Jan", "Feb", "Mar", "Apr", "May", "Jun", |
659 | "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")[$mon], |
660 | $year, |
661 | $hour, |
662 | $min, |
663 | $sec); |
664 | } |
665 | |
666 | sub local_filename |
667 | { |
668 | local($filename) = @_; |
669 | $filename =~ s,/+$,,; ## remove any trailing slashes |
670 | $filename =~ s,.*/,,; ## remove any leading path |
671 | if ($filename eq '') { |
672 | ## empty -- pick a random name |
673 | $filename = "file0000"; |
674 | ## look for a free random name. |
675 | $filename++ while -f $filename; |
676 | } |
677 | $filename; |
678 | } |
679 | |
680 | sub set_output_file |
681 | { |
682 | local($filename) = @_; |
683 | if (!open(OUT, ">$filename")) { |
684 | &www'message(1, "$0: can't open [$filename] for output"); |
685 | } else { |
686 | open(SAVEOUT, ">>&STDOUT") || die "$!";; |
687 | open(STDOUT, ">>&OUT"); |
688 | } |
689 | } |
690 | |
691 | sub close_output_file |
692 | { |
693 | local($filename) = @_; |
694 | unless ($quiet) |
695 | { |
696 | local($note) = qq/"$filename" written/; |
697 | if (defined $error) { |
698 | $note .= " (possibly corrupt due to error above)"; |
699 | } |
700 | &www'message(1, "$note."); |
701 | } |
702 | close(STDOUT); |
703 | open(STDOUT, ">&SAVEOUT"); |
704 | } |
705 | |
706 | sub http_alarm |
707 | { |
708 | &www'message(1, "ERROR: $AlarmNote."); |
709 | exit($EXIT_timeout); ## the alarm doesn't seem to cause a waiting syscall to break? |
710 | # $HaveAlarm = 1; |
711 | } |
712 | |
713 | ## |
714 | ## Given the host, port, and path, and (for info only) real target, |
715 | ## fetch via HTTP. |
716 | ## |
717 | ## If there is a user and/or password, use that for Basic Authorization. |
718 | ## |
719 | ## If $timeout is nonzero, time out after that many seconds. |
720 | ## |
721 | sub fetch_via_http |
722 | { |
723 | local($host, $port, $path, $target, $user, $password, $timeout) = @_; |
724 | local(@options); |
725 | local($local_filename); |
726 | |
727 | ## |
728 | ## If we're posting, but -postfile was given, we need to interpret |
729 | ## the item in $path after '?' as a filename, and replace it with |
730 | ## the contents of the file. |
731 | ## |
732 | if ($postfile && $path =~ s/\?([\d\D]*)//) { |
733 | local($filename) = $1; |
734 | return("can't open [$filename] to POST") if !open(IN, "<$filename"); |
735 | local($/) = ''; ## want to suck up the whole file. |
736 | $path .= '?' . <IN>; |
737 | close(IN); |
738 | } |
739 | |
740 | $local_filename = &local_filename($path) |
741 | if $refresh || $nab || defined($reference_file); |
742 | $refresh = &filedate($local_filename) if $refresh; |
743 | $refresh = &filedate($reference_file) if defined($reference_file); |
744 | |
745 | push(@options, 'head') if $head; |
746 | push(@options, 'post') if $post; |
747 | push(@options, 'nofollow') unless $follow; |
748 | push(@options, ('retry') x 3); |
749 | push(@options, 'quiet') if $quiet; |
750 | push(@options, 'debug') if $debug; |
751 | push(@options, "ifmodifiedsince=$refresh") if $refresh; |
752 | |
753 | if (defined $password || defined $user) { |
754 | local($auth) = join(':', ($user || ''), ($password || '')); |
755 | push(@options, "authorization=$auth"); |
756 | } |
757 | |
758 | local($old_alarm); |
759 | if ($timeout) { |
760 | $old_alarm = $SIG{'ALRM'} || 'DEFAULT'; |
761 | $SIG{'ALRM'} = "main'http_alarm"; |
762 | # $HaveAlarm = 0; |
763 | $AlarmNote = "host $host"; |
764 | $AlarmNote .= ":$port" if $port != $www'default_port{'http'}; |
765 | $AlarmNote .= " timed out after $timeout second"; |
766 | $AlarmNote .= 's' if $timeout > 1; |
767 | alarm($timeout); |
768 | } |
769 | local($result, $memo, %info) = |
770 | &www'open_http_connection(*HTTP, $host,$port,$path,$target,@options); |
771 | |
772 | if ($timeout) { |
773 | alarm(0); |
774 | $SIG{'ALRM'} = $old_alarm; |
775 | } |
776 | |
777 | # if ($HaveAlarm) { |
778 | # close(HTTP); |
779 | # $error = "timeout after $timeout second"; |
780 | # $error .= "s" if $timeout > 1; |
781 | # return $error; |
782 | # } |
783 | |
784 | if ($follow && ($result eq 'follow')) { |
785 | %circref = %hold_circref; |
786 | $circref{$memo} = 1; |
787 | unshift(@todo, $memo); |
788 | return undef; |
789 | } |
790 | |
791 | |
792 | return $memo if $result eq 'error'; |
793 | if (!$quiet && $result eq 'status' && ! -t STDOUT) { |
794 | #&www'message(1, "Warning: $memo"); |
795 | $error = "Warning: $memo"; |
796 | } |
797 | |
798 | if ($info{'CODE'} == 304) { ## 304 is magic for "Not Modified" |
799 | close(HTTP); |
800 | &www'message(1, "$URL: Not Modified") unless $quiet; |
801 | $NotModifiedCount++; |
802 | return undef; ## no error |
803 | } |
804 | |
805 | |
806 | &set_output_file($local_filename) if $nab; |
807 | |
808 | unless($strip) { |
809 | print $info{'STATUS'}, "\n", $info{'HEADER'}, "\n"; |
810 | |
811 | print SAVEOUT $info{'STATUS'}, "\n", $info{'HEADER'}, "\n" if $nab==2; |
812 | } |
813 | |
814 | if (defined $info{'BODY'}) { |
815 | print $info{'BODY'}; |
816 | print SAVEOUT $info{'BODY'} if $nab==2; |
817 | } |
818 | |
819 | if (!$head) { |
820 | &general_read(*HTTP, $info{'content-length'}); |
821 | } |
822 | close(HTTP); |
823 | &close_output_file($local_filename) if $nab; |
824 | |
825 | $error; ## will be 'undef' if no error; |
826 | } |
827 | |
828 | sub fetch_via_ftp |
829 | { |
830 | local($host, $port, $path, $target, $user, $password, $timeout) = @_; |
831 | local($local_filename) = &local_filename($path); |
832 | local($ftp_debug) = $debug; |
833 | local(@password) = ($password); |
834 | $path =~ s,^/,,; ## remove a leading / from the path. |
835 | $path = '.' if $path eq ''; ## make sure we have something |
836 | |
837 | if (!defined $user) { |
838 | $user = 'anonymous'; |
839 | $password = $ENV{'USER'} || 'WWWuser'; |
840 | @password = ($password.'@'. &network'addr_to_ascii(&network'my_addr), |
841 | $password.'@'); |
842 | } elsif (!defined $password) { |
843 | @password = (""); |
844 | } |
845 | |
846 | local($_last_ftp_reply, $_passive_host, $_passive_port); |
847 | local($size); |
848 | |
849 | sub _ftp_get_reply |
850 | { |
851 | local($text) = scalar(<FTP_CONTROL>); |
852 | die "lost connection to $host\n" if !defined $text; |
853 | local($_, $tmp); |
854 | print STDERR "READ: $text" if $ftp_debug; |
855 | die "internal error: expected reply code in response from ". |
856 | "ftp server [$text]" unless $text =~ s/^(\d+)([- ])//; |
857 | local($code) = $1; |
858 | if ($2 eq '-') { |
859 | while (<FTP_CONTROL>) { |
860 | ($tmp = $_) =~ s/^\d+[- ]//; |
861 | $text .= $tmp; |
862 | last if m/^$code /; |
863 | } |
864 | } |
865 | $text =~ s/^\d+ ?/<foo>/g; |
866 | ($code, $text); |
867 | } |
868 | |
869 | sub _ftp_expect |
870 | { |
871 | local($code, $text) = &_ftp_get_reply; |
872 | $_last_ftp_reply = $text; |
873 | foreach $expect (@_) { |
874 | return ($code, $text) if $code == $expect; |
875 | } |
876 | die "internal error: expected return code ". |
877 | join('|',@_).", got [$text]"; |
878 | } |
879 | |
880 | sub _ftp_send |
881 | { |
882 | print STDERR "SEND: ", @_ if $ftp_debug; |
883 | print FTP_CONTROL @_; |
884 | } |
885 | |
886 | sub _ftp_do_passive |
887 | { |
888 | local(@commands) = @_; |
889 | |
890 | &_ftp_send("PASV\r\n"); |
891 | local($code) = &_ftp_expect(227, 125); |
892 | |
893 | if ($code == 227) |
894 | { |
895 | die "internal error: can't grok passive reply [$_last_ftp_reply]" |
896 | unless $_last_ftp_reply =~ m/\(([\d,]+)\)/; |
897 | local($a,$b,$c,$d, $p1, $p2) = split(/,/, $1); |
898 | ($_passive_host, $_passive_port) = |
899 | ("$a.$b.$c.$d", $p1*256 + $p2); |
900 | } |
901 | |
902 | foreach(@commands) { |
903 | &_ftp_send($_); |
904 | } |
905 | |
906 | local($error)= |
907 | &network'connect_to(*PASSIVE, $_passive_host, $_passive_port); |
908 | die "internal error: passive ftp connect [$error]" if $error; |
909 | } |
910 | |
911 | ## make the connection to the host |
912 | &www'message($debug, "connecting to $host...") unless $quiet; |
913 | |
914 | local($old_alarm); |
915 | if ($timeout) { |
916 | $old_alarm = $SIG{'ALRM'} || 'DEFAULT'; |
917 | $SIG{'ALRM'} = "main'http_alarm"; ## can use this for now |
918 | # $HaveAlarm = 0; |
919 | $AlarmNote = "host $host"; |
920 | $AlarmNote .= ":$port" if $port != $www'default_port{'ftp'}; |
921 | $AlarmNote .= " timed out after $timeout second"; |
922 | $AlarmNote .= 's' if $timeout > 1; |
923 | alarm($timeout); |
924 | } |
925 | |
926 | local($error) = &network'connect_to(*FTP_CONTROL, $host, $port); |
927 | |
928 | if ($timeout) { |
929 | alarm(0); |
930 | $SIG{'ALRM'} = $old_alarm; |
931 | } |
932 | |
933 | return $error if $error; |
934 | |
935 | local ($code, $text) = &_ftp_get_reply(*FTP_CONTROL); |
936 | close(FTP_CONTROL), return "internal ftp error: [$text]" unless $code==220; |
937 | |
938 | ## log in |
939 | &www'message($debug, "logging in as $user...") unless $quiet; |
940 | foreach $password (@password) |
941 | { |
942 | &_ftp_send("USER $user\r\n"); |
943 | ($code, $text) = &_ftp_expect(230,331,530); |
944 | close(FTP_CONTROL), return $text if ($code == 530); |
945 | last if $code == 230; ## hey, already logged in, cool. |
946 | |
947 | &_ftp_send("PASS $password\r\n"); |
948 | ($code, $text) = &_ftp_expect(220,230,530,550,332); |
949 | last if $code != 550; |
950 | last if $text =~ m/can't change directory/; |
951 | } |
952 | |
953 | if ($code == 550) |
954 | { |
955 | $text =~ s/\n+$//; |
956 | &www'message(1, "Can't log in $host: $text") unless $quiet; |
957 | exit($EXIT_error); |
958 | } |
959 | |
960 | if ($code == 332) |
961 | { |
962 | &_ftp_send("ACCT noaccount\r\n"); |
963 | ($code, $text) = &_ftp_expect(230, 202, 530, 500,501,503, 421) |
964 | } |
965 | close(FTP_CONTROL), return $text if $code >= 300; |
966 | |
967 | &_ftp_send("TYPE I\r\n"); |
968 | &_ftp_expect(200); |
969 | |
970 | unless ($quiet) { |
971 | local($name) = $path; |
972 | $name =~ s,.*/([^/]),$1,; |
973 | &www'message($debug, "requesting $name..."); |
974 | } |
975 | ## get file |
976 | &_ftp_do_passive("RETR $path\r\n"); |
977 | ($code,$text) = &_ftp_expect(125, 150, 550, 530); |
978 | close(FTP_CONTROL), return $text if $code == 530; |
979 | |
980 | if ($code == 550) |
981 | { |
982 | close(PASSIVE); |
983 | if ($text =~ /directory/i) { |
984 | ## probably from "no such file or directory", so just return now. |
985 | close(FTP_CONTROL); |
986 | return $text; |
987 | } |
988 | |
989 | ## do like Mosaic and try getting a directory listing. |
990 | &_ftp_send("CWD $path\r\n"); |
991 | ($code) = &_ftp_expect(250,550); |
992 | if ($code == 550) { |
993 | close(FTP_CONTROL); |
994 | return $text; |
995 | } |
996 | &_ftp_do_passive("LIST\r\n"); |
997 | &_ftp_expect(125, 150); |
998 | } |
999 | |
1000 | $size = $1 if $text =~ m/(\d+)\s+bytes/; |
1001 | binmode(PASSIVE); ## just in case. |
1002 | &www'message($debug, "waiting for data...") unless $quiet; |
1003 | &set_output_file($local_filename) if $nab; |
1004 | &general_read(*PASSIVE, $size); |
1005 | &close_output_file($local_filename) if $nab; |
1006 | |
1007 | close(PASSIVE); |
1008 | close(FTP_CONTROL); |
1009 | undef; |
1010 | } |
1011 | |
1012 | sub general_read |
1013 | { |
1014 | local(*INPUT, $size) = @_; |
1015 | local($lastcount, $bytes) = (0,0); |
1016 | local($need_to_clear) = 0; |
1017 | local($start_time) = time; |
1018 | local($last_time, $time) = $start_time; |
1019 | ## Figure out how often to print the "bytes read" message |
1020 | local($delta2print) = |
1021 | (defined $size) ? int($size/50) : $defaultdelta2print; |
1022 | |
1023 | &www'message(0, "read 0 bytes") unless $quiet; |
1024 | |
1025 | ## so $! below is set only if a real error happens from now |
1026 | eval 'local($^W) = 0; undef $!'; |
1027 | |
1028 | |
1029 | while (defined($_ = <INPUT>)) |
1030 | { |
1031 | ## shove it out. |
1032 | &www'clear_message if $need_to_clear; |
1033 | print; |
1034 | print SAVEOUT if $nab==2; |
1035 | |
1036 | ## if we know the content-size, keep track of what we're reading. |
1037 | $bytes += length; |
1038 | |
1039 | last if eof || (defined $size && $bytes >= $size); |
1040 | |
1041 | if (!$quiet && $bytes > ($lastcount + $delta2print)) |
1042 | { |
1043 | if ($time = time, $last_time == $time) { |
1044 | $delta2print *= 1.5; |
1045 | } else { |
1046 | $last_time = $time; |
1047 | $lastcount = $bytes; |
1048 | local($time_delta) = $time - $start_time; |
1049 | local($text); |
1050 | |
1051 | $delta2print /= $time_delta; |
1052 | if (defined $size) { |
1053 | $text = sprintf("read $bytes bytes (%.0f%%)", |
1054 | $bytes*100/$size); |
1055 | } else { |
1056 | $text = "read $bytes bytes"; |
1057 | } |
1058 | |
1059 | if ($time_delta > 5 || ($time_delta && $bytes > 10240)) |
1060 | { |
1061 | local($rate) = int($bytes / $time_delta); |
1062 | if ($rate < 5000) { |
1063 | $text .= " ($rate bytes/sec)"; |
1064 | } elsif ($rate < 1024 * 10) { |
1065 | $text .= sprintf(" (%.1f k/sec)", $rate/1024); |
1066 | } else { |
1067 | $text .= sprintf(" (%.0f k/sec)", $rate/1024); |
1068 | } |
1069 | } |
1070 | &www'message(0, "$text..."); |
1071 | $need_to_clear = -t STDOUT; |
1072 | } |
1073 | } |
1074 | } |
1075 | |
1076 | if (!$quiet) |
1077 | { |
1078 | if ($size && ($size != $bytes)) { |
1079 | &www'message("WARNING: Expected $size bytes, read $bytes bytes.\n"); |
1080 | } |
1081 | # if ($!) { |
1082 | # print STDERR "\$! is [$!]\n"; |
1083 | # } |
1084 | # if ($@) { |
1085 | # print STDERR "\$\@ is [$@]\n"; |
1086 | # } |
1087 | } |
1088 | &www'clear_message($text) unless $quiet; |
1089 | } |
1090 | |
1091 | sub dummy { |
1092 | 1 || &dummy || &fetch_via_ftp || &fetch_via_http || &http_alarm; |
1093 | 1 || close(OUT); |
1094 | 1 || close(SAVEOUT); |
1095 | } |
1096 | |
1097 | __END__ |
1098 | __END__ |
1099 | :endofperl |