Commit | Line | Data |
d444a431 |
1 | #!/usr/local/bin/perl -w |
2 | |
3 | #- |
4 | #!/usr/local/bin/perl -w |
5 | $version = "951121.18"; |
6 | $comments = 'jfriedl@omron.co.jp'; |
7 | |
8 | ## |
9 | ## This is "webget" |
10 | ## |
11 | ## Jeffrey Friedl (jfriedl@omron.co.jp), July 1994. |
12 | ## Copyright 19.... ah hell, just take it. |
13 | ## Should work with either perl4 or perl5 |
14 | ## |
15 | ## BLURB: |
16 | ## Given a URL on the command line (HTTP and FTP supported at the moment), |
17 | ## webget fetches the named object (HTML text, images, audio, whatever the |
18 | ## object happens to be). Will automatically use a proxy if one is defined |
19 | ## in the environment, follow "this URL has moved" responses, and retry |
20 | ## "can't find host" responses from a proxy in case host lookup was slow). |
21 | ## Supports users & passwords (FTP), Basic Authorization (HTTP), update-if- |
22 | ## modified (HTTP), and much more. Works with perl4 or perl5. |
23 | |
24 | ## |
25 | ## More-detailed instructions in the comment block below the history list. |
26 | ## |
27 | |
28 | ## |
29 | ## To-do: |
30 | ## Add gopher support. |
31 | ## Fix up how error messages are passed among this and the libraries. |
32 | ## |
33 | |
34 | ## 951219.19 |
35 | ## Lost ftp connections now die with a bit more grace. |
36 | ## |
37 | ## 951121.18 |
38 | ## Add -nnab. |
39 | ## Brought the "usage" string in line with reality. |
40 | ## |
41 | ## 951114.17 |
42 | ## Added -head. |
43 | ## Added -update/-refresh/-IfNewerThan. If any URL was not pulled |
44 | ## because it was not out of date, an exit value of 2 is returned. |
45 | ## |
46 | ## 951031.16 |
47 | ## Added -timeout. Cleaned up (a bit) the exit value. Now exits |
48 | ## with 1 if all URLs had some error (timeout exits immediately with |
49 | ## code 3, though. This is subject to change). Exits with 0 if any |
50 | ## URL was brought over safely. |
51 | ## |
52 | ## 951017.15 |
53 | ## Neat -pf, -postfile idea from Lorrie Cranor |
54 | ## (http://www.ccrc.wustl.edu/~lorracks/) |
55 | ## |
56 | ## 950912.14 |
57 | ## Sigh, fixed a typo. |
58 | ## |
59 | ## 950911.13 |
60 | ## Added Basic Authorization support for http. See "PASSWORDS AND STUFF" |
61 | ## in the documentation. |
62 | ## |
63 | ## 950911.12 |
64 | ## Implemented a most-excellent suggestion by Anthony D'Atri |
65 | ## (aad@nwnet.net), to be able to automatically grab to a local file of |
66 | ## the same name as the URL. See the '-nab' flag. |
67 | ## |
68 | ## 950706.11 |
69 | ## Quelled small -w warning (thanks: Lars Rasmussen <gnort@daimi.aau.dk>) |
70 | ## |
71 | ## 950630.10 |
72 | ## Steve Campbell to the rescue again. FTP now works when supplied |
73 | ## with a userid & password (eg ftp://user:pass@foo.bar.com/index.txt). |
74 | ## |
75 | ## 950623.9 |
76 | ## Incorporated changes from Steve Campbell (steven_campbell@uk.ibm.com) |
77 | ## so that the ftp will work when no password is required of a user. |
78 | ## |
79 | ## 950530.8 |
80 | ## Minor changes: |
81 | ## Eliminate read-size warning message when size unknown. |
82 | ## Pseudo-debug/warning messages at the end of debug_read now go to |
83 | ## stderr. Some better error handling when trying to contact systems |
84 | ## that aren't really set up for ftp. Fixed a bug concerning FTP access |
85 | ## to a root directory. Added proxy documentation at head of file. |
86 | ## |
87 | ## 950426.6,7 |
88 | ## Complete Overhaul: |
89 | ## Renamed from httpget. Added ftp support (very sketchy at the moment). |
90 | ## Redid to work with new 'www.pl' library; chucked 'Www.pl' library. |
91 | ## More or less new and/or improved in many ways, but probably introduced |
92 | ## a few bugs along the way. |
93 | ## |
94 | ## 941227.5 |
95 | ## Added follow stuff (with -nofollow, etc.) |
96 | ## Added -updateme. Cool! |
97 | ## Some general tidying up. |
98 | ## |
99 | ## 941107.4 |
100 | ## Allowed for ^M ending a header line... PCs give those kind of headers. |
101 | ## |
102 | ## 940820.3 |
103 | ## First sorta'clean net release. |
104 | ## |
105 | ## |
106 | |
107 | ## |
108 | ##> |
109 | ## |
110 | ## Fetch http and/or ftp URL(s) given on the command line and spit to |
111 | ## STDOUT. |
112 | ## |
113 | ## Options include: |
114 | ## -V, -version |
115 | ## Print version information; exit. |
116 | ## |
117 | ## -p, -post |
118 | ## If the URL looks like a reply to a form (i.e. has a '?' in it), |
119 | ## the request is POST'ed instead of GET'ed. |
120 | ## |
121 | ## -head |
122 | ## Gets the header only (for HTTP). This might include such useful |
123 | ## things as 'Last-modified' and 'Content-length' fields |
124 | ## (a lack of a 'Last-modified' might be a good indication that it's |
125 | ## a CGI). |
126 | ## |
127 | ## The "-head" option implies "-nostrip", but does *not* imply, |
128 | ## for example "-nofollow". |
129 | ## |
130 | ## |
131 | ## -pf, -postfile |
132 | ## The item after the '?' is taken as a local filename, and the contents |
133 | ## are POST'ed as with -post |
134 | ## |
135 | ## -nab, -f, -file |
136 | ## Rather than spit the URL(s) to standard output, unconditionally |
137 | ## dump to a file (or files) whose name is that as used in the URL, |
138 | ## sans path. I like '-nab', but supply '-file' as well since that's |
139 | ## what was originally suggested. Also see '-update' below for the |
140 | ## only-if-changed version. |
141 | ## |
142 | ## -nnab |
143 | ## Like -nab, but in addtion to dumping to a file, dump to stdout as well. |
144 | ## Sort of like the 'tee' command. |
145 | ## |
146 | ## -update, -refresh |
147 | ## Do the same thing as -nab, etc., but does not bother pulling the |
148 | ## URL if it older than the localfile. Only applies to HTTP. |
149 | ## Uses the HTTP "If-Modified-Since" field. If the URL was not modified |
150 | ## (and hence not changed), the return value is '2'. |
151 | ## |
152 | ## -IfNewerThan FILE |
153 | ## -int FILE |
154 | ## Only pulls URLs if they are newer than the date the local FILE was |
155 | ## last written. |
156 | ## |
157 | ## -q, -quiet |
158 | ## Suppresses all non-essential informational messages. |
159 | ## |
160 | ## -nf, -nofollow |
161 | ## Normally, a "this URL has moved" HTTP response is automatically |
162 | ## followed. Not done with -nofollow. |
163 | ## |
164 | ## -nr, -noretry |
165 | ## Normally, an HTTP proxy response of "can't find host" is retried |
166 | ## up to three times, to give the remote hostname lookup time to |
167 | ## come back with an answer. This suppresses the retries. This is the |
168 | ## same as '-retry 0'. |
169 | ## |
170 | ## -r#, -retry#, -r #, -retry # |
171 | ## Sets the number of times to retry. Default 3. |
172 | ## |
173 | ## -ns, -nostrip |
174 | ## For HTTP items (including other items going through an HTTP proxy), |
175 | ## the HTTP response header is printed rather than stripped as default. |
176 | ## |
177 | ## -np, -noproxy |
178 | ## A proxy is not used, even if defined for the protocol. |
179 | ## |
180 | ## -h, -help |
181 | ## Show a usage message and exit. |
182 | ## |
183 | ## -d, -debug |
184 | ## Show some debugging messages. |
185 | ## |
186 | ## -updateme |
187 | ## The special and rather cool flag "-updateme" will see if webget has |
188 | ## been updated since you got your version, and prepare a local |
189 | ## version of the new version for you to use. Keep updated! (although |
190 | ## you can always ask to be put on the ping list to be notified when |
191 | ## there's a new version -- see the author's perl web page). |
192 | ## |
193 | ## -timeout TIMESPAN |
194 | ## -to TIMESPAN |
195 | ## Time out if a connection can not be made within the specified time |
196 | ## period. TIMESPAN is normally in seconds, although a 'm' or 'h' may |
197 | ## be appended to indicate minutes and hours. "-to 1.5m" would timeout |
198 | ## after 90 seconds. |
199 | ## |
200 | ## (At least for now), a timeout causes immediate program death (with |
201 | ## exit value 3). For some reason, the alarm doesn't always cause a |
202 | ## waiting read or connect to abort, so I just die immediately.. /-: |
203 | ## |
204 | ## I might consider adding an "entire fetch" timeout, if someone |
205 | ## wants it. |
206 | ## |
207 | ## PASSWORDS AND SUCH |
208 | ## |
209 | ## You can use webget to do FTP fetches from non-Anonymous systems and |
210 | ## accounts. Just put the required username and password into the URL, |
211 | ## as with |
212 | ## webget 'ftp:/user:password@ftp.somesite.com/pub/pix/babe.gif |
213 | ## ^^^^^^^^^^^^^ |
214 | ## Note the user:password is separated from the hostname by a '@'. |
215 | ## |
216 | ## You can use the same kind of thing with HTTP, and if so it will provide |
217 | ## what's know as Basic Authorization. This is >weak< authorization. It |
218 | ## also provides >zero< security -- I wouldn't be sending any credit-card |
219 | ## numbers this way (unless you send them 'round my way :-). It seems to |
220 | ## be used most by providers of free stuff where they want to make some |
221 | ## attempt to limit access to "known users". |
222 | ## |
223 | ## PROXY STUFF |
224 | ## |
225 | ## If you need to go through a gateway to get out to the whole internet, |
226 | ## you can use a proxy if one's been set up on the gateway. This is done |
227 | ## by setting the "http_proxy" environmental variable to point to the |
228 | ## proxy server. Other variables are used for other target protocols.... |
229 | ## "gopher_proxy", "ftp_proxy", "wais_proxy", etc. |
230 | ## |
231 | ## For example, I have the following in my ".login" file (for use with csh): |
232 | ## |
233 | ## setenv http_proxy http://local.gateway.machine:8080/ |
234 | ## |
235 | ## This is to indicate that any http URL should go to local.gateway.machine |
236 | ## (port 8080) via HTTP. Additionally, I have |
237 | ## |
238 | ## setenv gopher_proxy "$http_proxy" |
239 | ## setenv wais_proxy "$http_proxy" |
240 | ## setenv ftp_proxy "$http_proxy" |
241 | ## |
242 | ## This means that any gopher, wais, or ftp URL should also go to the |
243 | ## same place, also via HTTP. This allows webget to get, for example, |
244 | ## GOPHER URLs even though it doesn't support GOPHER itself. It uses HTTP |
245 | ## to talk to the proxy, which then uses GOPHER to talk to the destination. |
246 | ## |
247 | ## Finally, if there are sites inside your gateway that you would like to |
248 | ## connect to, you can list them in the "no_proxy" variable. This will allow |
249 | ## you to connect to them directly and skip going through the proxy: |
250 | ## |
251 | ## setenv no_proxy "www.this,www.that,www.other" |
252 | ## |
253 | ## I (jfriedl@omron.co.jp) have little personal experience with proxies |
254 | ## except what I deal with here at Omron, so if this is not representative |
255 | ## of your situation, please let me know. |
256 | ## |
257 | ## RETURN VALUE |
258 | ## The value returned to the system by webget is rather screwed up because |
259 | ## I didn't think about dealing with it until things were already |
260 | ## complicated. Since there can be more than one URL on the command line, |
261 | ## it's hard to decide what to return when one times out, another is fetched, |
262 | ## another doesn't need to be fetched, and a fourth isn't found. |
263 | ## |
264 | ## So, here's the current status: |
265 | ## |
266 | ## Upon any timeout (via the -timeout arg), webget immediately |
267 | ## returns 3. End of story. Otherwise.... |
268 | ## |
269 | ## If any URL was fetched with a date limit (i.e. via |
270 | ## '-update/-refresh/-IfNewerThan' and was found to not have changed, |
271 | ## 2 is returned. Otherwise.... |
272 | ## |
273 | ## If any URL was successfully fetched, 0 is returned. Otherwise... |
274 | ## |
275 | ## If there were any errors, 1 is returned. Otherwise... |
276 | ## |
277 | ## Must have been an info-only or do-nothing instance. 0 is returned. |
278 | ## |
279 | ## Phew. Hopefully useful to someone. |
280 | ##< |
281 | ## |
282 | |
283 | ## Where latest version should be. |
284 | $WEB_normal = 'http://www.wg.omron.co.jp/~jfriedl/perl/webget'; |
285 | $WEB_inlined = 'http://www.wg.omron.co.jp/~jfriedl/perl/inlined/webget'; |
286 | |
287 | |
288 | require 'network.pl'; ## inline if possible (directive to a tool of mine) |
289 | require 'www.pl'; ## inline if possible (directive to a tool of mine) |
290 | $inlined=0; ## this might be changed by a the inline thing. |
291 | |
292 | ## |
293 | ## Exit values. All screwed up. |
294 | ## |
295 | $EXIT_ok = 0; |
296 | $EXIT_error = 1; |
297 | $EXIT_notmodified = 2; |
298 | $EXIT_timeout = 3; |
299 | |
300 | ## |
301 | ## |
302 | |
303 | warn qq/WARNING:\n$0: need a newer version of "network.pl"\n/ if |
304 | !defined($network'version) || $network'version < "950311.5"; |
305 | warn qq/WARNING:\n$0: need a newer version of "www.pl"\n/ if |
306 | !defined($www'version) || $www'version < "951114.8"; |
307 | |
308 | $WEB = $inlined ? $WEB_inlined : $WEB_normal; |
309 | |
310 | $debug = 0; |
311 | $strip = 1; ## default is to strip |
312 | $quiet = 0; ## also normally off. |
313 | $follow = 1; ## normally, we follow "Found (302)" links |
314 | $retry = 3; ## normally, retry proxy hostname lookups up to 3 times. |
315 | $nab = 0; ## If true, grab to a local file of the same name. |
316 | $refresh = 0; ## If true, use 'If-Modified-Since' with -nab get. |
317 | $postfile = 0; ## If true, filename is given after the '?' |
318 | $defaultdelta2print = 2048; |
319 | $TimeoutSpan = 0; ## seconds after which we should time out. |
320 | |
321 | while (@ARGV && $ARGV[0] =~ m/^-/) |
322 | { |
323 | $arg = shift(@ARGV); |
324 | |
325 | $nab = 1, next if $arg =~ m/^-f(ile)?$/; |
326 | $nab = 1, next if $arg =~ m/^-nab$/; |
327 | $nab = 2, next if $arg =~ m/^-nnab$/; |
328 | $post = 1, next if $arg =~ m/^-p(ost)?$/i; |
329 | $post = $postfile = 1, next if $arg =~ m/^-p(ost)?f(ile)?$/i; |
330 | $quiet=1, next if $arg =~ m/^-q(uiet)?$/; |
331 | $follow = 0, next if $arg =~ m/^-no?f(ollow)?$/; |
332 | $strip = 0, next if $arg =~ m/^-no?s(trip)?$/; |
333 | $debug=1, next if $arg =~ m/^-d(ebug)?$/; |
334 | $noproxy=1, next if $arg =~ m/^-no?p(roxy)?$/; |
335 | $retry=0, next if $arg =~ m/^-no?r(etry)?$/; |
336 | $retry=$2, next if $arg =~ m/^-r(etry)?(\d+)$/; |
337 | &updateme if $arg eq '-updateme'; |
338 | $strip = 0, $head = 1, next if $arg =~ m/^-head(er)?/; |
339 | $nab = $refresh = 1, next if $arg =~ m/^-(refresh|update)/; |
340 | |
341 | &usage($EXIT_ok) if $arg =~ m/^-h(elp)?$/; |
342 | &show_version, exit($EXIT_ok) if $arg eq '-version' || $arg eq '-V'; |
343 | |
344 | if ($arg =~ m/^-t(ime)?o(ut)?$/i) { |
345 | local($num) = shift(@ARGV); |
346 | &usage($EXIT_error, "expecting timespan argument to $arg\n") unless |
347 | $num =~ m/^\d+(\d*)?[hms]?$/; |
348 | &timeout_arg($num); |
349 | next; |
350 | } |
351 | |
352 | if ($arg =~ m/^-if?n(ewer)?t(han)?$/i) { |
353 | $reference_file = shift(@ARGV); |
354 | &usage($EXIT_error, "expecting filename arg to $arg") |
355 | if !defined $reference_file; |
356 | if (!-f $reference_file) { |
357 | warn qq/$0: ${arg}'s "$reference_file" not found.\n/; |
358 | exit($EXIT_error); |
359 | } |
360 | next; |
361 | } |
362 | |
363 | if ($arg eq '-r' || $arg eq '-retry') { |
364 | local($num) = shift(@ARGV); |
365 | &usage($EXIT_error, "expecting numerical arg to $arg\n") unless |
366 | defined($num) && $num =~ m/^\d+$/; |
367 | $retry = $num; |
368 | next; |
369 | } |
370 | &usage($EXIT_error, qq/$0: unknown option "$arg"\n/); |
371 | } |
372 | |
373 | if ($head && $post) { |
374 | warn "$0: combining -head and -post makes no sense, ignoring -post.\n"; |
375 | $post = 0; |
376 | undef $postfile; |
377 | } |
378 | |
379 | if ($refresh && defined($reference_file)) { |
380 | warn "$0: combining -update and -IfNewerThan make no sense, ignoring -IfNewerThan.\n"; |
381 | undef $reference_file; |
382 | } |
383 | |
384 | if (@ARGV == 0) { |
385 | warn "$0: nothing to do. Use -help for info.\n"; |
386 | exit($EXIT_ok); |
387 | } |
388 | |
389 | |
390 | ## |
391 | ## Now run through the remaining arguments (mostly URLs) and do a quick |
392 | ## check to see if they look well-formed. We won't *do* anything -- just |
393 | ## want to catch quick errors before really starting the work. |
394 | ## |
395 | @tmp = @ARGV; |
396 | $errors = 0; |
397 | while (@tmp) { |
398 | $arg = shift(@tmp); |
399 | if ($arg =~ m/^-t(ime)?o(ut)?$/) { |
400 | local($num) = shift(@tmp); |
401 | if ($num !~ m/^\d+(\d*)?[hms]?$/) { |
402 | &warn("expecting timespan argument to $arg\n"); |
403 | $errors++; |
404 | } |
405 | } else { |
406 | local($protocol) = &www'grok_URL($arg, $noproxy); |
407 | |
408 | if (!defined $protocol) { |
409 | warn qq/can't grok "$arg"/; |
410 | $errors++; |
411 | } elsif (!$quiet && ($protocol eq 'ftp')) { |
412 | warn qq/warning: -head ignored for ftp URLs\n/ if $head; |
413 | warn qq/warning: -refresh ignored for ftp URLs\n/if $refresh; |
414 | warn qq/warning: -IfNewerThan ignored for ftp URLs\n/if defined($reference_file); |
415 | |
416 | } |
417 | } |
418 | } |
419 | |
420 | exit($EXIT_error) if $errors; |
421 | |
422 | |
423 | $SuccessfulCount = 0; |
424 | $NotModifiedCount = 0; |
425 | |
426 | ## |
427 | ## Now do the real thing. |
428 | ## |
429 | while (@ARGV) { |
430 | $arg = shift(@ARGV); |
431 | if ($arg =~ m/^-t(ime)?o(ut)?$/) { |
432 | &timeout_arg(shift(@ARGV)); |
433 | } else { |
434 | &fetch_url($arg); |
435 | } |
436 | } |
437 | |
438 | if ($NotModifiedCount) { |
439 | exit($EXIT_notmodified); |
440 | } elsif ($SuccessfulCount) { |
441 | exit($EXIT_ok); |
442 | } else { |
443 | exit($EXIT_error); |
444 | } |
445 | |
446 | ########################################################################### |
447 | ########################################################################### |
448 | |
449 | sub timeout_arg |
450 | { |
451 | ($TimeoutSpan) = @_; |
452 | $TimeoutSpan =~ s/s//; |
453 | $TimeoutSpan *= 60 if $TimeoutSpan =~ m/m/; |
454 | $TimeoutSpan *= 3600 if $TimeoutSpan =~ m/h/; |
455 | |
456 | } |
457 | |
458 | ## |
459 | ## As a byproduct, returns the basename of $0. |
460 | ## |
461 | sub show_version |
462 | { |
463 | local($base) = $0; |
464 | $base =~ s,.*/,,; |
465 | print STDERR "This is $base version $version\n"; |
466 | $base; |
467 | } |
468 | |
469 | ## |
470 | ## &usage(exitval, message); |
471 | ## |
472 | ## Prints a usage message to STDERR. |
473 | ## If MESSAGE is defined, prints that first. |
474 | ## If exitval is defined, exits with that value. Otherwise, returns. |
475 | ## |
476 | sub usage |
477 | { |
478 | local($exit, $message) = @_; |
479 | |
480 | print STDERR $message if defined $message; |
481 | local($base) = &show_version; |
482 | print STDERR <<INLINE_LITERAL_TEXT; |
483 | usage: $0 [options] URL ... |
484 | Fetches and displays the named URL(s). Supports http and ftp. |
485 | (if no protocol is given, a leading "http://" is normally used). |
486 | |
487 | Options are from among: |
488 | -V, -version Print version information; exit. |
489 | -p, -post If URL looks like a form reply, does POST instead of GET. |
490 | -pf, -postfile Like -post, but takes everything after ? to be a filename. |
491 | -q, -quiet All non-essential informational messages are suppressed. |
492 | -nf, -nofollow Don't follow "this document has moved" replies. |
493 | -nr, -noretry Doesn't retry a failed hostname lookup (same as -retry 0) |
494 | -r #, -retry # Sets failed-hostname-lookup-retry to # (default $retry) |
495 | -np, -noproxy Uses no proxy, even if one defined for the protocol. |
496 | -ns, -nostrip The HTTP header, normally elided, is printed. |
497 | -head gets item header only (implies -ns) |
498 | -nab, -file Dumps output to file whose name taken from URL, minus path |
499 | -nnab Like -nab, but *also* dumps to stdout. |
500 | -update HTTP only. Like -nab, but only if the page has been modified. |
501 | -h, -help Prints this message. |
502 | -IfNewerThan F HTTP only. Only brings page if it is newer than named file. |
503 | -timeout T Fail if a connection can't be made in the specified time. |
504 | |
505 | -updateme Pull the latest version of $base from |
506 | $WEB |
507 | and reports if it is newer than your current version. |
508 | |
509 | Comments to $comments. |
510 | INLINE_LITERAL_TEXT |
511 | |
512 | exit($exit) if defined $exit; |
513 | } |
514 | |
515 | ## |
516 | ## Pull the latest version of this program to a local file. |
517 | ## Clip the first couple lines from this executing file so that we |
518 | ## preserve the local invocation style. |
519 | ## |
520 | sub updateme |
521 | { |
522 | ## |
523 | ## Open a temp file to hold the new version, |
524 | ## redirecting STDOUT to it. |
525 | ## |
526 | open(STDOUT, '>'.($tempFile="/tmp/webget.new")) || |
527 | open(STDOUT, '>'.($tempFile="/usr/tmp/webget.new")) || |
528 | open(STDOUT, '>'.($tempFile="/webget.new")) || |
529 | open(STDOUT, '>'.($tempFile="webget.new")) || |
530 | die "$0: can't open a temp file.\n"; |
531 | |
532 | ## |
533 | ## See if we can figure out how we were called. |
534 | ## The seek will rewind not to the start of the data, but to the |
535 | ## start of the whole program script. |
536 | ## |
537 | ## Keep the first line if it begins with #!, and the next two if they |
538 | ## look like the trick mentioned in the perl man page for getting |
539 | ## around the lack of #!-support. |
540 | ## |
541 | if (seek(DATA, 0, 0)) { ## |
542 | $_ = <DATA>; if (m/^#!/) { print STDOUT; |
543 | $_ = <DATA>; if (m/^\s*eval/) { print STDOUT; |
544 | $_ = <DATA>; if (m/^\s*if/) { print STDOUT; } |
545 | } |
546 | } |
547 | print STDOUT "\n#-\n"; |
548 | } |
549 | |
550 | ## Go get the latest one... |
551 | local(@options); |
552 | push(@options, 'head') if $head; |
553 | push(@options, 'nofollow') unless $follow; |
554 | push(@options, ('retry') x $retry) if $retry; |
555 | push(@options, 'quiet') if $quiet; |
556 | push(@options, 'debug') if $debug; |
557 | local($status, $memo, %info) = &www'open_http_url(*IN, $WEB, @options); |
558 | die "fetching $WEB:\n $memo\n" unless $status eq 'ok'; |
559 | |
560 | $size = $info{'content-length'}; |
561 | while (<IN>) |
562 | { |
563 | $size -= length; |
564 | print STDOUT; |
565 | if (!defined $fetched_version && m/version\s*=\s*"([^"]+)"/) { |
566 | $fetched_version = $1; |
567 | &general_read(*IN, $size); |
568 | last; |
569 | } |
570 | } |
571 | |
572 | $fetched_version = "<unknown>" unless defined $fetched_version; |
573 | |
574 | ## |
575 | ## Try to update the mode of the temp file with the mode of this file. |
576 | ## Don't worry if it fails. |
577 | ## |
578 | chmod($mode, $tempFile) if $mode = (stat($0))[2]; |
579 | |
580 | $as_well = ''; |
581 | if ($fetched_version eq $version) |
582 | { |
583 | print STDERR "You already have the most-recent version ($version).\n", |
584 | qq/FWIW, the newly fetched one has been left in "$tempFile".\n/; |
585 | } |
586 | elsif ($fetched_version <= $version) |
587 | { |
588 | print STDERR |
589 | "Mmm, your current version seems newer (?!):\n", |
590 | qq/ your version: "$version"\n/, |
591 | qq/ new version: "$fetched_version"\n/, |
592 | qq/FWIW, fetched one left in "$tempFile".\n/; |
593 | } |
594 | else |
595 | { |
596 | print STDERR |
597 | "Indeed, your current version was old:\n", |
598 | qq/ your version: "$version"\n/, |
599 | qq/ new version: "$fetched_version"\n/, |
600 | qq/The file "$tempFile" is ready to replace the old one.\n/; |
601 | print STDERR qq/Just do:\n % mv $tempFile $0\n/ if -f $0; |
602 | $as_well = ' as well'; |
603 | } |
604 | print STDERR "Note that the libraries it uses may (or may not) need updating$as_well.\n" |
605 | unless $inlined; |
606 | exit($EXIT_ok); |
607 | } |
608 | |
609 | ## |
610 | ## Given a list of URLs, fetch'em. |
611 | ## Parses the URL and calls the routine for the appropriate protocol |
612 | ## |
613 | sub fetch_url |
614 | { |
615 | local(@todo) = @_; |
616 | local(%circref, %hold_circref); |
617 | |
618 | URL_LOOP: while (@todo) |
619 | { |
620 | $URL = shift(@todo); |
621 | %hold_circref = %circref; undef %circref; |
622 | |
623 | local($protocol, @args) = &www'grok_URL($URL, $noproxy); |
624 | |
625 | if (!defined $protocol) { |
626 | &www'message(1, qq/can't grok "$URL"/); |
627 | next URL_LOOP; |
628 | } |
629 | |
630 | ## call protocol-specific handler |
631 | $func = "fetch_via_" . $protocol; |
632 | $error = &$func(@args, $TimeoutSpan); |
633 | if (defined $error) { |
634 | &www'message(1, "$URL: $error"); |
635 | } else { |
636 | $SuccessfulCount++; |
637 | } |
638 | } |
639 | } |
640 | |
641 | sub filedate |
642 | { |
643 | local($filename) = @_; |
644 | local($filetime) = (stat($filename))[9]; |
645 | return 0 if !defined $filetime; |
646 | local($sec, $min, $hour, $mday, $mon, $year, $wday) = gmtime($filetime); |
647 | return 0 if !defined $wday; |
648 | sprintf(qq/"%s, %02d-%s-%02d %02d:%02d:%02d GMT"/, |
649 | ("Sunday", "Monday", "Tuesdsy", "Wednesday", |
650 | "Thursday", "Friday", "Saturday")[$wday], |
651 | $mday, |
652 | ("Jan", "Feb", "Mar", "Apr", "May", "Jun", |
653 | "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")[$mon], |
654 | $year, |
655 | $hour, |
656 | $min, |
657 | $sec); |
658 | } |
659 | |
660 | sub local_filename |
661 | { |
662 | local($filename) = @_; |
663 | $filename =~ s,/+$,,; ## remove any trailing slashes |
664 | $filename =~ s,.*/,,; ## remove any leading path |
665 | if ($filename eq '') { |
666 | ## empty -- pick a random name |
667 | $filename = "file0000"; |
668 | ## look for a free random name. |
669 | $filename++ while -f $filename; |
670 | } |
671 | $filename; |
672 | } |
673 | |
674 | sub set_output_file |
675 | { |
676 | local($filename) = @_; |
677 | if (!open(OUT, ">$filename")) { |
678 | &www'message(1, "$0: can't open [$filename] for output"); |
679 | } else { |
680 | open(SAVEOUT, ">>&STDOUT") || die "$!";; |
681 | open(STDOUT, ">>&OUT"); |
682 | } |
683 | } |
684 | |
685 | sub close_output_file |
686 | { |
687 | local($filename) = @_; |
688 | unless ($quiet) |
689 | { |
690 | local($note) = qq/"$filename" written/; |
691 | if (defined $error) { |
692 | $note .= " (possibly corrupt due to error above)"; |
693 | } |
694 | &www'message(1, "$note."); |
695 | } |
696 | close(STDOUT); |
697 | open(STDOUT, ">&SAVEOUT"); |
698 | } |
699 | |
700 | sub http_alarm |
701 | { |
702 | &www'message(1, "ERROR: $AlarmNote."); |
703 | exit($EXIT_timeout); ## the alarm doesn't seem to cause a waiting syscall to break? |
704 | # $HaveAlarm = 1; |
705 | } |
706 | |
707 | ## |
708 | ## Given the host, port, and path, and (for info only) real target, |
709 | ## fetch via HTTP. |
710 | ## |
711 | ## If there is a user and/or password, use that for Basic Authorization. |
712 | ## |
713 | ## If $timeout is nonzero, time out after that many seconds. |
714 | ## |
715 | sub fetch_via_http |
716 | { |
717 | local($host, $port, $path, $target, $user, $password, $timeout) = @_; |
718 | local(@options); |
719 | local($local_filename); |
720 | |
721 | ## |
722 | ## If we're posting, but -postfile was given, we need to interpret |
723 | ## the item in $path after '?' as a filename, and replace it with |
724 | ## the contents of the file. |
725 | ## |
726 | if ($postfile && $path =~ s/\?([\d\D]*)//) { |
727 | local($filename) = $1; |
728 | return("can't open [$filename] to POST") if !open(IN, "<$filename"); |
729 | local($/) = ''; ## want to suck up the whole file. |
730 | $path .= '?' . <IN>; |
731 | close(IN); |
732 | } |
733 | |
734 | $local_filename = &local_filename($path) |
735 | if $refresh || $nab || defined($reference_file); |
736 | $refresh = &filedate($local_filename) if $refresh; |
737 | $refresh = &filedate($reference_file) if defined($reference_file); |
738 | |
739 | push(@options, 'head') if $head; |
740 | push(@options, 'post') if $post; |
741 | push(@options, 'nofollow') unless $follow; |
742 | push(@options, ('retry') x 3); |
743 | push(@options, 'quiet') if $quiet; |
744 | push(@options, 'debug') if $debug; |
745 | push(@options, "ifmodifiedsince=$refresh") if $refresh; |
746 | |
747 | if (defined $password || defined $user) { |
748 | local($auth) = join(':', ($user || ''), ($password || '')); |
749 | push(@options, "authorization=$auth"); |
750 | } |
751 | |
752 | local($old_alarm); |
753 | if ($timeout) { |
754 | $old_alarm = $SIG{'ALRM'} || 'DEFAULT'; |
755 | $SIG{'ALRM'} = "main'http_alarm"; |
756 | # $HaveAlarm = 0; |
757 | $AlarmNote = "host $host"; |
758 | $AlarmNote .= ":$port" if $port != $www'default_port{'http'}; |
759 | $AlarmNote .= " timed out after $timeout second"; |
760 | $AlarmNote .= 's' if $timeout > 1; |
761 | alarm($timeout); |
762 | } |
763 | local($result, $memo, %info) = |
764 | &www'open_http_connection(*HTTP, $host,$port,$path,$target,@options); |
765 | |
766 | if ($timeout) { |
767 | alarm(0); |
768 | $SIG{'ALRM'} = $old_alarm; |
769 | } |
770 | |
771 | # if ($HaveAlarm) { |
772 | # close(HTTP); |
773 | # $error = "timeout after $timeout second"; |
774 | # $error .= "s" if $timeout > 1; |
775 | # return $error; |
776 | # } |
777 | |
778 | if ($follow && ($result eq 'follow')) { |
779 | %circref = %hold_circref; |
780 | $circref{$memo} = 1; |
781 | unshift(@todo, $memo); |
782 | return undef; |
783 | } |
784 | |
785 | |
786 | return $memo if $result eq 'error'; |
787 | if (!$quiet && $result eq 'status' && ! -t STDOUT) { |
788 | #&www'message(1, "Warning: $memo"); |
789 | $error = "Warning: $memo"; |
790 | } |
791 | |
792 | if ($info{'CODE'} == 304) { ## 304 is magic for "Not Modified" |
793 | close(HTTP); |
794 | &www'message(1, "$URL: Not Modified") unless $quiet; |
795 | $NotModifiedCount++; |
796 | return undef; ## no error |
797 | } |
798 | |
799 | |
800 | &set_output_file($local_filename) if $nab; |
801 | |
802 | unless($strip) { |
803 | print $info{'STATUS'}, "\n", $info{'HEADER'}, "\n"; |
804 | |
805 | print SAVEOUT $info{'STATUS'}, "\n", $info{'HEADER'}, "\n" if $nab==2; |
806 | } |
807 | |
808 | if (defined $info{'BODY'}) { |
809 | print $info{'BODY'}; |
810 | print SAVEOUT $info{'BODY'} if $nab==2; |
811 | } |
812 | |
813 | if (!$head) { |
814 | &general_read(*HTTP, $info{'content-length'}); |
815 | } |
816 | close(HTTP); |
817 | &close_output_file($local_filename) if $nab; |
818 | |
819 | $error; ## will be 'undef' if no error; |
820 | } |
821 | |
822 | sub fetch_via_ftp |
823 | { |
824 | local($host, $port, $path, $target, $user, $password, $timeout) = @_; |
825 | local($local_filename) = &local_filename($path); |
826 | local($ftp_debug) = $debug; |
827 | local(@password) = ($password); |
828 | $path =~ s,^/,,; ## remove a leading / from the path. |
829 | $path = '.' if $path eq ''; ## make sure we have something |
830 | |
831 | if (!defined $user) { |
832 | $user = 'anonymous'; |
833 | $password = $ENV{'USER'} || 'WWWuser'; |
834 | @password = ($password.'@'. &network'addr_to_ascii(&network'my_addr), |
835 | $password.'@'); |
836 | } elsif (!defined $password) { |
837 | @password = (""); |
838 | } |
839 | |
840 | local($_last_ftp_reply, $_passive_host, $_passive_port); |
841 | local($size); |
842 | |
843 | sub _ftp_get_reply |
844 | { |
845 | local($text) = scalar(<FTP_CONTROL>); |
846 | die "lost connection to $host\n" if !defined $text; |
847 | local($_, $tmp); |
848 | print STDERR "READ: $text" if $ftp_debug; |
849 | die "internal error: expected reply code in response from ". |
850 | "ftp server [$text]" unless $text =~ s/^(\d+)([- ])//; |
851 | local($code) = $1; |
852 | if ($2 eq '-') { |
853 | while (<FTP_CONTROL>) { |
854 | ($tmp = $_) =~ s/^\d+[- ]//; |
855 | $text .= $tmp; |
856 | last if m/^$code /; |
857 | } |
858 | } |
859 | $text =~ s/^\d+ ?/<foo>/g; |
860 | ($code, $text); |
861 | } |
862 | |
863 | sub _ftp_expect |
864 | { |
865 | local($code, $text) = &_ftp_get_reply; |
866 | $_last_ftp_reply = $text; |
867 | foreach $expect (@_) { |
868 | return ($code, $text) if $code == $expect; |
869 | } |
870 | die "internal error: expected return code ". |
871 | join('|',@_).", got [$text]"; |
872 | } |
873 | |
874 | sub _ftp_send |
875 | { |
876 | print STDERR "SEND: ", @_ if $ftp_debug; |
877 | print FTP_CONTROL @_; |
878 | } |
879 | |
880 | sub _ftp_do_passive |
881 | { |
882 | local(@commands) = @_; |
883 | |
884 | &_ftp_send("PASV\r\n"); |
885 | local($code) = &_ftp_expect(227, 125); |
886 | |
887 | if ($code == 227) |
888 | { |
889 | die "internal error: can't grok passive reply [$_last_ftp_reply]" |
890 | unless $_last_ftp_reply =~ m/\(([\d,]+)\)/; |
891 | local($a,$b,$c,$d, $p1, $p2) = split(/,/, $1); |
892 | ($_passive_host, $_passive_port) = |
893 | ("$a.$b.$c.$d", $p1*256 + $p2); |
894 | } |
895 | |
896 | foreach(@commands) { |
897 | &_ftp_send($_); |
898 | } |
899 | |
900 | local($error)= |
901 | &network'connect_to(*PASSIVE, $_passive_host, $_passive_port); |
902 | die "internal error: passive ftp connect [$error]" if $error; |
903 | } |
904 | |
905 | ## make the connection to the host |
906 | &www'message($debug, "connecting to $host...") unless $quiet; |
907 | |
908 | local($old_alarm); |
909 | if ($timeout) { |
910 | $old_alarm = $SIG{'ALRM'} || 'DEFAULT'; |
911 | $SIG{'ALRM'} = "main'http_alarm"; ## can use this for now |
912 | # $HaveAlarm = 0; |
913 | $AlarmNote = "host $host"; |
914 | $AlarmNote .= ":$port" if $port != $www'default_port{'ftp'}; |
915 | $AlarmNote .= " timed out after $timeout second"; |
916 | $AlarmNote .= 's' if $timeout > 1; |
917 | alarm($timeout); |
918 | } |
919 | |
920 | local($error) = &network'connect_to(*FTP_CONTROL, $host, $port); |
921 | |
922 | if ($timeout) { |
923 | alarm(0); |
924 | $SIG{'ALRM'} = $old_alarm; |
925 | } |
926 | |
927 | return $error if $error; |
928 | |
929 | local ($code, $text) = &_ftp_get_reply(*FTP_CONTROL); |
930 | close(FTP_CONTROL), return "internal ftp error: [$text]" unless $code==220; |
931 | |
932 | ## log in |
933 | &www'message($debug, "logging in as $user...") unless $quiet; |
934 | foreach $password (@password) |
935 | { |
936 | &_ftp_send("USER $user\r\n"); |
937 | ($code, $text) = &_ftp_expect(230,331,530); |
938 | close(FTP_CONTROL), return $text if ($code == 530); |
939 | last if $code == 230; ## hey, already logged in, cool. |
940 | |
941 | &_ftp_send("PASS $password\r\n"); |
942 | ($code, $text) = &_ftp_expect(220,230,530,550,332); |
943 | last if $code != 550; |
944 | last if $text =~ m/can't change directory/; |
945 | } |
946 | |
947 | if ($code == 550) |
948 | { |
949 | $text =~ s/\n+$//; |
950 | &www'message(1, "Can't log in $host: $text") unless $quiet; |
951 | exit($EXIT_error); |
952 | } |
953 | |
954 | if ($code == 332) |
955 | { |
956 | &_ftp_send("ACCT noaccount\r\n"); |
957 | ($code, $text) = &_ftp_expect(230, 202, 530, 500,501,503, 421) |
958 | } |
959 | close(FTP_CONTROL), return $text if $code >= 300; |
960 | |
961 | &_ftp_send("TYPE I\r\n"); |
962 | &_ftp_expect(200); |
963 | |
964 | unless ($quiet) { |
965 | local($name) = $path; |
966 | $name =~ s,.*/([^/]),$1,; |
967 | &www'message($debug, "requesting $name..."); |
968 | } |
969 | ## get file |
970 | &_ftp_do_passive("RETR $path\r\n"); |
971 | ($code,$text) = &_ftp_expect(125, 150, 550, 530); |
972 | close(FTP_CONTROL), return $text if $code == 530; |
973 | |
974 | if ($code == 550) |
975 | { |
976 | close(PASSIVE); |
977 | if ($text =~ /directory/i) { |
978 | ## probably from "no such file or directory", so just return now. |
979 | close(FTP_CONTROL); |
980 | return $text; |
981 | } |
982 | |
983 | ## do like Mosaic and try getting a directory listing. |
984 | &_ftp_send("CWD $path\r\n"); |
985 | ($code) = &_ftp_expect(250,550); |
986 | if ($code == 550) { |
987 | close(FTP_CONTROL); |
988 | return $text; |
989 | } |
990 | &_ftp_do_passive("LIST\r\n"); |
991 | &_ftp_expect(125, 150); |
992 | } |
993 | |
994 | $size = $1 if $text =~ m/(\d+)\s+bytes/; |
995 | binmode(PASSIVE); ## just in case. |
996 | &www'message($debug, "waiting for data...") unless $quiet; |
997 | &set_output_file($local_filename) if $nab; |
998 | &general_read(*PASSIVE, $size); |
999 | &close_output_file($local_filename) if $nab; |
1000 | |
1001 | close(PASSIVE); |
1002 | close(FTP_CONTROL); |
1003 | undef; |
1004 | } |
1005 | |
1006 | sub general_read |
1007 | { |
1008 | local(*INPUT, $size) = @_; |
1009 | local($lastcount, $bytes) = (0,0); |
1010 | local($need_to_clear) = 0; |
1011 | local($start_time) = time; |
1012 | local($last_time, $time) = $start_time; |
1013 | ## Figure out how often to print the "bytes read" message |
1014 | local($delta2print) = |
1015 | (defined $size) ? int($size/50) : $defaultdelta2print; |
1016 | |
1017 | &www'message(0, "read 0 bytes") unless $quiet; |
1018 | |
1019 | ## so $! below is set only if a real error happens from now |
1020 | eval 'local($^W) = 0; undef $!'; |
1021 | |
1022 | |
1023 | while (defined($_ = <INPUT>)) |
1024 | { |
1025 | ## shove it out. |
1026 | &www'clear_message if $need_to_clear; |
1027 | print; |
1028 | print SAVEOUT if $nab==2; |
1029 | |
1030 | ## if we know the content-size, keep track of what we're reading. |
1031 | $bytes += length; |
1032 | |
1033 | last if eof || (defined $size && $bytes >= $size); |
1034 | |
1035 | if (!$quiet && $bytes > ($lastcount + $delta2print)) |
1036 | { |
1037 | if ($time = time, $last_time == $time) { |
1038 | $delta2print *= 1.5; |
1039 | } else { |
1040 | $last_time = $time; |
1041 | $lastcount = $bytes; |
1042 | local($time_delta) = $time - $start_time; |
1043 | local($text); |
1044 | |
1045 | $delta2print /= $time_delta; |
1046 | if (defined $size) { |
1047 | $text = sprintf("read $bytes bytes (%.0f%%)", |
1048 | $bytes*100/$size); |
1049 | } else { |
1050 | $text = "read $bytes bytes"; |
1051 | } |
1052 | |
1053 | if ($time_delta > 5 || ($time_delta && $bytes > 10240)) |
1054 | { |
1055 | local($rate) = int($bytes / $time_delta); |
1056 | if ($rate < 5000) { |
1057 | $text .= " ($rate bytes/sec)"; |
1058 | } elsif ($rate < 1024 * 10) { |
1059 | $text .= sprintf(" (%.1f k/sec)", $rate/1024); |
1060 | } else { |
1061 | $text .= sprintf(" (%.0f k/sec)", $rate/1024); |
1062 | } |
1063 | } |
1064 | &www'message(0, "$text..."); |
1065 | $need_to_clear = -t STDOUT; |
1066 | } |
1067 | } |
1068 | } |
1069 | |
1070 | if (!$quiet) |
1071 | { |
1072 | if ($size && ($size != $bytes)) { |
1073 | &www'message("WARNING: Expected $size bytes, read $bytes bytes.\n"); |
1074 | } |
1075 | # if ($!) { |
1076 | # print STDERR "\$! is [$!]\n"; |
1077 | # } |
1078 | # if ($@) { |
1079 | # print STDERR "\$\@ is [$@]\n"; |
1080 | # } |
1081 | } |
1082 | &www'clear_message($text) unless $quiet; |
1083 | } |
1084 | |
1085 | sub dummy { |
1086 | 1 || &dummy || &fetch_via_ftp || &fetch_via_http || &http_alarm; |
1087 | 1 || close(OUT); |
1088 | 1 || close(SAVEOUT); |
1089 | } |
1090 | |
1091 | __END__ |