Commit | Line | Data |
0a753a76 |
1 | ## |
2 | ## Jeffrey Friedl (jfriedl@omron.co.jp) |
3 | ## Copyri.... ah hell, just take it. |
4 | ## |
5 | ## This is "www.pl". |
6 | ## Include (require) to use, execute ("perl www.pl") to print a man page. |
7 | ## Requires my 'network.pl' library. |
8 | package www; |
9 | $version = "951219.9"; |
10 | |
11 | ## |
12 | ## 951219.9 |
13 | ## -- oops, stopped sending garbage Authorization line when no |
14 | ## authorization was requested. |
15 | ## |
16 | ## 951114.8 |
17 | ## -- added support for HEAD, If-Modified-Since |
18 | ## |
19 | ## 951017.7 |
20 | ## -- Change to allow a POST'ed HTTP text to have newlines in it. |
21 | ## Added 'NewURL to the open_http_connection %info. Idea courtesy |
22 | ## of Bryan Schmersal (http://www.transarc.com/~bryans/Home.html). |
23 | ## |
24 | ## |
25 | ## 950921.6 |
26 | ## -- added more robust HTTP error reporting |
27 | ## (due to steven_campbell@uk.ibm.com) |
28 | ## |
29 | ## 950911.5 |
30 | ## -- added Authorization support |
31 | ## |
32 | |
33 | ## |
34 | ## HTTP return status codes. |
35 | ## |
36 | %http_return_code = |
37 | (200,"OK", |
38 | 201,"Created", |
39 | 202,"Accepted", |
40 | 203,"Partial Information", |
41 | 204,"No Response", |
42 | 301,"Moved", |
43 | 302,"Found", |
44 | 303,"Method", |
45 | 304,"Not modified", |
46 | 400,"Bad request", |
47 | 401,"Unauthorized", |
48 | 402,"Payment required", |
49 | 403,"Forbidden", |
50 | 404,"Not found", |
51 | 500,"Internal error", |
52 | 501,"Not implemented", |
53 | 502,"Service temporarily overloaded", |
54 | 503,"Gateway timeout"); |
55 | |
56 | ## |
57 | ## If executed directly as a program, print as a man page. |
58 | ## |
59 | if (length($0) >= 6 && substr($0, -6) eq 'www.pl') |
60 | { |
61 | seek(DATA, 0, 0) || die "$0: can't reset internal pointer.\n"; |
62 | print "www.pl version $version\n", '=' x 60, "\n"; |
63 | while (<DATA>) { |
64 | next unless /^##>/../^##</; ## select lines to print |
65 | s/^##[<> ]?//; ## clean up |
66 | print; |
67 | } |
68 | exit(0); |
69 | } |
70 | |
71 | ## |
72 | ## History: |
73 | ## version 950425.4 |
74 | ## added require for "network.pl" |
75 | ## |
76 | ## version 950425.3 |
77 | ## re-did from "Www.pl" which was a POS. |
78 | ## |
79 | ## |
80 | ## BLURB: |
81 | ## A group of routines for dealing with URLs, HTTP sessions, proxies, etc. |
82 | ## Requires my 'network.pl' package. The library file can be executed |
83 | ## directly to produce a man page. |
84 | |
85 | ##> |
86 | ## A motley group of routines for dealing with URLs, HTTP sessions, proxies, |
87 | ## etc. Requires my 'network.pl' package. |
88 | ## |
89 | ## Latest version, as well as other stuff (including network.pl) available |
90 | ## at http://www.wg.omron.co.jp/~jfriedl/perl/ |
91 | ## |
92 | ## Simpleton complete program to dump a URL given on the command-line: |
93 | ## |
94 | ## require 'network.pl'; ## required for www.pl |
95 | ## require 'www.pl'; ## main routines |
96 | ## $URL = shift; ## get URL |
97 | ## ($status, $memo) = &www'open_http_url(*IN, $URL); ## connect |
98 | ## die "$memo\n" if $status ne 'ok'; ## report any error |
99 | ## print while <IN>; ## dump contents |
100 | ## |
101 | ## There are various options available for open_http_url. |
102 | ## For example, adding 'quiet' to the call, i.e. vvvvvvv-----added |
103 | ## ($status, $memo) = &www'open_http_url(*IN, $URL, 'quiet'); |
104 | ## suppresses the normal informational messages such as "waiting for data...". |
105 | ## |
106 | ## The options, as well as the various other public routines in the package, |
107 | ## are discussed below. |
108 | ## |
109 | ##< |
110 | |
111 | ## |
112 | ## Default port for the protocols whose URL we'll at least try to recognize. |
113 | ## |
114 | %default_port = ('http', 80, |
115 | 'ftp', 21, |
116 | 'gopher', 70, |
117 | 'telnet', 23, |
118 | 'wais', 210, |
119 | ); |
120 | |
121 | ## |
122 | ## A "URL" to "ftp.blah.com" without a protocol specified is probably |
123 | ## best reached via ftp. If the hostname begins with a protocol name, it's |
124 | ## easy. But something like "www." maps to "http", so that mapping is below: |
125 | ## |
126 | %name2protocol = ( |
127 | 'www', 'http', |
128 | 'wwwcgi','http', |
129 | ); |
130 | |
131 | $last_message_length = 0; |
132 | $useragent = "www.pl/$version"; |
133 | |
134 | ## |
135 | ##> |
136 | ############################################################################## |
137 | ## routine: open_http_url |
138 | ## |
139 | ## Used as |
140 | ## ($status, $memo, %info) = &www'open_http_url(*FILEHANDLE, $URL, options..) |
141 | ## |
142 | ## Given an unused filehandle, a URL, and a list of options, opens a socket |
143 | ## to the URL and returns with the filehandle ready to read the data of the |
144 | ## URL. The HTTP header, as well as other information, is returned in %info. |
145 | ## |
146 | ## OPTIONS are from among: |
147 | ## |
148 | ## "post" |
149 | ## If PATH appears to be a query (i.e. has a ? in it), contact |
150 | ## via a POST rather than a GET. |
151 | ## |
152 | ## "nofollow" |
153 | ## Normally, if the initial contact indicates that the URL has moved |
154 | ## to a different location, the new location is automatically contacted. |
155 | ## "nofollow" inhibits this. |
156 | ## |
157 | ## "noproxy" |
158 | ## Normally, a proxy will be used if 'http_proxy' is defined in the |
159 | ## environment. This option inhibits the use of a proxy. |
160 | ## |
161 | ## "retry" |
162 | ## If a host's address can't be found, it may well be because the |
163 | ## nslookup just didn't return in time and that retrying the lookup |
164 | ## after a few seconds will succeed. If this option is given, will |
165 | ## wait five seconds and try again. May be given multiple times to |
166 | ## retry multiple times. |
167 | ## |
168 | ## "quiet" |
169 | ## Informational messages will be suppressed. |
170 | ## |
171 | ## "debug" |
172 | ## Additional messages will be printed. |
173 | ## |
174 | ## "head" |
175 | ## Requests only the file header to be sent |
176 | ## |
177 | ## |
178 | ## |
179 | ## |
180 | ## The return array is ($STATUS, $MEMO, %INFO). |
181 | ## |
182 | ## STATUS is 'ok', 'error', 'status', or 'follow' |
183 | ## |
184 | ## If 'error', the MEMO will indicate why (URL was not http, can't |
185 | ## connect, etc.). INFO is probably empty, but may have some data. |
186 | ## See below. |
187 | ## |
188 | ## If 'status', the connnection was made but the reply was not a normal |
189 | ## "OK" successful reply (i.e. "Not found", etc.). MEMO is a note. |
190 | ## INFO is filled as noted below. Filehandle is ready to read (unless |
191 | ## $info{'BODY'} is filled -- see below), but probably most useful |
192 | ## to treat this as an 'error' response. |
193 | ## |
194 | ## If 'follow', MEMO is the new URL (for when 'nofollow' was used to |
195 | ## turn off automatic following) and INFO is filled as described |
196 | ## below. Unless you wish to give special treatment to these types of |
197 | ## responses, you can just treat 'follow' responses like 'ok' |
198 | ## responses. |
199 | ## |
200 | ## If 'ok', the connection went well and the filehandle is ready to |
201 | ## read. |
202 | ## |
203 | ## INFO contains data as described at the read_http_header() function (in |
204 | ## short, the HTTP response header) and additional informational fields. |
205 | ## In addition, the following fields are filled in which describe the raw |
206 | ## connection made or attempted: |
207 | ## |
208 | ## PROTOCOL, HOST, PORT, PATH |
209 | ## |
210 | ## Note that if a proxy is being used, these will describe the proxy. |
211 | ## The field TARGET will describe the host or host:port ultimately being |
212 | ## contacted. When no proxy is being used, this will be the same info as |
213 | ## in the raw connection fields above. However, if a proxy is being used, |
214 | ## it will refer to the final target. |
215 | ## |
216 | ## In some cases, the additional entry $info{'BODY'} exists as well. If |
217 | ## the result-code indicates an error, the body of the message may be |
218 | ## parsed for internal reasons (i.e. to support 'repeat'), and if so, it |
219 | ## will be saved in $info{'BODY}. |
220 | ## |
221 | ## If the URL has moved, $info{'NewURL'} will exist and contain the new |
222 | ## URL. This will be true even if the 'nofollow' option is specified. |
223 | ## |
224 | ##< |
225 | ## |
226 | sub open_http_url |
227 | { |
228 | local(*HTTP, $URL, @options) = @_; |
229 | return &open_http_connection(*HTTP, $URL, undef, undef, undef, @options); |
230 | } |
231 | |
232 | |
233 | ## |
234 | ##> |
235 | ############################################################################## |
236 | ## routine: read_http_header |
237 | ## |
238 | ## Given a filehandle to a just-opened HTTP socket connection (such as one |
239 | ## created via &network'connect_to which has had the HTTP request sent), |
240 | ## reads the HTTP header and and returns the parsed info. |
241 | ## |
242 | ## ($replycode, %info) = &read_http_header(*FILEHANDLE); |
243 | ## |
244 | ## $replycode will be the HTTP reply code as described below, or |
245 | ## zero on header-read error. |
246 | ## |
247 | ## %info contains two types of fields: |
248 | ## |
249 | ## Upper-case fields are informational from the function. |
250 | ## Lower-case fields are the header field/value pairs. |
251 | ## |
252 | ## Upper-case fields: |
253 | ## |
254 | ## $info{'STATUS'} will be the first line read (HTTP status line) |
255 | ## |
256 | ## $info{'CODE'} will be the numeric HTTP reply code from that line. |
257 | ## This is also returned as $replycode. |
258 | ## |
259 | ## $info{'TYPE'} is the text from the status line that follows CODE. |
260 | ## |
261 | ## $info{'HEADER'} will be the raw text of the header (sans status line), |
262 | ## newlines and all. |
263 | ## |
264 | ## $info{'UNKNOWN'}, if defined, will be any header lines not in the |
265 | ## field/value format used to fill the lower-case fields of %info. |
266 | ## |
267 | ## Lower-case fields are reply-dependent, but in general are described |
268 | ## in http://info.cern.ch/hypertext/WWW/Protocols/HTTP/Object_Headers.html |
269 | ## |
270 | ## A header line such as |
271 | ## Content-type: Text/Plain |
272 | ## will appear as $info{'content-type'} = 'Text/Plain'; |
273 | ## |
274 | ## (*) Note that while the field names are are lower-cased, the field |
275 | ## values are left as-is. |
276 | ## |
277 | ## |
278 | ## When $replycode is zero, there are two possibilities: |
279 | ## $info{'TYPE'} is 'empty' |
280 | ## No response was received from the filehandle before it was closed. |
281 | ## No other %info fields present. |
282 | ## $info{'TYPE'} is 'unknown' |
283 | ## First line of the response doesn't seem to be proper HTTP. |
284 | ## $info{'STATUS'} holds that line. No other %info fields present. |
285 | ## |
286 | ## The $replycode, when not zero, is as described at |
287 | ## http://info.cern.ch/hypertext/WWW/Protocols/HTTP/HTRESP.html |
288 | ## |
289 | ## Some of the codes: |
290 | ## |
291 | ## success 2xx |
292 | ## ok 200 |
293 | ## created 201 |
294 | ## accepted 202 |
295 | ## partial information 203 |
296 | ## no response 204 |
297 | ## redirection 3xx |
298 | ## moved 301 |
299 | ## found 302 |
300 | ## method 303 |
301 | ## not modified 304 |
302 | ## error 4xx, 5xx |
303 | ## bad request 400 |
304 | ## unauthorized 401 |
305 | ## paymentrequired 402 |
306 | ## forbidden 403 |
307 | ## not found 404 |
308 | ## internal error 500 |
309 | ## not implemented 501 |
310 | ## service temporarily overloaded 502 |
311 | ## gateway timeout 503 |
312 | ## |
313 | ##< |
314 | ## |
315 | sub read_http_header |
316 | { |
317 | local(*HTTP) = @_; |
318 | local(%info, $_); |
319 | |
320 | ## |
321 | ## The first line of the response will be the status (OK, error, etc.) |
322 | ## |
323 | unless (defined($info{'STATUS'} = <HTTP>)) { |
324 | $info{'TYPE'} = "empty"; |
325 | return (0, %info); |
326 | } |
327 | chop $info{'STATUS'}; |
328 | |
329 | ## |
330 | ## Check the status line. If it doesn't match and we don't know the |
331 | ## format, we'll just let it pass and hope for the best. |
332 | ## |
333 | unless ($info{'STATUS'} =~ m/^HTTP\S+\s+(\d\d\d)\s+(.*\S)/i) { |
334 | $info{'TYPE'} = 'unknown'; |
335 | return (0, %info); |
336 | } |
337 | |
338 | $info{'CODE'} = $1; |
339 | $info{'TYPE'} = $2; |
340 | $info{'HEADER'} = ''; |
341 | |
342 | ## read the rest of the header. |
343 | while (<HTTP>) { |
344 | last if m/^\s*$/; |
345 | $info{'HEADER'} .= $_; ## save whole text of header. |
346 | |
347 | if (m/^([^\n:]+):[ \t]*(.*\S)/) { |
348 | local($field, $value) = ("\L$1", $2); |
349 | if (defined $info{$field}) { |
350 | $info{$field} .= "\n" . $value; |
351 | } else { |
352 | $info{$field} = $value; |
353 | } |
354 | } elsif (defined $info{'UNKNOWN'}) { |
355 | $info{'UNKNOWN'} .= $_; |
356 | } else { |
357 | $info{'UNKNOWN'} = $_; |
358 | } |
359 | } |
360 | |
361 | return ($info{'CODE'}, %info); |
362 | } |
363 | |
364 | ## |
365 | ##> |
366 | ## |
367 | ############################################################################## |
368 | ## routine: grok_URL(URL, noproxy, defaultprotocol) |
369 | ## |
370 | ## Given a URL, returns access information. Deals with |
371 | ## http, wais, gopher, ftp, and telnet |
372 | ## URLs. |
373 | ## |
374 | ## Information returned is |
375 | ## (PROTOCOL, HOST, PORT, PATH, TARGET, USER, PASSWORD) |
376 | ## |
377 | ## If noproxy is not given (or false) and there is a proxy defined |
378 | ## for the given protocol (via the "*_proxy" environmental variable), |
379 | ## the returned access information will be for the proxy and will |
380 | ## reference the given URL. In this case, 'TARGET' will be the |
381 | ## HOST:PORT of the original URL (PORT elided if it's the default port). |
382 | ## |
383 | ## Access information returned: |
384 | ## PROTOCOL: "http", "ftp", etc. (guaranteed to be lowercase). |
385 | ## HOST: hostname or address as given. |
386 | ## PORT: port to access |
387 | ## PATH: path of resource on HOST:PORT. |
388 | ## TARGET: (see above) |
389 | ## USER and PASSWORD: for 'ftp' and 'telnet' URLs, if supplied by the |
390 | ## URL these will be defined, undefined otherwise. |
391 | ## |
392 | ## If no protocol is defined via the URL, the defaultprotocol will be used |
393 | ## if given. Otherwise, the URL's address will be checked for a leading |
394 | ## protocol name (as with a leading "www.") and if found will be used. |
395 | ## Otherwise, the protocol defaults to http. |
396 | ## |
397 | ## Fills in the appropriate default port for the protocol if need be. |
398 | ## |
399 | ## A proxy is defined by a per-protocol environmental variable such |
400 | ## as http_proxy. For example, you might have |
401 | ## setenv http_proxy http://firewall:8080/ |
402 | ## setenv ftp_proxy $http_proxy |
403 | ## to set it up. |
404 | ## |
405 | ## A URL seems to be officially described at |
406 | ## http://www.w3.org/hypertext/WWW/Addressing/URL/5_BNF.html |
407 | ## although that document is a joke of errors. |
408 | ## |
409 | ##< |
410 | ## |
411 | sub grok_URL |
412 | { |
413 | local($_, $noproxy, $defaultprotocol) = @_; |
414 | $noproxy = defined($noproxy) && $noproxy; |
415 | |
416 | ## Items to be filled in and returned. |
417 | local($protocol, $address, $port, $path, $target, $user, $password); |
418 | |
419 | return undef unless m%^(([a-zA-Z]+)://|/*)([^/]+)(/.*)?$%; |
420 | |
421 | ## |
422 | ## Due to a bug in some versions of perl5, $2 might not be empty |
423 | ## even if $1 is. Therefore, we must check $1 for a : to see if the |
424 | ## protocol stuff matched or not. If not, the protocol is undefined. |
425 | ## |
426 | ($protocol, $address, $path) = ((index($1,":") >= 0 ? $2 : undef), $3, $4); |
427 | |
428 | if (!defined $protocol) |
429 | { |
430 | ## |
431 | ## Choose a default protocol if none given. If address begins with |
432 | ## a protocol name (one that we know via %name2protocol or |
433 | ## %default_port), choose it. Otherwise, choose http. |
434 | ## |
435 | if (defined $defaultprotocol) { |
436 | $protocol = $defaultprotocol; |
437 | } |
438 | else |
439 | { |
440 | $address =~ m/^[a-zA-Z]+/; |
441 | if (defined($name2protocol{"\L$&"})) { |
442 | $protocol = $name2protocol{"\L$&"}; |
443 | } else { |
444 | $protocol = defined($default_port{"\L$&"}) ? $& : 'http'; |
445 | } |
446 | } |
447 | } |
448 | $protocol =~ tr/A-Z/a-z/; ## ensure lower-case. |
449 | |
450 | ## |
451 | ## Http support here probably not kosher, but fits in nice for basic |
452 | ## authorization. |
453 | ## |
454 | if ($protocol eq 'ftp' || $protocol eq 'telnet' || $protocol eq 'http') |
455 | { |
456 | ## Glean a username and password from address, if there. |
457 | ## There if address starts with USER[:PASSWORD]@ |
458 | if ($address =~ s/^(([^\@:]+)(:([^@]+))?\@)//) { |
459 | ($user, $password) = ($2, $4); |
460 | } |
461 | } |
462 | |
463 | ## |
464 | ## address left is (HOSTNAME|HOSTNUM)[:PORTNUM] |
465 | ## |
466 | if ($address =~ s/:(\d+)$//) { |
467 | $port = $1; |
468 | } else { |
469 | $port = $default_port{$protocol}; |
470 | } |
471 | |
472 | ## default path is '/'; |
473 | $path = '/' if !defined $path; |
474 | |
475 | ## |
476 | ## If there's a proxy and we're to proxy this request, do so. |
477 | ## |
478 | local($proxy) = $ENV{$protocol."_proxy"}; |
479 | if (!$noproxy && defined($proxy) && !&no_proxy($protocol,$address)) |
480 | { |
481 | local($dummy); |
482 | local($old_pass, $old_user); |
483 | |
484 | ## |
485 | ## Since we're going through a proxy, we want to send the |
486 | ## proxy the entire URL that we want. However, when we're |
487 | ## doing Authenticated HTTP, we need to take out the user:password |
488 | ## that webget has encoded in the URL (this is a bit sleazy on |
489 | ## the part of webget, but the alternative is to have flags, and |
490 | ## having them part of the URL like with FTP, etc., seems a bit |
491 | ## cleaner to me in the context of how webget is used). |
492 | ## |
493 | ## So, if we're doing this slezy thing, we need to construct |
494 | ## the new URL from the compnents we have now (leaving out password |
495 | ## and user), decode the proxy URL, then return the info for |
496 | ## that host, a "filename" of the entire URL we really want, and |
497 | ## the user/password from the original URL. |
498 | ## |
499 | ## For all other things, we can just take the original URL, |
500 | ## ensure it has a protocol on it, and pass it as the "filename" |
501 | ## we want to the proxy host. The difference between reconstructing |
502 | ## the URL (as for HTTP Authentication) and just ensuring the |
503 | ## protocol is there is, except for the user/password stuff, |
504 | ## nothing. In theory, at least. |
505 | ## |
506 | if ($protocol eq 'http' && (defined($password) || defined($user))) |
507 | { |
508 | $path = "http://$address$path"; |
509 | $old_pass = $password; |
510 | $old_user = $user; |
511 | } else { |
512 | ## Re-get original URL and ensure protocol// actually there. |
513 | ## This will become our new path. |
514 | ($path = $_) =~ s,^($protocol:)?/*,$protocol://,i; |
515 | } |
516 | |
517 | ## note what the target will be |
518 | $target = ($port==$default_port{$protocol})?$address:"$address:$port"; |
519 | |
520 | ## get proxy info, discarding |
521 | ($protocol, $address, $port, $dummy, $dummy, $user, $password) |
522 | = &grok_URL($proxy, 1); |
523 | $password = $old_pass if defined $old_pass; |
524 | $user = $old_user if defined $old_user; |
525 | } |
526 | ($protocol, $address, $port, $path, $target, $user, $password); |
527 | } |
528 | |
529 | |
530 | |
531 | ## |
532 | ## &no_proxy($protocol, $host) |
533 | ## |
534 | ## Returns true if the specified host is identified in the no_proxy |
535 | ## environmental variable, or identify the proxy server itself. |
536 | ## |
537 | sub no_proxy |
538 | { |
539 | local($protocol, $targethost) = @_; |
540 | local(@dests, $dest, $host, @hosts, $aliases); |
541 | local($proxy) = $ENV{$protocol."_proxy"}; |
542 | return 0 if !defined $proxy; |
543 | $targethost =~ tr/A-Z/a-z/; ## ensure all lowercase; |
544 | |
545 | @dests = ($proxy); |
546 | push(@dests,split(/\s*,\s*/,$ENV{'no_proxy'})) if defined $ENV{'no_proxy'}; |
547 | |
548 | foreach $dest (@dests) |
549 | { |
550 | ## just get the hostname |
551 | $host = (&grok_URL($dest, 1), 'http')[1]; |
552 | |
553 | if (!defined $host) { |
554 | warn "can't grok [$dest] from no_proxy env.var.\n"; |
555 | next; |
556 | } |
557 | @hosts = ($host); ## throw in original name just to make sure |
558 | ($host, $aliases) = (gethostbyname($host))[0, 1]; |
559 | |
560 | if (defined $aliases) { |
561 | push(@hosts, ($host, split(/\s+/, $aliases))); |
562 | } else { |
563 | push(@hosts, $host); |
564 | } |
565 | foreach $host (@hosts) { |
566 | next if !defined $host; |
567 | return 1 if "\L$host" eq $targethost; |
568 | } |
569 | } |
570 | return 0; |
571 | } |
572 | |
573 | sub ensure_proper_network_library |
574 | { |
575 | require 'network.pl' if !defined $network'version; |
576 | warn "WARNING:\n". __FILE__ . |
577 | qq/ needs a newer version of "network.pl"\n/ if |
578 | !defined($network'version) || $network'version < "950311.5"; |
579 | } |
580 | |
581 | |
582 | |
583 | ## |
584 | ##> |
585 | ############################################################################## |
586 | ## open_http_connection(*FILEHANDLE, HOST, PORT, PATH, TARGET, OPTIONS...) |
587 | ## |
588 | ## Opens an HTTP connection to HOST:PORT and requests PATH. |
589 | ## TARGET is used only for informational messages to the user. |
590 | ## |
591 | ## If PORT and PATH are undefined, HOST is taken as an http URL and TARGET |
592 | ## is filled in as needed. |
593 | ## |
594 | ## Otherwise, it's the same as open_http_url (including return value, etc.). |
595 | ##< |
596 | ## |
597 | sub open_http_connection |
598 | { |
599 | local(*HTTP, $host, $port, $path, $target, @options) = @_; |
600 | local($post_text, @error, %seen); |
601 | local(%info); |
602 | |
603 | &ensure_proper_network_library; |
604 | |
605 | ## options allowed: |
606 | local($post, $retry, $authorization, $nofollow, $noproxy, |
607 | $head, $debug, $ifmodifiedsince, $quiet, ) = (0) x 10; |
608 | ## parse options: |
609 | foreach $opt (@options) |
610 | { |
611 | next unless defined($opt) && $opt ne ''; |
612 | local($var, $val); |
613 | if ($opt =~ m/^(\w+)=(.*)/) { |
614 | ($var, $val) = ($1, $2); |
615 | } else { |
616 | $var = $opt; |
617 | $val = 1; |
618 | } |
619 | $var =~ tr/A-Z/a-z/; ## ensure variable is lowercase. |
620 | local(@error); |
621 | |
622 | eval "if (defined \$$var) { \$$var = \$val; } else { \@error = |
623 | ('error', 'bad open_http_connection option [$opt]'); }"; |
624 | return ('error', "open_http_connection eval: $@") if $@; |
625 | return @error if defined @error; |
626 | } |
627 | $quiet = 0 if $debug; ## debug overrides quiet |
628 | |
629 | local($protocol, $error, $code, $URL, %info, $tmp, $aite); |
630 | |
631 | ## |
632 | ## if both PORT and PATH are undefined, treat HOST as a URL. |
633 | ## |
634 | unless (defined($port) && defined($path)) |
635 | { |
636 | ($protocol,$host,$port,$path,$target)=&grok_URL($host,$noproxy,'http'); |
637 | if ($protocol ne "http") { |
638 | return ('error',"open_http_connection doesn't grok [$protocol]"); |
639 | } |
640 | unless (defined($host)) { |
641 | return ('error', "can't grok [$URL]"); |
642 | } |
643 | } |
644 | |
645 | return ('error', "no port in URL [$URL]") unless defined $port; |
646 | return ('error', "no path in URL [$URL]") unless defined $path; |
647 | |
648 | RETRY: while(1) |
649 | { |
650 | ## we'll want $URL around for error messages and such. |
651 | if ($port == $default_port{'http'}) { |
652 | $URL = "http://$host"; |
653 | } else { |
654 | $URL = "http://$host:$default_port{'http'}"; |
655 | } |
656 | $URL .= ord($path) eq ord('/') ? $path : "/$path"; |
657 | |
658 | $aite = defined($target) ? "$target via $host" : $host; |
659 | |
660 | &message($debug, "connecting to $aite ...") unless $quiet; |
661 | |
662 | ## |
663 | ## note some info that might be of use to the caller. |
664 | ## |
665 | local(%preinfo) = ( |
666 | 'PROTOCOL', 'http', |
667 | 'HOST', $host, |
668 | 'PORT', $port, |
669 | 'PATH', $path, |
670 | ); |
671 | if (defined $target) { |
672 | $preinfo{'TARGET'} = $target; |
673 | } elsif ($default_port{'http'} == $port) { |
674 | $preinfo{'TARGET'} = $host; |
675 | } else { |
676 | $preinfo{'TARGET'} = "$host:$port"; |
677 | } |
678 | |
679 | ## connect to the site |
680 | $error = &network'connect_to(*HTTP, $host, $port); |
681 | if (defined $error) { |
682 | return('error', "can't connect to $aite: $error", %preinfo); |
683 | } |
684 | |
685 | ## If we're asked to POST and it looks like a POST, note post text. |
686 | if ($post && $path =~ m/\?/) { |
687 | $post_text = $'; ## everything after the '?' |
688 | $path = $`; ## everything before the '?' |
689 | } |
690 | |
691 | ## send the POST or GET request |
692 | $tmp = $head ? 'HEAD' : (defined $post_text ? 'POST' : 'GET'); |
693 | |
694 | &message($debug, "sending request to $aite ...") if !$quiet; |
695 | print HTTP $tmp, " $path HTTP/1.0\n"; |
696 | |
697 | ## send the If-Modified-Since field if needed. |
698 | if ($ifmodifiedsince) { |
699 | print HTTP "If-Modified-Since: $ifmodifiedsince\n"; |
700 | } |
701 | |
702 | ## oh, let's sputter a few platitudes..... |
703 | print HTTP "Accept: */*\n"; |
704 | print HTTP "User-Agent: $useragent\n" if defined $useragent; |
705 | |
706 | ## If doing Authorization, do so now. |
707 | if ($authorization) { |
708 | print HTTP "Authorization: Basic ", |
709 | &htuu_encode($authorization), "\n"; |
710 | } |
711 | |
712 | ## If it's a post, send it. |
713 | if (defined $post_text) |
714 | { |
715 | print HTTP "Content-type: application/x-www-form-urlencoded\n"; |
716 | print HTTP "Content-length: ", length $post_text, "\n\n"; |
717 | print HTTP $post_text, "\n"; |
718 | } |
719 | print HTTP "\n"; |
720 | &message($debug, "waiting for data from $aite ...") unless $quiet; |
721 | |
722 | ## we can now read the response (header, then body) via HTTP. |
723 | binmode(HTTP); ## just in case. |
724 | |
725 | ($code, %info) = &read_http_header(*HTTP); |
726 | &message(1, "header returns code $code ($info{'TYPE'})") if $debug; |
727 | |
728 | ## fill in info from %preinfo |
729 | local($val, $key); |
730 | while (($val, $key) = each %preinfo) { |
731 | $info{$val} = $key; |
732 | } |
733 | |
734 | if ($code == 0) |
735 | { |
736 | return('error',"empty response for $URL") |
737 | if $info{'TYPE'} eq 'empty'; |
738 | return('error', "non-HTTP response for $URL", %info) |
739 | if $info{'TYPE'} eq 'unknown'; |
740 | return('error', "unknown zero-code for $URL", %info); |
741 | } |
742 | |
743 | if ($code == 302) ## 302 is magic for "Found" |
744 | { |
745 | if (!defined $info{'location'}) { |
746 | return('error', "No location info for Found URL $URL", %info); |
747 | } |
748 | local($newURL) = $info{'location'}; |
749 | |
750 | ## Remove :80 from hostname, if there. Looks ugly. |
751 | $newURL =~ s,^(http:/+[^/:]+):80/,$1/,i; |
752 | $info{"NewURL"} = $newURL; |
753 | |
754 | ## if we're not following links or if it's not to HTTP, return. |
755 | return('follow', $newURL, %info) if |
756 | $nofollow || $newURL!~m/^http:/i; |
757 | |
758 | ## note that we've seen this current URL. |
759 | $seen{$host, $port, $path} = 1; |
760 | |
761 | &message(1, qq/[note: now moved to "$newURL"]/) unless $quiet; |
762 | |
763 | |
764 | ## get the new one and return an error if it's been seen. |
765 | ($protocol, $host, $port, $path, $target) = |
766 | &www'grok_URL($newURL, $noproxy); |
767 | &message(1, "[$protocol][$host][$port][$path]") if $debug; |
768 | |
769 | if (defined $seen{$host, $port, $path}) |
770 | { |
771 | return('error', "circular reference among:\n ". |
772 | join("\n ", sort grep(/^http/i, keys %seen)), %seen); |
773 | } |
774 | next RETRY; |
775 | } |
776 | elsif ($code == 500) ## 500 is magic for "internal error" |
777 | { |
778 | ## |
779 | ## A proxy will often return this with text saying "can't find |
780 | ## host" when in reality it's just because the nslookup returned |
781 | ## null at the time. Such a thing should be retied again after a |
782 | ## few seconds. |
783 | ## |
784 | if ($retry) |
785 | { |
786 | local($_) = $info{'BODY'} = join('', <HTTP>); |
787 | if (/Can't locate remote host:\s*(\S+)/i) { |
788 | local($times) = ($retry == 1) ? |
789 | "once more" : "up to $retry more times"; |
790 | &message(0, "can't locate $1, will try $times ...") |
791 | unless $quiet; |
792 | sleep(5); |
793 | $retry--; |
794 | next RETRY; |
795 | } |
796 | } |
797 | } |
798 | |
799 | if ($code != 200) ## 200 is magic for "OK"; |
800 | { |
801 | ## I'll deal with these as I see them..... |
802 | &clear_message; |
803 | if ($info{'TYPE'} eq '') |
804 | { |
805 | if (defined $http_return_code{$code}) { |
806 | $info{'TYPE'} = $http_return_code{$code}; |
807 | } else { |
808 | $info{'TYPE'} = "(unknown status code $code)"; |
809 | } |
810 | } |
811 | return ('status', $info{'TYPE'}, %info); |
812 | } |
813 | |
814 | &clear_message; |
815 | return ('ok', 'ok', %info); |
816 | } |
817 | } |
818 | |
819 | |
820 | ## |
821 | ## Hyper Text UUencode. Somewhat different from regular uuencode. |
822 | ## |
823 | ## Logic taken from Mosaic for X code by Mark Riordan and Ari Luotonen. |
824 | ## |
825 | sub htuu_encode |
826 | { |
827 | local(@in) = unpack("C*", $_[0]); |
828 | local(@out); |
829 | |
830 | push(@in, 0, 0); ## in case we need to round off an odd byte or two |
831 | while (@in >= 3) { |
832 | ## |
833 | ## From the next three input bytes, |
834 | ## construct four encoded output bytes. |
835 | ## |
836 | push(@out, $in[0] >> 2); |
837 | push(@out, (($in[0] << 4) & 060) | (($in[1] >> 4) & 017)); |
838 | push(@out, (($in[1] << 2) & 074) | (($in[2] >> 6) & 003)); |
839 | push(@out, $in[2] & 077); |
840 | splice(@in, 0, 3); ## remove these three |
841 | } |
842 | |
843 | ## |
844 | ## @out elements are now indices to the string below. Convert to |
845 | ## the appropriate actual text. |
846 | ## |
847 | foreach $new (@out) { |
848 | $new = substr( |
849 | "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/", |
850 | $new, 1); |
851 | } |
852 | |
853 | if (@in == 2) { |
854 | ## the two left over are the two extra nulls, so we encoded the proper |
855 | ## amount as-is. |
856 | } elsif (@in == 1) { |
857 | ## We encoded one extra null too many. Undo it. |
858 | $out[$#out] = '='; |
859 | } else { |
860 | ## We must have encoded two nulls... Undo both. |
861 | $out[$#out ] = '='; |
862 | $out[$#out -1] = '='; |
863 | } |
864 | |
865 | join('', @out); |
866 | } |
867 | |
868 | ## |
869 | ## This message stuff really shouldn't be here, but in some seperate library. |
870 | ## Sorry. |
871 | ## |
872 | ## Called as &message(SAVE, TEXT ....), it shoves the text to the screen. |
873 | ## If SAVE is true, bumps the text out as a printed line. Otherwise, |
874 | ## will shove out without a newline so that the next message overwrites it, |
875 | ## or it is clearded via &clear_message(). |
876 | ## |
877 | sub message |
878 | { |
879 | local($nl) = shift; |
880 | die "oops $nl." unless $nl =~ m/^\d+$/; |
881 | local($text) = join('', @_); |
882 | local($NL) = $nl ? "\n" : "\r"; |
883 | $thislength = length($text); |
884 | if ($thislength >= $last_message_length) { |
885 | print STDERR $text, $NL; |
886 | } else { |
887 | print STDERR $text, ' 'x ($last_message_length-$thislength), $NL; |
888 | } |
889 | $last_message_length = $nl ? 0 : $thislength; |
890 | } |
891 | |
892 | sub clear_message |
893 | { |
894 | if ($last_message_length) { |
895 | print STDERR ' ' x $last_message_length, "\r"; |
896 | $last_message_length = 0; |
897 | } |
898 | } |
899 | |
900 | 1; |
901 | __END__ |