Commit | Line | Data |
3fea05b9 |
1 | package URI::Heuristic; |
2 | |
3 | =head1 NAME |
4 | |
5 | URI::Heuristic - Expand URI using heuristics |
6 | |
7 | =head1 SYNOPSIS |
8 | |
9 | use URI::Heuristic qw(uf_uristr); |
10 | $u = uf_uristr("perl"); # http://www.perl.com |
11 | $u = uf_uristr("www.sol.no/sol"); # http://www.sol.no/sol |
12 | $u = uf_uristr("aas"); # http://www.aas.no |
13 | $u = uf_uristr("ftp.funet.fi"); # ftp://ftp.funet.fi |
14 | $u = uf_uristr("/etc/passwd"); # file:/etc/passwd |
15 | |
16 | =head1 DESCRIPTION |
17 | |
18 | This module provides functions that expand strings into real absolute |
19 | URIs using some built-in heuristics. Strings that already represent |
20 | absolute URIs (i.e. that start with a C<scheme:> part) are never modified |
21 | and are returned unchanged. The main use of these functions is to |
22 | allow abbreviated URIs similar to what many web browsers allow for URIs |
23 | typed in by the user. |
24 | |
25 | The following functions are provided: |
26 | |
27 | =over 4 |
28 | |
29 | =item uf_uristr($str) |
30 | |
31 | Tries to make the argument string |
32 | into a proper absolute URI string. The "uf_" prefix stands for "User |
33 | Friendly". Under MacOS, it assumes that any string with a common URL |
34 | scheme (http, ftp, etc.) is a URL rather than a local path. So don't name |
35 | your volumes after common URL schemes and expect uf_uristr() to construct |
36 | valid file: URL's on those volumes for you, because it won't. |
37 | |
38 | =item uf_uri($str) |
39 | |
40 | Works the same way as uf_uristr() but |
41 | returns a C<URI> object. |
42 | |
43 | =back |
44 | |
45 | =head1 ENVIRONMENT |
46 | |
47 | If the hostname portion of a URI does not contain any dots, then |
48 | certain qualified guesses are made. These guesses are governed by |
49 | the following two environment variables: |
50 | |
51 | =over 10 |
52 | |
53 | =item COUNTRY |
54 | |
55 | The two-letter country code (ISO 3166) for your location. If |
56 | the domain name of your host ends with two letters, then it is taken |
57 | to be the default country. See also L<Locale::Country>. |
58 | |
59 | =item URL_GUESS_PATTERN |
60 | |
61 | Contains a space-separated list of URL patterns to try. The string |
62 | "ACME" is for some reason used as a placeholder for the host name in |
63 | the URL provided. Example: |
64 | |
65 | URL_GUESS_PATTERN="www.ACME.no www.ACME.se www.ACME.com" |
66 | export URL_GUESS_PATTERN |
67 | |
68 | Specifying URL_GUESS_PATTERN disables any guessing rules based on |
69 | country. An empty URL_GUESS_PATTERN disables any guessing that |
70 | involves host name lookups. |
71 | |
72 | =back |
73 | |
74 | =head1 COPYRIGHT |
75 | |
76 | Copyright 1997-1998, Gisle Aas |
77 | |
78 | This library is free software; you can redistribute it and/or |
79 | modify it under the same terms as Perl itself. |
80 | |
81 | =cut |
82 | |
83 | use strict; |
84 | |
85 | use vars qw(@EXPORT_OK $VERSION $MY_COUNTRY %LOCAL_GUESSING $DEBUG); |
86 | |
87 | require Exporter; |
88 | *import = \&Exporter::import; |
89 | @EXPORT_OK = qw(uf_uri uf_uristr uf_url uf_urlstr); |
90 | $VERSION = "4.18"; |
91 | |
92 | sub MY_COUNTRY() { |
93 | for ($MY_COUNTRY) { |
94 | return $_ if defined; |
95 | |
96 | # First try the environment. |
97 | $_ = $ENV{COUNTRY}; |
98 | return $_ if defined; |
99 | |
100 | # Could use LANG, LC_ALL, etc at this point, but probably too |
101 | # much of a wild guess. (Catalan != Canada, etc.) |
102 | # |
103 | |
104 | # Last bit of domain name. This may access the network. |
105 | require Net::Domain; |
106 | my $fqdn = Net::Domain::hostfqdn(); |
107 | $_ = lc($1) if $fqdn =~ /\.([a-zA-Z]{2})$/; |
108 | return $_ if defined; |
109 | |
110 | # Give up. Defined but false. |
111 | return ($_ = 0); |
112 | } |
113 | } |
114 | |
115 | %LOCAL_GUESSING = |
116 | ( |
117 | 'us' => [qw(www.ACME.gov www.ACME.mil)], |
118 | 'uk' => [qw(www.ACME.co.uk www.ACME.org.uk www.ACME.ac.uk)], |
119 | 'au' => [qw(www.ACME.com.au www.ACME.org.au www.ACME.edu.au)], |
120 | 'il' => [qw(www.ACME.co.il www.ACME.org.il www.ACME.net.il)], |
121 | # send corrections and new entries to <gisle@aas.no> |
122 | ); |
123 | |
124 | |
125 | sub uf_uristr ($) |
126 | { |
127 | local($_) = @_; |
128 | print STDERR "uf_uristr: resolving $_\n" if $DEBUG; |
129 | return unless defined; |
130 | |
131 | s/^\s+//; |
132 | s/\s+$//; |
133 | |
134 | if (/^(www|web|home)\./) { |
135 | $_ = "http://$_"; |
136 | |
137 | } elsif (/^(ftp|gopher|news|wais|http|https)\./) { |
138 | $_ = "$1://$_"; |
139 | |
140 | } elsif ($^O ne "MacOS" && |
141 | (m,^/, || # absolute file name |
142 | m,^\.\.?/, || # relative file name |
143 | m,^[a-zA-Z]:[/\\],) # dosish file name |
144 | ) |
145 | { |
146 | $_ = "file:$_"; |
147 | |
148 | } elsif ($^O eq "MacOS" && m/:/) { |
149 | # potential MacOS file name |
150 | unless (m/^(ftp|gopher|news|wais|http|https|mailto):/) { |
151 | require URI::file; |
152 | my $a = URI::file->new($_)->as_string; |
153 | $_ = ($a =~ m/^file:/) ? $a : "file:$a"; |
154 | } |
155 | } elsif (/^\w+([\.\-]\w+)*\@(\w+\.)+\w{2,3}$/) { |
156 | $_ = "mailto:$_"; |
157 | |
158 | } elsif (!/^[a-zA-Z][a-zA-Z0-9.+\-]*:/) { # no scheme specified |
159 | if (s/^([-\w]+(?:\.[-\w]+)*)([\/:\?\#]|$)/$2/) { |
160 | my $host = $1; |
161 | |
162 | if ($host !~ /\./ && $host ne "localhost") { |
163 | my @guess; |
164 | if (exists $ENV{URL_GUESS_PATTERN}) { |
165 | @guess = map { s/\bACME\b/$host/; $_ } |
166 | split(' ', $ENV{URL_GUESS_PATTERN}); |
167 | } else { |
168 | if (MY_COUNTRY()) { |
169 | my $special = $LOCAL_GUESSING{MY_COUNTRY()}; |
170 | if ($special) { |
171 | my @special = @$special; |
172 | push(@guess, map { s/\bACME\b/$host/; $_ } |
173 | @special); |
174 | } else { |
175 | push(@guess, "www.$host." . MY_COUNTRY()); |
176 | } |
177 | } |
178 | push(@guess, map "www.$host.$_", |
179 | "com", "org", "net", "edu", "int"); |
180 | } |
181 | |
182 | |
183 | my $guess; |
184 | for $guess (@guess) { |
185 | print STDERR "uf_uristr: gethostbyname('$guess.')..." |
186 | if $DEBUG; |
187 | if (gethostbyname("$guess.")) { |
188 | print STDERR "yes\n" if $DEBUG; |
189 | $host = $guess; |
190 | last; |
191 | } |
192 | print STDERR "no\n" if $DEBUG; |
193 | } |
194 | } |
195 | $_ = "http://$host$_"; |
196 | |
197 | } else { |
198 | # pure junk, just return it unchanged... |
199 | |
200 | } |
201 | } |
202 | print STDERR "uf_uristr: ==> $_\n" if $DEBUG; |
203 | |
204 | $_; |
205 | } |
206 | |
207 | sub uf_uri ($) |
208 | { |
209 | require URI; |
210 | URI->new(uf_uristr($_[0])); |
211 | } |
212 | |
213 | # legacy |
214 | *uf_urlstr = \*uf_uristr; |
215 | |
216 | sub uf_url ($) |
217 | { |
218 | require URI::URL; |
219 | URI::URL->new(uf_uristr($_[0])); |
220 | } |
221 | |
222 | 1; |