Commit | Line | Data |
3fea05b9 |
1 | package URI::Escape; |
2 | use strict; |
3 | |
4 | =head1 NAME |
5 | |
6 | URI::Escape - Escape and unescape unsafe characters |
7 | |
8 | =head1 SYNOPSIS |
9 | |
10 | use URI::Escape; |
11 | $safe = uri_escape("10% is enough\n"); |
12 | $verysafe = uri_escape("foo", "\0-\377"); |
13 | $str = uri_unescape($safe); |
14 | |
15 | =head1 DESCRIPTION |
16 | |
17 | This module provides functions to escape and unescape URI strings as |
18 | defined by RFC 2396 (and updated by RFC 2732). |
19 | A URI consists of a restricted set of characters, |
20 | denoted as C<uric> in RFC 2396. The restricted set of characters |
21 | consists of digits, letters, and a few graphic symbols chosen from |
22 | those common to most of the character encodings and input facilities |
23 | available to Internet users: |
24 | |
25 | "A" .. "Z", "a" .. "z", "0" .. "9", |
26 | ";", "/", "?", ":", "@", "&", "=", "+", "$", ",", "[", "]", # reserved |
27 | "-", "_", ".", "!", "~", "*", "'", "(", ")" |
28 | |
29 | In addition, any byte (octet) can be represented in a URI by an escape |
30 | sequence: a triplet consisting of the character "%" followed by two |
31 | hexadecimal digits. A byte can also be represented directly by a |
32 | character, using the US-ASCII character for that octet (iff the |
33 | character is part of C<uric>). |
34 | |
35 | Some of the C<uric> characters are I<reserved> for use as delimiters |
36 | or as part of certain URI components. These must be escaped if they are |
37 | to be treated as ordinary data. Read RFC 2396 for further details. |
38 | |
39 | The functions provided (and exported by default) from this module are: |
40 | |
41 | =over 4 |
42 | |
43 | =item uri_escape( $string ) |
44 | |
45 | =item uri_escape( $string, $unsafe ) |
46 | |
47 | Replaces each unsafe character in the $string with the corresponding |
48 | escape sequence and returns the result. The $string argument should |
49 | be a string of bytes. The uri_escape() function will croak if given a |
50 | characters with code above 255. Use uri_escape_utf8() if you know you |
51 | have such chars or/and want chars in the 128 .. 255 range treated as |
52 | UTF-8. |
53 | |
54 | The uri_escape() function takes an optional second argument that |
55 | overrides the set of characters that are to be escaped. The set is |
56 | specified as a string that can be used in a regular expression |
57 | character class (between [ ]). E.g.: |
58 | |
59 | "\x00-\x1f\x7f-\xff" # all control and hi-bit characters |
60 | "a-z" # all lower case characters |
61 | "^A-Za-z" # everything not a letter |
62 | |
63 | The default set of characters to be escaped is all those which are |
64 | I<not> part of the C<uric> character class shown above as well as the |
65 | reserved characters. I.e. the default is: |
66 | |
67 | "^A-Za-z0-9\-_.!~*'()" |
68 | |
69 | =item uri_escape_utf8( $string ) |
70 | |
71 | =item uri_escape_utf8( $string, $unsafe ) |
72 | |
73 | Works like uri_escape(), but will encode chars as UTF-8 before |
74 | escaping them. This makes this function able do deal with characters |
75 | with code above 255 in $string. Note that chars in the 128 .. 255 |
76 | range will be escaped differently by this function compared to what |
77 | uri_escape() would. For chars in the 0 .. 127 range there is no |
78 | difference. |
79 | |
80 | The call: |
81 | |
82 | $uri = uri_escape_utf8($string); |
83 | |
84 | will be the same as: |
85 | |
86 | use Encode qw(encode); |
87 | $uri = uri_escape(encode("UTF-8", $string)); |
88 | |
89 | but will even work for perl-5.6 for chars in the 128 .. 255 range. |
90 | |
91 | Note: Javascript has a function called escape() that produces the |
92 | sequence "%uXXXX" for chars in the 256 .. 65535 range. This function |
93 | has really nothing to do with URI escaping but some folks got confused |
94 | since it "does the right thing" in the 0 .. 255 range. Because of |
95 | this you sometimes see "URIs" with these kind of escapes. The |
96 | JavaScript encodeURIComponent() function is similar to uri_escape_utf8(). |
97 | |
98 | =item uri_unescape($string,...) |
99 | |
100 | Returns a string with each %XX sequence replaced with the actual byte |
101 | (octet). |
102 | |
103 | This does the same as: |
104 | |
105 | $string =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; |
106 | |
107 | but does not modify the string in-place as this RE would. Using the |
108 | uri_unescape() function instead of the RE might make the code look |
109 | cleaner and is a few characters less to type. |
110 | |
111 | In a simple benchmark test I did, |
112 | calling the function (instead of the inline RE above) if a few chars |
113 | were unescaped was something like 40% slower, and something like 700% slower if none were. If |
114 | you are going to unescape a lot of times it might be a good idea to |
115 | inline the RE. |
116 | |
117 | If the uri_unescape() function is passed multiple strings, then each |
118 | one is returned unescaped. |
119 | |
120 | =back |
121 | |
122 | The module can also export the C<%escapes> hash, which contains the |
123 | mapping from all 256 bytes to the corresponding escape codes. Lookup |
124 | in this hash is faster than evaluating C<sprintf("%%%02X", ord($byte))> |
125 | each time. |
126 | |
127 | =head1 SEE ALSO |
128 | |
129 | L<URI> |
130 | |
131 | |
132 | =head1 COPYRIGHT |
133 | |
134 | Copyright 1995-2004 Gisle Aas. |
135 | |
136 | This program is free software; you can redistribute it and/or modify |
137 | it under the same terms as Perl itself. |
138 | |
139 | =cut |
140 | |
141 | use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION); |
142 | use vars qw(%escapes); |
143 | |
144 | require Exporter; |
145 | @ISA = qw(Exporter); |
146 | @EXPORT = qw(uri_escape uri_unescape uri_escape_utf8); |
147 | @EXPORT_OK = qw(%escapes); |
148 | $VERSION = "3.29"; |
149 | |
150 | use Carp (); |
151 | |
152 | # Build a char->hex map |
153 | for (0..255) { |
154 | $escapes{chr($_)} = sprintf("%%%02X", $_); |
155 | } |
156 | |
157 | my %subst; # compiled patternes |
158 | |
159 | sub uri_escape |
160 | { |
161 | my($text, $patn) = @_; |
162 | return undef unless defined $text; |
163 | if (defined $patn){ |
164 | unless (exists $subst{$patn}) { |
165 | # Because we can't compile the regex we fake it with a cached sub |
166 | (my $tmp = $patn) =~ s,/,\\/,g; |
167 | eval "\$subst{\$patn} = sub {\$_[0] =~ s/([$tmp])/\$escapes{\$1} || _fail_hi(\$1)/ge; }"; |
168 | Carp::croak("uri_escape: $@") if $@; |
169 | } |
170 | &{$subst{$patn}}($text); |
171 | } else { |
172 | # Default unsafe characters. RFC 2732 ^(uric - reserved) |
173 | $text =~ s/([^A-Za-z0-9\-_.!~*'()])/$escapes{$1} || _fail_hi($1)/ge; |
174 | } |
175 | $text; |
176 | } |
177 | |
178 | sub _fail_hi { |
179 | my $chr = shift; |
180 | Carp::croak(sprintf "Can't escape \\x{%04X}, try uri_escape_utf8() instead", ord($chr)); |
181 | } |
182 | |
183 | sub uri_escape_utf8 |
184 | { |
185 | my $text = shift; |
186 | if ($] < 5.008) { |
187 | $text =~ s/([^\0-\x7F])/do {my $o = ord($1); sprintf("%c%c", 0xc0 | ($o >> 6), 0x80 | ($o & 0x3f)) }/ge; |
188 | } |
189 | else { |
190 | utf8::encode($text); |
191 | } |
192 | |
193 | return uri_escape($text, @_); |
194 | } |
195 | |
196 | sub uri_unescape |
197 | { |
198 | # Note from RFC1630: "Sequences which start with a percent sign |
199 | # but are not followed by two hexadecimal characters are reserved |
200 | # for future extension" |
201 | my $str = shift; |
202 | if (@_ && wantarray) { |
203 | # not executed for the common case of a single argument |
204 | my @str = ($str, @_); # need to copy |
205 | foreach (@str) { |
206 | s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; |
207 | } |
208 | return @str; |
209 | } |
210 | $str =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg if defined $str; |
211 | $str; |
212 | } |
213 | |
214 | sub escape_char { |
215 | return join '', @URI::Escape::escapes{$_[0] =~ /(\C)/g}; |
216 | } |
217 | |
218 | 1; |