Commit | Line | Data |
3ef515df |
1 | package encoding; |
0f7c507f |
2 | our $VERSION = do { my @r = (q$Revision: 1.38 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; |
3ef515df |
3 | |
4 | use Encode; |
046f36bf |
5 | use strict; |
3ef515df |
6 | |
7 | BEGIN { |
8 | if (ord("A") == 193) { |
9 | require Carp; |
10c5ecbb |
10 | Carp::croak("encoding pragma does not support EBCDIC platforms"); |
3ef515df |
11 | } |
12 | } |
13 | |
0ab8f81e |
14 | our $HAS_PERLIO = 0; |
15 | eval { require PerlIO::encoding }; |
16 | unless ($@){ |
17 | $HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02); |
18 | } |
b2704119 |
19 | |
3ef515df |
20 | sub import { |
21 | my $class = shift; |
22 | my $name = shift; |
23 | my %arg = @_; |
24 | $name ||= $ENV{PERL_ENCODING}; |
25 | |
26 | my $enc = find_encoding($name); |
27 | unless (defined $enc) { |
28 | require Carp; |
10c5ecbb |
29 | Carp::croak("Unknown encoding '$name'"); |
3ef515df |
30 | } |
0f7c507f |
31 | unless ($arg{Filter}) { |
32 | ${^ENCODING} = $enc # this is all you need, actually. |
33 | unless $name =~ /^(?:utf-?(?:8|16|32)|ucs-?(?:2|4))(?:[bl]e)?$/i; |
85982a32 |
34 | $HAS_PERLIO or return 1; |
aae85ceb |
35 | for my $h (qw(STDIN STDOUT)){ |
36 | if ($arg{$h}){ |
b2704119 |
37 | unless (defined find_encoding($arg{$h})) { |
aae85ceb |
38 | require Carp; |
10c5ecbb |
39 | Carp::croak("Unknown encoding for $h, '$arg{$h}'"); |
aae85ceb |
40 | } |
0ab8f81e |
41 | eval { binmode($h, ":encoding($arg{$h})") }; |
aae85ceb |
42 | }else{ |
43 | unless (exists $arg{$h}){ |
0ab8f81e |
44 | eval { |
45 | no warnings 'uninitialized'; |
46 | binmode($h, ":encoding($name)"); |
47 | }; |
aae85ceb |
48 | } |
49 | } |
50 | if ($@){ |
3ef515df |
51 | require Carp; |
aae85ceb |
52 | Carp::croak($@); |
3ef515df |
53 | } |
3ef515df |
54 | } |
aae85ceb |
55 | }else{ |
56 | defined(${^ENCODING}) and undef ${^ENCODING}; |
57 | eval { |
58 | require Filter::Util::Call ; |
59 | Filter::Util::Call->import ; |
b2704119 |
60 | binmode(STDIN); |
61 | binmode(STDOUT); |
aae85ceb |
62 | filter_add(sub{ |
63 | my $status; |
64 | if (($status = filter_read()) > 0){ |
65 | $_ = $enc->decode($_, 1); |
66 | # warn $_; |
67 | } |
68 | $status ; |
69 | }); |
70 | }; |
71 | # warn "Filter installed"; |
3ef515df |
72 | } |
73 | return 1; # I doubt if we need it, though |
74 | } |
75 | |
76 | sub unimport{ |
77 | no warnings; |
78 | undef ${^ENCODING}; |
621b0f8d |
79 | if ($HAS_PERLIO){ |
80 | binmode(STDIN, ":raw"); |
81 | binmode(STDOUT, ":raw"); |
82 | }else{ |
b2704119 |
83 | binmode(STDIN); |
84 | binmode(STDOUT); |
621b0f8d |
85 | } |
aae85ceb |
86 | if ($INC{"Filter/Util/Call.pm"}){ |
87 | eval { filter_del() }; |
88 | } |
3ef515df |
89 | } |
90 | |
91 | 1; |
92 | __END__ |
85982a32 |
93 | |
3ef515df |
94 | =pod |
95 | |
96 | =head1 NAME |
97 | |
0ab8f81e |
98 | encoding - allows you to write your script in non-ascii or non-utf8 |
3ef515df |
99 | |
100 | =head1 SYNOPSIS |
101 | |
962111ca |
102 | use encoding "greek"; # Perl like Greek to you? |
3ef515df |
103 | use encoding "euc-jp"; # Jperl! |
104 | |
962111ca |
105 | # or you can even do this if your shell supports your native encoding |
3ef515df |
106 | |
962111ca |
107 | perl -Mencoding=latin2 -e '...' # Feeling centrally European? |
0ab8f81e |
108 | perl -Mencoding=euc-kr -e '...' # Or Korean? |
3ef515df |
109 | |
3ef515df |
110 | # more control |
111 | |
962111ca |
112 | # A simple euc-cn => utf-8 converter |
6d1c0808 |
113 | use encoding "euc-cn", STDOUT => "utf8"; while(<>){print}; |
3ef515df |
114 | |
115 | # "no encoding;" supported (but not scoped!) |
116 | no encoding; |
117 | |
aae85ceb |
118 | # an alternate way, Filter |
119 | use encoding "euc-jp", Filter=>1; |
120 | use utf8; |
121 | # now you can use kanji identifiers -- in euc-jp! |
122 | |
3ef515df |
123 | =head1 ABSTRACT |
124 | |
962111ca |
125 | Let's start with a bit of history: Perl 5.6.0 introduced Unicode |
126 | support. You could apply C<substr()> and regexes even to complex CJK |
127 | characters -- so long as the script was written in UTF-8. But back |
0ab8f81e |
128 | then, text editors that supported UTF-8 were still rare and many users |
129 | instead chose to write scripts in legacy encodings, giving up a whole |
130 | new feature of Perl 5.6. |
3ef515df |
131 | |
0ab8f81e |
132 | Rewind to the future: starting from perl 5.8.0 with the B<encoding> |
962111ca |
133 | pragma, you can write your script in any encoding you like (so long |
134 | as the C<Encode> module supports it) and still enjoy Unicode support. |
0ab8f81e |
135 | You can write code in EUC-JP as follows: |
3ef515df |
136 | |
137 | my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji |
138 | #<-char-><-char-> # 4 octets |
139 | s/\bCamel\b/$Rakuda/; |
140 | |
141 | And with C<use encoding "euc-jp"> in effect, it is the same thing as |
962111ca |
142 | the code in UTF-8: |
3ef515df |
143 | |
32b9ed1f |
144 | my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters |
3ef515df |
145 | s/\bCamel\b/$Rakuda/; |
146 | |
962111ca |
147 | The B<encoding> pragma also modifies the filehandle disciplines of |
4b291ae6 |
148 | STDIN and STDOUT to the specified encoding. Therefore, |
3ef515df |
149 | |
150 | use encoding "euc-jp"; |
151 | my $message = "Camel is the symbol of perl.\n"; |
152 | my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji |
153 | $message =~ s/\bCamel\b/$Rakuda/; |
154 | print $message; |
155 | |
962111ca |
156 | Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n", |
157 | not "\x{99F1}\x{99DD} is the symbol of perl.\n". |
3ef515df |
158 | |
0ab8f81e |
159 | You can override this by giving extra arguments; see below. |
3ef515df |
160 | |
161 | =head1 USAGE |
162 | |
163 | =over 4 |
164 | |
165 | =item use encoding [I<ENCNAME>] ; |
166 | |
0ab8f81e |
167 | Sets the script encoding to I<ENCNAME>. Filehandle disciplines of |
168 | STDIN and STDOUT are set to ":encoding(I<ENCNAME>)". Note that STDERR |
169 | will not be changed. |
3ef515df |
170 | |
171 | If no encoding is specified, the environment variable L<PERL_ENCODING> |
962111ca |
172 | is consulted. If no encoding can be found, the error C<Unknown encoding |
173 | 'I<ENCNAME>'> will be thrown. |
3ef515df |
174 | |
175 | Note that non-STD file handles remain unaffected. Use C<use open> or |
176 | C<binmode> to change disciplines of those. |
177 | |
aae85ceb |
178 | =item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ; |
3ef515df |
179 | |
0ab8f81e |
180 | You can also individually set encodings of STDIN and STDOUT via the |
32b9ed1f |
181 | C<< STDIN => I<ENCNAME> >> form. In this case, you cannot omit the |
182 | first I<ENCNAME>. C<< STDIN => undef >> turns the IO transcoding |
aae85ceb |
183 | completely off. |
3ef515df |
184 | |
185 | =item no encoding; |
186 | |
0ab8f81e |
187 | Unsets the script encoding. The disciplines of STDIN, STDOUT are |
962111ca |
188 | reset to ":raw" (the default unprocessed raw stream of bytes). |
3ef515df |
189 | |
190 | =back |
191 | |
192 | =head1 CAVEATS |
193 | |
194 | =head2 NOT SCOPED |
195 | |
196 | The pragma is a per script, not a per block lexical. Only the last |
621b0f8d |
197 | C<use encoding> or C<no encoding> matters, and it affects |
198 | B<the whole script>. However, the <no encoding> pragma is supported and |
199 | B<use encoding> can appear as many times as you want in a given script. |
200 | The multiple use of this pragma is discouraged. |
201 | |
202 | Because of this nature, the use of this pragma inside the module is |
203 | strongly discouraged (because the influence of this pragma lasts not |
204 | only for the module but the script that uses). But if you have to, |
205 | make sure you say C<no encoding> at the end of the module so you |
206 | contain the influence of the pragma within the module. |
3ef515df |
207 | |
208 | =head2 DO NOT MIX MULTIPLE ENCODINGS |
209 | |
210 | Notice that only literals (string or regular expression) having only |
211 | legacy code points are affected: if you mix data like this |
212 | |
213 | \xDF\x{100} |
214 | |
215 | the data is assumed to be in (Latin 1 and) Unicode, not in your native |
216 | encoding. In other words, this will match in "greek": |
217 | |
218 | "\xDF" =~ /\x{3af}/ |
219 | |
220 | but this will not |
221 | |
222 | "\xDF\x{100}" =~ /\x{3af}\x{100}/ |
223 | |
962111ca |
224 | since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on |
225 | the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL |
226 | LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left. You |
227 | should not be mixing your legacy data and Unicode in the same string. |
3ef515df |
228 | |
229 | This pragma also affects encoding of the 0x80..0xFF code point range: |
230 | normally characters in that range are left as eight-bit bytes (unless |
231 | they are combined with characters with code points 0x100 or larger, |
232 | in which case all characters need to become UTF-8 encoded), but if |
233 | the C<encoding> pragma is present, even the 0x80..0xFF range always |
234 | gets UTF-8 encoded. |
235 | |
236 | After all, the best thing about this pragma is that you don't have to |
0ab8f81e |
237 | resort to \x{....} just to spell your name in a native encoding. |
238 | So feel free to put your strings in your encoding in quotes and |
239 | regexes. |
3ef515df |
240 | |
4b291ae6 |
241 | =head2 tr/// with ranges remain unaffected |
242 | |
243 | The B<encoding> pragma works by decoding string literals in |
244 | C<q//,qq//,qr//,qw///, qx//> and so forth. As of perl 5.8.0, this |
245 | does not apply to C<tr///>. Therefore, |
246 | |
247 | use encoding 'euc-jp'; |
248 | #.... |
249 | $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/; |
250 | # -------- -------- -------- -------- |
251 | |
252 | Does not work as |
253 | |
254 | $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/; |
255 | |
256 | =over |
257 | |
258 | =item Legend of characters above |
259 | |
260 | utf8 euc-jp charnames::viacode() |
261 | ----------------------------------------- |
262 | \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A |
263 | \x{3093} \xA4\xF3 HIRAGANA LETTER N |
264 | \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A |
265 | \x{30f3} \xA5\xF3 KATAKANA LETTER N |
266 | |
267 | =back |
268 | |
269 | =head3 workaround to tr///; |
270 | |
271 | You can, however, achieve the same as simply as follows; |
272 | |
273 | use encoding 'euc-jp'; |
274 | # .... |
275 | eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ }; |
276 | |
277 | Note the C<tr//> expression is surronded by C<qq{}>. The idea behind |
278 | is the same as classic idiom that makes C<tr///> 'interpolate'. |
279 | |
280 | tr/$from/$to/; # wrong! |
281 | eval qq{ tr/$from/$to/ }; # workaround. |
282 | |
283 | Nevertheless, in case of B<encoding> pragma even C<q//> is affected so |
284 | C<tr///> not being decoded was obviously against the will of Perl5 |
285 | Porters. In future version of perl, this counter-intuitive behaviour |
286 | of C<tr///> will be fixed so C<eval qq{}> trick will be unneccesary. |
287 | |
962111ca |
288 | =head1 Non-ASCII Identifiers and Filter option |
aae85ceb |
289 | |
962111ca |
290 | The magic of C<use encoding> is not applied to the names of |
32b9ed1f |
291 | identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human |
962111ca |
292 | is a single Han ideograph) work, you still need to write your script |
293 | in UTF-8 or use a source filter. |
aae85ceb |
294 | |
0ab8f81e |
295 | In other words, the same restriction as with Jperl applies. |
aae85ceb |
296 | |
0ab8f81e |
297 | If you dare to experiment, however, you can try the Filter option. |
aae85ceb |
298 | |
299 | =over 4 |
300 | |
301 | =item use encoding I<ENCNAME> Filter=E<gt>1; |
302 | |
0ab8f81e |
303 | This turns the encoding pragma into a source filter. While the default |
aae85ceb |
304 | approach just decodes interpolated literals (in qq() and qr()), this |
0ab8f81e |
305 | will apply a source filter to the entire source code. In this case, |
306 | STDIN and STDOUT remain untouched. |
aae85ceb |
307 | |
308 | =back |
309 | |
962111ca |
310 | What does this mean? Your source code behaves as if it is written in |
0ab8f81e |
311 | UTF-8. So even if your editor only supports Shift_JIS, for example, |
312 | you can still try examples in Chapter 15 of C<Programming Perl, 3rd |
313 | Ed.>. For instance, you can use UTF-8 identifiers. |
aae85ceb |
314 | |
315 | This option is significantly slower and (as of this writing) non-ASCII |
316 | identifiers are not very stable WITHOUT this option and with the |
317 | source code written in UTF-8. |
318 | |
962111ca |
319 | To make your script in legacy encoding work with minimum effort, |
320 | do not use Filter=E<gt>1. |
aae85ceb |
321 | |
3ef515df |
322 | =head1 EXAMPLE - Greekperl |
323 | |
324 | use encoding "iso 8859-7"; |
325 | |
0ab8f81e |
326 | # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode. |
3ef515df |
327 | |
328 | $a = "\xDF"; |
329 | $b = "\x{100}"; |
330 | |
331 | printf "%#x\n", ord($a); # will print 0x3af, not 0xdf |
332 | |
333 | $c = $a . $b; |
334 | |
335 | # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}". |
336 | |
337 | # chr() is affected, and ... |
338 | |
339 | print "mega\n" if ord(chr(0xdf)) == 0x3af; |
340 | |
341 | # ... ord() is affected by the encoding pragma ... |
342 | |
343 | print "tera\n" if ord(pack("C", 0xdf)) == 0x3af; |
344 | |
345 | # ... as are eq and cmp ... |
346 | |
347 | print "peta\n" if "\x{3af}" eq pack("C", 0xdf); |
348 | print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0; |
349 | |
350 | # ... but pack/unpack C are not affected, in case you still |
0ab8f81e |
351 | # want to go back to your native encoding |
3ef515df |
352 | |
353 | print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf; |
354 | |
355 | =head1 KNOWN PROBLEMS |
356 | |
0ab8f81e |
357 | For native multibyte encodings (either fixed or variable length), |
3ef515df |
358 | the current implementation of the regular expressions may introduce |
0ab8f81e |
359 | recoding errors for regular expression literals longer than 127 bytes. |
3ef515df |
360 | |
361 | The encoding pragma is not supported on EBCDIC platforms. |
0ab8f81e |
362 | (Porters who are willing and able to remove this limitation are |
363 | welcome.) |
3ef515df |
364 | |
365 | =head1 SEE ALSO |
366 | |
aae85ceb |
367 | L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>, |
368 | |
369 | Ch. 15 of C<Programming Perl (3rd Edition)> |
370 | by Larry Wall, Tom Christiansen, Jon Orwant; |
371 | O'Reilly & Associates; ISBN 0-596-00027-8 |
3ef515df |
372 | |
373 | =cut |