Commit | Line | Data |
9735c3fc |
1 | # $Id: encoding.pm,v 1.44 2003/03/09 20:07:37 dankogai Exp $ |
3ef515df |
2 | package encoding; |
0f29a567 |
3 | our $VERSION = do { my @r = (q$Revision: 1.44 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; |
3ef515df |
4 | |
5 | use Encode; |
046f36bf |
6 | use strict; |
151b5d36 |
7 | our $DEBUG = 0; |
3ef515df |
8 | |
9 | BEGIN { |
10 | if (ord("A") == 193) { |
11 | require Carp; |
10c5ecbb |
12 | Carp::croak("encoding pragma does not support EBCDIC platforms"); |
3ef515df |
13 | } |
14 | } |
15 | |
0ab8f81e |
16 | our $HAS_PERLIO = 0; |
17 | eval { require PerlIO::encoding }; |
18 | unless ($@){ |
19 | $HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02); |
20 | } |
b2704119 |
21 | |
151b5d36 |
22 | sub _exception{ |
23 | my $name = shift; |
24 | $] > 5.008 and return 0; # 5.8.1 then no |
25 | my %utfs = map {$_=>1} |
26 | qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE |
27 | UTF-32 UTF-32BE UTF-32LE); |
28 | $utfs{$name} or return 0; # UTFs or no |
29 | require Config; Config->import(); our %Config; |
2b6a28d4 |
30 | return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no |
151b5d36 |
31 | } |
fa6f41cf |
32 | |
3ef515df |
33 | sub import { |
34 | my $class = shift; |
35 | my $name = shift; |
36 | my %arg = @_; |
37 | $name ||= $ENV{PERL_ENCODING}; |
3ef515df |
38 | my $enc = find_encoding($name); |
39 | unless (defined $enc) { |
40 | require Carp; |
10c5ecbb |
41 | Carp::croak("Unknown encoding '$name'"); |
3ef515df |
42 | } |
fa6f41cf |
43 | $name = $enc->name; # canonize |
0f7c507f |
44 | unless ($arg{Filter}) { |
151b5d36 |
45 | $DEBUG and warn "_exception($name) = ", _exception($name); |
46 | _exception($name) or ${^ENCODING} = $enc; |
85982a32 |
47 | $HAS_PERLIO or return 1; |
aae85ceb |
48 | }else{ |
49 | defined(${^ENCODING}) and undef ${^ENCODING}; |
151b5d36 |
50 | # implicitly 'use utf8' |
51 | require utf8; # to fetch $utf8::hint_bits; |
52 | $^H |= $utf8::hint_bits; |
aae85ceb |
53 | eval { |
54 | require Filter::Util::Call ; |
55 | Filter::Util::Call->import ; |
aae85ceb |
56 | filter_add(sub{ |
151b5d36 |
57 | my $status = filter_read(); |
58 | if ($status > 0){ |
59 | # $DEBUG and warn $_; |
aae85ceb |
60 | $_ = $enc->decode($_, 1); |
151b5d36 |
61 | $DEBUG and warn $_; |
aae85ceb |
62 | } |
63 | $status ; |
64 | }); |
65 | }; |
6be7c101 |
66 | } $DEBUG and warn "Filter installed"; |
05ef2f67 |
67 | defined ${^UNICODE} and ${^UNICODE} != 0 and return 1; |
151b5d36 |
68 | for my $h (qw(STDIN STDOUT)){ |
69 | if ($arg{$h}){ |
70 | unless (defined find_encoding($arg{$h})) { |
71 | require Carp; |
72 | Carp::croak("Unknown encoding for $h, '$arg{$h}'"); |
73 | } |
6be7c101 |
74 | eval { binmode($h, ":raw :encoding($arg{$h})") }; |
151b5d36 |
75 | }else{ |
76 | unless (exists $arg{$h}){ |
77 | eval { |
78 | no warnings 'uninitialized'; |
6be7c101 |
79 | binmode($h, ":raw :encoding($name)"); |
151b5d36 |
80 | }; |
81 | } |
82 | } |
83 | if ($@){ |
84 | require Carp; |
85 | Carp::croak($@); |
86 | } |
3ef515df |
87 | } |
88 | return 1; # I doubt if we need it, though |
89 | } |
90 | |
91 | sub unimport{ |
92 | no warnings; |
93 | undef ${^ENCODING}; |
621b0f8d |
94 | if ($HAS_PERLIO){ |
95 | binmode(STDIN, ":raw"); |
96 | binmode(STDOUT, ":raw"); |
97 | }else{ |
6be7c101 |
98 | binmode(STDIN); |
99 | binmode(STDOUT); |
621b0f8d |
100 | } |
aae85ceb |
101 | if ($INC{"Filter/Util/Call.pm"}){ |
102 | eval { filter_del() }; |
103 | } |
3ef515df |
104 | } |
105 | |
106 | 1; |
107 | __END__ |
85982a32 |
108 | |
3ef515df |
109 | =pod |
110 | |
111 | =head1 NAME |
112 | |
0ab8f81e |
113 | encoding - allows you to write your script in non-ascii or non-utf8 |
3ef515df |
114 | |
115 | =head1 SYNOPSIS |
116 | |
962111ca |
117 | use encoding "greek"; # Perl like Greek to you? |
3ef515df |
118 | use encoding "euc-jp"; # Jperl! |
119 | |
962111ca |
120 | # or you can even do this if your shell supports your native encoding |
3ef515df |
121 | |
962111ca |
122 | perl -Mencoding=latin2 -e '...' # Feeling centrally European? |
0ab8f81e |
123 | perl -Mencoding=euc-kr -e '...' # Or Korean? |
3ef515df |
124 | |
3ef515df |
125 | # more control |
126 | |
962111ca |
127 | # A simple euc-cn => utf-8 converter |
6d1c0808 |
128 | use encoding "euc-cn", STDOUT => "utf8"; while(<>){print}; |
3ef515df |
129 | |
130 | # "no encoding;" supported (but not scoped!) |
131 | no encoding; |
132 | |
aae85ceb |
133 | # an alternate way, Filter |
134 | use encoding "euc-jp", Filter=>1; |
aae85ceb |
135 | # now you can use kanji identifiers -- in euc-jp! |
136 | |
3ef515df |
137 | =head1 ABSTRACT |
138 | |
962111ca |
139 | Let's start with a bit of history: Perl 5.6.0 introduced Unicode |
140 | support. You could apply C<substr()> and regexes even to complex CJK |
141 | characters -- so long as the script was written in UTF-8. But back |
0ab8f81e |
142 | then, text editors that supported UTF-8 were still rare and many users |
143 | instead chose to write scripts in legacy encodings, giving up a whole |
144 | new feature of Perl 5.6. |
3ef515df |
145 | |
0ab8f81e |
146 | Rewind to the future: starting from perl 5.8.0 with the B<encoding> |
962111ca |
147 | pragma, you can write your script in any encoding you like (so long |
148 | as the C<Encode> module supports it) and still enjoy Unicode support. |
0f29a567 |
149 | This pragma achieves that by doing the following: |
05ef2f67 |
150 | |
151 | =over |
152 | |
153 | =item * |
154 | |
155 | Internally converts all literals (C<q//,qq//,qr//,qw///, qx//>) from |
156 | the encoding specified to utf8. In Perl 5.8.1 and later, literals in |
157 | C<tr///> and C<DATA> pseudo-filehandle are also converted. |
158 | |
159 | =item * |
160 | |
161 | Changing PerlIO layers of C<STDIN> and C<STDOUT> to the encoding |
162 | specified. |
163 | |
164 | =back |
165 | |
166 | =head2 Literal Conversions |
167 | |
0ab8f81e |
168 | You can write code in EUC-JP as follows: |
3ef515df |
169 | |
170 | my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji |
171 | #<-char-><-char-> # 4 octets |
172 | s/\bCamel\b/$Rakuda/; |
173 | |
174 | And with C<use encoding "euc-jp"> in effect, it is the same thing as |
962111ca |
175 | the code in UTF-8: |
3ef515df |
176 | |
32b9ed1f |
177 | my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters |
3ef515df |
178 | s/\bCamel\b/$Rakuda/; |
179 | |
05ef2f67 |
180 | =head2 PerlIO layers for C<STD(IN|OUT)> |
181 | |
182 | The B<encoding> pragma also modifies the filehandle layers of |
4b291ae6 |
183 | STDIN and STDOUT to the specified encoding. Therefore, |
3ef515df |
184 | |
185 | use encoding "euc-jp"; |
186 | my $message = "Camel is the symbol of perl.\n"; |
187 | my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji |
188 | $message =~ s/\bCamel\b/$Rakuda/; |
189 | print $message; |
190 | |
962111ca |
191 | Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n", |
192 | not "\x{99F1}\x{99DD} is the symbol of perl.\n". |
3ef515df |
193 | |
0ab8f81e |
194 | You can override this by giving extra arguments; see below. |
3ef515df |
195 | |
05ef2f67 |
196 | =head1 FEATURES THAT REQUIRE 5.8.1 |
197 | |
198 | Some of the features offered by this pragma requires perl 5.8.1. Most |
0f29a567 |
199 | of these are done by Inaba Hiroto. Any other features and changes |
05ef2f67 |
200 | are good for 5.8.0. |
201 | |
202 | =over |
203 | |
204 | =item "NON-EUC" doublebyte encodings |
205 | |
0f29a567 |
206 | Because perl needs to parse script before applying this pragma, such |
05ef2f67 |
207 | encodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH; |
208 | \x5c) in the second byte fails because the second byte may |
0f29a567 |
209 | accidentally escape the quoting character that follows. Perl 5.8.1 |
05ef2f67 |
210 | or later fixes this problem. |
211 | |
212 | =item tr// |
213 | |
214 | C<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0 |
215 | See the section below for details. |
216 | |
217 | =item DATA pseudo-filehandle |
218 | |
219 | Another feature that was overlooked was C<DATA>. |
220 | |
221 | =back |
222 | |
3ef515df |
223 | =head1 USAGE |
224 | |
225 | =over 4 |
226 | |
227 | =item use encoding [I<ENCNAME>] ; |
228 | |
05ef2f67 |
229 | Sets the script encoding to I<ENCNAME>. And unless ${^UNICODE} |
230 | exists and non-zero, PerlIO layers of STDIN and STDOUT are set to |
231 | ":encoding(I<ENCNAME>)". |
232 | |
233 | Note that STDERR WILL NOT be changed. |
234 | |
235 | Also note that non-STD file handles remain unaffected. Use C<use |
236 | open> or C<binmode> to change layers of those. |
3ef515df |
237 | |
238 | If no encoding is specified, the environment variable L<PERL_ENCODING> |
962111ca |
239 | is consulted. If no encoding can be found, the error C<Unknown encoding |
240 | 'I<ENCNAME>'> will be thrown. |
3ef515df |
241 | |
aae85ceb |
242 | =item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ; |
3ef515df |
243 | |
0ab8f81e |
244 | You can also individually set encodings of STDIN and STDOUT via the |
32b9ed1f |
245 | C<< STDIN => I<ENCNAME> >> form. In this case, you cannot omit the |
246 | first I<ENCNAME>. C<< STDIN => undef >> turns the IO transcoding |
aae85ceb |
247 | completely off. |
3ef515df |
248 | |
05ef2f67 |
249 | When ${^UNICODE} exists and non-zero, these options will completely |
250 | ignored. ${^UNICODE} is a variable introduced in perl 5.8.1. See |
251 | L<perlrun> see L<perlvar/"${^UNICODE}"> and L<perlrun/"-C"> for |
252 | details (perl 5.8.1 and later). |
253 | |
151b5d36 |
254 | =item use encoding I<ENCNAME> Filter=E<gt>1; |
255 | |
256 | This turns the encoding pragma into a source filter. While the |
257 | default approach just decodes interpolated literals (in qq() and |
258 | qr()), this will apply a source filter to the entire source code. See |
05ef2f67 |
259 | L</"The Filter Option"> below for details. |
151b5d36 |
260 | |
3ef515df |
261 | =item no encoding; |
262 | |
05ef2f67 |
263 | Unsets the script encoding. The layers of STDIN, STDOUT are |
962111ca |
264 | reset to ":raw" (the default unprocessed raw stream of bytes). |
3ef515df |
265 | |
266 | =back |
267 | |
151b5d36 |
268 | =head1 The Filter Option |
269 | |
270 | The magic of C<use encoding> is not applied to the names of |
271 | identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human |
272 | is a single Han ideograph) work, you still need to write your script |
273 | in UTF-8 -- or use a source filter. That's what 'Filter=>1' does. |
274 | |
151b5d36 |
275 | What does this mean? Your source code behaves as if it is written in |
276 | UTF-8 with 'use utf8' in effect. So even if your editor only supports |
277 | Shift_JIS, for example, you can still try examples in Chapter 15 of |
278 | C<Programming Perl, 3rd Ed.>. For instance, you can use UTF-8 |
279 | identifiers. |
280 | |
281 | This option is significantly slower and (as of this writing) non-ASCII |
282 | identifiers are not very stable WITHOUT this option and with the |
283 | source code written in UTF-8. |
284 | |
285 | =head2 Filter-related changes at Encode version 1.87 |
286 | |
287 | =over |
288 | |
289 | =item * |
290 | |
291 | The Filter option now sets STDIN and STDOUT like non-filter options. |
292 | And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like |
293 | non-filter version. |
294 | |
295 | =item * |
296 | |
297 | C<use utf8> is implicitly declared so you no longer have to C<use |
298 | utf8> to C<${"\x{4eba}"}++>. |
299 | |
300 | =back |
301 | |
3ef515df |
302 | =head1 CAVEATS |
303 | |
304 | =head2 NOT SCOPED |
305 | |
306 | The pragma is a per script, not a per block lexical. Only the last |
621b0f8d |
307 | C<use encoding> or C<no encoding> matters, and it affects |
308 | B<the whole script>. However, the <no encoding> pragma is supported and |
309 | B<use encoding> can appear as many times as you want in a given script. |
310 | The multiple use of this pragma is discouraged. |
311 | |
0f29a567 |
312 | By the same reason, the use this pragma inside modules is also |
313 | discouraged (though not as strongly discouranged as the case above. |
314 | See below). |
05ef2f67 |
315 | |
316 | If you still have to write a module with this pragma, be very careful |
317 | of the load order. See the codes below; |
318 | |
319 | # called module |
320 | package Module_IN_BAR; |
321 | use encoding "bar"; |
322 | # stuff in "bar" encoding here |
323 | 1; |
324 | |
325 | # caller script |
326 | use encoding "foo" |
327 | use Module_IN_BAR; |
328 | # surprise! use encoding "bar" is in effect. |
329 | |
330 | The best way to avoid this oddity is to use this pragma RIGHT AFTER |
331 | other modules are loaded. i.e. |
332 | |
333 | use Module_IN_BAR; |
334 | use encoding "foo"; |
3ef515df |
335 | |
336 | =head2 DO NOT MIX MULTIPLE ENCODINGS |
337 | |
338 | Notice that only literals (string or regular expression) having only |
339 | legacy code points are affected: if you mix data like this |
340 | |
341 | \xDF\x{100} |
342 | |
343 | the data is assumed to be in (Latin 1 and) Unicode, not in your native |
344 | encoding. In other words, this will match in "greek": |
345 | |
346 | "\xDF" =~ /\x{3af}/ |
347 | |
348 | but this will not |
349 | |
350 | "\xDF\x{100}" =~ /\x{3af}\x{100}/ |
351 | |
962111ca |
352 | since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on |
353 | the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL |
354 | LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left. You |
355 | should not be mixing your legacy data and Unicode in the same string. |
3ef515df |
356 | |
357 | This pragma also affects encoding of the 0x80..0xFF code point range: |
358 | normally characters in that range are left as eight-bit bytes (unless |
359 | they are combined with characters with code points 0x100 or larger, |
360 | in which case all characters need to become UTF-8 encoded), but if |
361 | the C<encoding> pragma is present, even the 0x80..0xFF range always |
362 | gets UTF-8 encoded. |
363 | |
364 | After all, the best thing about this pragma is that you don't have to |
0ab8f81e |
365 | resort to \x{....} just to spell your name in a native encoding. |
366 | So feel free to put your strings in your encoding in quotes and |
367 | regexes. |
3ef515df |
368 | |
151b5d36 |
369 | =head2 tr/// with ranges |
4b291ae6 |
370 | |
371 | The B<encoding> pragma works by decoding string literals in |
151b5d36 |
372 | C<q//,qq//,qr//,qw///, qx//> and so forth. In perl 5.8.0, this |
4b291ae6 |
373 | does not apply to C<tr///>. Therefore, |
374 | |
375 | use encoding 'euc-jp'; |
376 | #.... |
377 | $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/; |
378 | # -------- -------- -------- -------- |
379 | |
380 | Does not work as |
381 | |
382 | $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/; |
383 | |
384 | =over |
385 | |
386 | =item Legend of characters above |
387 | |
388 | utf8 euc-jp charnames::viacode() |
389 | ----------------------------------------- |
390 | \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A |
391 | \x{3093} \xA4\xF3 HIRAGANA LETTER N |
392 | \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A |
393 | \x{30f3} \xA5\xF3 KATAKANA LETTER N |
394 | |
395 | =back |
396 | |
05ef2f67 |
397 | This counterintuitive behavior has been fixed in perl 5.8.1. |
151b5d36 |
398 | |
4b291ae6 |
399 | =head3 workaround to tr///; |
400 | |
151b5d36 |
401 | In perl 5.8.0, you can work aroud as follows; |
4b291ae6 |
402 | |
403 | use encoding 'euc-jp'; |
151b5d36 |
404 | # .... |
4b291ae6 |
405 | eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ }; |
406 | |
407 | Note the C<tr//> expression is surronded by C<qq{}>. The idea behind |
408 | is the same as classic idiom that makes C<tr///> 'interpolate'. |
409 | |
410 | tr/$from/$to/; # wrong! |
411 | eval qq{ tr/$from/$to/ }; # workaround. |
412 | |
413 | Nevertheless, in case of B<encoding> pragma even C<q//> is affected so |
414 | C<tr///> not being decoded was obviously against the will of Perl5 |
05ef2f67 |
415 | Porters so it has been fixed in Perl 5.8.1 or later. |
aae85ceb |
416 | |
3ef515df |
417 | =head1 EXAMPLE - Greekperl |
418 | |
419 | use encoding "iso 8859-7"; |
420 | |
0ab8f81e |
421 | # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode. |
3ef515df |
422 | |
423 | $a = "\xDF"; |
424 | $b = "\x{100}"; |
425 | |
426 | printf "%#x\n", ord($a); # will print 0x3af, not 0xdf |
427 | |
428 | $c = $a . $b; |
429 | |
430 | # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}". |
431 | |
432 | # chr() is affected, and ... |
433 | |
434 | print "mega\n" if ord(chr(0xdf)) == 0x3af; |
435 | |
436 | # ... ord() is affected by the encoding pragma ... |
437 | |
438 | print "tera\n" if ord(pack("C", 0xdf)) == 0x3af; |
439 | |
440 | # ... as are eq and cmp ... |
441 | |
442 | print "peta\n" if "\x{3af}" eq pack("C", 0xdf); |
443 | print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0; |
444 | |
445 | # ... but pack/unpack C are not affected, in case you still |
0ab8f81e |
446 | # want to go back to your native encoding |
3ef515df |
447 | |
448 | print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf; |
449 | |
450 | =head1 KNOWN PROBLEMS |
451 | |
151b5d36 |
452 | =over |
453 | |
0f29a567 |
454 | =item literals in regex that are longer than 127 bytes |
151b5d36 |
455 | |
0ab8f81e |
456 | For native multibyte encodings (either fixed or variable length), |
3ef515df |
457 | the current implementation of the regular expressions may introduce |
0ab8f81e |
458 | recoding errors for regular expression literals longer than 127 bytes. |
3ef515df |
459 | |
05ef2f67 |
460 | =item EBCDIC |
151b5d36 |
461 | |
3ef515df |
462 | The encoding pragma is not supported on EBCDIC platforms. |
0ab8f81e |
463 | (Porters who are willing and able to remove this limitation are |
464 | welcome.) |
3ef515df |
465 | |
05ef2f67 |
466 | =item format |
467 | |
468 | This pragma doesn't work well with format because PerlIO does not |
469 | get along very well with it. When format contains non-ascii |
470 | characters it prints funny or gets "wide character warnings". |
471 | To understand it, try the code below. |
472 | |
473 | # Save this one in utf8 |
474 | # replace *non-ascii* with a non-ascii string |
475 | my $camel; |
476 | format STDOUT = |
477 | *non-ascii*@>>>>>>> |
478 | $camel |
479 | . |
480 | $camel = "*non-ascii*"; |
481 | binmode(STDOUT=>':encoding(utf8)'); # bang! |
482 | write; # funny |
483 | print $camel, "\n"; # fine |
484 | |
485 | Without binmode this happens to work but without binmode, print() |
486 | fails instead of write(). |
487 | |
488 | At any rate, the very use of format is questionable when it comes to |
489 | unicode characters since you have to consider such things as character |
490 | width (i.e. double-width for ideographs) and directions (i.e. BIDI for |
491 | Arabic and Hebrew). |
492 | |
151b5d36 |
493 | =back |
494 | |
05ef2f67 |
495 | =head1 HISTORY |
496 | |
497 | This pragma first appeared in Perl 5.8.0. For features that require |
498 | 5.8.1 and better, see above. |
499 | |
3ef515df |
500 | =head1 SEE ALSO |
501 | |
aae85ceb |
502 | L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>, |
503 | |
504 | Ch. 15 of C<Programming Perl (3rd Edition)> |
505 | by Larry Wall, Tom Christiansen, Jon Orwant; |
506 | O'Reilly & Associates; ISBN 0-596-00027-8 |
3ef515df |
507 | |
508 | =cut |