Commit | Line | Data |
d1256cb1 |
1 | # $Id: encoding.pm,v 2.3 2006/05/03 18:24:10 dankogai Exp $ |
3ef515df |
2 | package encoding; |
d1256cb1 |
3 | our $VERSION = do { my @r = ( q$Revision: 2.3 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; |
3ef515df |
4 | |
5 | use Encode; |
046f36bf |
6 | use strict; |
b1aeb384 |
7 | |
8f139f4c |
8 | sub DEBUG () { 0 } |
3ef515df |
9 | |
10 | BEGIN { |
d1256cb1 |
11 | if ( ord("A") == 193 ) { |
12 | require Carp; |
13 | Carp::croak("encoding: pragma does not support EBCDIC platforms"); |
3ef515df |
14 | } |
15 | } |
16 | |
0ab8f81e |
17 | our $HAS_PERLIO = 0; |
18 | eval { require PerlIO::encoding }; |
d1256cb1 |
19 | unless ($@) { |
20 | $HAS_PERLIO = ( PerlIO::encoding->VERSION >= 0.02 ); |
0ab8f81e |
21 | } |
b2704119 |
22 | |
d1256cb1 |
23 | sub _exception { |
151b5d36 |
24 | my $name = shift; |
d1256cb1 |
25 | $] > 5.008 and return 0; # 5.8.1 or higher then no |
26 | my %utfs = map { $_ => 1 } |
27 | qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE |
28 | UTF-32 UTF-32BE UTF-32LE); |
29 | $utfs{$name} or return 0; # UTFs or no |
30 | require Config; |
31 | Config->import(); |
32 | our %Config; |
33 | return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no |
151b5d36 |
34 | } |
fa6f41cf |
35 | |
d1256cb1 |
36 | sub in_locale { $^H & ( $locale::hint_bits || 0 ) } |
b1aeb384 |
37 | |
38 | sub _get_locale_encoding { |
39 | my $locale_encoding; |
40 | |
41 | # I18N::Langinfo isn't available everywhere |
42 | eval { |
d1256cb1 |
43 | require I18N::Langinfo; |
44 | I18N::Langinfo->import(qw(langinfo CODESET)); |
45 | $locale_encoding = langinfo( CODESET() ); |
b1aeb384 |
46 | }; |
d1256cb1 |
47 | |
b1aeb384 |
48 | my $country_language; |
49 | |
50 | no warnings 'uninitialized'; |
51 | |
d1256cb1 |
52 | if ( not $locale_encoding && in_locale() ) { |
53 | if ( $ENV{LC_ALL} =~ /^([^.]+)\.([^.]+)$/ ) { |
54 | ( $country_language, $locale_encoding ) = ( $1, $2 ); |
55 | } |
56 | elsif ( $ENV{LANG} =~ /^([^.]+)\.([^.]+)$/ ) { |
57 | ( $country_language, $locale_encoding ) = ( $1, $2 ); |
58 | } |
59 | |
60 | # LANGUAGE affects only LC_MESSAGES only on glibc |
61 | } |
62 | elsif ( not $locale_encoding ) { |
63 | if ( $ENV{LC_ALL} =~ /\butf-?8\b/i |
64 | || $ENV{LANG} =~ /\butf-?8\b/i ) |
65 | { |
66 | $locale_encoding = 'utf8'; |
67 | } |
68 | |
69 | # Could do more heuristics based on the country and language |
70 | # parts of LC_ALL and LANG (the parts before the dot (if any)), |
71 | # since we have Locale::Country and Locale::Language available. |
72 | # TODO: get a database of Language -> Encoding mappings |
73 | # (the Estonian database at http://www.eki.ee/letter/ |
74 | # would be excellent!) --jhi |
b1aeb384 |
75 | } |
d1256cb1 |
76 | if ( defined $locale_encoding |
77 | && lc($locale_encoding) eq 'euc' |
78 | && defined $country_language ) |
79 | { |
80 | if ( $country_language =~ /^ja_JP|japan(?:ese)?$/i ) { |
81 | $locale_encoding = 'euc-jp'; |
82 | } |
83 | elsif ( $country_language =~ /^ko_KR|korean?$/i ) { |
84 | $locale_encoding = 'euc-kr'; |
85 | } |
5a1dbf39 |
86 | elsif ( $country_language =~ /^zh_CN|chin(?:a|ese)$/i ) { |
d1256cb1 |
87 | $locale_encoding = 'euc-cn'; |
88 | } |
89 | elsif ( $country_language =~ /^zh_TW|taiwan(?:ese)?$/i ) { |
90 | $locale_encoding = 'euc-tw'; |
91 | } |
92 | else { |
93 | require Carp; |
94 | Carp::croak( |
95 | "encoding: Locale encoding '$locale_encoding' too ambiguous" |
96 | ); |
97 | } |
b1aeb384 |
98 | } |
99 | |
100 | return $locale_encoding; |
101 | } |
102 | |
3ef515df |
103 | sub import { |
104 | my $class = shift; |
105 | my $name = shift; |
d1256cb1 |
106 | if ( $name eq ':_get_locale_encoding' ) { # used by lib/open.pm |
107 | my $caller = caller(); |
b1aeb384 |
108 | { |
d1256cb1 |
109 | no strict 'refs'; |
110 | *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding; |
111 | } |
112 | return; |
b1aeb384 |
113 | } |
114 | $name = _get_locale_encoding() if $name eq ':locale'; |
3ef515df |
115 | my %arg = @_; |
b1aeb384 |
116 | $name = $ENV{PERL_ENCODING} unless defined $name; |
3ef515df |
117 | my $enc = find_encoding($name); |
d1256cb1 |
118 | unless ( defined $enc ) { |
119 | require Carp; |
120 | Carp::croak("encoding: Unknown encoding '$name'"); |
121 | } |
122 | $name = $enc->name; # canonize |
123 | unless ( $arg{Filter} ) { |
124 | DEBUG and warn "_exception($name) = ", _exception($name); |
125 | _exception($name) or ${^ENCODING} = $enc; |
126 | $HAS_PERLIO or return 1; |
3ef515df |
127 | } |
d1256cb1 |
128 | else { |
129 | defined( ${^ENCODING} ) and undef ${^ENCODING}; |
130 | |
131 | # implicitly 'use utf8' |
132 | require utf8; # to fetch $utf8::hint_bits; |
133 | $^H |= $utf8::hint_bits; |
134 | eval { |
135 | require Filter::Util::Call; |
136 | Filter::Util::Call->import; |
137 | filter_add( |
138 | sub { |
139 | my $status = filter_read(); |
140 | if ( $status > 0 ) { |
141 | $_ = $enc->decode( $_, 1 ); |
142 | DEBUG and warn $_; |
143 | } |
144 | $status; |
145 | } |
146 | ); |
147 | }; |
d7fe8a7a |
148 | $@ eq '' and DEBUG and warn "Filter installed"; |
b1aeb384 |
149 | } |
05ef2f67 |
150 | defined ${^UNICODE} and ${^UNICODE} != 0 and return 1; |
d1256cb1 |
151 | for my $h (qw(STDIN STDOUT)) { |
152 | if ( $arg{$h} ) { |
153 | unless ( defined find_encoding( $arg{$h} ) ) { |
154 | require Carp; |
155 | Carp::croak( |
156 | "encoding: Unknown encoding for $h, '$arg{$h}'"); |
157 | } |
158 | eval { binmode( $h, ":raw :encoding($arg{$h})" ) }; |
159 | } |
160 | else { |
161 | unless ( exists $arg{$h} ) { |
162 | eval { |
163 | no warnings 'uninitialized'; |
164 | binmode( $h, ":raw :encoding($name)" ); |
165 | }; |
166 | } |
167 | } |
168 | if ($@) { |
169 | require Carp; |
170 | Carp::croak($@); |
171 | } |
3ef515df |
172 | } |
d1256cb1 |
173 | return 1; # I doubt if we need it, though |
3ef515df |
174 | } |
175 | |
d1256cb1 |
176 | sub unimport { |
3ef515df |
177 | no warnings; |
178 | undef ${^ENCODING}; |
d1256cb1 |
179 | if ($HAS_PERLIO) { |
180 | binmode( STDIN, ":raw" ); |
181 | binmode( STDOUT, ":raw" ); |
182 | } |
183 | else { |
184 | binmode(STDIN); |
185 | binmode(STDOUT); |
621b0f8d |
186 | } |
d1256cb1 |
187 | if ( $INC{"Filter/Util/Call.pm"} ) { |
188 | eval { filter_del() }; |
aae85ceb |
189 | } |
3ef515df |
190 | } |
191 | |
192 | 1; |
193 | __END__ |
85982a32 |
194 | |
3ef515df |
195 | =pod |
196 | |
197 | =head1 NAME |
198 | |
0ab8f81e |
199 | encoding - allows you to write your script in non-ascii or non-utf8 |
3ef515df |
200 | |
201 | =head1 SYNOPSIS |
202 | |
962111ca |
203 | use encoding "greek"; # Perl like Greek to you? |
3ef515df |
204 | use encoding "euc-jp"; # Jperl! |
205 | |
962111ca |
206 | # or you can even do this if your shell supports your native encoding |
3ef515df |
207 | |
962111ca |
208 | perl -Mencoding=latin2 -e '...' # Feeling centrally European? |
0ab8f81e |
209 | perl -Mencoding=euc-kr -e '...' # Or Korean? |
3ef515df |
210 | |
3ef515df |
211 | # more control |
212 | |
962111ca |
213 | # A simple euc-cn => utf-8 converter |
6d1c0808 |
214 | use encoding "euc-cn", STDOUT => "utf8"; while(<>){print}; |
3ef515df |
215 | |
216 | # "no encoding;" supported (but not scoped!) |
217 | no encoding; |
218 | |
aae85ceb |
219 | # an alternate way, Filter |
220 | use encoding "euc-jp", Filter=>1; |
aae85ceb |
221 | # now you can use kanji identifiers -- in euc-jp! |
222 | |
b1aeb384 |
223 | # switch on locale - |
224 | # note that this probably means that unless you have a complete control |
225 | # over the environments the application is ever going to be run, you should |
226 | # NOT use the feature of encoding pragma allowing you to write your script |
227 | # in any recognized encoding because changing locale settings will wreck |
228 | # the script; you can of course still use the other features of the pragma. |
229 | use encoding ':locale'; |
230 | |
3ef515df |
231 | =head1 ABSTRACT |
232 | |
962111ca |
233 | Let's start with a bit of history: Perl 5.6.0 introduced Unicode |
234 | support. You could apply C<substr()> and regexes even to complex CJK |
235 | characters -- so long as the script was written in UTF-8. But back |
0ab8f81e |
236 | then, text editors that supported UTF-8 were still rare and many users |
237 | instead chose to write scripts in legacy encodings, giving up a whole |
238 | new feature of Perl 5.6. |
3ef515df |
239 | |
0ab8f81e |
240 | Rewind to the future: starting from perl 5.8.0 with the B<encoding> |
962111ca |
241 | pragma, you can write your script in any encoding you like (so long |
242 | as the C<Encode> module supports it) and still enjoy Unicode support. |
0f29a567 |
243 | This pragma achieves that by doing the following: |
05ef2f67 |
244 | |
245 | =over |
246 | |
247 | =item * |
248 | |
249 | Internally converts all literals (C<q//,qq//,qr//,qw///, qx//>) from |
250 | the encoding specified to utf8. In Perl 5.8.1 and later, literals in |
251 | C<tr///> and C<DATA> pseudo-filehandle are also converted. |
252 | |
253 | =item * |
254 | |
255 | Changing PerlIO layers of C<STDIN> and C<STDOUT> to the encoding |
256 | specified. |
257 | |
258 | =back |
259 | |
260 | =head2 Literal Conversions |
261 | |
0ab8f81e |
262 | You can write code in EUC-JP as follows: |
3ef515df |
263 | |
264 | my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji |
265 | #<-char-><-char-> # 4 octets |
266 | s/\bCamel\b/$Rakuda/; |
267 | |
268 | And with C<use encoding "euc-jp"> in effect, it is the same thing as |
962111ca |
269 | the code in UTF-8: |
3ef515df |
270 | |
32b9ed1f |
271 | my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters |
3ef515df |
272 | s/\bCamel\b/$Rakuda/; |
273 | |
05ef2f67 |
274 | =head2 PerlIO layers for C<STD(IN|OUT)> |
275 | |
276 | The B<encoding> pragma also modifies the filehandle layers of |
4b291ae6 |
277 | STDIN and STDOUT to the specified encoding. Therefore, |
3ef515df |
278 | |
279 | use encoding "euc-jp"; |
280 | my $message = "Camel is the symbol of perl.\n"; |
281 | my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji |
282 | $message =~ s/\bCamel\b/$Rakuda/; |
283 | print $message; |
284 | |
962111ca |
285 | Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n", |
286 | not "\x{99F1}\x{99DD} is the symbol of perl.\n". |
3ef515df |
287 | |
0ab8f81e |
288 | You can override this by giving extra arguments; see below. |
3ef515df |
289 | |
990e18f7 |
290 | =head2 Implicit upgrading for byte strings |
291 | |
292 | By default, if strings operating under byte semantics and strings |
293 | with Unicode character data are concatenated, the new string will |
294 | be created by decoding the byte strings as I<ISO 8859-1 (Latin-1)>. |
295 | |
296 | The B<encoding> pragma changes this to use the specified encoding |
297 | instead. For example: |
298 | |
299 | use encoding 'utf8'; |
300 | my $string = chr(20000); # a Unicode string |
301 | utf8::encode($string); # now it's a UTF-8 encoded byte string |
302 | # concatenate with another Unicode string |
303 | print length($string . chr(20000)); |
304 | |
305 | Will print C<2>, because C<$string> is upgraded as UTF-8. Without |
306 | C<use encoding 'utf8';>, it will print C<4> instead, since C<$string> |
307 | is three octets when interpreted as Latin-1. |
308 | |
05ef2f67 |
309 | =head1 FEATURES THAT REQUIRE 5.8.1 |
310 | |
311 | Some of the features offered by this pragma requires perl 5.8.1. Most |
0f29a567 |
312 | of these are done by Inaba Hiroto. Any other features and changes |
05ef2f67 |
313 | are good for 5.8.0. |
314 | |
315 | =over |
316 | |
317 | =item "NON-EUC" doublebyte encodings |
318 | |
0f29a567 |
319 | Because perl needs to parse script before applying this pragma, such |
05ef2f67 |
320 | encodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH; |
321 | \x5c) in the second byte fails because the second byte may |
0f29a567 |
322 | accidentally escape the quoting character that follows. Perl 5.8.1 |
05ef2f67 |
323 | or later fixes this problem. |
324 | |
325 | =item tr// |
326 | |
327 | C<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0 |
328 | See the section below for details. |
329 | |
330 | =item DATA pseudo-filehandle |
331 | |
332 | Another feature that was overlooked was C<DATA>. |
333 | |
334 | =back |
335 | |
3ef515df |
336 | =head1 USAGE |
337 | |
338 | =over 4 |
339 | |
340 | =item use encoding [I<ENCNAME>] ; |
341 | |
05ef2f67 |
342 | Sets the script encoding to I<ENCNAME>. And unless ${^UNICODE} |
343 | exists and non-zero, PerlIO layers of STDIN and STDOUT are set to |
344 | ":encoding(I<ENCNAME>)". |
345 | |
346 | Note that STDERR WILL NOT be changed. |
347 | |
348 | Also note that non-STD file handles remain unaffected. Use C<use |
349 | open> or C<binmode> to change layers of those. |
3ef515df |
350 | |
351 | If no encoding is specified, the environment variable L<PERL_ENCODING> |
962111ca |
352 | is consulted. If no encoding can be found, the error C<Unknown encoding |
353 | 'I<ENCNAME>'> will be thrown. |
3ef515df |
354 | |
aae85ceb |
355 | =item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ; |
3ef515df |
356 | |
0ab8f81e |
357 | You can also individually set encodings of STDIN and STDOUT via the |
32b9ed1f |
358 | C<< STDIN => I<ENCNAME> >> form. In this case, you cannot omit the |
359 | first I<ENCNAME>. C<< STDIN => undef >> turns the IO transcoding |
aae85ceb |
360 | completely off. |
3ef515df |
361 | |
05ef2f67 |
362 | When ${^UNICODE} exists and non-zero, these options will completely |
363 | ignored. ${^UNICODE} is a variable introduced in perl 5.8.1. See |
364 | L<perlrun> see L<perlvar/"${^UNICODE}"> and L<perlrun/"-C"> for |
365 | details (perl 5.8.1 and later). |
366 | |
151b5d36 |
367 | =item use encoding I<ENCNAME> Filter=E<gt>1; |
368 | |
369 | This turns the encoding pragma into a source filter. While the |
370 | default approach just decodes interpolated literals (in qq() and |
371 | qr()), this will apply a source filter to the entire source code. See |
05ef2f67 |
372 | L</"The Filter Option"> below for details. |
151b5d36 |
373 | |
3ef515df |
374 | =item no encoding; |
375 | |
05ef2f67 |
376 | Unsets the script encoding. The layers of STDIN, STDOUT are |
962111ca |
377 | reset to ":raw" (the default unprocessed raw stream of bytes). |
3ef515df |
378 | |
379 | =back |
380 | |
151b5d36 |
381 | =head1 The Filter Option |
382 | |
383 | The magic of C<use encoding> is not applied to the names of |
384 | identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human |
385 | is a single Han ideograph) work, you still need to write your script |
386 | in UTF-8 -- or use a source filter. That's what 'Filter=>1' does. |
387 | |
151b5d36 |
388 | What does this mean? Your source code behaves as if it is written in |
389 | UTF-8 with 'use utf8' in effect. So even if your editor only supports |
390 | Shift_JIS, for example, you can still try examples in Chapter 15 of |
391 | C<Programming Perl, 3rd Ed.>. For instance, you can use UTF-8 |
392 | identifiers. |
393 | |
394 | This option is significantly slower and (as of this writing) non-ASCII |
395 | identifiers are not very stable WITHOUT this option and with the |
396 | source code written in UTF-8. |
397 | |
398 | =head2 Filter-related changes at Encode version 1.87 |
399 | |
400 | =over |
401 | |
402 | =item * |
403 | |
404 | The Filter option now sets STDIN and STDOUT like non-filter options. |
405 | And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like |
406 | non-filter version. |
407 | |
408 | =item * |
409 | |
410 | C<use utf8> is implicitly declared so you no longer have to C<use |
411 | utf8> to C<${"\x{4eba}"}++>. |
412 | |
413 | =back |
414 | |
3ef515df |
415 | =head1 CAVEATS |
416 | |
417 | =head2 NOT SCOPED |
418 | |
419 | The pragma is a per script, not a per block lexical. Only the last |
621b0f8d |
420 | C<use encoding> or C<no encoding> matters, and it affects |
421 | B<the whole script>. However, the <no encoding> pragma is supported and |
422 | B<use encoding> can appear as many times as you want in a given script. |
423 | The multiple use of this pragma is discouraged. |
424 | |
0f29a567 |
425 | By the same reason, the use this pragma inside modules is also |
3c4b39be |
426 | discouraged (though not as strongly discouraged as the case above. |
0f29a567 |
427 | See below). |
05ef2f67 |
428 | |
429 | If you still have to write a module with this pragma, be very careful |
430 | of the load order. See the codes below; |
431 | |
432 | # called module |
433 | package Module_IN_BAR; |
434 | use encoding "bar"; |
435 | # stuff in "bar" encoding here |
436 | 1; |
437 | |
438 | # caller script |
439 | use encoding "foo" |
440 | use Module_IN_BAR; |
441 | # surprise! use encoding "bar" is in effect. |
442 | |
443 | The best way to avoid this oddity is to use this pragma RIGHT AFTER |
444 | other modules are loaded. i.e. |
445 | |
446 | use Module_IN_BAR; |
447 | use encoding "foo"; |
3ef515df |
448 | |
449 | =head2 DO NOT MIX MULTIPLE ENCODINGS |
450 | |
451 | Notice that only literals (string or regular expression) having only |
452 | legacy code points are affected: if you mix data like this |
453 | |
d1256cb1 |
454 | \xDF\x{100} |
3ef515df |
455 | |
456 | the data is assumed to be in (Latin 1 and) Unicode, not in your native |
457 | encoding. In other words, this will match in "greek": |
458 | |
d1256cb1 |
459 | "\xDF" =~ /\x{3af}/ |
3ef515df |
460 | |
461 | but this will not |
462 | |
d1256cb1 |
463 | "\xDF\x{100}" =~ /\x{3af}\x{100}/ |
3ef515df |
464 | |
962111ca |
465 | since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on |
466 | the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL |
467 | LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left. You |
468 | should not be mixing your legacy data and Unicode in the same string. |
3ef515df |
469 | |
470 | This pragma also affects encoding of the 0x80..0xFF code point range: |
471 | normally characters in that range are left as eight-bit bytes (unless |
472 | they are combined with characters with code points 0x100 or larger, |
473 | in which case all characters need to become UTF-8 encoded), but if |
474 | the C<encoding> pragma is present, even the 0x80..0xFF range always |
475 | gets UTF-8 encoded. |
476 | |
477 | After all, the best thing about this pragma is that you don't have to |
0ab8f81e |
478 | resort to \x{....} just to spell your name in a native encoding. |
479 | So feel free to put your strings in your encoding in quotes and |
480 | regexes. |
3ef515df |
481 | |
151b5d36 |
482 | =head2 tr/// with ranges |
4b291ae6 |
483 | |
484 | The B<encoding> pragma works by decoding string literals in |
151b5d36 |
485 | C<q//,qq//,qr//,qw///, qx//> and so forth. In perl 5.8.0, this |
4b291ae6 |
486 | does not apply to C<tr///>. Therefore, |
487 | |
488 | use encoding 'euc-jp'; |
489 | #.... |
490 | $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/; |
491 | # -------- -------- -------- -------- |
492 | |
493 | Does not work as |
494 | |
495 | $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/; |
496 | |
497 | =over |
498 | |
499 | =item Legend of characters above |
500 | |
501 | utf8 euc-jp charnames::viacode() |
502 | ----------------------------------------- |
503 | \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A |
504 | \x{3093} \xA4\xF3 HIRAGANA LETTER N |
505 | \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A |
506 | \x{30f3} \xA5\xF3 KATAKANA LETTER N |
507 | |
508 | =back |
509 | |
05ef2f67 |
510 | This counterintuitive behavior has been fixed in perl 5.8.1. |
151b5d36 |
511 | |
4b291ae6 |
512 | =head3 workaround to tr///; |
513 | |
ce16148b |
514 | In perl 5.8.0, you can work around as follows; |
4b291ae6 |
515 | |
516 | use encoding 'euc-jp'; |
151b5d36 |
517 | # .... |
4b291ae6 |
518 | eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ }; |
519 | |
ce16148b |
520 | Note the C<tr//> expression is surrounded by C<qq{}>. The idea behind |
4b291ae6 |
521 | is the same as classic idiom that makes C<tr///> 'interpolate'. |
522 | |
523 | tr/$from/$to/; # wrong! |
524 | eval qq{ tr/$from/$to/ }; # workaround. |
525 | |
526 | Nevertheless, in case of B<encoding> pragma even C<q//> is affected so |
527 | C<tr///> not being decoded was obviously against the will of Perl5 |
05ef2f67 |
528 | Porters so it has been fixed in Perl 5.8.1 or later. |
aae85ceb |
529 | |
3ef515df |
530 | =head1 EXAMPLE - Greekperl |
531 | |
532 | use encoding "iso 8859-7"; |
533 | |
0ab8f81e |
534 | # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode. |
3ef515df |
535 | |
536 | $a = "\xDF"; |
537 | $b = "\x{100}"; |
538 | |
539 | printf "%#x\n", ord($a); # will print 0x3af, not 0xdf |
540 | |
541 | $c = $a . $b; |
542 | |
543 | # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}". |
544 | |
545 | # chr() is affected, and ... |
546 | |
547 | print "mega\n" if ord(chr(0xdf)) == 0x3af; |
548 | |
549 | # ... ord() is affected by the encoding pragma ... |
550 | |
551 | print "tera\n" if ord(pack("C", 0xdf)) == 0x3af; |
552 | |
553 | # ... as are eq and cmp ... |
554 | |
555 | print "peta\n" if "\x{3af}" eq pack("C", 0xdf); |
556 | print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0; |
557 | |
558 | # ... but pack/unpack C are not affected, in case you still |
0ab8f81e |
559 | # want to go back to your native encoding |
3ef515df |
560 | |
561 | print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf; |
562 | |
563 | =head1 KNOWN PROBLEMS |
564 | |
151b5d36 |
565 | =over |
566 | |
0f29a567 |
567 | =item literals in regex that are longer than 127 bytes |
151b5d36 |
568 | |
0ab8f81e |
569 | For native multibyte encodings (either fixed or variable length), |
3ef515df |
570 | the current implementation of the regular expressions may introduce |
0ab8f81e |
571 | recoding errors for regular expression literals longer than 127 bytes. |
3ef515df |
572 | |
05ef2f67 |
573 | =item EBCDIC |
151b5d36 |
574 | |
3ef515df |
575 | The encoding pragma is not supported on EBCDIC platforms. |
0ab8f81e |
576 | (Porters who are willing and able to remove this limitation are |
577 | welcome.) |
3ef515df |
578 | |
05ef2f67 |
579 | =item format |
580 | |
581 | This pragma doesn't work well with format because PerlIO does not |
582 | get along very well with it. When format contains non-ascii |
583 | characters it prints funny or gets "wide character warnings". |
584 | To understand it, try the code below. |
585 | |
586 | # Save this one in utf8 |
587 | # replace *non-ascii* with a non-ascii string |
588 | my $camel; |
589 | format STDOUT = |
590 | *non-ascii*@>>>>>>> |
591 | $camel |
592 | . |
593 | $camel = "*non-ascii*"; |
594 | binmode(STDOUT=>':encoding(utf8)'); # bang! |
595 | write; # funny |
596 | print $camel, "\n"; # fine |
597 | |
598 | Without binmode this happens to work but without binmode, print() |
599 | fails instead of write(). |
600 | |
601 | At any rate, the very use of format is questionable when it comes to |
602 | unicode characters since you have to consider such things as character |
603 | width (i.e. double-width for ideographs) and directions (i.e. BIDI for |
604 | Arabic and Hebrew). |
605 | |
151b5d36 |
606 | =back |
607 | |
b1aeb384 |
608 | =head2 The Logic of :locale |
609 | |
610 | The logic of C<:locale> is as follows: |
611 | |
612 | =over 4 |
613 | |
614 | =item 1. |
615 | |
616 | If the platform supports the langinfo(CODESET) interface, the codeset |
617 | returned is used as the default encoding for the open pragma. |
618 | |
619 | =item 2. |
620 | |
621 | If 1. didn't work but we are under the locale pragma, the environment |
622 | variables LC_ALL and LANG (in that order) are matched for encodings |
623 | (the part after C<.>, if any), and if any found, that is used |
624 | as the default encoding for the open pragma. |
625 | |
626 | =item 3. |
627 | |
628 | If 1. and 2. didn't work, the environment variables LC_ALL and LANG |
629 | (in that order) are matched for anything looking like UTF-8, and if |
630 | any found, C<:utf8> is used as the default encoding for the open |
631 | pragma. |
632 | |
633 | =back |
634 | |
635 | If your locale environment variables (LC_ALL, LC_CTYPE, LANG) |
636 | contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching), |
637 | the default encoding of your STDIN, STDOUT, and STDERR, and of |
638 | B<any subsequent file open>, is UTF-8. |
639 | |
05ef2f67 |
640 | =head1 HISTORY |
641 | |
642 | This pragma first appeared in Perl 5.8.0. For features that require |
643 | 5.8.1 and better, see above. |
644 | |
b1aeb384 |
645 | The C<:locale> subpragma was implemented in 2.01, or Perl 5.8.6. |
646 | |
3ef515df |
647 | =head1 SEE ALSO |
648 | |
aae85ceb |
649 | L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>, |
650 | |
651 | Ch. 15 of C<Programming Perl (3rd Edition)> |
652 | by Larry Wall, Tom Christiansen, Jon Orwant; |
653 | O'Reilly & Associates; ISBN 0-596-00027-8 |
3ef515df |
654 | |
655 | =cut |