Upgrade to Encode 1.91.
[p5sagit/p5-mst-13.2.git] / ext / Encode / encoding.pm
CommitLineData
0f29a567 1# $Id: encoding.pm,v 1.44 2003/03/09 20:07:37 dankogai Exp dankogai $
3ef515df 2package encoding;
0f29a567 3our $VERSION = do { my @r = (q$Revision: 1.44 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
3ef515df 4
5use Encode;
046f36bf 6use strict;
151b5d36 7our $DEBUG = 0;
3ef515df 8
9BEGIN {
10 if (ord("A") == 193) {
11 require Carp;
10c5ecbb 12 Carp::croak("encoding pragma does not support EBCDIC platforms");
3ef515df 13 }
14}
15
0ab8f81e 16our $HAS_PERLIO = 0;
17eval { require PerlIO::encoding };
18unless ($@){
19 $HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02);
20}
b2704119 21
151b5d36 22sub _exception{
23 my $name = shift;
24 $] > 5.008 and return 0; # 5.8.1 then no
25 my %utfs = map {$_=>1}
26 qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
27 UTF-32 UTF-32BE UTF-32LE);
28 $utfs{$name} or return 0; # UTFs or no
29 require Config; Config->import(); our %Config;
30 return $Config{perl_patchlevel} == 0 # maintperl then no
31}
fa6f41cf 32
3ef515df 33sub import {
34 my $class = shift;
35 my $name = shift;
36 my %arg = @_;
37 $name ||= $ENV{PERL_ENCODING};
3ef515df 38 my $enc = find_encoding($name);
39 unless (defined $enc) {
40 require Carp;
10c5ecbb 41 Carp::croak("Unknown encoding '$name'");
3ef515df 42 }
fa6f41cf 43 $name = $enc->name; # canonize
0f7c507f 44 unless ($arg{Filter}) {
151b5d36 45 $DEBUG and warn "_exception($name) = ", _exception($name);
46 _exception($name) or ${^ENCODING} = $enc;
85982a32 47 $HAS_PERLIO or return 1;
aae85ceb 48 }else{
49 defined(${^ENCODING}) and undef ${^ENCODING};
151b5d36 50 # implicitly 'use utf8'
51 require utf8; # to fetch $utf8::hint_bits;
52 $^H |= $utf8::hint_bits;
aae85ceb 53 eval {
54 require Filter::Util::Call ;
55 Filter::Util::Call->import ;
aae85ceb 56 filter_add(sub{
151b5d36 57 my $status = filter_read();
58 if ($status > 0){
59 # $DEBUG and warn $_;
aae85ceb 60 $_ = $enc->decode($_, 1);
151b5d36 61 $DEBUG and warn $_;
aae85ceb 62 }
63 $status ;
64 });
65 };
6be7c101 66 } $DEBUG and warn "Filter installed";
05ef2f67 67 defined ${^UNICODE} and ${^UNICODE} != 0 and return 1;
151b5d36 68 for my $h (qw(STDIN STDOUT)){
69 if ($arg{$h}){
70 unless (defined find_encoding($arg{$h})) {
71 require Carp;
72 Carp::croak("Unknown encoding for $h, '$arg{$h}'");
73 }
6be7c101 74 eval { binmode($h, ":raw :encoding($arg{$h})") };
151b5d36 75 }else{
76 unless (exists $arg{$h}){
77 eval {
78 no warnings 'uninitialized';
6be7c101 79 binmode($h, ":raw :encoding($name)");
151b5d36 80 };
81 }
82 }
83 if ($@){
84 require Carp;
85 Carp::croak($@);
86 }
3ef515df 87 }
88 return 1; # I doubt if we need it, though
89}
90
91sub unimport{
92 no warnings;
93 undef ${^ENCODING};
621b0f8d 94 if ($HAS_PERLIO){
95 binmode(STDIN, ":raw");
96 binmode(STDOUT, ":raw");
97 }else{
6be7c101 98 binmode(STDIN);
99 binmode(STDOUT);
621b0f8d 100 }
aae85ceb 101 if ($INC{"Filter/Util/Call.pm"}){
102 eval { filter_del() };
103 }
3ef515df 104}
105
1061;
107__END__
85982a32 108
3ef515df 109=pod
110
111=head1 NAME
112
0ab8f81e 113encoding - allows you to write your script in non-ascii or non-utf8
3ef515df 114
115=head1 SYNOPSIS
116
962111ca 117 use encoding "greek"; # Perl like Greek to you?
3ef515df 118 use encoding "euc-jp"; # Jperl!
119
962111ca 120 # or you can even do this if your shell supports your native encoding
3ef515df 121
962111ca 122 perl -Mencoding=latin2 -e '...' # Feeling centrally European?
0ab8f81e 123 perl -Mencoding=euc-kr -e '...' # Or Korean?
3ef515df 124
3ef515df 125 # more control
126
962111ca 127 # A simple euc-cn => utf-8 converter
6d1c0808 128 use encoding "euc-cn", STDOUT => "utf8"; while(<>){print};
3ef515df 129
130 # "no encoding;" supported (but not scoped!)
131 no encoding;
132
aae85ceb 133 # an alternate way, Filter
134 use encoding "euc-jp", Filter=>1;
aae85ceb 135 # now you can use kanji identifiers -- in euc-jp!
136
3ef515df 137=head1 ABSTRACT
138
962111ca 139Let's start with a bit of history: Perl 5.6.0 introduced Unicode
140support. You could apply C<substr()> and regexes even to complex CJK
141characters -- so long as the script was written in UTF-8. But back
0ab8f81e 142then, text editors that supported UTF-8 were still rare and many users
143instead chose to write scripts in legacy encodings, giving up a whole
144new feature of Perl 5.6.
3ef515df 145
0ab8f81e 146Rewind to the future: starting from perl 5.8.0 with the B<encoding>
962111ca 147pragma, you can write your script in any encoding you like (so long
148as the C<Encode> module supports it) and still enjoy Unicode support.
0f29a567 149This pragma achieves that by doing the following:
05ef2f67 150
151=over
152
153=item *
154
155Internally converts all literals (C<q//,qq//,qr//,qw///, qx//>) from
156the encoding specified to utf8. In Perl 5.8.1 and later, literals in
157C<tr///> and C<DATA> pseudo-filehandle are also converted.
158
159=item *
160
161Changing PerlIO layers of C<STDIN> and C<STDOUT> to the encoding
162 specified.
163
164=back
165
166=head2 Literal Conversions
167
0ab8f81e 168You can write code in EUC-JP as follows:
3ef515df 169
170 my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
171 #<-char-><-char-> # 4 octets
172 s/\bCamel\b/$Rakuda/;
173
174And with C<use encoding "euc-jp"> in effect, it is the same thing as
962111ca 175the code in UTF-8:
3ef515df 176
32b9ed1f 177 my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
3ef515df 178 s/\bCamel\b/$Rakuda/;
179
05ef2f67 180=head2 PerlIO layers for C<STD(IN|OUT)>
181
182The B<encoding> pragma also modifies the filehandle layers of
4b291ae6 183STDIN and STDOUT to the specified encoding. Therefore,
3ef515df 184
185 use encoding "euc-jp";
186 my $message = "Camel is the symbol of perl.\n";
187 my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
188 $message =~ s/\bCamel\b/$Rakuda/;
189 print $message;
190
962111ca 191Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n",
192not "\x{99F1}\x{99DD} is the symbol of perl.\n".
3ef515df 193
0ab8f81e 194You can override this by giving extra arguments; see below.
3ef515df 195
05ef2f67 196=head1 FEATURES THAT REQUIRE 5.8.1
197
198Some of the features offered by this pragma requires perl 5.8.1. Most
0f29a567 199of these are done by Inaba Hiroto. Any other features and changes
05ef2f67 200are good for 5.8.0.
201
202=over
203
204=item "NON-EUC" doublebyte encodings
205
0f29a567 206Because perl needs to parse script before applying this pragma, such
05ef2f67 207encodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH;
208\x5c) in the second byte fails because the second byte may
0f29a567 209accidentally escape the quoting character that follows. Perl 5.8.1
05ef2f67 210or later fixes this problem.
211
212=item tr//
213
214C<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0
215See the section below for details.
216
217=item DATA pseudo-filehandle
218
219Another feature that was overlooked was C<DATA>.
220
221=back
222
3ef515df 223=head1 USAGE
224
225=over 4
226
227=item use encoding [I<ENCNAME>] ;
228
05ef2f67 229Sets the script encoding to I<ENCNAME>. And unless ${^UNICODE}
230exists and non-zero, PerlIO layers of STDIN and STDOUT are set to
231":encoding(I<ENCNAME>)".
232
233Note that STDERR WILL NOT be changed.
234
235Also note that non-STD file handles remain unaffected. Use C<use
236open> or C<binmode> to change layers of those.
3ef515df 237
238If no encoding is specified, the environment variable L<PERL_ENCODING>
962111ca 239is consulted. If no encoding can be found, the error C<Unknown encoding
240'I<ENCNAME>'> will be thrown.
3ef515df 241
aae85ceb 242=item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ;
3ef515df 243
0ab8f81e 244You can also individually set encodings of STDIN and STDOUT via the
32b9ed1f 245C<< STDIN => I<ENCNAME> >> form. In this case, you cannot omit the
246first I<ENCNAME>. C<< STDIN => undef >> turns the IO transcoding
aae85ceb 247completely off.
3ef515df 248
05ef2f67 249When ${^UNICODE} exists and non-zero, these options will completely
250ignored. ${^UNICODE} is a variable introduced in perl 5.8.1. See
251L<perlrun> see L<perlvar/"${^UNICODE}"> and L<perlrun/"-C"> for
252details (perl 5.8.1 and later).
253
151b5d36 254=item use encoding I<ENCNAME> Filter=E<gt>1;
255
256This turns the encoding pragma into a source filter. While the
257default approach just decodes interpolated literals (in qq() and
258qr()), this will apply a source filter to the entire source code. See
05ef2f67 259L</"The Filter Option"> below for details.
151b5d36 260
3ef515df 261=item no encoding;
262
05ef2f67 263Unsets the script encoding. The layers of STDIN, STDOUT are
962111ca 264reset to ":raw" (the default unprocessed raw stream of bytes).
3ef515df 265
266=back
267
151b5d36 268=head1 The Filter Option
269
270The magic of C<use encoding> is not applied to the names of
271identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human
272is a single Han ideograph) work, you still need to write your script
273in UTF-8 -- or use a source filter. That's what 'Filter=>1' does.
274
151b5d36 275What does this mean? Your source code behaves as if it is written in
276UTF-8 with 'use utf8' in effect. So even if your editor only supports
277Shift_JIS, for example, you can still try examples in Chapter 15 of
278C<Programming Perl, 3rd Ed.>. For instance, you can use UTF-8
279identifiers.
280
281This option is significantly slower and (as of this writing) non-ASCII
282identifiers are not very stable WITHOUT this option and with the
283source code written in UTF-8.
284
285=head2 Filter-related changes at Encode version 1.87
286
287=over
288
289=item *
290
291The Filter option now sets STDIN and STDOUT like non-filter options.
292And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like
293non-filter version.
294
295=item *
296
297C<use utf8> is implicitly declared so you no longer have to C<use
298utf8> to C<${"\x{4eba}"}++>.
299
300=back
301
3ef515df 302=head1 CAVEATS
303
304=head2 NOT SCOPED
305
306The pragma is a per script, not a per block lexical. Only the last
621b0f8d 307C<use encoding> or C<no encoding> matters, and it affects
308B<the whole script>. However, the <no encoding> pragma is supported and
309B<use encoding> can appear as many times as you want in a given script.
310The multiple use of this pragma is discouraged.
311
0f29a567 312By the same reason, the use this pragma inside modules is also
313discouraged (though not as strongly discouranged as the case above.
314See below).
05ef2f67 315
316If you still have to write a module with this pragma, be very careful
317of the load order. See the codes below;
318
319 # called module
320 package Module_IN_BAR;
321 use encoding "bar";
322 # stuff in "bar" encoding here
323 1;
324
325 # caller script
326 use encoding "foo"
327 use Module_IN_BAR;
328 # surprise! use encoding "bar" is in effect.
329
330The best way to avoid this oddity is to use this pragma RIGHT AFTER
331other modules are loaded. i.e.
332
333 use Module_IN_BAR;
334 use encoding "foo";
3ef515df 335
336=head2 DO NOT MIX MULTIPLE ENCODINGS
337
338Notice that only literals (string or regular expression) having only
339legacy code points are affected: if you mix data like this
340
341 \xDF\x{100}
342
343the data is assumed to be in (Latin 1 and) Unicode, not in your native
344encoding. In other words, this will match in "greek":
345
346 "\xDF" =~ /\x{3af}/
347
348but this will not
349
350 "\xDF\x{100}" =~ /\x{3af}\x{100}/
351
962111ca 352since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
353the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL
354LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left. You
355should not be mixing your legacy data and Unicode in the same string.
3ef515df 356
357This pragma also affects encoding of the 0x80..0xFF code point range:
358normally characters in that range are left as eight-bit bytes (unless
359they are combined with characters with code points 0x100 or larger,
360in which case all characters need to become UTF-8 encoded), but if
361the C<encoding> pragma is present, even the 0x80..0xFF range always
362gets UTF-8 encoded.
363
364After all, the best thing about this pragma is that you don't have to
0ab8f81e 365resort to \x{....} just to spell your name in a native encoding.
366So feel free to put your strings in your encoding in quotes and
367regexes.
3ef515df 368
151b5d36 369=head2 tr/// with ranges
4b291ae6 370
371The B<encoding> pragma works by decoding string literals in
151b5d36 372C<q//,qq//,qr//,qw///, qx//> and so forth. In perl 5.8.0, this
4b291ae6 373does not apply to C<tr///>. Therefore,
374
375 use encoding 'euc-jp';
376 #....
377 $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/;
378 # -------- -------- -------- --------
379
380Does not work as
381
382 $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/;
383
384=over
385
386=item Legend of characters above
387
388 utf8 euc-jp charnames::viacode()
389 -----------------------------------------
390 \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A
391 \x{3093} \xA4\xF3 HIRAGANA LETTER N
392 \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A
393 \x{30f3} \xA5\xF3 KATAKANA LETTER N
394
395=back
396
05ef2f67 397This counterintuitive behavior has been fixed in perl 5.8.1.
151b5d36 398
4b291ae6 399=head3 workaround to tr///;
400
151b5d36 401In perl 5.8.0, you can work aroud as follows;
4b291ae6 402
403 use encoding 'euc-jp';
151b5d36 404 # ....
4b291ae6 405 eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };
406
407Note the C<tr//> expression is surronded by C<qq{}>. The idea behind
408is the same as classic idiom that makes C<tr///> 'interpolate'.
409
410 tr/$from/$to/; # wrong!
411 eval qq{ tr/$from/$to/ }; # workaround.
412
413Nevertheless, in case of B<encoding> pragma even C<q//> is affected so
414C<tr///> not being decoded was obviously against the will of Perl5
05ef2f67 415Porters so it has been fixed in Perl 5.8.1 or later.
aae85ceb 416
3ef515df 417=head1 EXAMPLE - Greekperl
418
419 use encoding "iso 8859-7";
420
0ab8f81e 421 # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode.
3ef515df 422
423 $a = "\xDF";
424 $b = "\x{100}";
425
426 printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
427
428 $c = $a . $b;
429
430 # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
431
432 # chr() is affected, and ...
433
434 print "mega\n" if ord(chr(0xdf)) == 0x3af;
435
436 # ... ord() is affected by the encoding pragma ...
437
438 print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
439
440 # ... as are eq and cmp ...
441
442 print "peta\n" if "\x{3af}" eq pack("C", 0xdf);
443 print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0;
444
445 # ... but pack/unpack C are not affected, in case you still
0ab8f81e 446 # want to go back to your native encoding
3ef515df 447
448 print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
449
450=head1 KNOWN PROBLEMS
451
151b5d36 452=over
453
0f29a567 454=item literals in regex that are longer than 127 bytes
151b5d36 455
0ab8f81e 456For native multibyte encodings (either fixed or variable length),
3ef515df 457the current implementation of the regular expressions may introduce
0ab8f81e 458recoding errors for regular expression literals longer than 127 bytes.
3ef515df 459
05ef2f67 460=item EBCDIC
151b5d36 461
3ef515df 462The encoding pragma is not supported on EBCDIC platforms.
0ab8f81e 463(Porters who are willing and able to remove this limitation are
464welcome.)
3ef515df 465
05ef2f67 466=item format
467
468This pragma doesn't work well with format because PerlIO does not
469get along very well with it. When format contains non-ascii
470characters it prints funny or gets "wide character warnings".
471To understand it, try the code below.
472
473 # Save this one in utf8
474 # replace *non-ascii* with a non-ascii string
475 my $camel;
476 format STDOUT =
477 *non-ascii*@>>>>>>>
478 $camel
479 .
480 $camel = "*non-ascii*";
481 binmode(STDOUT=>':encoding(utf8)'); # bang!
482 write; # funny
483 print $camel, "\n"; # fine
484
485Without binmode this happens to work but without binmode, print()
486fails instead of write().
487
488At any rate, the very use of format is questionable when it comes to
489unicode characters since you have to consider such things as character
490width (i.e. double-width for ideographs) and directions (i.e. BIDI for
491Arabic and Hebrew).
492
151b5d36 493=back
494
05ef2f67 495=head1 HISTORY
496
497This pragma first appeared in Perl 5.8.0. For features that require
4985.8.1 and better, see above.
499
3ef515df 500=head1 SEE ALSO
501
aae85ceb 502L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,
503
504Ch. 15 of C<Programming Perl (3rd Edition)>
505by Larry Wall, Tom Christiansen, Jon Orwant;
506O'Reilly & Associates; ISBN 0-596-00027-8
3ef515df 507
508=cut