Add the test of [perl #8760] (fixed by change #18656).
[p5sagit/p5-mst-13.2.git] / ext / Encode / encoding.pm
CommitLineData
3ef515df 1package encoding;
fa6f41cf 2our $VERSION = do { my @r = (q$Revision: 1.40 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
3ef515df 3
4use Encode;
046f36bf 5use strict;
3ef515df 6
7BEGIN {
8 if (ord("A") == 193) {
9 require Carp;
10c5ecbb 10 Carp::croak("encoding pragma does not support EBCDIC platforms");
3ef515df 11 }
12}
13
0ab8f81e 14our $HAS_PERLIO = 0;
15eval { require PerlIO::encoding };
16unless ($@){
17 $HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02);
18}
b2704119 19
fa6f41cf 20my %utfs = map {$_=>1}
21 qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE);
22
3ef515df 23sub import {
24 my $class = shift;
25 my $name = shift;
26 my %arg = @_;
27 $name ||= $ENV{PERL_ENCODING};
28
29 my $enc = find_encoding($name);
30 unless (defined $enc) {
31 require Carp;
10c5ecbb 32 Carp::croak("Unknown encoding '$name'");
3ef515df 33 }
fa6f41cf 34 $name = $enc->name; # canonize
0f7c507f 35 unless ($arg{Filter}) {
fa6f41cf 36 ${^ENCODING} = $enc unless $] <= 5.008 and $utfs{$name};
85982a32 37 $HAS_PERLIO or return 1;
aae85ceb 38 for my $h (qw(STDIN STDOUT)){
39 if ($arg{$h}){
b2704119 40 unless (defined find_encoding($arg{$h})) {
aae85ceb 41 require Carp;
10c5ecbb 42 Carp::croak("Unknown encoding for $h, '$arg{$h}'");
aae85ceb 43 }
0ab8f81e 44 eval { binmode($h, ":encoding($arg{$h})") };
aae85ceb 45 }else{
46 unless (exists $arg{$h}){
0ab8f81e 47 eval {
48 no warnings 'uninitialized';
49 binmode($h, ":encoding($name)");
50 };
aae85ceb 51 }
52 }
53 if ($@){
3ef515df 54 require Carp;
aae85ceb 55 Carp::croak($@);
3ef515df 56 }
3ef515df 57 }
aae85ceb 58 }else{
59 defined(${^ENCODING}) and undef ${^ENCODING};
60 eval {
61 require Filter::Util::Call ;
62 Filter::Util::Call->import ;
b2704119 63 binmode(STDIN);
64 binmode(STDOUT);
aae85ceb 65 filter_add(sub{
66 my $status;
67 if (($status = filter_read()) > 0){
68 $_ = $enc->decode($_, 1);
69 # warn $_;
70 }
71 $status ;
72 });
73 };
74 # warn "Filter installed";
3ef515df 75 }
76 return 1; # I doubt if we need it, though
77}
78
79sub unimport{
80 no warnings;
81 undef ${^ENCODING};
621b0f8d 82 if ($HAS_PERLIO){
83 binmode(STDIN, ":raw");
84 binmode(STDOUT, ":raw");
85 }else{
b2704119 86 binmode(STDIN);
87 binmode(STDOUT);
621b0f8d 88 }
aae85ceb 89 if ($INC{"Filter/Util/Call.pm"}){
90 eval { filter_del() };
91 }
3ef515df 92}
93
941;
95__END__
85982a32 96
3ef515df 97=pod
98
99=head1 NAME
100
0ab8f81e 101encoding - allows you to write your script in non-ascii or non-utf8
3ef515df 102
103=head1 SYNOPSIS
104
962111ca 105 use encoding "greek"; # Perl like Greek to you?
3ef515df 106 use encoding "euc-jp"; # Jperl!
107
962111ca 108 # or you can even do this if your shell supports your native encoding
3ef515df 109
962111ca 110 perl -Mencoding=latin2 -e '...' # Feeling centrally European?
0ab8f81e 111 perl -Mencoding=euc-kr -e '...' # Or Korean?
3ef515df 112
3ef515df 113 # more control
114
962111ca 115 # A simple euc-cn => utf-8 converter
6d1c0808 116 use encoding "euc-cn", STDOUT => "utf8"; while(<>){print};
3ef515df 117
118 # "no encoding;" supported (but not scoped!)
119 no encoding;
120
aae85ceb 121 # an alternate way, Filter
122 use encoding "euc-jp", Filter=>1;
123 use utf8;
124 # now you can use kanji identifiers -- in euc-jp!
125
3ef515df 126=head1 ABSTRACT
127
962111ca 128Let's start with a bit of history: Perl 5.6.0 introduced Unicode
129support. You could apply C<substr()> and regexes even to complex CJK
130characters -- so long as the script was written in UTF-8. But back
0ab8f81e 131then, text editors that supported UTF-8 were still rare and many users
132instead chose to write scripts in legacy encodings, giving up a whole
133new feature of Perl 5.6.
3ef515df 134
0ab8f81e 135Rewind to the future: starting from perl 5.8.0 with the B<encoding>
962111ca 136pragma, you can write your script in any encoding you like (so long
137as the C<Encode> module supports it) and still enjoy Unicode support.
0ab8f81e 138You can write code in EUC-JP as follows:
3ef515df 139
140 my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
141 #<-char-><-char-> # 4 octets
142 s/\bCamel\b/$Rakuda/;
143
144And with C<use encoding "euc-jp"> in effect, it is the same thing as
962111ca 145the code in UTF-8:
3ef515df 146
32b9ed1f 147 my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
3ef515df 148 s/\bCamel\b/$Rakuda/;
149
962111ca 150The B<encoding> pragma also modifies the filehandle disciplines of
4b291ae6 151STDIN and STDOUT to the specified encoding. Therefore,
3ef515df 152
153 use encoding "euc-jp";
154 my $message = "Camel is the symbol of perl.\n";
155 my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
156 $message =~ s/\bCamel\b/$Rakuda/;
157 print $message;
158
962111ca 159Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n",
160not "\x{99F1}\x{99DD} is the symbol of perl.\n".
3ef515df 161
0ab8f81e 162You can override this by giving extra arguments; see below.
3ef515df 163
164=head1 USAGE
165
166=over 4
167
168=item use encoding [I<ENCNAME>] ;
169
0ab8f81e 170Sets the script encoding to I<ENCNAME>. Filehandle disciplines of
171STDIN and STDOUT are set to ":encoding(I<ENCNAME>)". Note that STDERR
172will not be changed.
3ef515df 173
174If no encoding is specified, the environment variable L<PERL_ENCODING>
962111ca 175is consulted. If no encoding can be found, the error C<Unknown encoding
176'I<ENCNAME>'> will be thrown.
3ef515df 177
178Note that non-STD file handles remain unaffected. Use C<use open> or
179C<binmode> to change disciplines of those.
180
aae85ceb 181=item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ;
3ef515df 182
0ab8f81e 183You can also individually set encodings of STDIN and STDOUT via the
32b9ed1f 184C<< STDIN => I<ENCNAME> >> form. In this case, you cannot omit the
185first I<ENCNAME>. C<< STDIN => undef >> turns the IO transcoding
aae85ceb 186completely off.
3ef515df 187
188=item no encoding;
189
0ab8f81e 190Unsets the script encoding. The disciplines of STDIN, STDOUT are
962111ca 191reset to ":raw" (the default unprocessed raw stream of bytes).
3ef515df 192
193=back
194
195=head1 CAVEATS
196
197=head2 NOT SCOPED
198
199The pragma is a per script, not a per block lexical. Only the last
621b0f8d 200C<use encoding> or C<no encoding> matters, and it affects
201B<the whole script>. However, the <no encoding> pragma is supported and
202B<use encoding> can appear as many times as you want in a given script.
203The multiple use of this pragma is discouraged.
204
205Because of this nature, the use of this pragma inside the module is
206strongly discouraged (because the influence of this pragma lasts not
207only for the module but the script that uses). But if you have to,
208make sure you say C<no encoding> at the end of the module so you
209contain the influence of the pragma within the module.
3ef515df 210
211=head2 DO NOT MIX MULTIPLE ENCODINGS
212
213Notice that only literals (string or regular expression) having only
214legacy code points are affected: if you mix data like this
215
216 \xDF\x{100}
217
218the data is assumed to be in (Latin 1 and) Unicode, not in your native
219encoding. In other words, this will match in "greek":
220
221 "\xDF" =~ /\x{3af}/
222
223but this will not
224
225 "\xDF\x{100}" =~ /\x{3af}\x{100}/
226
962111ca 227since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
228the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL
229LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left. You
230should not be mixing your legacy data and Unicode in the same string.
3ef515df 231
232This pragma also affects encoding of the 0x80..0xFF code point range:
233normally characters in that range are left as eight-bit bytes (unless
234they are combined with characters with code points 0x100 or larger,
235in which case all characters need to become UTF-8 encoded), but if
236the C<encoding> pragma is present, even the 0x80..0xFF range always
237gets UTF-8 encoded.
238
239After all, the best thing about this pragma is that you don't have to
0ab8f81e 240resort to \x{....} just to spell your name in a native encoding.
241So feel free to put your strings in your encoding in quotes and
242regexes.
3ef515df 243
4b291ae6 244=head2 tr/// with ranges remain unaffected
245
246The B<encoding> pragma works by decoding string literals in
247C<q//,qq//,qr//,qw///, qx//> and so forth. As of perl 5.8.0, this
248does not apply to C<tr///>. Therefore,
249
250 use encoding 'euc-jp';
251 #....
252 $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/;
253 # -------- -------- -------- --------
254
255Does not work as
256
257 $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/;
258
259=over
260
261=item Legend of characters above
262
263 utf8 euc-jp charnames::viacode()
264 -----------------------------------------
265 \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A
266 \x{3093} \xA4\xF3 HIRAGANA LETTER N
267 \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A
268 \x{30f3} \xA5\xF3 KATAKANA LETTER N
269
270=back
271
272=head3 workaround to tr///;
273
274You can, however, achieve the same as simply as follows;
275
276 use encoding 'euc-jp';
277 # ....
278 eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };
279
280Note the C<tr//> expression is surronded by C<qq{}>. The idea behind
281is the same as classic idiom that makes C<tr///> 'interpolate'.
282
283 tr/$from/$to/; # wrong!
284 eval qq{ tr/$from/$to/ }; # workaround.
285
286Nevertheless, in case of B<encoding> pragma even C<q//> is affected so
287C<tr///> not being decoded was obviously against the will of Perl5
288Porters. In future version of perl, this counter-intuitive behaviour
289of C<tr///> will be fixed so C<eval qq{}> trick will be unneccesary.
290
962111ca 291=head1 Non-ASCII Identifiers and Filter option
aae85ceb 292
962111ca 293The magic of C<use encoding> is not applied to the names of
32b9ed1f 294identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human
962111ca 295is a single Han ideograph) work, you still need to write your script
296in UTF-8 or use a source filter.
aae85ceb 297
0ab8f81e 298In other words, the same restriction as with Jperl applies.
aae85ceb 299
0ab8f81e 300If you dare to experiment, however, you can try the Filter option.
aae85ceb 301
302=over 4
303
304=item use encoding I<ENCNAME> Filter=E<gt>1;
305
0ab8f81e 306This turns the encoding pragma into a source filter. While the default
aae85ceb 307approach just decodes interpolated literals (in qq() and qr()), this
0ab8f81e 308will apply a source filter to the entire source code. In this case,
309STDIN and STDOUT remain untouched.
aae85ceb 310
311=back
312
962111ca 313What does this mean? Your source code behaves as if it is written in
0ab8f81e 314UTF-8. So even if your editor only supports Shift_JIS, for example,
315you can still try examples in Chapter 15 of C<Programming Perl, 3rd
316Ed.>. For instance, you can use UTF-8 identifiers.
aae85ceb 317
318This option is significantly slower and (as of this writing) non-ASCII
319identifiers are not very stable WITHOUT this option and with the
320source code written in UTF-8.
321
962111ca 322To make your script in legacy encoding work with minimum effort,
323do not use Filter=E<gt>1.
aae85ceb 324
3ef515df 325=head1 EXAMPLE - Greekperl
326
327 use encoding "iso 8859-7";
328
0ab8f81e 329 # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode.
3ef515df 330
331 $a = "\xDF";
332 $b = "\x{100}";
333
334 printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
335
336 $c = $a . $b;
337
338 # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
339
340 # chr() is affected, and ...
341
342 print "mega\n" if ord(chr(0xdf)) == 0x3af;
343
344 # ... ord() is affected by the encoding pragma ...
345
346 print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
347
348 # ... as are eq and cmp ...
349
350 print "peta\n" if "\x{3af}" eq pack("C", 0xdf);
351 print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0;
352
353 # ... but pack/unpack C are not affected, in case you still
0ab8f81e 354 # want to go back to your native encoding
3ef515df 355
356 print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
357
358=head1 KNOWN PROBLEMS
359
0ab8f81e 360For native multibyte encodings (either fixed or variable length),
3ef515df 361the current implementation of the regular expressions may introduce
0ab8f81e 362recoding errors for regular expression literals longer than 127 bytes.
3ef515df 363
364The encoding pragma is not supported on EBCDIC platforms.
0ab8f81e 365(Porters who are willing and able to remove this limitation are
366welcome.)
3ef515df 367
368=head1 SEE ALSO
369
aae85ceb 370L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,
371
372Ch. 15 of C<Programming Perl (3rd Edition)>
373by Larry Wall, Tom Christiansen, Jon Orwant;
374O'Reilly & Associates; ISBN 0-596-00027-8
3ef515df 375
376=cut