Integrate #16510 from macperl;
[p5sagit/p5-mst-13.2.git] / ext / Encode / encoding.pm
CommitLineData
3ef515df 1package encoding;
621b0f8d 2our $VERSION = do { my @r = (q$Revision: 1.35 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
3ef515df 3
4use Encode;
046f36bf 5use strict;
3ef515df 6
7BEGIN {
8 if (ord("A") == 193) {
9 require Carp;
10c5ecbb 10 Carp::croak("encoding pragma does not support EBCDIC platforms");
3ef515df 11 }
12}
13
0ab8f81e 14our $HAS_PERLIO = 0;
15eval { require PerlIO::encoding };
16unless ($@){
17 $HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02);
18}
b2704119 19
3ef515df 20sub import {
21 my $class = shift;
22 my $name = shift;
23 my %arg = @_;
24 $name ||= $ENV{PERL_ENCODING};
25
26 my $enc = find_encoding($name);
27 unless (defined $enc) {
28 require Carp;
10c5ecbb 29 Carp::croak("Unknown encoding '$name'");
3ef515df 30 }
aae85ceb 31 unless ($arg{Filter}){
32 ${^ENCODING} = $enc; # this is all you need, actually.
85982a32 33 $HAS_PERLIO or return 1;
aae85ceb 34 for my $h (qw(STDIN STDOUT)){
35 if ($arg{$h}){
b2704119 36 unless (defined find_encoding($arg{$h})) {
aae85ceb 37 require Carp;
10c5ecbb 38 Carp::croak("Unknown encoding for $h, '$arg{$h}'");
aae85ceb 39 }
0ab8f81e 40 eval { binmode($h, ":encoding($arg{$h})") };
aae85ceb 41 }else{
42 unless (exists $arg{$h}){
0ab8f81e 43 eval {
44 no warnings 'uninitialized';
45 binmode($h, ":encoding($name)");
46 };
aae85ceb 47 }
48 }
49 if ($@){
3ef515df 50 require Carp;
aae85ceb 51 Carp::croak($@);
3ef515df 52 }
3ef515df 53 }
aae85ceb 54 }else{
55 defined(${^ENCODING}) and undef ${^ENCODING};
56 eval {
57 require Filter::Util::Call ;
58 Filter::Util::Call->import ;
b2704119 59 binmode(STDIN);
60 binmode(STDOUT);
aae85ceb 61 filter_add(sub{
62 my $status;
63 if (($status = filter_read()) > 0){
64 $_ = $enc->decode($_, 1);
65 # warn $_;
66 }
67 $status ;
68 });
69 };
70 # warn "Filter installed";
3ef515df 71 }
72 return 1; # I doubt if we need it, though
73}
74
75sub unimport{
76 no warnings;
77 undef ${^ENCODING};
621b0f8d 78 if ($HAS_PERLIO){
79 binmode(STDIN, ":raw");
80 binmode(STDOUT, ":raw");
81 }else{
b2704119 82 binmode(STDIN);
83 binmode(STDOUT);
621b0f8d 84 }
aae85ceb 85 if ($INC{"Filter/Util/Call.pm"}){
86 eval { filter_del() };
87 }
3ef515df 88}
89
901;
91__END__
85982a32 92
3ef515df 93=pod
94
95=head1 NAME
96
0ab8f81e 97encoding - allows you to write your script in non-ascii or non-utf8
3ef515df 98
99=head1 SYNOPSIS
100
962111ca 101 use encoding "greek"; # Perl like Greek to you?
3ef515df 102 use encoding "euc-jp"; # Jperl!
103
962111ca 104 # or you can even do this if your shell supports your native encoding
3ef515df 105
962111ca 106 perl -Mencoding=latin2 -e '...' # Feeling centrally European?
0ab8f81e 107 perl -Mencoding=euc-kr -e '...' # Or Korean?
3ef515df 108
109 # or from the shebang line
110
962111ca 111 #!/your/path/to/perl -Mencoding="8859-6" # Arabian Nights
0ab8f81e 112 #!/your/path/to/perl -Mencoding=big5 # Taiwanese
3ef515df 113
114 # more control
115
962111ca 116 # A simple euc-cn => utf-8 converter
6d1c0808 117 use encoding "euc-cn", STDOUT => "utf8"; while(<>){print};
3ef515df 118
119 # "no encoding;" supported (but not scoped!)
120 no encoding;
121
aae85ceb 122 # an alternate way, Filter
123 use encoding "euc-jp", Filter=>1;
124 use utf8;
125 # now you can use kanji identifiers -- in euc-jp!
126
3ef515df 127=head1 ABSTRACT
128
962111ca 129Let's start with a bit of history: Perl 5.6.0 introduced Unicode
130support. You could apply C<substr()> and regexes even to complex CJK
131characters -- so long as the script was written in UTF-8. But back
0ab8f81e 132then, text editors that supported UTF-8 were still rare and many users
133instead chose to write scripts in legacy encodings, giving up a whole
134new feature of Perl 5.6.
3ef515df 135
0ab8f81e 136Rewind to the future: starting from perl 5.8.0 with the B<encoding>
962111ca 137pragma, you can write your script in any encoding you like (so long
138as the C<Encode> module supports it) and still enjoy Unicode support.
0ab8f81e 139You can write code in EUC-JP as follows:
3ef515df 140
141 my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
142 #<-char-><-char-> # 4 octets
143 s/\bCamel\b/$Rakuda/;
144
145And with C<use encoding "euc-jp"> in effect, it is the same thing as
962111ca 146the code in UTF-8:
3ef515df 147
32b9ed1f 148 my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
3ef515df 149 s/\bCamel\b/$Rakuda/;
150
962111ca 151The B<encoding> pragma also modifies the filehandle disciplines of
3ef515df 152STDIN, STDOUT, and STDERR to the specified encoding. Therefore,
153
154 use encoding "euc-jp";
155 my $message = "Camel is the symbol of perl.\n";
156 my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
157 $message =~ s/\bCamel\b/$Rakuda/;
158 print $message;
159
962111ca 160Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n",
161not "\x{99F1}\x{99DD} is the symbol of perl.\n".
3ef515df 162
0ab8f81e 163You can override this by giving extra arguments; see below.
3ef515df 164
165=head1 USAGE
166
167=over 4
168
169=item use encoding [I<ENCNAME>] ;
170
0ab8f81e 171Sets the script encoding to I<ENCNAME>. Filehandle disciplines of
172STDIN and STDOUT are set to ":encoding(I<ENCNAME>)". Note that STDERR
173will not be changed.
3ef515df 174
175If no encoding is specified, the environment variable L<PERL_ENCODING>
962111ca 176is consulted. If no encoding can be found, the error C<Unknown encoding
177'I<ENCNAME>'> will be thrown.
3ef515df 178
179Note that non-STD file handles remain unaffected. Use C<use open> or
180C<binmode> to change disciplines of those.
181
aae85ceb 182=item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ;
3ef515df 183
0ab8f81e 184You can also individually set encodings of STDIN and STDOUT via the
32b9ed1f 185C<< STDIN => I<ENCNAME> >> form. In this case, you cannot omit the
186first I<ENCNAME>. C<< STDIN => undef >> turns the IO transcoding
aae85ceb 187completely off.
3ef515df 188
189=item no encoding;
190
0ab8f81e 191Unsets the script encoding. The disciplines of STDIN, STDOUT are
962111ca 192reset to ":raw" (the default unprocessed raw stream of bytes).
3ef515df 193
194=back
195
196=head1 CAVEATS
197
198=head2 NOT SCOPED
199
200The pragma is a per script, not a per block lexical. Only the last
621b0f8d 201C<use encoding> or C<no encoding> matters, and it affects
202B<the whole script>. However, the <no encoding> pragma is supported and
203B<use encoding> can appear as many times as you want in a given script.
204The multiple use of this pragma is discouraged.
205
206Because of this nature, the use of this pragma inside the module is
207strongly discouraged (because the influence of this pragma lasts not
208only for the module but the script that uses). But if you have to,
209make sure you say C<no encoding> at the end of the module so you
210contain the influence of the pragma within the module.
3ef515df 211
212=head2 DO NOT MIX MULTIPLE ENCODINGS
213
214Notice that only literals (string or regular expression) having only
215legacy code points are affected: if you mix data like this
216
217 \xDF\x{100}
218
219the data is assumed to be in (Latin 1 and) Unicode, not in your native
220encoding. In other words, this will match in "greek":
221
222 "\xDF" =~ /\x{3af}/
223
224but this will not
225
226 "\xDF\x{100}" =~ /\x{3af}\x{100}/
227
962111ca 228since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
229the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL
230LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left. You
231should not be mixing your legacy data and Unicode in the same string.
3ef515df 232
233This pragma also affects encoding of the 0x80..0xFF code point range:
234normally characters in that range are left as eight-bit bytes (unless
235they are combined with characters with code points 0x100 or larger,
236in which case all characters need to become UTF-8 encoded), but if
237the C<encoding> pragma is present, even the 0x80..0xFF range always
238gets UTF-8 encoded.
239
240After all, the best thing about this pragma is that you don't have to
0ab8f81e 241resort to \x{....} just to spell your name in a native encoding.
242So feel free to put your strings in your encoding in quotes and
243regexes.
3ef515df 244
962111ca 245=head1 Non-ASCII Identifiers and Filter option
aae85ceb 246
962111ca 247The magic of C<use encoding> is not applied to the names of
32b9ed1f 248identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human
962111ca 249is a single Han ideograph) work, you still need to write your script
250in UTF-8 or use a source filter.
aae85ceb 251
0ab8f81e 252In other words, the same restriction as with Jperl applies.
aae85ceb 253
0ab8f81e 254If you dare to experiment, however, you can try the Filter option.
aae85ceb 255
256=over 4
257
258=item use encoding I<ENCNAME> Filter=E<gt>1;
259
0ab8f81e 260This turns the encoding pragma into a source filter. While the default
aae85ceb 261approach just decodes interpolated literals (in qq() and qr()), this
0ab8f81e 262will apply a source filter to the entire source code. In this case,
263STDIN and STDOUT remain untouched.
aae85ceb 264
265=back
266
962111ca 267What does this mean? Your source code behaves as if it is written in
0ab8f81e 268UTF-8. So even if your editor only supports Shift_JIS, for example,
269you can still try examples in Chapter 15 of C<Programming Perl, 3rd
270Ed.>. For instance, you can use UTF-8 identifiers.
aae85ceb 271
272This option is significantly slower and (as of this writing) non-ASCII
273identifiers are not very stable WITHOUT this option and with the
274source code written in UTF-8.
275
962111ca 276To make your script in legacy encoding work with minimum effort,
277do not use Filter=E<gt>1.
aae85ceb 278
3ef515df 279=head1 EXAMPLE - Greekperl
280
281 use encoding "iso 8859-7";
282
0ab8f81e 283 # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode.
3ef515df 284
285 $a = "\xDF";
286 $b = "\x{100}";
287
288 printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
289
290 $c = $a . $b;
291
292 # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
293
294 # chr() is affected, and ...
295
296 print "mega\n" if ord(chr(0xdf)) == 0x3af;
297
298 # ... ord() is affected by the encoding pragma ...
299
300 print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
301
302 # ... as are eq and cmp ...
303
304 print "peta\n" if "\x{3af}" eq pack("C", 0xdf);
305 print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0;
306
307 # ... but pack/unpack C are not affected, in case you still
0ab8f81e 308 # want to go back to your native encoding
3ef515df 309
310 print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
311
312=head1 KNOWN PROBLEMS
313
0ab8f81e 314For native multibyte encodings (either fixed or variable length),
3ef515df 315the current implementation of the regular expressions may introduce
0ab8f81e 316recoding errors for regular expression literals longer than 127 bytes.
3ef515df 317
318The encoding pragma is not supported on EBCDIC platforms.
0ab8f81e 319(Porters who are willing and able to remove this limitation are
320welcome.)
3ef515df 321
322=head1 SEE ALSO
323
aae85ceb 324L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,
325
326Ch. 15 of C<Programming Perl (3rd Edition)>
327by Larry Wall, Tom Christiansen, Jon Orwant;
328O'Reilly & Associates; ISBN 0-596-00027-8
3ef515df 329
330=cut