Upgrade to Encode 1.33, from Dan Kogai.
[p5sagit/p5-mst-13.2.git] / ext / Encode / encoding.pm
CommitLineData
3ef515df 1package encoding;
f2a2953c 2our $VERSION = do { my @r = (q$Revision: 1.25 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
3ef515df 3
4use Encode;
5
6BEGIN {
7 if (ord("A") == 193) {
8 require Carp;
9 Carp::croak "encoding pragma does not support EBCDIC platforms";
10 }
11}
12
13sub import {
14 my $class = shift;
15 my $name = shift;
16 my %arg = @_;
17 $name ||= $ENV{PERL_ENCODING};
18
19 my $enc = find_encoding($name);
20 unless (defined $enc) {
21 require Carp;
22 Carp::croak "Unknown encoding '$name'";
23 }
24 ${^ENCODING} = $enc; # this is all you need, actually.
25
26 # $_OPEN_ORIG = ${^OPEN};
27 for my $h (qw(STDIN STDOUT STDERR)){
28 if ($arg{$h}){
29 unless (defined find_encoding($name)) {
30 require Carp;
31 Carp::croak "Unknown encoding for $fhname, '$arg{$h}'";
32 }
33 eval qq{ binmode($h, ":encoding($arg{h})") };
34 }else{
35 eval qq{ binmode($h, ":encoding($name)") };
36 }
37 if ($@){
38 require Carp;
39 Carp::croak($@);
40 }
41 }
42 return 1; # I doubt if we need it, though
43}
44
45sub unimport{
46 no warnings;
47 undef ${^ENCODING};
48 binmode(STDIN, ":raw");
49 binmode(STDOUT, ":raw");
f2a2953c 50 # Leaves STDERR alone.
51 # binmode(STDERR, ":raw");
3ef515df 52}
53
541;
55__END__
56=pod
57
58=head1 NAME
59
60encoding - allows you to write your script in non-asii or non-utf8
61
62=head1 SYNOPSIS
63
64 use encoding "euc-jp"; # Jperl!
65
66 # or you can even do this if your shell supports euc-jp
67
68 > perl -Mencoding=euc-jp -e '...'
69
70 # or from the shebang line
71
72 #!/your/path/to/perl -Mencoding=euc-jp
73
74 # more control
75
76 # A simple euc-jp => utf-8 converter
77 use encoding "euc-jp", STDOUT => "utf8"; while(<>){print};
78
79 # "no encoding;" supported (but not scoped!)
80 no encoding;
81
82=head1 ABSTRACT
83
84Perl 5.6.0 has introduced Unicode support. You could apply
85C<substr()> and regexes even to complex CJK characters -- so long as
86the script was written in UTF-8. But back then text editors that
87support UTF-8 was still rare and many users rather chose to writer
88scripts in legacy encodings, given up whole new feature of Perl 5.6.
89
90With B<encoding> pragma, you can write your script in any encoding you like
91(so long as the C<Encode> module supports it) and still enjoy Unicode
92support. You can write a code in EUC-JP as follows;
93
94 my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
95 #<-char-><-char-> # 4 octets
96 s/\bCamel\b/$Rakuda/;
97
98And with C<use encoding "euc-jp"> in effect, it is the same thing as
99the code in UTF-8 as follow.
100
101 my $Rakuda = "\x{99F1}\x{99DD}"; # who Unicode Characters
102 s/\bCamel\b/$Rakuda/;
103
104The B<encoding> pragma also modifies the file handle disciplines of
105STDIN, STDOUT, and STDERR to the specified encoding. Therefore,
106
107 use encoding "euc-jp";
108 my $message = "Camel is the symbol of perl.\n";
109 my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
110 $message =~ s/\bCamel\b/$Rakuda/;
111 print $message;
112
113Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n", not
114"\x{99F1}\x{99DD} is the symbol of perl.\n".
115
116You can override this by giving extra arguments. See below.
117
118=head1 USAGE
119
120=over 4
121
122=item use encoding [I<ENCNAME>] ;
123
124Sets the script encoding to I<ENCNAME> and file handle disciplines of
f2a2953c 125STDIN, STDOUT are set to ":encoding(I<ENCNAME>)". Note STDERR will not
126be changed.
3ef515df 127
128If no encoding is specified, the environment variable L<PERL_ENCODING>
129is consulted. If no encoding can be found, C<Unknown encoding 'I<ENCNAME>'>
130error will be thrown.
131
132Note that non-STD file handles remain unaffected. Use C<use open> or
133C<binmode> to change disciplines of those.
134
135=item use encoding I<ENCNAME> [ STDIN => I<ENCNAME_IN> ...] ;
136
137You can also individually set encodings of STDIN, STDOUT, and STDERR
138via STDI<FH> => I<ENCNAME_FH> form. In this case, you cannot omit the
139first I<ENCNAME>.
140
141=item no encoding;
142
f2a2953c 143Unsets the script encoding and the disciplines of STDIN, STDOUT are
144reset to ":raw".
3ef515df 145
146=back
147
148=head1 CAVEATS
149
150=head2 NOT SCOPED
151
152The pragma is a per script, not a per block lexical. Only the last
153C<use encoding> or C<matters, and it affects B<the whole script>.
154Though <no encoding> pragma is supported and C<use encoding> can
155appear as many times as you want in a given script, the multiple use
156of this pragma is discouraged.
157
158=head2 DO NOT MIX MULTIPLE ENCODINGS
159
160Notice that only literals (string or regular expression) having only
161legacy code points are affected: if you mix data like this
162
163 \xDF\x{100}
164
165the data is assumed to be in (Latin 1 and) Unicode, not in your native
166encoding. In other words, this will match in "greek":
167
168 "\xDF" =~ /\x{3af}/
169
170but this will not
171
172 "\xDF\x{100}" =~ /\x{3af}\x{100}/
173
174since the C<\xDF> on the left will B<not> be upgraded to C<\x{3af}>
175because of the C<\x{100}> on the left. You should not be mixing your
176legacy data and Unicode in the same string.
177
178This pragma also affects encoding of the 0x80..0xFF code point range:
179normally characters in that range are left as eight-bit bytes (unless
180they are combined with characters with code points 0x100 or larger,
181in which case all characters need to become UTF-8 encoded), but if
182the C<encoding> pragma is present, even the 0x80..0xFF range always
183gets UTF-8 encoded.
184
185After all, the best thing about this pragma is that you don't have to
186resort to \x... just to spell your name in native encoding. So feel
187free to put your strings in your encoding in quotes and regexes.
188
189=head1 EXAMPLE - Greekperl
190
191 use encoding "iso 8859-7";
192
193 # The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode.
194
195 $a = "\xDF";
196 $b = "\x{100}";
197
198 printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
199
200 $c = $a . $b;
201
202 # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
203
204 # chr() is affected, and ...
205
206 print "mega\n" if ord(chr(0xdf)) == 0x3af;
207
208 # ... ord() is affected by the encoding pragma ...
209
210 print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
211
212 # ... as are eq and cmp ...
213
214 print "peta\n" if "\x{3af}" eq pack("C", 0xdf);
215 print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0;
216
217 # ... but pack/unpack C are not affected, in case you still
218 # want back to your native encoding
219
220 print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
221
222=head1 KNOWN PROBLEMS
223
224For native multibyte encodings (either fixed or variable length)
225the current implementation of the regular expressions may introduce
226recoding errors for longer regular expression literals than 127 bytes.
227
228The encoding pragma is not supported on EBCDIC platforms.
229(Porters wanted.)
230
231=head1 SEE ALSO
232
233L<perlunicode>, L<Encode>, L<open>
234
235=cut