Commit | Line | Data |
3ef515df |
1 | package encoding; |
85982a32 |
2 | our $VERSION = do { my @r = (q$Revision: 1.30 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; |
3ef515df |
3 | |
4 | use Encode; |
046f36bf |
5 | use strict; |
3ef515df |
6 | |
7 | BEGIN { |
8 | if (ord("A") == 193) { |
9 | require Carp; |
10 | Carp::croak "encoding pragma does not support EBCDIC platforms"; |
11 | } |
12 | } |
13 | |
85982a32 |
14 | our $HAS_PERLIO = exists $INC{"PerlIO/encoding.pm"}; |
15 | $HAS_PERLIO or binmode(STDIN); |
b2704119 |
16 | |
3ef515df |
17 | sub import { |
18 | my $class = shift; |
19 | my $name = shift; |
20 | my %arg = @_; |
21 | $name ||= $ENV{PERL_ENCODING}; |
22 | |
23 | my $enc = find_encoding($name); |
24 | unless (defined $enc) { |
25 | require Carp; |
26 | Carp::croak "Unknown encoding '$name'"; |
27 | } |
aae85ceb |
28 | unless ($arg{Filter}){ |
29 | ${^ENCODING} = $enc; # this is all you need, actually. |
85982a32 |
30 | $HAS_PERLIO or return 1; |
aae85ceb |
31 | for my $h (qw(STDIN STDOUT)){ |
32 | if ($arg{$h}){ |
b2704119 |
33 | unless (defined find_encoding($arg{$h})) { |
aae85ceb |
34 | require Carp; |
35 | Carp::croak "Unknown encoding for $h, '$arg{$h}'"; |
36 | } |
37 | eval qq{ binmode($h, ":encoding($arg{$h})") }; |
38 | }else{ |
39 | unless (exists $arg{$h}){ |
40 | eval qq{ binmode($h, ":encoding($name)") }; |
41 | } |
42 | } |
43 | if ($@){ |
3ef515df |
44 | require Carp; |
aae85ceb |
45 | Carp::croak($@); |
3ef515df |
46 | } |
3ef515df |
47 | } |
aae85ceb |
48 | }else{ |
49 | defined(${^ENCODING}) and undef ${^ENCODING}; |
50 | eval { |
51 | require Filter::Util::Call ; |
52 | Filter::Util::Call->import ; |
b2704119 |
53 | binmode(STDIN); |
54 | binmode(STDOUT); |
aae85ceb |
55 | filter_add(sub{ |
56 | my $status; |
57 | if (($status = filter_read()) > 0){ |
58 | $_ = $enc->decode($_, 1); |
59 | # warn $_; |
60 | } |
61 | $status ; |
62 | }); |
63 | }; |
64 | # warn "Filter installed"; |
3ef515df |
65 | } |
66 | return 1; # I doubt if we need it, though |
67 | } |
68 | |
69 | sub unimport{ |
70 | no warnings; |
71 | undef ${^ENCODING}; |
b2704119 |
72 | binmode(STDIN); |
73 | binmode(STDOUT); |
aae85ceb |
74 | if ($INC{"Filter/Util/Call.pm"}){ |
75 | eval { filter_del() }; |
76 | } |
3ef515df |
77 | } |
78 | |
79 | 1; |
80 | __END__ |
85982a32 |
81 | |
3ef515df |
82 | =pod |
83 | |
84 | =head1 NAME |
85 | |
86 | encoding - allows you to write your script in non-asii or non-utf8 |
87 | |
88 | =head1 SYNOPSIS |
89 | |
962111ca |
90 | use encoding "greek"; # Perl like Greek to you? |
3ef515df |
91 | use encoding "euc-jp"; # Jperl! |
92 | |
962111ca |
93 | # or you can even do this if your shell supports your native encoding |
3ef515df |
94 | |
962111ca |
95 | perl -Mencoding=latin2 -e '...' # Feeling centrally European? |
96 | perl -Mencoding=euc-ko -e '...' |
3ef515df |
97 | |
98 | # or from the shebang line |
99 | |
962111ca |
100 | #!/your/path/to/perl -Mencoding="8859-6" # Arabian Nights |
101 | #!/your/path/to/perl -Mencoding=euc-tw |
3ef515df |
102 | |
103 | # more control |
104 | |
962111ca |
105 | # A simple euc-cn => utf-8 converter |
106 | use encoding "euc-cn", STDOUT => "utf8"; while(<>){print}; |
3ef515df |
107 | |
108 | # "no encoding;" supported (but not scoped!) |
109 | no encoding; |
110 | |
aae85ceb |
111 | # an alternate way, Filter |
112 | use encoding "euc-jp", Filter=>1; |
113 | use utf8; |
114 | # now you can use kanji identifiers -- in euc-jp! |
115 | |
3ef515df |
116 | =head1 ABSTRACT |
117 | |
962111ca |
118 | Let's start with a bit of history: Perl 5.6.0 introduced Unicode |
119 | support. You could apply C<substr()> and regexes even to complex CJK |
120 | characters -- so long as the script was written in UTF-8. But back |
121 | then text editors that supported UTF-8 were still rare and many users |
122 | rather chose to write scripts in legacy encodings, given up whole new |
123 | feature of Perl 5.6. |
3ef515df |
124 | |
962111ca |
125 | Rewind to the future: starting from perl 5.8.0 with B<encoding> |
126 | pragma, you can write your script in any encoding you like (so long |
127 | as the C<Encode> module supports it) and still enjoy Unicode support. |
128 | You can write a code in EUC-JP as follows: |
3ef515df |
129 | |
130 | my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji |
131 | #<-char-><-char-> # 4 octets |
132 | s/\bCamel\b/$Rakuda/; |
133 | |
134 | And with C<use encoding "euc-jp"> in effect, it is the same thing as |
962111ca |
135 | the code in UTF-8: |
3ef515df |
136 | |
137 | my $Rakuda = "\x{99F1}\x{99DD}"; # who Unicode Characters |
138 | s/\bCamel\b/$Rakuda/; |
139 | |
962111ca |
140 | The B<encoding> pragma also modifies the filehandle disciplines of |
3ef515df |
141 | STDIN, STDOUT, and STDERR to the specified encoding. Therefore, |
142 | |
143 | use encoding "euc-jp"; |
144 | my $message = "Camel is the symbol of perl.\n"; |
145 | my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji |
146 | $message =~ s/\bCamel\b/$Rakuda/; |
147 | print $message; |
148 | |
962111ca |
149 | Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n", |
150 | not "\x{99F1}\x{99DD} is the symbol of perl.\n". |
3ef515df |
151 | |
962111ca |
152 | You can override this by giving extra arguments, see below. |
3ef515df |
153 | |
154 | =head1 USAGE |
155 | |
156 | =over 4 |
157 | |
158 | =item use encoding [I<ENCNAME>] ; |
159 | |
962111ca |
160 | Sets the script encoding to I<ENCNAME> and filehandle disciplines of |
161 | STDIN, STDOUT are set to ":encoding(I<ENCNAME>)". Note STDERR will |
162 | not be changed. |
3ef515df |
163 | |
164 | If no encoding is specified, the environment variable L<PERL_ENCODING> |
962111ca |
165 | is consulted. If no encoding can be found, the error C<Unknown encoding |
166 | 'I<ENCNAME>'> will be thrown. |
3ef515df |
167 | |
168 | Note that non-STD file handles remain unaffected. Use C<use open> or |
169 | C<binmode> to change disciplines of those. |
170 | |
aae85ceb |
171 | =item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ; |
3ef515df |
172 | |
aae85ceb |
173 | You can also individually set encodings of STDIN and STDOUT via |
174 | STDI<FH> =E<gt> I<ENCNAME_FH> form. In this case, you cannot omit the |
962111ca |
175 | first I<ENCNAME>. C<STDI<FH> =E<gt> undef> turns the IO transcoding |
aae85ceb |
176 | completely off. |
3ef515df |
177 | |
178 | =item no encoding; |
179 | |
f2a2953c |
180 | Unsets the script encoding and the disciplines of STDIN, STDOUT are |
962111ca |
181 | reset to ":raw" (the default unprocessed raw stream of bytes). |
3ef515df |
182 | |
183 | =back |
184 | |
185 | =head1 CAVEATS |
186 | |
187 | =head2 NOT SCOPED |
188 | |
189 | The pragma is a per script, not a per block lexical. Only the last |
190 | C<use encoding> or C<matters, and it affects B<the whole script>. |
962111ca |
191 | However, <no encoding> pragma is supported and C<use encoding> can |
192 | appear as many times as you want in a given script. The multiple use |
3ef515df |
193 | of this pragma is discouraged. |
194 | |
195 | =head2 DO NOT MIX MULTIPLE ENCODINGS |
196 | |
197 | Notice that only literals (string or regular expression) having only |
198 | legacy code points are affected: if you mix data like this |
199 | |
200 | \xDF\x{100} |
201 | |
202 | the data is assumed to be in (Latin 1 and) Unicode, not in your native |
203 | encoding. In other words, this will match in "greek": |
204 | |
205 | "\xDF" =~ /\x{3af}/ |
206 | |
207 | but this will not |
208 | |
209 | "\xDF\x{100}" =~ /\x{3af}\x{100}/ |
210 | |
962111ca |
211 | since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on |
212 | the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL |
213 | LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left. You |
214 | should not be mixing your legacy data and Unicode in the same string. |
3ef515df |
215 | |
216 | This pragma also affects encoding of the 0x80..0xFF code point range: |
217 | normally characters in that range are left as eight-bit bytes (unless |
218 | they are combined with characters with code points 0x100 or larger, |
219 | in which case all characters need to become UTF-8 encoded), but if |
220 | the C<encoding> pragma is present, even the 0x80..0xFF range always |
221 | gets UTF-8 encoded. |
222 | |
223 | After all, the best thing about this pragma is that you don't have to |
962111ca |
224 | resort to \x... just to spell your name in native a encoding. So feel |
3ef515df |
225 | free to put your strings in your encoding in quotes and regexes. |
226 | |
962111ca |
227 | =head1 Non-ASCII Identifiers and Filter option |
aae85ceb |
228 | |
962111ca |
229 | The magic of C<use encoding> is not applied to the names of |
230 | identifiers. In order to make C<${"4eba"}++> ($human++, where human |
231 | is a single Han ideograph) work, you still need to write your script |
232 | in UTF-8 or use a source filter. |
aae85ceb |
233 | |
234 | In other words, the same restriction as Jperl applies. |
235 | |
962111ca |
236 | If you dare to experiment, however, you can try Filter option. |
aae85ceb |
237 | |
238 | =over 4 |
239 | |
240 | =item use encoding I<ENCNAME> Filter=E<gt>1; |
241 | |
242 | This turns encoding pragma into source filter. While the default |
243 | approach just decodes interpolated literals (in qq() and qr()), this |
244 | will apply source filter to entire source code. In this case, STDIN |
245 | and STDOUT remain untouched. |
246 | |
247 | =back |
248 | |
962111ca |
249 | What does this mean? Your source code behaves as if it is written in |
250 | UTF-8. So even if your editor only supports Shift_JIS, for example. |
251 | You can still try examples in Chapter 15 of C<Programming Perl, 3rd |
252 | Ed.> For instance, you can use UTF-8 identifiers. |
aae85ceb |
253 | |
254 | This option is significantly slower and (as of this writing) non-ASCII |
255 | identifiers are not very stable WITHOUT this option and with the |
256 | source code written in UTF-8. |
257 | |
962111ca |
258 | To make your script in legacy encoding work with minimum effort, |
259 | do not use Filter=E<gt>1. |
aae85ceb |
260 | |
3ef515df |
261 | =head1 EXAMPLE - Greekperl |
262 | |
263 | use encoding "iso 8859-7"; |
264 | |
265 | # The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode. |
266 | |
267 | $a = "\xDF"; |
268 | $b = "\x{100}"; |
269 | |
270 | printf "%#x\n", ord($a); # will print 0x3af, not 0xdf |
271 | |
272 | $c = $a . $b; |
273 | |
274 | # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}". |
275 | |
276 | # chr() is affected, and ... |
277 | |
278 | print "mega\n" if ord(chr(0xdf)) == 0x3af; |
279 | |
280 | # ... ord() is affected by the encoding pragma ... |
281 | |
282 | print "tera\n" if ord(pack("C", 0xdf)) == 0x3af; |
283 | |
284 | # ... as are eq and cmp ... |
285 | |
286 | print "peta\n" if "\x{3af}" eq pack("C", 0xdf); |
287 | print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0; |
288 | |
289 | # ... but pack/unpack C are not affected, in case you still |
290 | # want back to your native encoding |
291 | |
292 | print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf; |
293 | |
294 | =head1 KNOWN PROBLEMS |
295 | |
296 | For native multibyte encodings (either fixed or variable length) |
297 | the current implementation of the regular expressions may introduce |
298 | recoding errors for longer regular expression literals than 127 bytes. |
299 | |
300 | The encoding pragma is not supported on EBCDIC platforms. |
301 | (Porters wanted.) |
302 | |
303 | =head1 SEE ALSO |
304 | |
aae85ceb |
305 | L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>, |
306 | |
307 | Ch. 15 of C<Programming Perl (3rd Edition)> |
308 | by Larry Wall, Tom Christiansen, Jon Orwant; |
309 | O'Reilly & Associates; ISBN 0-596-00027-8 |
3ef515df |
310 | |
311 | =cut |