Commit | Line | Data |
3ef515df |
1 | package encoding; |
f2a2953c |
2 | our $VERSION = do { my @r = (q$Revision: 1.25 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; |
3ef515df |
3 | |
4 | use Encode; |
5 | |
6 | BEGIN { |
7 | if (ord("A") == 193) { |
8 | require Carp; |
9 | Carp::croak "encoding pragma does not support EBCDIC platforms"; |
10 | } |
11 | } |
12 | |
13 | sub import { |
14 | my $class = shift; |
15 | my $name = shift; |
16 | my %arg = @_; |
17 | $name ||= $ENV{PERL_ENCODING}; |
18 | |
19 | my $enc = find_encoding($name); |
20 | unless (defined $enc) { |
21 | require Carp; |
22 | Carp::croak "Unknown encoding '$name'"; |
23 | } |
24 | ${^ENCODING} = $enc; # this is all you need, actually. |
25 | |
26 | # $_OPEN_ORIG = ${^OPEN}; |
27 | for my $h (qw(STDIN STDOUT STDERR)){ |
28 | if ($arg{$h}){ |
29 | unless (defined find_encoding($name)) { |
30 | require Carp; |
31 | Carp::croak "Unknown encoding for $fhname, '$arg{$h}'"; |
32 | } |
33 | eval qq{ binmode($h, ":encoding($arg{h})") }; |
34 | }else{ |
35 | eval qq{ binmode($h, ":encoding($name)") }; |
36 | } |
37 | if ($@){ |
38 | require Carp; |
39 | Carp::croak($@); |
40 | } |
41 | } |
42 | return 1; # I doubt if we need it, though |
43 | } |
44 | |
45 | sub unimport{ |
46 | no warnings; |
47 | undef ${^ENCODING}; |
48 | binmode(STDIN, ":raw"); |
49 | binmode(STDOUT, ":raw"); |
f2a2953c |
50 | # Leaves STDERR alone. |
51 | # binmode(STDERR, ":raw"); |
3ef515df |
52 | } |
53 | |
54 | 1; |
55 | __END__ |
56 | =pod |
57 | |
58 | =head1 NAME |
59 | |
60 | encoding - allows you to write your script in non-asii or non-utf8 |
61 | |
62 | =head1 SYNOPSIS |
63 | |
64 | use encoding "euc-jp"; # Jperl! |
65 | |
66 | # or you can even do this if your shell supports euc-jp |
67 | |
68 | > perl -Mencoding=euc-jp -e '...' |
69 | |
70 | # or from the shebang line |
71 | |
72 | #!/your/path/to/perl -Mencoding=euc-jp |
73 | |
74 | # more control |
75 | |
76 | # A simple euc-jp => utf-8 converter |
77 | use encoding "euc-jp", STDOUT => "utf8"; while(<>){print}; |
78 | |
79 | # "no encoding;" supported (but not scoped!) |
80 | no encoding; |
81 | |
82 | =head1 ABSTRACT |
83 | |
84 | Perl 5.6.0 has introduced Unicode support. You could apply |
85 | C<substr()> and regexes even to complex CJK characters -- so long as |
86 | the script was written in UTF-8. But back then text editors that |
87 | support UTF-8 was still rare and many users rather chose to writer |
88 | scripts in legacy encodings, given up whole new feature of Perl 5.6. |
89 | |
90 | With B<encoding> pragma, you can write your script in any encoding you like |
91 | (so long as the C<Encode> module supports it) and still enjoy Unicode |
92 | support. You can write a code in EUC-JP as follows; |
93 | |
94 | my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji |
95 | #<-char-><-char-> # 4 octets |
96 | s/\bCamel\b/$Rakuda/; |
97 | |
98 | And with C<use encoding "euc-jp"> in effect, it is the same thing as |
99 | the code in UTF-8 as follow. |
100 | |
101 | my $Rakuda = "\x{99F1}\x{99DD}"; # who Unicode Characters |
102 | s/\bCamel\b/$Rakuda/; |
103 | |
104 | The B<encoding> pragma also modifies the file handle disciplines of |
105 | STDIN, STDOUT, and STDERR to the specified encoding. Therefore, |
106 | |
107 | use encoding "euc-jp"; |
108 | my $message = "Camel is the symbol of perl.\n"; |
109 | my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji |
110 | $message =~ s/\bCamel\b/$Rakuda/; |
111 | print $message; |
112 | |
113 | Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n", not |
114 | "\x{99F1}\x{99DD} is the symbol of perl.\n". |
115 | |
116 | You can override this by giving extra arguments. See below. |
117 | |
118 | =head1 USAGE |
119 | |
120 | =over 4 |
121 | |
122 | =item use encoding [I<ENCNAME>] ; |
123 | |
124 | Sets the script encoding to I<ENCNAME> and file handle disciplines of |
f2a2953c |
125 | STDIN, STDOUT are set to ":encoding(I<ENCNAME>)". Note STDERR will not |
126 | be changed. |
3ef515df |
127 | |
128 | If no encoding is specified, the environment variable L<PERL_ENCODING> |
129 | is consulted. If no encoding can be found, C<Unknown encoding 'I<ENCNAME>'> |
130 | error will be thrown. |
131 | |
132 | Note that non-STD file handles remain unaffected. Use C<use open> or |
133 | C<binmode> to change disciplines of those. |
134 | |
135 | =item use encoding I<ENCNAME> [ STDIN => I<ENCNAME_IN> ...] ; |
136 | |
137 | You can also individually set encodings of STDIN, STDOUT, and STDERR |
138 | via STDI<FH> => I<ENCNAME_FH> form. In this case, you cannot omit the |
139 | first I<ENCNAME>. |
140 | |
141 | =item no encoding; |
142 | |
f2a2953c |
143 | Unsets the script encoding and the disciplines of STDIN, STDOUT are |
144 | reset to ":raw". |
3ef515df |
145 | |
146 | =back |
147 | |
148 | =head1 CAVEATS |
149 | |
150 | =head2 NOT SCOPED |
151 | |
152 | The pragma is a per script, not a per block lexical. Only the last |
153 | C<use encoding> or C<matters, and it affects B<the whole script>. |
154 | Though <no encoding> pragma is supported and C<use encoding> can |
155 | appear as many times as you want in a given script, the multiple use |
156 | of this pragma is discouraged. |
157 | |
158 | =head2 DO NOT MIX MULTIPLE ENCODINGS |
159 | |
160 | Notice that only literals (string or regular expression) having only |
161 | legacy code points are affected: if you mix data like this |
162 | |
163 | \xDF\x{100} |
164 | |
165 | the data is assumed to be in (Latin 1 and) Unicode, not in your native |
166 | encoding. In other words, this will match in "greek": |
167 | |
168 | "\xDF" =~ /\x{3af}/ |
169 | |
170 | but this will not |
171 | |
172 | "\xDF\x{100}" =~ /\x{3af}\x{100}/ |
173 | |
174 | since the C<\xDF> on the left will B<not> be upgraded to C<\x{3af}> |
175 | because of the C<\x{100}> on the left. You should not be mixing your |
176 | legacy data and Unicode in the same string. |
177 | |
178 | This pragma also affects encoding of the 0x80..0xFF code point range: |
179 | normally characters in that range are left as eight-bit bytes (unless |
180 | they are combined with characters with code points 0x100 or larger, |
181 | in which case all characters need to become UTF-8 encoded), but if |
182 | the C<encoding> pragma is present, even the 0x80..0xFF range always |
183 | gets UTF-8 encoded. |
184 | |
185 | After all, the best thing about this pragma is that you don't have to |
186 | resort to \x... just to spell your name in native encoding. So feel |
187 | free to put your strings in your encoding in quotes and regexes. |
188 | |
189 | =head1 EXAMPLE - Greekperl |
190 | |
191 | use encoding "iso 8859-7"; |
192 | |
193 | # The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode. |
194 | |
195 | $a = "\xDF"; |
196 | $b = "\x{100}"; |
197 | |
198 | printf "%#x\n", ord($a); # will print 0x3af, not 0xdf |
199 | |
200 | $c = $a . $b; |
201 | |
202 | # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}". |
203 | |
204 | # chr() is affected, and ... |
205 | |
206 | print "mega\n" if ord(chr(0xdf)) == 0x3af; |
207 | |
208 | # ... ord() is affected by the encoding pragma ... |
209 | |
210 | print "tera\n" if ord(pack("C", 0xdf)) == 0x3af; |
211 | |
212 | # ... as are eq and cmp ... |
213 | |
214 | print "peta\n" if "\x{3af}" eq pack("C", 0xdf); |
215 | print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0; |
216 | |
217 | # ... but pack/unpack C are not affected, in case you still |
218 | # want back to your native encoding |
219 | |
220 | print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf; |
221 | |
222 | =head1 KNOWN PROBLEMS |
223 | |
224 | For native multibyte encodings (either fixed or variable length) |
225 | the current implementation of the regular expressions may introduce |
226 | recoding errors for longer regular expression literals than 127 bytes. |
227 | |
228 | The encoding pragma is not supported on EBCDIC platforms. |
229 | (Porters wanted.) |
230 | |
231 | =head1 SEE ALSO |
232 | |
233 | L<perlunicode>, L<Encode>, L<open> |
234 | |
235 | =cut |