Commit | Line | Data |
0a378802 |
1 | package encoding; |
2 | |
8de1277c |
3 | our $VERSION = '1.00'; |
4 | |
0a378802 |
5 | use Encode; |
6 | |
7 | sub import { |
8 | my ($class, $name) = @_; |
9 | $name = $ENV{PERL_ENCODING} if @_ < 2; |
121910a4 |
10 | $name = "latin1" unless defined $name; |
0a378802 |
11 | my $enc = find_encoding($name); |
12 | unless (defined $enc) { |
13 | require Carp; |
14 | Carp::croak "Unknown encoding '$name'"; |
15 | } |
16 | ${^ENCODING} = $enc; |
17 | } |
18 | |
19 | =pod |
20 | |
21 | =head1 NAME |
22 | |
23 | encoding - pragma to control the conversion of legacy data into Unicode |
24 | |
25 | =head1 SYNOPSIS |
26 | |
27 | use encoding "iso 8859-7"; |
28 | |
121910a4 |
29 | # The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode. |
4bdee82d |
30 | |
0a378802 |
31 | $a = "\xDF"; |
32 | $b = "\x{100}"; |
33 | |
4bdee82d |
34 | printf "%#x\n", ord($a); # will print 0x3af, not 0xdf |
35 | |
0a378802 |
36 | $c = $a . $b; |
37 | |
38 | # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}". |
0a378802 |
39 | |
121910a4 |
40 | # chr() is affected, and ... |
41 | |
42 | print "mega\n" if ord(chr(0xdf)) == 0x3af; |
43 | |
44 | # ... ord() is affected by the encoding pragma ... |
45 | |
46 | print "tera\n" if ord(pack("C", 0xdf)) == 0x3af; |
47 | |
3de8ed06 |
48 | # but pack/unpack are not affected, in case you still |
121910a4 |
49 | # want back to your native encoding |
50 | |
51 | print "peta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf; |
52 | |
0a378802 |
53 | =head1 DESCRIPTION |
54 | |
55 | Normally when legacy 8-bit data is converted to Unicode the data is |
56 | expected to be Latin-1 (or EBCDIC in EBCDIC platforms). With the |
57 | encoding pragma you can change this default. |
58 | |
59 | The pragma is a per script, not a per block lexical. Only the last |
9f4817db |
60 | C<use encoding> matters, and it affects B<the whole script>. |
0a378802 |
61 | |
a72c7584 |
62 | Notice that only literals (string or regular expression) having only |
63 | legacy code points are affected: if you mix data like this |
64 | |
65 | \xDF\x{100} |
66 | |
67 | the data is assumed to be in (Latin 1 and) Unicode, not in your native |
68 | encoding. In other words, this will match in "greek": |
69 | |
70 | "\xDF" =~ /\x{3af}/ |
71 | |
72 | but this will not |
73 | |
74 | "\xDF\x{100}" =~ /\x{3af}\x{100}/ |
75 | |
76 | since the C<\xDF> on the left will B<not> be upgraded to C<\x{3af}> |
77 | because of the C<\x{100}> on the left. You should not be mixing your |
78 | legacy data and Unicode in the same string. |
79 | |
4bdee82d |
80 | If no encoding is specified, the environment variable L<PERL_ENCODING> |
3de8ed06 |
81 | is consulted. If that fails, "latin1" (ISO 8859-1) is assumed. If no |
82 | encoding can be found, C<Unknown encoding '...'> error will be thrown. |
4bdee82d |
83 | |
6ec9efec |
84 | =head1 KNOWN PROBLEMS |
0a378802 |
85 | |
a72c7584 |
86 | For native multibyte encodings (either fixed or variable length) |
87 | the current implementation of the regular expressions may introduce |
88 | recoding errors for longer regular expression literals than 127 bytes. |
d521382b |
89 | |
0a378802 |
90 | =head1 SEE ALSO |
91 | |
121910a4 |
92 | L<perlunicode>, L<Encode> |
0a378802 |
93 | |
94 | =cut |
95 | |
96 | 1; |