(retracted by #14846)
[p5sagit/p5-mst-13.2.git] / lib / encoding.pm
CommitLineData
0a378802 1package encoding;
2
8de1277c 3our $VERSION = '1.00';
4
0a378802 5use Encode;
6
7sub import {
8 my ($class, $name) = @_;
9 $name = $ENV{PERL_ENCODING} if @_ < 2;
121910a4 10 $name = "latin1" unless defined $name;
0a378802 11 my $enc = find_encoding($name);
12 unless (defined $enc) {
13 require Carp;
14 Carp::croak "Unknown encoding '$name'";
15 }
16 ${^ENCODING} = $enc;
17}
18
19=pod
20
21=head1 NAME
22
23encoding - pragma to control the conversion of legacy data into Unicode
24
25=head1 SYNOPSIS
26
27 use encoding "iso 8859-7";
28
121910a4 29 # The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode.
4bdee82d 30
0a378802 31 $a = "\xDF";
32 $b = "\x{100}";
33
4bdee82d 34 printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
35
0a378802 36 $c = $a . $b;
37
38 # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
0a378802 39
121910a4 40 # chr() is affected, and ...
41
42 print "mega\n" if ord(chr(0xdf)) == 0x3af;
43
44 # ... ord() is affected by the encoding pragma ...
45
46 print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
47
3de8ed06 48 # but pack/unpack are not affected, in case you still
121910a4 49 # want back to your native encoding
50
51 print "peta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
52
0a378802 53=head1 DESCRIPTION
54
55Normally when legacy 8-bit data is converted to Unicode the data is
56expected to be Latin-1 (or EBCDIC in EBCDIC platforms). With the
57encoding pragma you can change this default.
58
59The pragma is a per script, not a per block lexical. Only the last
9f4817db 60C<use encoding> matters, and it affects B<the whole script>.
0a378802 61
a72c7584 62Notice that only literals (string or regular expression) having only
63legacy code points are affected: if you mix data like this
64
65 \xDF\x{100}
66
67the data is assumed to be in (Latin 1 and) Unicode, not in your native
68encoding. In other words, this will match in "greek":
69
70 "\xDF" =~ /\x{3af}/
71
72but this will not
73
74 "\xDF\x{100}" =~ /\x{3af}\x{100}/
75
76since the C<\xDF> on the left will B<not> be upgraded to C<\x{3af}>
77because of the C<\x{100}> on the left. You should not be mixing your
78legacy data and Unicode in the same string.
79
4ef28c72 80This pragma also affects encoding of the 0x80..0xFF code point range:
81normally characters in that range are left as eight-bit bytes (unless
82they are combined with characters with code points 0x100 or larger,
83in which case all characters need to become UTF-8 encoded), but if
84the C<encoding> pragma is present, even the 0x80..0xFF range always
85gets UTF-8 encoded.
86
4bdee82d 87If no encoding is specified, the environment variable L<PERL_ENCODING>
3de8ed06 88is consulted. If that fails, "latin1" (ISO 8859-1) is assumed. If no
89encoding can be found, C<Unknown encoding '...'> error will be thrown.
4bdee82d 90
6ec9efec 91=head1 KNOWN PROBLEMS
0a378802 92
a72c7584 93For native multibyte encodings (either fixed or variable length)
94the current implementation of the regular expressions may introduce
95recoding errors for longer regular expression literals than 127 bytes.
d521382b 96
0a378802 97=head1 SEE ALSO
98
121910a4 99L<perlunicode>, L<Encode>
0a378802 100
101=cut
102
1031;