lib/open.pm
[p5sagit/p5-mst-13.2.git] / lib / encoding.pm
CommitLineData
0a378802 1package encoding;
2
8de1277c 3our $VERSION = '1.00';
4
0a378802 5use Encode;
6
7sub import {
8 my ($class, $name) = @_;
9 $name = $ENV{PERL_ENCODING} if @_ < 2;
121910a4 10 $name = "latin1" unless defined $name;
0a378802 11 my $enc = find_encoding($name);
12 unless (defined $enc) {
13 require Carp;
14 Carp::croak "Unknown encoding '$name'";
15 }
16 ${^ENCODING} = $enc;
17}
18
19=pod
20
21=head1 NAME
22
23encoding - pragma to control the conversion of legacy data into Unicode
24
25=head1 SYNOPSIS
26
27 use encoding "iso 8859-7";
28
121910a4 29 # The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode.
4bdee82d 30
0a378802 31 $a = "\xDF";
32 $b = "\x{100}";
33
4bdee82d 34 printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
35
0a378802 36 $c = $a . $b;
37
38 # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
0a378802 39
121910a4 40 # chr() is affected, and ...
41
42 print "mega\n" if ord(chr(0xdf)) == 0x3af;
43
44 # ... ord() is affected by the encoding pragma ...
45
46 print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
47
3de8ed06 48 # but pack/unpack are not affected, in case you still
121910a4 49 # want back to your native encoding
50
51 print "peta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
52
0a378802 53=head1 DESCRIPTION
54
55Normally when legacy 8-bit data is converted to Unicode the data is
56expected to be Latin-1 (or EBCDIC in EBCDIC platforms). With the
57encoding pragma you can change this default.
58
59The pragma is a per script, not a per block lexical. Only the last
9f4817db 60C<use encoding> matters, and it affects B<the whole script>.
0a378802 61
a72c7584 62Notice that only literals (string or regular expression) having only
63legacy code points are affected: if you mix data like this
64
65 \xDF\x{100}
66
67the data is assumed to be in (Latin 1 and) Unicode, not in your native
68encoding. In other words, this will match in "greek":
69
70 "\xDF" =~ /\x{3af}/
71
72but this will not
73
74 "\xDF\x{100}" =~ /\x{3af}\x{100}/
75
76since the C<\xDF> on the left will B<not> be upgraded to C<\x{3af}>
77because of the C<\x{100}> on the left. You should not be mixing your
78legacy data and Unicode in the same string.
79
4bdee82d 80If no encoding is specified, the environment variable L<PERL_ENCODING>
3de8ed06 81is consulted. If that fails, "latin1" (ISO 8859-1) is assumed. If no
82encoding can be found, C<Unknown encoding '...'> error will be thrown.
4bdee82d 83
6ec9efec 84=head1 KNOWN PROBLEMS
0a378802 85
a72c7584 86For native multibyte encodings (either fixed or variable length)
87the current implementation of the regular expressions may introduce
88recoding errors for longer regular expression literals than 127 bytes.
d521382b 89
0a378802 90=head1 SEE ALSO
91
121910a4 92L<perlunicode>, L<Encode>
0a378802 93
94=cut
95
961;