ccflags, not ldflags.
[p5sagit/p5-mst-13.2.git] / lib / encoding.pm
CommitLineData
0a378802 1package encoding;
2
3use Encode;
4
5sub import {
6 my ($class, $name) = @_;
7 $name = $ENV{PERL_ENCODING} if @_ < 2;
121910a4 8 $name = "latin1" unless defined $name;
0a378802 9 my $enc = find_encoding($name);
10 unless (defined $enc) {
11 require Carp;
12 Carp::croak "Unknown encoding '$name'";
13 }
14 ${^ENCODING} = $enc;
15}
16
17=pod
18
19=head1 NAME
20
21encoding - pragma to control the conversion of legacy data into Unicode
22
23=head1 SYNOPSIS
24
25 use encoding "iso 8859-7";
26
121910a4 27 # The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode.
4bdee82d 28
0a378802 29 $a = "\xDF";
30 $b = "\x{100}";
31
4bdee82d 32 printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
33
0a378802 34 $c = $a . $b;
35
36 # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
0a378802 37
121910a4 38 # chr() is affected, and ...
39
40 print "mega\n" if ord(chr(0xdf)) == 0x3af;
41
42 # ... ord() is affected by the encoding pragma ...
43
44 print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
45
3de8ed06 46 # but pack/unpack are not affected, in case you still
121910a4 47 # want back to your native encoding
48
49 print "peta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
50
0a378802 51=head1 DESCRIPTION
52
53Normally when legacy 8-bit data is converted to Unicode the data is
54expected to be Latin-1 (or EBCDIC in EBCDIC platforms). With the
55encoding pragma you can change this default.
56
57The pragma is a per script, not a per block lexical. Only the last
9f4817db 58C<use encoding> matters, and it affects B<the whole script>.
0a378802 59
a72c7584 60Notice that only literals (string or regular expression) having only
61legacy code points are affected: if you mix data like this
62
63 \xDF\x{100}
64
65the data is assumed to be in (Latin 1 and) Unicode, not in your native
66encoding. In other words, this will match in "greek":
67
68 "\xDF" =~ /\x{3af}/
69
70but this will not
71
72 "\xDF\x{100}" =~ /\x{3af}\x{100}/
73
74since the C<\xDF> on the left will B<not> be upgraded to C<\x{3af}>
75because of the C<\x{100}> on the left. You should not be mixing your
76legacy data and Unicode in the same string.
77
4bdee82d 78If no encoding is specified, the environment variable L<PERL_ENCODING>
3de8ed06 79is consulted. If that fails, "latin1" (ISO 8859-1) is assumed. If no
80encoding can be found, C<Unknown encoding '...'> error will be thrown.
4bdee82d 81
6ec9efec 82=head1 KNOWN PROBLEMS
0a378802 83
a72c7584 84For native multibyte encodings (either fixed or variable length)
85the current implementation of the regular expressions may introduce
86recoding errors for longer regular expression literals than 127 bytes.
d521382b 87
0a378802 88=head1 SEE ALSO
89
121910a4 90L<perlunicode>, L<Encode>
0a378802 91
92=cut
93
941;