X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2Fencoding.pm;h=642726da7e0f2d11676f48471544d4ccd3754e28;hb=5290524f8b52822096b01140005d681d126b507d;hp=7dacd7a5bdca24ad2d2e12011c9f0430c6f1d9d9;hpb=25f59a5e41d8ad4cca035052507fca0ebff1d7e6;p=p5sagit%2Fp5-mst-13.2.git diff --git a/lib/encoding.pm b/lib/encoding.pm index 7dacd7a..642726d 100644 --- a/lib/encoding.pm +++ b/lib/encoding.pm @@ -1,10 +1,13 @@ package encoding; +our $VERSION = '1.00'; + use Encode; sub import { my ($class, $name) = @_; $name = $ENV{PERL_ENCODING} if @_ < 2; + $name = "latin1" unless defined $name; my $enc = find_encoding($name); unless (defined $enc) { require Carp; @@ -23,13 +26,29 @@ encoding - pragma to control the conversion of legacy data into Unicode use encoding "iso 8859-7"; + # The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode. + $a = "\xDF"; $b = "\x{100}"; + printf "%#x\n", ord($a); # will print 0x3af, not 0xdf + $c = $a . $b; # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}". - # The \xDF of ISO 8859-7 is \x{3af} in Unicode. + + # chr() is affected, and ... + + print "mega\n" if ord(chr(0xdf)) == 0x3af; + + # ... ord() is affected by the encoding pragma ... + + print "tera\n" if ord(pack("C", 0xdf)) == 0x3af; + + # but pack/unpack are not affected, in case you still + # want back to your native encoding + + print "peta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf; =head1 DESCRIPTION @@ -38,16 +57,46 @@ expected to be Latin-1 (or EBCDIC in EBCDIC platforms). With the encoding pragma you can change this default. The pragma is a per script, not a per block lexical. Only the last -'use encoding' seen matters. +C matters, and it affects B. + +Notice that only literals (string or regular expression) having only +legacy code points are affected: if you mix data like this + + \xDF\x{100} + +the data is assumed to be in (Latin 1 and) Unicode, not in your native +encoding. In other words, this will match in "greek": + + "\xDF" =~ /\x{3af}/ + +but this will not + + "\xDF\x{100}" =~ /\x{3af}\x{100}/ + +since the C<\xDF> on the left will B be upgraded to C<\x{3af}> +because of the C<\x{100}> on the left. You should not be mixing your +legacy data and Unicode in the same string. + +This pragma also affects encoding of the 0x80..0xFF code point range: +normally characters in that range are left as eight-bit bytes (unless +they are combined with characters with code points 0x100 or larger, +in which case all characters need to become UTF-8 encoded), but if +the C pragma is present, even the 0x80..0xFF range always +gets UTF-8 encoded. + +If no encoding is specified, the environment variable L +is consulted. If that fails, "latin1" (ISO 8859-1) is assumed. If no +encoding can be found, C error will be thrown. -=head1 FUTURE POSSIBILITIES +=head1 KNOWN PROBLEMS -The C<\x..> and C<\0...> in literals and regular expressions are not -affected by this pragma. They probably should. Ditto C<\N{...}>. +For native multibyte encodings (either fixed or variable length) +the current implementation of the regular expressions may introduce +recoding errors for longer regular expression literals than 127 bytes. =head1 SEE ALSO -L +L, L =cut