X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2Fencoding.pm;h=642726da7e0f2d11676f48471544d4ccd3754e28;hb=5290524f8b52822096b01140005d681d126b507d;hp=be0fd73a0042b98ad1fef311f1fa1ecaa3a09d4c;hpb=4bdee82d7bc2074b4348e7a8af9486f836481a0a;p=p5sagit%2Fp5-mst-13.2.git diff --git a/lib/encoding.pm b/lib/encoding.pm index be0fd73..642726d 100644 --- a/lib/encoding.pm +++ b/lib/encoding.pm @@ -1,10 +1,13 @@ package encoding; +our $VERSION = '1.00'; + use Encode; sub import { my ($class, $name) = @_; $name = $ENV{PERL_ENCODING} if @_ < 2; + $name = "latin1" unless defined $name; my $enc = find_encoding($name); unless (defined $enc) { require Carp; @@ -23,7 +26,7 @@ encoding - pragma to control the conversion of legacy data into Unicode use encoding "iso 8859-7"; - # The \xDF of ISO 8859-7 is \x{3af} in Unicode. + # The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode. $a = "\xDF"; $b = "\x{100}"; @@ -34,6 +37,19 @@ encoding - pragma to control the conversion of legacy data into Unicode # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}". + # chr() is affected, and ... + + print "mega\n" if ord(chr(0xdf)) == 0x3af; + + # ... ord() is affected by the encoding pragma ... + + print "tera\n" if ord(pack("C", 0xdf)) == 0x3af; + + # but pack/unpack are not affected, in case you still + # want back to your native encoding + + print "peta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf; + =head1 DESCRIPTION Normally when legacy 8-bit data is converted to Unicode the data is @@ -43,27 +59,44 @@ encoding pragma you can change this default. The pragma is a per script, not a per block lexical. Only the last C matters, and it affects B. -If no encoding is specified, the environment variable L -is consulted. If no encoding can be found, C -error will be thrown. +Notice that only literals (string or regular expression) having only +legacy code points are affected: if you mix data like this + + \xDF\x{100} + +the data is assumed to be in (Latin 1 and) Unicode, not in your native +encoding. In other words, this will match in "greek": + + "\xDF" =~ /\x{3af}/ -=head1 FUTURE POSSIBILITIES +but this will not -The C<\x..> and C<\0...> in regular expressions are not -affected by this pragma. They probably should. + "\xDF\x{100}" =~ /\x{3af}\x{100}/ -Also chr(), ord(), and C<\N{...}> might become affected. +since the C<\xDF> on the left will B be upgraded to C<\x{3af}> +because of the C<\x{100}> on the left. You should not be mixing your +legacy data and Unicode in the same string. + +This pragma also affects encoding of the 0x80..0xFF code point range: +normally characters in that range are left as eight-bit bytes (unless +they are combined with characters with code points 0x100 or larger, +in which case all characters need to become UTF-8 encoded), but if +the C pragma is present, even the 0x80..0xFF range always +gets UTF-8 encoded. + +If no encoding is specified, the environment variable L +is consulted. If that fails, "latin1" (ISO 8859-1) is assumed. If no +encoding can be found, C error will be thrown. =head1 KNOWN PROBLEMS -Cannot be combined with C. Note that this is a problem -B if you would like to have Unicode identifiers in your scripts. -You should not need C for anything else these days -(since Perl 5.8.0) +For native multibyte encodings (either fixed or variable length) +the current implementation of the regular expressions may introduce +recoding errors for longer regular expression literals than 127 bytes. =head1 SEE ALSO -L, L +L, L =cut