X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2Fencoding.pm;h=642726da7e0f2d11676f48471544d4ccd3754e28;hb=5290524f8b52822096b01140005d681d126b507d;hp=7dacd7a5bdca24ad2d2e12011c9f0430c6f1d9d9;hpb=25f59a5e41d8ad4cca035052507fca0ebff1d7e6;p=p5sagit%2Fp5-mst-13.2.git

diff --git a/lib/encoding.pm b/lib/encoding.pm
index 7dacd7a..642726d 100644
--- a/lib/encoding.pm
+++ b/lib/encoding.pm
@@ -1,10 +1,13 @@
 package encoding;
 
+our $VERSION = '1.00';
+
 use Encode;
 
 sub import {
     my ($class, $name) = @_;
     $name = $ENV{PERL_ENCODING} if @_ < 2;
+    $name = "latin1" unless defined $name;
     my $enc = find_encoding($name);
     unless (defined $enc) {
 	require Carp;
@@ -23,13 +26,29 @@ encoding - pragma to control the conversion of legacy data into Unicode
 
     use encoding "iso 8859-7";
 
+    # The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode.
+
     $a = "\xDF";
     $b = "\x{100}";
 
+    printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
+
     $c = $a . $b;
 
     # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
-    # The \xDF of ISO 8859-7 is \x{3af} in Unicode.
+
+    # chr() is affected, and ...
+
+    print "mega\n"  if ord(chr(0xdf)) == 0x3af;
+
+    # ... ord() is affected by the encoding pragma ...
+
+    print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
+
+    # but pack/unpack are not affected, in case you still
+    # want back to your native encoding
+
+    print "peta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
 
 =head1 DESCRIPTION
 
@@ -38,16 +57,46 @@ expected to be Latin-1 (or EBCDIC in EBCDIC platforms).  With the
 encoding pragma you can change this default.
 
 The pragma is a per script, not a per block lexical.  Only the last
-'use encoding' seen matters.
+C<use encoding> matters, and it affects B<the whole script>.
+
+Notice that only literals (string or regular expression) having only
+legacy code points are affected: if you mix data like this
+
+	\xDF\x{100}
+
+the data is assumed to be in (Latin 1 and) Unicode, not in your native
+encoding.  In other words, this will match in "greek":
+
+	"\xDF" =~ /\x{3af}/
+
+but this will not
+
+	"\xDF\x{100}" =~ /\x{3af}\x{100}/
+
+since the C<\xDF> on the left will B<not> be upgraded to C<\x{3af}>
+because of the C<\x{100}> on the left.  You should not be mixing your
+legacy data and Unicode in the same string.
+
+This pragma also affects encoding of the 0x80..0xFF code point range:
+normally characters in that range are left as eight-bit bytes (unless
+they are combined with characters with code points 0x100 or larger,
+in which case all characters need to become UTF-8 encoded), but if
+the C<encoding> pragma is present, even the 0x80..0xFF range always
+gets UTF-8 encoded.
+
+If no encoding is specified, the environment variable L<PERL_ENCODING>
+is consulted.  If that fails, "latin1" (ISO 8859-1) is assumed.  If no
+encoding can be found, C<Unknown encoding '...'> error will be thrown.
 
-=head1 FUTURE POSSIBILITIES
+=head1 KNOWN PROBLEMS
 
-The C<\x..> and C<\0...> in literals and regular expressions are not
-affected by this pragma.  They probably should.  Ditto C<\N{...}>.
+For native multibyte encodings (either fixed or variable length)
+the current implementation of the regular expressions may introduce
+recoding errors for longer regular expression literals than 127 bytes.
 
 =head1 SEE ALSO
 
-L<perlunicode>
+L<perlunicode>, L<Encode>
 
 =cut