lib/encoding.pm

   1 package encoding;
   2
   3 use Encode;
   4
   5 sub import {
   6     my ($class, $name) = @_;
   7     $name = $ENV{PERL_ENCODING} if @_ < 2;
   8     $name = "latin1" unless defined $name;
   9     my $enc = find_encoding($name);
  10     unless (defined $enc) {
  11         require Carp;
  12         Carp::croak "Unknown encoding '$name'";
  13     }
  14     ${^ENCODING} = $enc;
  15 }
  16
  17 =pod
  18
  19 =head1 NAME
  20
  21 encoding - pragma to control the conversion of legacy data into Unicode
  22
  23 =head1 SYNOPSIS
  24
  25     use encoding "iso 8859-7";
  26
  27     # The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode.
  28
  29     $a = "\xDF";
  30     $b = "\x{100}";
  31
  32     printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
  33
  34     $c = $a . $b;
  35
  36     # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
  37
  38     # chr() is affected, and ...
  39
  40     print "mega\n"  if ord(chr(0xdf)) == 0x3af;
  41
  42     # ... ord() is affected by the encoding pragma ...
  43
  44     print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
  45
  46     # but pack/unpack are not affected, in case you still
  47     # want back to your native encoding
  48
  49     print "peta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
  50
  51 =head1 DESCRIPTION
  52
  53 Normally when legacy 8-bit data is converted to Unicode the data is
  54 expected to be Latin-1 (or EBCDIC in EBCDIC platforms).  With the
  55 encoding pragma you can change this default.
  56
  57 The pragma is a per script, not a per block lexical.  Only the last
  58 C<use encoding> matters, and it affects B<the whole script>.
  59
  60 Notice that only literals (string or regular expression) having only
  61 legacy code points are affected: if you mix data like this
  62
  63         \xDF\x{100}
  64
  65 the data is assumed to be in (Latin 1 and) Unicode, not in your native
  66 encoding.  In other words, this will match in "greek":
  67
  68         "\xDF" =~ /\x{3af}/
  69
  70 but this will not
  71
  72         "\xDF\x{100}" =~ /\x{3af}\x{100}/
  73
  74 since the C<\xDF> on the left will B<not> be upgraded to C<\x{3af}>
  75 because of the C<\x{100}> on the left.  You should not be mixing your
  76 legacy data and Unicode in the same string.
  77
  78 If no encoding is specified, the environment variable L<PERL_ENCODING>
  79 is consulted.  If that fails, "latin1" (ISO 8859-1) is assumed.  If no
  80 encoding can be found, C<Unknown encoding '...'> error will be thrown.
  81
  82 =head1 KNOWN PROBLEMS
  83
  84 For native multibyte encodings (either fixed or variable length)
  85 the current implementation of the regular expressions may introduce
  86 recoding errors for longer regular expression literals than 127 bytes.
  87
  88 =head1 SEE ALSO
  89
  90 L<perlunicode>, L<Encode>
  91
  92 =cut
  93
  94 1;