ext/Encode/encoding.pm

   1 package encoding;
   2 our $VERSION = do { my @r = (q$Revision: 1.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
   3
   4 use Encode;
   5
   6 BEGIN {
   7     if (ord("A") == 193) {
   8         require Carp;
   9         Carp::croak "encoding pragma does not support EBCDIC platforms";
  10     }
  11 }
  12
  13 sub import {
  14     my $class = shift;
  15     my $name  = shift;
  16     my %arg = @_;
  17     $name ||= $ENV{PERL_ENCODING};
  18
  19     my $enc = find_encoding($name);
  20     unless (defined $enc) {
  21         require Carp;
  22         Carp::croak "Unknown encoding '$name'";
  23     }
  24     ${^ENCODING} = $enc; # this is all you need, actually.
  25
  26     # $_OPEN_ORIG = ${^OPEN};
  27     for my $h (qw(STDIN STDOUT STDERR)){
  28         if ($arg{$h}){
  29             unless (defined find_encoding($name)) {
  30                 require Carp;
  31                 Carp::croak "Unknown encoding for $fhname, '$arg{$h}'";
  32             }
  33             eval qq{ binmode($h, ":encoding($arg{h})") };
  34         }else{
  35             eval qq{ binmode($h, ":encoding($name)") };
  36         }
  37         if ($@){
  38             require Carp;
  39             Carp::croak($@);
  40         }
  41     }
  42     return 1; # I doubt if we need it, though
  43 }
  44
  45 sub unimport{
  46     no warnings;
  47     undef ${^ENCODING};
  48     binmode(STDIN,  ":raw");
  49     binmode(STDOUT, ":raw");
  50     binmode(STDERR, ":raw");
  51 }
  52
  53 1;
  54 __END__
  55 =pod
  56
  57 =head1 NAME
  58
  59 encoding -  allows you to write your script in non-asii or non-utf8
  60
  61 =head1 SYNOPSIS
  62
  63   use encoding "euc-jp"; # Jperl!
  64
  65   # or you can even do this if your shell supports euc-jp
  66
  67   > perl -Mencoding=euc-jp -e '...'
  68
  69   # or from the shebang line
  70
  71   #!/your/path/to/perl -Mencoding=euc-jp
  72
  73   # more control
  74
  75   # A simple euc-jp => utf-8 converter
  76   use encoding "euc-jp", STDOUT => "utf8";  while(<>){print};
  77
  78   # "no encoding;" supported (but not scoped!)
  79   no encoding;
  80
  81 =head1 ABSTRACT
  82
  83 Perl 5.6.0 has introduced Unicode support.  You could apply
  84 C<substr()> and regexes even to complex CJK characters -- so long as
  85 the script was written in UTF-8.  But back then text editors that
  86 support UTF-8 was still rare and many users rather chose to writer
  87 scripts in legacy encodings, given up whole new feature of Perl 5.6.
  88
  89 With B<encoding> pragma, you can write your script in any encoding you like
  90 (so long as the C<Encode> module supports it) and still enjoy Unicode
  91 support.  You can write a code in EUC-JP as follows;
  92
  93   my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
  94                #<-char-><-char->   # 4 octets
  95   s/\bCamel\b/$Rakuda/;
  96
  97 And with C<use encoding "euc-jp"> in effect, it is the same thing as
  98 the code in UTF-8 as follow.
  99
 100   my $Rakuda = "\x{99F1}\x{99DD}"; # who Unicode Characters
 101   s/\bCamel\b/$Rakuda/;
 102
 103 The B<encoding> pragma also modifies the file handle disciplines of
 104 STDIN, STDOUT, and STDERR to the specified encoding.  Therefore,
 105
 106   use encoding "euc-jp";
 107   my $message = "Camel is the symbol of perl.\n";
 108   my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
 109   $message =~ s/\bCamel\b/$Rakuda/;
 110   print $message;
 111
 112 Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n", not
 113 "\x{99F1}\x{99DD} is the symbol of perl.\n".
 114
 115 You can override this by giving extra arguments.  See below.
 116
 117 =head1 USAGE
 118
 119 =over 4
 120
 121 =item use encoding [I<ENCNAME>] ;
 122
 123 Sets the script encoding to I<ENCNAME> and file handle disciplines of
 124 STDIN, STDOUT, and STDERR are set to ":encoding(I<ENCNAME>)".
 125
 126 If no encoding is specified, the environment variable L<PERL_ENCODING>
 127 is consulted. If no  encoding can be found, C<Unknown encoding 'I<ENCNAME>'>
 128 error will be thrown.
 129
 130 Note that non-STD file handles remain unaffected.  Use C<use open> or
 131 C<binmode> to change disciplines of those.
 132
 133 =item use encoding I<ENCNAME> [ STDIN => I<ENCNAME_IN> ...] ;
 134
 135 You can also individually set encodings of STDIN, STDOUT, and STDERR
 136 via STDI<FH> => I<ENCNAME_FH> form.  In this case, you cannot omit the
 137 first I<ENCNAME>.
 138
 139 =item no encoding;
 140
 141 Unsets the script encoding and the disciplines of STDIN, STDOUT, and
 142 STDERR are reset to ":raw".
 143
 144 =back
 145
 146 =head1 CAVEATS
 147
 148 =head2 NOT SCOPED
 149
 150 The pragma is a per script, not a per block lexical.  Only the last
 151 C<use encoding> or C<matters, and it affects B<the whole script>.
 152 Though <no encoding> pragma is supported and C<use encoding> can
 153 appear as many times as you want in a given script, the multiple use
 154 of this pragma is discouraged.
 155
 156 =head2 DO NOT MIX MULTIPLE ENCODINGS
 157
 158 Notice that only literals (string or regular expression) having only
 159 legacy code points are affected: if you mix data like this
 160
 161         \xDF\x{100}
 162
 163 the data is assumed to be in (Latin 1 and) Unicode, not in your native
 164 encoding.  In other words, this will match in "greek":
 165
 166         "\xDF" =~ /\x{3af}/
 167
 168 but this will not
 169
 170         "\xDF\x{100}" =~ /\x{3af}\x{100}/
 171
 172 since the C<\xDF> on the left will B<not> be upgraded to C<\x{3af}>
 173 because of the C<\x{100}> on the left.  You should not be mixing your
 174 legacy data and Unicode in the same string.
 175
 176 This pragma also affects encoding of the 0x80..0xFF code point range:
 177 normally characters in that range are left as eight-bit bytes (unless
 178 they are combined with characters with code points 0x100 or larger,
 179 in which case all characters need to become UTF-8 encoded), but if
 180 the C<encoding> pragma is present, even the 0x80..0xFF range always
 181 gets UTF-8 encoded.
 182
 183 After all, the best thing about this pragma is that you don't have to
 184 resort to \x... just to spell your name in native encoding.  So feel
 185 free to put your strings in your encoding in quotes and regexes.
 186
 187 =head1 EXAMPLE - Greekperl
 188
 189     use encoding "iso 8859-7";
 190
 191     # The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode.
 192
 193     $a = "\xDF";
 194     $b = "\x{100}";
 195
 196     printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
 197
 198     $c = $a . $b;
 199
 200     # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
 201
 202     # chr() is affected, and ...
 203
 204     print "mega\n"  if ord(chr(0xdf)) == 0x3af;
 205
 206     # ... ord() is affected by the encoding pragma ...
 207
 208     print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
 209
 210     # ... as are eq and cmp ...
 211
 212     print "peta\n" if "\x{3af}" eq  pack("C", 0xdf);
 213     print "exa\n"  if "\x{3af}" cmp pack("C", 0xdf) == 0;
 214
 215     # ... but pack/unpack C are not affected, in case you still
 216     # want back to your native encoding
 217
 218     print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
 219
 220 =head1 KNOWN PROBLEMS
 221
 222 For native multibyte encodings (either fixed or variable length)
 223 the current implementation of the regular expressions may introduce
 224 recoding errors for longer regular expression literals than 127 bytes.
 225
 226 The encoding pragma is not supported on EBCDIC platforms.
 227 (Porters wanted.)
 228
 229 =head1 SEE ALSO
 230
 231 L<perlunicode>, L<Encode>, L<open>
 232
 233 =cut