[p5sagit/p5-mst-13.2.git] / ext / Encode / encoding.pm

# $Id: encoding.pm,v 1.44 2003/03/09 20:07:37 dankogai Exp dankogai $
package encoding;
our $VERSION = do { my @r = (q$Revision: 1.44 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };

use Encode;
use strict;
our $DEBUG = 0;

BEGIN {
    if (ord("A") == 193) {
	require Carp;
	Carp::croak("encoding pragma does not support EBCDIC platforms");
    }
}

our $HAS_PERLIO = 0;
eval { require PerlIO::encoding };
unless ($@){
    $HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02);
}

sub _exception{
    my $name = shift;
    $] > 5.008 and return 0;             # 5.8.1 then no
    my %utfs = map {$_=>1}
	qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
	   UTF-32 UTF-32BE UTF-32LE);
    $utfs{$name} or return 0;            # UTFs or no
    require Config; Config->import(); our %Config;
    return $Config{perl_patchlevel} == 0 # maintperl then no
}

sub import {
    my $class = shift;
    my $name  = shift;
    my %arg = @_;
    $name ||= $ENV{PERL_ENCODING};
    my $enc = find_encoding($name);
    unless (defined $enc) {
	require Carp;
	Carp::croak("Unknown encoding '$name'");
    }
    $name = $enc->name; # canonize
    unless ($arg{Filter}) {
	$DEBUG and warn "_exception($name) = ", _exception($name);
	_exception($name) or ${^ENCODING} = $enc;
	$HAS_PERLIO or return 1;
    }else{
	defined(${^ENCODING}) and undef ${^ENCODING};
	# implicitly 'use utf8'
	require utf8; # to fetch $utf8::hint_bits;
	$^H |= $utf8::hint_bits;
	eval {
	    require Filter::Util::Call ;
	    Filter::Util::Call->import ;
	    filter_add(sub{
			   my $status = filter_read();
                           if ($status > 0){
			       # $DEBUG and warn $_;
			       $_ = $enc->decode($_, 1);
			       $DEBUG and warn $_;
			   }
			   $status ;
		       });
	};
    }	$DEBUG and warn "Filter installed";
    defined ${^UNICODE} and ${^UNICODE} != 0 and return 1;
    for my $h (qw(STDIN STDOUT)){
	if ($arg{$h}){
	    unless (defined find_encoding($arg{$h})) {
		require Carp;
		Carp::croak("Unknown encoding for $h, '$arg{$h}'");
	    }
	    eval { binmode($h, ":raw :encoding($arg{$h})") };
	}else{
	    unless (exists $arg{$h}){
		eval { 
		    no warnings 'uninitialized';
		    binmode($h, ":raw :encoding($name)");
		};
	    }
	}
	if ($@){
	    require Carp;
	    Carp::croak($@);
	}
    }
    return 1; # I doubt if we need it, though
}

sub unimport{
    no warnings;
    undef ${^ENCODING};
    if ($HAS_PERLIO){
	binmode(STDIN,  ":raw");
	binmode(STDOUT, ":raw");
    }else{
	binmode(STDIN);
	binmode(STDOUT);
    }
    if ($INC{"Filter/Util/Call.pm"}){
	eval { filter_del() };
    }
}

1;
__END__

=pod

=head1 NAME

encoding - allows you to write your script in non-ascii or non-utf8

=head1 SYNOPSIS

  use encoding "greek";  # Perl like Greek to you?
  use encoding "euc-jp"; # Jperl!

  # or you can even do this if your shell supports your native encoding

  perl -Mencoding=latin2 -e '...' # Feeling centrally European?
  perl -Mencoding=euc-kr -e '...' # Or Korean?

  # more control

  # A simple euc-cn => utf-8 converter
  use encoding "euc-cn", STDOUT => "utf8";  while(<>){print};

  # "no encoding;" supported (but not scoped!)
  no encoding;

  # an alternate way, Filter
  use encoding "euc-jp", Filter=>1;
  # now you can use kanji identifiers -- in euc-jp!

=head1 ABSTRACT

Let's start with a bit of history: Perl 5.6.0 introduced Unicode
support.  You could apply C<substr()> and regexes even to complex CJK
characters -- so long as the script was written in UTF-8.  But back
then, text editors that supported UTF-8 were still rare and many users
instead chose to write scripts in legacy encodings, giving up a whole
new feature of Perl 5.6.

Rewind to the future: starting from perl 5.8.0 with the B<encoding>
pragma, you can write your script in any encoding you like (so long
as the C<Encode> module supports it) and still enjoy Unicode support.
This pragma achieves that by doing the following:

=over

=item *

Internally converts all literals (C<q//,qq//,qr//,qw///, qx//>) from
the encoding specified to utf8.  In Perl 5.8.1 and later, literals in
C<tr///> and C<DATA> pseudo-filehandle are also converted.

=item *

Changing PerlIO layers of C<STDIN> and C<STDOUT> to the encoding
 specified.

=back

=head2 Literal Conversions

You can write code in EUC-JP as follows:

  my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
               #<-char-><-char->   # 4 octets
  s/\bCamel\b/$Rakuda/;

And with C<use encoding "euc-jp"> in effect, it is the same thing as
the code in UTF-8:

  my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
  s/\bCamel\b/$Rakuda/;

=head2 PerlIO layers for C<STD(IN|OUT)>

The B<encoding> pragma also modifies the filehandle layers of
STDIN and STDOUT to the specified encoding.  Therefore,

  use encoding "euc-jp";
  my $message = "Camel is the symbol of perl.\n";
  my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
  $message =~ s/\bCamel\b/$Rakuda/;
  print $message;

Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n",
not "\x{99F1}\x{99DD} is the symbol of perl.\n".

You can override this by giving extra arguments; see below.

=head1 FEATURES THAT REQUIRE 5.8.1

Some of the features offered by this pragma requires perl 5.8.1.  Most
of these are done by Inaba Hiroto.  Any other features and changes
are good for 5.8.0.

=over

=item "NON-EUC" doublebyte encodings

Because perl needs to parse script before applying this pragma, such
encodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH;
\x5c) in the second byte fails because the second byte may
accidentally escape the quoting character that follows.  Perl 5.8.1
or later fixes this problem.

=item tr// 

C<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0
See the section below for details.

=item DATA pseudo-filehandle

Another feature that was overlooked was C<DATA>. 

=back

=head1 USAGE

=over 4

=item use encoding [I<ENCNAME>] ;

Sets the script encoding to I<ENCNAME>.  And unless ${^UNICODE} 
exists and non-zero, PerlIO layers of STDIN and STDOUT are set to
":encoding(I<ENCNAME>)".

Note that STDERR WILL NOT be changed.

Also note that non-STD file handles remain unaffected.  Use C<use
open> or C<binmode> to change layers of those.

If no encoding is specified, the environment variable L<PERL_ENCODING>
is consulted.  If no encoding can be found, the error C<Unknown encoding
'I<ENCNAME>'> will be thrown.

=item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ;

You can also individually set encodings of STDIN and STDOUT via the
C<< STDIN => I<ENCNAME> >> form.  In this case, you cannot omit the
first I<ENCNAME>.  C<< STDIN => undef >> turns the IO transcoding
completely off.

When ${^UNICODE} exists and non-zero, these options will completely
ignored.  ${^UNICODE} is a variable introduced in perl 5.8.1.  See
L<perlrun> see L<perlvar/"${^UNICODE}"> and L<perlrun/"-C"> for
details (perl 5.8.1 and later).

=item use encoding I<ENCNAME> Filter=E<gt>1;

This turns the encoding pragma into a source filter.  While the
default approach just decodes interpolated literals (in qq() and
qr()), this will apply a source filter to the entire source code.  See
L</"The Filter Option"> below for details.

=item no encoding;

Unsets the script encoding. The layers of STDIN, STDOUT are
reset to ":raw" (the default unprocessed raw stream of bytes).

=back

=head1 The Filter Option

The magic of C<use encoding> is not applied to the names of
identifiers.  In order to make C<${"\x{4eba}"}++> ($human++, where human
is a single Han ideograph) work, you still need to write your script
in UTF-8 -- or use a source filter.  That's what 'Filter=>1' does.

What does this mean?  Your source code behaves as if it is written in
UTF-8 with 'use utf8' in effect.  So even if your editor only supports
Shift_JIS, for example, you can still try examples in Chapter 15 of
C<Programming Perl, 3rd Ed.>.  For instance, you can use UTF-8
identifiers.

This option is significantly slower and (as of this writing) non-ASCII
identifiers are not very stable WITHOUT this option and with the
source code written in UTF-8.

=head2 Filter-related changes at Encode version 1.87

=over

=item *

The Filter option now sets STDIN and STDOUT like non-filter options.
And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like
non-filter version.

=item *

C<use utf8> is implicitly declared so you no longer have to C<use
utf8> to C<${"\x{4eba}"}++>.

=back

=head1 CAVEATS

=head2 NOT SCOPED

The pragma is a per script, not a per block lexical.  Only the last
C<use encoding> or C<no encoding> matters, and it affects 
B<the whole script>.  However, the <no encoding> pragma is supported and 
B<use encoding> can appear as many times as you want in a given script. 
The multiple use of this pragma is discouraged.

By the same reason, the use this pragma inside modules is also
discouraged (though not as strongly discouranged as the case above.  
See below).

If you still have to write a module with this pragma, be very careful
of the load order.  See the codes below;

  # called module
  package Module_IN_BAR;
  use encoding "bar";
  # stuff in "bar" encoding here
  1;

  # caller script
  use encoding "foo"
  use Module_IN_BAR;
  # surprise! use encoding "bar" is in effect.

The best way to avoid this oddity is to use this pragma RIGHT AFTER
other modules are loaded.  i.e.

  use Module_IN_BAR;
  use encoding "foo";

=head2 DO NOT MIX MULTIPLE ENCODINGS

Notice that only literals (string or regular expression) having only
legacy code points are affected: if you mix data like this

	\xDF\x{100}

the data is assumed to be in (Latin 1 and) Unicode, not in your native
encoding.  In other words, this will match in "greek":

	"\xDF" =~ /\x{3af}/

but this will not

	"\xDF\x{100}" =~ /\x{3af}\x{100}/

since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL
LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left.  You
should not be mixing your legacy data and Unicode in the same string.

This pragma also affects encoding of the 0x80..0xFF code point range:
normally characters in that range are left as eight-bit bytes (unless
they are combined with characters with code points 0x100 or larger,
in which case all characters need to become UTF-8 encoded), but if
the C<encoding> pragma is present, even the 0x80..0xFF range always
gets UTF-8 encoded.

After all, the best thing about this pragma is that you don't have to
resort to \x{....} just to spell your name in a native encoding.
So feel free to put your strings in your encoding in quotes and
regexes.

=head2 tr/// with ranges

The B<encoding> pragma works by decoding string literals in
C<q//,qq//,qr//,qw///, qx//> and so forth.  In perl 5.8.0, this
does not apply to C<tr///>.  Therefore,

  use encoding 'euc-jp';
  #....
  $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/;
  #           -------- -------- -------- --------

Does not work as

  $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/;

=over

=item Legend of characters above

  utf8     euc-jp   charnames::viacode()
  -----------------------------------------
  \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A
  \x{3093} \xA4\xF3 HIRAGANA LETTER N
  \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A
  \x{30f3} \xA5\xF3 KATAKANA LETTER N

=back

This counterintuitive behavior has been fixed in perl 5.8.1.

=head3 workaround to tr///;

In perl 5.8.0, you can work aroud as follows;

  use encoding 'euc-jp';
  #  ....
  eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };

Note the C<tr//> expression is surronded by C<qq{}>.  The idea behind
is the same as classic idiom that makes C<tr///> 'interpolate'.

   tr/$from/$to/;            # wrong!
   eval qq{ tr/$from/$to/ }; # workaround.

Nevertheless, in case of B<encoding> pragma even C<q//> is affected so
C<tr///> not being decoded was obviously against the will of Perl5
Porters so it has been fixed in Perl 5.8.1 or later.

=head1 EXAMPLE - Greekperl

    use encoding "iso 8859-7";

    # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode.

    $a = "\xDF";
    $b = "\x{100}";

    printf "%#x\n", ord($a); # will print 0x3af, not 0xdf

    $c = $a . $b;

    # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".

    # chr() is affected, and ...

    print "mega\n"  if ord(chr(0xdf)) == 0x3af;

    # ... ord() is affected by the encoding pragma ...

    print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;

    # ... as are eq and cmp ...

    print "peta\n" if "\x{3af}" eq  pack("C", 0xdf);
    print "exa\n"  if "\x{3af}" cmp pack("C", 0xdf) == 0;

    # ... but pack/unpack C are not affected, in case you still
    # want to go back to your native encoding

    print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;

=head1 KNOWN PROBLEMS

=over

=item literals in regex that are longer than 127 bytes

For native multibyte encodings (either fixed or variable length),
the current implementation of the regular expressions may introduce
recoding errors for regular expression literals longer than 127 bytes.

=item EBCDIC

The encoding pragma is not supported on EBCDIC platforms.
(Porters who are willing and able to remove this limitation are
welcome.)

=item format

This pragma doesn't work well with format because PerlIO does not
get along very well with it.  When format contains non-ascii
characters it prints funny or gets "wide character warnings".
To understand it, try the code below.

  # Save this one in utf8
  # replace *non-ascii* with a non-ascii string
  my $camel;
  format STDOUT =
  *non-ascii*@>>>>>>>
  $camel
  .
  $camel = "*non-ascii*";
  binmode(STDOUT=>':encoding(utf8)'); # bang!
  write;              # funny 
  print $camel, "\n"; # fine

Without binmode this happens to work but without binmode, print()
fails instead of write().

At any rate, the very use of format is questionable when it comes to
unicode characters since you have to consider such things as character
width (i.e. double-width for ideographs) and directions (i.e. BIDI for
Arabic and Hebrew).

=back

=head1 HISTORY

This pragma first appeared in Perl 5.8.0.  For features that require 
5.8.1 and better, see above.

=head1 SEE ALSO

L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,

Ch. 15 of C<Programming Perl (3rd Edition)>
by Larry Wall, Tom Christiansen, Jon Orwant;
O'Reilly & Associates; ISBN 0-596-00027-8

=cut
Commit	Line	Data
0f29a567	1	# $Id: encoding.pm,v 1.44 2003/03/09 20:07:37 dankogai Exp dankogai $
3ef515df	2	package encoding;
0f29a567	3	our $VERSION = do { my @r = (q$Revision: 1.44 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
3ef515df	4
3ef515df	5	use Encode;
046f36bf	6	use strict;
151b5d36	7	our $DEBUG = 0;
3ef515df	8
	9	BEGIN {
	10	if (ord("A") == 193) {
	11	require Carp;
10c5ecbb	12	Carp::croak("encoding pragma does not support EBCDIC platforms");
3ef515df	13	}
	14	}
	15
0ab8f81e	16	our $HAS_PERLIO = 0;
	17	eval { require PerlIO::encoding };
	18	unless ($@){
	19	$HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02);
	20	}
b2704119	21
151b5d36	22	sub _exception{
	23	my $name = shift;
	24	$] > 5.008 and return 0; # 5.8.1 then no
	25	my %utfs = map {$_=>1}
	26	qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
	27	UTF-32 UTF-32BE UTF-32LE);
	28	$utfs{$name} or return 0; # UTFs or no
	29	require Config; Config->import(); our %Config;
	30	return $Config{perl_patchlevel} == 0 # maintperl then no
	31	}
fa6f41cf	32
3ef515df	33	sub import {
	34	my $class = shift;
	35	my $name = shift;
	36	my %arg = @_;
	37	$name \|\|= $ENV{PERL_ENCODING};
3ef515df	38	my $enc = find_encoding($name);
	39	unless (defined $enc) {
	40	require Carp;
10c5ecbb	41	Carp::croak("Unknown encoding '$name'");
3ef515df	42	}
fa6f41cf	43	$name = $enc->name; # canonize
0f7c507f	44	unless ($arg{Filter}) {
151b5d36	45	$DEBUG and warn "_exception($name) = ", _exception($name);
151b5d36	46	_exception($name) or ${^ENCODING} = $enc;
85982a32	47	$HAS_PERLIO or return 1;
aae85ceb	48	}else{
aae85ceb	49	defined(${^ENCODING}) and undef ${^ENCODING};
151b5d36	50	# implicitly 'use utf8'
	51	require utf8; # to fetch $utf8::hint_bits;
	52	$^H \|= $utf8::hint_bits;
aae85ceb	53	eval {
	54	require Filter::Util::Call ;
	55	Filter::Util::Call->import ;
aae85ceb	56	filter_add(sub{
151b5d36	57	my $status = filter_read();
	58	if ($status > 0){
	59	# $DEBUG and warn $_;
aae85ceb	60	$_ = $enc->decode($_, 1);
151b5d36	61	$DEBUG and warn $_;
aae85ceb	62	}
	63	$status ;
	64	});
	65	};
6be7c101	66	} $DEBUG and warn "Filter installed";
05ef2f67	67	defined ${^UNICODE} and ${^UNICODE} != 0 and return 1;
151b5d36	68	for my $h (qw(STDIN STDOUT)){
	69	if ($arg{$h}){
	70	unless (defined find_encoding($arg{$h})) {
	71	require Carp;
	72	Carp::croak("Unknown encoding for $h, '$arg{$h}'");
	73	}
6be7c101	74	eval { binmode($h, ":raw :encoding($arg{$h})") };
151b5d36	75	}else{
	76	unless (exists $arg{$h}){
	77	eval {
	78	no warnings 'uninitialized';
6be7c101	79	binmode($h, ":raw :encoding($name)");
151b5d36	80	};
	81	}
	82	}
	83	if ($@){
	84	require Carp;
	85	Carp::croak($@);
	86	}
3ef515df	87	}
	88	return 1; # I doubt if we need it, though
	89	}
	90
	91	sub unimport{
	92	no warnings;
	93	undef ${^ENCODING};
621b0f8d	94	if ($HAS_PERLIO){
	95	binmode(STDIN, ":raw");
	96	binmode(STDOUT, ":raw");
	97	}else{
6be7c101	98	binmode(STDIN);
6be7c101	99	binmode(STDOUT);
621b0f8d	100	}
aae85ceb	101	if ($INC{"Filter/Util/Call.pm"}){
	102	eval { filter_del() };
	103	}
3ef515df	104	}
	105
	106	1;
	107	__END__
85982a32	108
3ef515df	109	=pod
	110
	111	=head1 NAME
	112
0ab8f81e	113	encoding - allows you to write your script in non-ascii or non-utf8
3ef515df	114
	115	=head1 SYNOPSIS
	116
962111ca	117	use encoding "greek"; # Perl like Greek to you?
3ef515df	118	use encoding "euc-jp"; # Jperl!
3ef515df	119
962111ca	120	# or you can even do this if your shell supports your native encoding
3ef515df	121
962111ca	122	perl -Mencoding=latin2 -e '...' # Feeling centrally European?
0ab8f81e	123	perl -Mencoding=euc-kr -e '...' # Or Korean?
3ef515df	124
3ef515df	125	# more control
3ef515df	126
962111ca	127	# A simple euc-cn => utf-8 converter
6d1c0808	128	use encoding "euc-cn", STDOUT => "utf8"; while(<>){print};
3ef515df	129
	130	# "no encoding;" supported (but not scoped!)
	131	no encoding;
	132
aae85ceb	133	# an alternate way, Filter
aae85ceb	134	use encoding "euc-jp", Filter=>1;
aae85ceb	135	# now you can use kanji identifiers -- in euc-jp!
aae85ceb	136
3ef515df	137	=head1 ABSTRACT
3ef515df	138
962111ca	139	Let's start with a bit of history: Perl 5.6.0 introduced Unicode
	140	support. You could apply C<substr()> and regexes even to complex CJK
	141	characters -- so long as the script was written in UTF-8. But back
0ab8f81e	142	then, text editors that supported UTF-8 were still rare and many users
	143	instead chose to write scripts in legacy encodings, giving up a whole
	144	new feature of Perl 5.6.
3ef515df	145
0ab8f81e	146	Rewind to the future: starting from perl 5.8.0 with the B<encoding>
962111ca	147	pragma, you can write your script in any encoding you like (so long
962111ca	148	as the C<Encode> module supports it) and still enjoy Unicode support.
0f29a567	149	This pragma achieves that by doing the following:
05ef2f67	150
	151	=over
	152
	153	=item *
	154
	155	Internally converts all literals (C<q//,qq//,qr//,qw///, qx//>) from
	156	the encoding specified to utf8. In Perl 5.8.1 and later, literals in
	157	C<tr///> and C<DATA> pseudo-filehandle are also converted.
	158
	159	=item *
	160
	161	Changing PerlIO layers of C<STDIN> and C<STDOUT> to the encoding
	162	specified.
	163
	164	=back
	165
	166	=head2 Literal Conversions
	167
0ab8f81e	168	You can write code in EUC-JP as follows:
3ef515df	169
	170	my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
	171	#<-char-><-char-> # 4 octets
	172	s/\bCamel\b/$Rakuda/;
	173
	174	And with C<use encoding "euc-jp"> in effect, it is the same thing as
962111ca	175	the code in UTF-8:
3ef515df	176
32b9ed1f	177	my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
3ef515df	178	s/\bCamel\b/$Rakuda/;
3ef515df	179
05ef2f67	180	=head2 PerlIO layers for C<STD(IN\|OUT)>
	181
	182	The B<encoding> pragma also modifies the filehandle layers of
4b291ae6	183	STDIN and STDOUT to the specified encoding. Therefore,
3ef515df	184
	185	use encoding "euc-jp";
	186	my $message = "Camel is the symbol of perl.\n";
	187	my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
	188	$message =~ s/\bCamel\b/$Rakuda/;
	189	print $message;
	190
962111ca	191	Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n",
962111ca	192	not "\x{99F1}\x{99DD} is the symbol of perl.\n".
3ef515df	193
0ab8f81e	194	You can override this by giving extra arguments; see below.
3ef515df	195
05ef2f67	196	=head1 FEATURES THAT REQUIRE 5.8.1
	197
	198	Some of the features offered by this pragma requires perl 5.8.1. Most
0f29a567	199	of these are done by Inaba Hiroto. Any other features and changes
05ef2f67	200	are good for 5.8.0.
	201
	202	=over
	203
	204	=item "NON-EUC" doublebyte encodings
	205
0f29a567	206	Because perl needs to parse script before applying this pragma, such
05ef2f67	207	encodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH;
05ef2f67	208	\x5c) in the second byte fails because the second byte may
0f29a567	209	accidentally escape the quoting character that follows. Perl 5.8.1
05ef2f67	210	or later fixes this problem.
	211
	212	=item tr//
	213
	214	C<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0
	215	See the section below for details.
	216
	217	=item DATA pseudo-filehandle
	218
	219	Another feature that was overlooked was C<DATA>.
	220
	221	=back
	222
3ef515df	223	=head1 USAGE
	224
	225	=over 4
	226
	227	=item use encoding [I<ENCNAME>] ;
	228
05ef2f67	229	Sets the script encoding to I<ENCNAME>. And unless ${^UNICODE}
	230	exists and non-zero, PerlIO layers of STDIN and STDOUT are set to
	231	":encoding(I<ENCNAME>)".
	232
	233	Note that STDERR WILL NOT be changed.
	234
	235	Also note that non-STD file handles remain unaffected. Use C<use
	236	open> or C<binmode> to change layers of those.
3ef515df	237
3ef515df	238	If no encoding is specified, the environment variable L<PERL_ENCODING>
962111ca	239	is consulted. If no encoding can be found, the error C<Unknown encoding
962111ca	240	'I<ENCNAME>'> will be thrown.
3ef515df	241
aae85ceb	242	=item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ;
3ef515df	243
0ab8f81e	244	You can also individually set encodings of STDIN and STDOUT via the
32b9ed1f	245	C<< STDIN => I<ENCNAME> >> form. In this case, you cannot omit the
32b9ed1f	246	first I<ENCNAME>. C<< STDIN => undef >> turns the IO transcoding
aae85ceb	247	completely off.
3ef515df	248
05ef2f67	249	When ${^UNICODE} exists and non-zero, these options will completely
	250	ignored. ${^UNICODE} is a variable introduced in perl 5.8.1. See
	251	L<perlrun> see L<perlvar/"${^UNICODE}"> and L<perlrun/"-C"> for
	252	details (perl 5.8.1 and later).
	253
151b5d36	254	=item use encoding I<ENCNAME> Filter=E<gt>1;
	255
	256	This turns the encoding pragma into a source filter. While the
	257	default approach just decodes interpolated literals (in qq() and
	258	qr()), this will apply a source filter to the entire source code. See
05ef2f67	259	L</"The Filter Option"> below for details.
151b5d36	260
3ef515df	261	=item no encoding;
3ef515df	262
05ef2f67	263	Unsets the script encoding. The layers of STDIN, STDOUT are
962111ca	264	reset to ":raw" (the default unprocessed raw stream of bytes).
3ef515df	265
	266	=back
	267
151b5d36	268	=head1 The Filter Option
	269
	270	The magic of C<use encoding> is not applied to the names of
	271	identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human
	272	is a single Han ideograph) work, you still need to write your script
	273	in UTF-8 -- or use a source filter. That's what 'Filter=>1' does.
	274
151b5d36	275	What does this mean? Your source code behaves as if it is written in
	276	UTF-8 with 'use utf8' in effect. So even if your editor only supports
	277	Shift_JIS, for example, you can still try examples in Chapter 15 of
	278	C<Programming Perl, 3rd Ed.>. For instance, you can use UTF-8
	279	identifiers.
	280
	281	This option is significantly slower and (as of this writing) non-ASCII
	282	identifiers are not very stable WITHOUT this option and with the
	283	source code written in UTF-8.
	284
	285	=head2 Filter-related changes at Encode version 1.87
	286
	287	=over
	288
	289	=item *
	290
	291	The Filter option now sets STDIN and STDOUT like non-filter options.
	292	And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like
	293	non-filter version.
	294
	295	=item *
	296
	297	C<use utf8> is implicitly declared so you no longer have to C<use
	298	utf8> to C<${"\x{4eba}"}++>.
	299
	300	=back
	301
3ef515df	302	=head1 CAVEATS
	303
	304	=head2 NOT SCOPED
	305
	306	The pragma is a per script, not a per block lexical. Only the last
621b0f8d	307	C<use encoding> or C<no encoding> matters, and it affects
	308	B<the whole script>. However, the <no encoding> pragma is supported and
	309	B<use encoding> can appear as many times as you want in a given script.
	310	The multiple use of this pragma is discouraged.
	311
0f29a567	312	By the same reason, the use this pragma inside modules is also
	313	discouraged (though not as strongly discouranged as the case above.
	314	See below).
05ef2f67	315
	316	If you still have to write a module with this pragma, be very careful
	317	of the load order. See the codes below;
	318
	319	# called module
	320	package Module_IN_BAR;
	321	use encoding "bar";
	322	# stuff in "bar" encoding here
	323	1;
	324
	325	# caller script
	326	use encoding "foo"
	327	use Module_IN_BAR;
	328	# surprise! use encoding "bar" is in effect.
	329
	330	The best way to avoid this oddity is to use this pragma RIGHT AFTER
	331	other modules are loaded. i.e.
	332
	333	use Module_IN_BAR;
	334	use encoding "foo";
3ef515df	335
	336	=head2 DO NOT MIX MULTIPLE ENCODINGS
	337
	338	Notice that only literals (string or regular expression) having only
	339	legacy code points are affected: if you mix data like this
	340
	341	\xDF\x{100}
	342
	343	the data is assumed to be in (Latin 1 and) Unicode, not in your native
	344	encoding. In other words, this will match in "greek":
	345
	346	"\xDF" =~ /\x{3af}/
	347
	348	but this will not
	349
	350	"\xDF\x{100}" =~ /\x{3af}\x{100}/
	351
962111ca	352	since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
	353	the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL
	354	LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left. You
	355	should not be mixing your legacy data and Unicode in the same string.
3ef515df	356
	357	This pragma also affects encoding of the 0x80..0xFF code point range:
	358	normally characters in that range are left as eight-bit bytes (unless
	359	they are combined with characters with code points 0x100 or larger,
	360	in which case all characters need to become UTF-8 encoded), but if
	361	the C<encoding> pragma is present, even the 0x80..0xFF range always
	362	gets UTF-8 encoded.
	363
	364	After all, the best thing about this pragma is that you don't have to
0ab8f81e	365	resort to \x{....} just to spell your name in a native encoding.
	366	So feel free to put your strings in your encoding in quotes and
	367	regexes.
3ef515df	368
151b5d36	369	=head2 tr/// with ranges
4b291ae6	370
4b291ae6	371	The B<encoding> pragma works by decoding string literals in
151b5d36	372	C<q//,qq//,qr//,qw///, qx//> and so forth. In perl 5.8.0, this
4b291ae6	373	does not apply to C<tr///>. Therefore,
	374
	375	use encoding 'euc-jp';
	376	#....
	377	$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/;
	378	# -------- -------- -------- --------
	379
	380	Does not work as
	381
	382	$kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/;
	383
	384	=over
	385
	386	=item Legend of characters above
	387
	388	utf8 euc-jp charnames::viacode()
	389	-----------------------------------------
	390	\x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A
	391	\x{3093} \xA4\xF3 HIRAGANA LETTER N
	392	\x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A
	393	\x{30f3} \xA5\xF3 KATAKANA LETTER N
	394
	395	=back
	396
05ef2f67	397	This counterintuitive behavior has been fixed in perl 5.8.1.
151b5d36	398
4b291ae6	399	=head3 workaround to tr///;
4b291ae6	400
151b5d36	401	In perl 5.8.0, you can work aroud as follows;
4b291ae6	402
4b291ae6	403	use encoding 'euc-jp';
151b5d36	404	# ....
4b291ae6	405	eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };
	406
	407	Note the C<tr//> expression is surronded by C<qq{}>. The idea behind
	408	is the same as classic idiom that makes C<tr///> 'interpolate'.
	409
	410	tr/$from/$to/; # wrong!
	411	eval qq{ tr/$from/$to/ }; # workaround.
	412
	413	Nevertheless, in case of B<encoding> pragma even C<q//> is affected so
	414	C<tr///> not being decoded was obviously against the will of Perl5
05ef2f67	415	Porters so it has been fixed in Perl 5.8.1 or later.
aae85ceb	416
3ef515df	417	=head1 EXAMPLE - Greekperl
	418
	419	use encoding "iso 8859-7";
	420
0ab8f81e	421	# \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode.
3ef515df	422
	423	$a = "\xDF";
	424	$b = "\x{100}";
	425
	426	printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
	427
	428	$c = $a . $b;
	429
	430	# $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
	431
	432	# chr() is affected, and ...
	433
	434	print "mega\n" if ord(chr(0xdf)) == 0x3af;
	435
	436	# ... ord() is affected by the encoding pragma ...
	437
	438	print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
	439
	440	# ... as are eq and cmp ...
	441
	442	print "peta\n" if "\x{3af}" eq pack("C", 0xdf);
	443	print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0;
	444
	445	# ... but pack/unpack C are not affected, in case you still
0ab8f81e	446	# want to go back to your native encoding
3ef515df	447
	448	print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
	449
	450	=head1 KNOWN PROBLEMS
	451
151b5d36	452	=over
151b5d36	453
0f29a567	454	=item literals in regex that are longer than 127 bytes
151b5d36	455
0ab8f81e	456	For native multibyte encodings (either fixed or variable length),
3ef515df	457	the current implementation of the regular expressions may introduce
0ab8f81e	458	recoding errors for regular expression literals longer than 127 bytes.
3ef515df	459
05ef2f67	460	=item EBCDIC
151b5d36	461
3ef515df	462	The encoding pragma is not supported on EBCDIC platforms.
0ab8f81e	463	(Porters who are willing and able to remove this limitation are
0ab8f81e	464	welcome.)
3ef515df	465
05ef2f67	466	=item format
	467
	468	This pragma doesn't work well with format because PerlIO does not
	469	get along very well with it. When format contains non-ascii
	470	characters it prints funny or gets "wide character warnings".
	471	To understand it, try the code below.
	472
	473	# Save this one in utf8
	474	# replace non-ascii with a non-ascii string
	475	my $camel;
	476	format STDOUT =
	477	non-ascii@>>>>>>>
	478	$camel
	479	.
	480	$camel = "non-ascii";
	481	binmode(STDOUT=>':encoding(utf8)'); # bang!
	482	write; # funny
	483	print $camel, "\n"; # fine
	484
	485	Without binmode this happens to work but without binmode, print()
	486	fails instead of write().
	487
	488	At any rate, the very use of format is questionable when it comes to
	489	unicode characters since you have to consider such things as character
	490	width (i.e. double-width for ideographs) and directions (i.e. BIDI for
	491	Arabic and Hebrew).
	492
151b5d36	493	=back
151b5d36	494
05ef2f67	495	=head1 HISTORY
	496
	497	This pragma first appeared in Perl 5.8.0. For features that require
	498	5.8.1 and better, see above.
	499
3ef515df	500	=head1 SEE ALSO
3ef515df	501
aae85ceb	502	L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,
	503
	504	Ch. 15 of C<Programming Perl (3rd Edition)>
	505	by Larry Wall, Tom Christiansen, Jon Orwant;
	506	O'Reilly & Associates; ISBN 0-596-00027-8
3ef515df	507
3ef515df	508	=cut