[p5sagit/p5-mst-13.2.git] / ext / Encode / encoding.pm

# $Id: encoding.pm,v 1.46 2003/07/08 21:52:14 dankogai Exp $
package encoding;
our $VERSION = do { my @r = (q$Revision: 1.46 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };

use Encode;
use strict;
sub DEBUG () { 0 }

BEGIN {
    if (ord("A") == 193) {
	require Carp;
	Carp::croak("encoding pragma does not support EBCDIC platforms");
    }
}

our $HAS_PERLIO = 0;
eval { require PerlIO::encoding };
unless ($@){
    $HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02);
}

sub _exception{
    my $name = shift;
    $] > 5.008 and return 0;               # 5.8.1 or higher then no
    my %utfs = map {$_=>1}
	qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
	   UTF-32 UTF-32BE UTF-32LE);
    $utfs{$name} or return 0;               # UTFs or no
    require Config; Config->import(); our %Config;
    return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no
}

sub import {
    my $class = shift;
    my $name  = shift;
    my %arg = @_;
    $name ||= $ENV{PERL_ENCODING};
    my $enc = find_encoding($name);
    unless (defined $enc) {
	require Carp;
	Carp::croak("Unknown encoding '$name'");
    }
    $name = $enc->name; # canonize
    unless ($arg{Filter}) {
	DEBUG and warn "_exception($name) = ", _exception($name);
	_exception($name) or ${^ENCODING} = $enc;
	$HAS_PERLIO or return 1;
    }else{
	defined(${^ENCODING}) and undef ${^ENCODING};
	# implicitly 'use utf8'
	require utf8; # to fetch $utf8::hint_bits;
	$^H |= $utf8::hint_bits;
	eval {
	    require Filter::Util::Call ;
	    Filter::Util::Call->import ;
	    filter_add(sub{
			   my $status = filter_read();
                           if ($status > 0){
			       $_ = $enc->decode($_, 1);
			       DEBUG and warn $_;
			   }
			   $status ;
		       });
	};
    }	DEBUG and warn "Filter installed";
    defined ${^UNICODE} and ${^UNICODE} != 0 and return 1;
    for my $h (qw(STDIN STDOUT)){
	if ($arg{$h}){
	    unless (defined find_encoding($arg{$h})) {
		require Carp;
		Carp::croak("Unknown encoding for $h, '$arg{$h}'");
	    }
	    eval { binmode($h, ":raw :encoding($arg{$h})") };
	}else{
	    unless (exists $arg{$h}){
		eval { 
		    no warnings 'uninitialized';
		    binmode($h, ":raw :encoding($name)");
		};
	    }
	}
	if ($@){
	    require Carp;
	    Carp::croak($@);
	}
    }
    return 1; # I doubt if we need it, though
}

sub unimport{
    no warnings;
    undef ${^ENCODING};
    if ($HAS_PERLIO){
	binmode(STDIN,  ":raw");
	binmode(STDOUT, ":raw");
    }else{
	binmode(STDIN);
	binmode(STDOUT);
    }
    if ($INC{"Filter/Util/Call.pm"}){
	eval { filter_del() };
    }
}

1;
__END__

=pod

=head1 NAME

encoding - allows you to write your script in non-ascii or non-utf8

=head1 SYNOPSIS

  use encoding "greek";  # Perl like Greek to you?
  use encoding "euc-jp"; # Jperl!

  # or you can even do this if your shell supports your native encoding

  perl -Mencoding=latin2 -e '...' # Feeling centrally European?
  perl -Mencoding=euc-kr -e '...' # Or Korean?

  # more control

  # A simple euc-cn => utf-8 converter
  use encoding "euc-cn", STDOUT => "utf8";  while(<>){print};

  # "no encoding;" supported (but not scoped!)
  no encoding;

  # an alternate way, Filter
  use encoding "euc-jp", Filter=>1;
  # now you can use kanji identifiers -- in euc-jp!

=head1 ABSTRACT

Let's start with a bit of history: Perl 5.6.0 introduced Unicode
support.  You could apply C<substr()> and regexes even to complex CJK
characters -- so long as the script was written in UTF-8.  But back
then, text editors that supported UTF-8 were still rare and many users
instead chose to write scripts in legacy encodings, giving up a whole
new feature of Perl 5.6.

Rewind to the future: starting from perl 5.8.0 with the B<encoding>
pragma, you can write your script in any encoding you like (so long
as the C<Encode> module supports it) and still enjoy Unicode support.
This pragma achieves that by doing the following:

=over

=item *

Internally converts all literals (C<q//,qq//,qr//,qw///, qx//>) from
the encoding specified to utf8.  In Perl 5.8.1 and later, literals in
C<tr///> and C<DATA> pseudo-filehandle are also converted.

=item *

Changing PerlIO layers of C<STDIN> and C<STDOUT> to the encoding
 specified.

=back

=head2 Literal Conversions

You can write code in EUC-JP as follows:

  my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
               #<-char-><-char->   # 4 octets
  s/\bCamel\b/$Rakuda/;

And with C<use encoding "euc-jp"> in effect, it is the same thing as
the code in UTF-8:

  my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
  s/\bCamel\b/$Rakuda/;

=head2 PerlIO layers for C<STD(IN|OUT)>

The B<encoding> pragma also modifies the filehandle layers of
STDIN and STDOUT to the specified encoding.  Therefore,

  use encoding "euc-jp";
  my $message = "Camel is the symbol of perl.\n";
  my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
  $message =~ s/\bCamel\b/$Rakuda/;
  print $message;

Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n",
not "\x{99F1}\x{99DD} is the symbol of perl.\n".

You can override this by giving extra arguments; see below.

=head1 FEATURES THAT REQUIRE 5.8.1

Some of the features offered by this pragma requires perl 5.8.1.  Most
of these are done by Inaba Hiroto.  Any other features and changes
are good for 5.8.0.

=over

=item "NON-EUC" doublebyte encodings

Because perl needs to parse script before applying this pragma, such
encodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH;
\x5c) in the second byte fails because the second byte may
accidentally escape the quoting character that follows.  Perl 5.8.1
or later fixes this problem.

=item tr// 

C<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0
See the section below for details.

=item DATA pseudo-filehandle

Another feature that was overlooked was C<DATA>. 

=back

=head1 USAGE

=over 4

=item use encoding [I<ENCNAME>] ;

Sets the script encoding to I<ENCNAME>.  And unless ${^UNICODE} 
exists and non-zero, PerlIO layers of STDIN and STDOUT are set to
":encoding(I<ENCNAME>)".

Note that STDERR WILL NOT be changed.

Also note that non-STD file handles remain unaffected.  Use C<use
open> or C<binmode> to change layers of those.

If no encoding is specified, the environment variable L<PERL_ENCODING>
is consulted.  If no encoding can be found, the error C<Unknown encoding
'I<ENCNAME>'> will be thrown.

=item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ;

You can also individually set encodings of STDIN and STDOUT via the
C<< STDIN => I<ENCNAME> >> form.  In this case, you cannot omit the
first I<ENCNAME>.  C<< STDIN => undef >> turns the IO transcoding
completely off.

When ${^UNICODE} exists and non-zero, these options will completely
ignored.  ${^UNICODE} is a variable introduced in perl 5.8.1.  See
L<perlrun> see L<perlvar/"${^UNICODE}"> and L<perlrun/"-C"> for
details (perl 5.8.1 and later).

=item use encoding I<ENCNAME> Filter=E<gt>1;

This turns the encoding pragma into a source filter.  While the
default approach just decodes interpolated literals (in qq() and
qr()), this will apply a source filter to the entire source code.  See
L</"The Filter Option"> below for details.

=item no encoding;

Unsets the script encoding. The layers of STDIN, STDOUT are
reset to ":raw" (the default unprocessed raw stream of bytes).

=back

=head1 The Filter Option

The magic of C<use encoding> is not applied to the names of
identifiers.  In order to make C<${"\x{4eba}"}++> ($human++, where human
is a single Han ideograph) work, you still need to write your script
in UTF-8 -- or use a source filter.  That's what 'Filter=>1' does.

What does this mean?  Your source code behaves as if it is written in
UTF-8 with 'use utf8' in effect.  So even if your editor only supports
Shift_JIS, for example, you can still try examples in Chapter 15 of
C<Programming Perl, 3rd Ed.>.  For instance, you can use UTF-8
identifiers.

This option is significantly slower and (as of this writing) non-ASCII
identifiers are not very stable WITHOUT this option and with the
source code written in UTF-8.

=head2 Filter-related changes at Encode version 1.87

=over

=item *

The Filter option now sets STDIN and STDOUT like non-filter options.
And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like
non-filter version.

=item *

C<use utf8> is implicitly declared so you no longer have to C<use
utf8> to C<${"\x{4eba}"}++>.

=back

=head1 CAVEATS

=head2 NOT SCOPED

The pragma is a per script, not a per block lexical.  Only the last
C<use encoding> or C<no encoding> matters, and it affects 
B<the whole script>.  However, the <no encoding> pragma is supported and 
B<use encoding> can appear as many times as you want in a given script. 
The multiple use of this pragma is discouraged.

By the same reason, the use this pragma inside modules is also
discouraged (though not as strongly discouranged as the case above.  
See below).

If you still have to write a module with this pragma, be very careful
of the load order.  See the codes below;

  # called module
  package Module_IN_BAR;
  use encoding "bar";
  # stuff in "bar" encoding here
  1;

  # caller script
  use encoding "foo"
  use Module_IN_BAR;
  # surprise! use encoding "bar" is in effect.

The best way to avoid this oddity is to use this pragma RIGHT AFTER
other modules are loaded.  i.e.

  use Module_IN_BAR;
  use encoding "foo";

=head2 DO NOT MIX MULTIPLE ENCODINGS

Notice that only literals (string or regular expression) having only
legacy code points are affected: if you mix data like this

	\xDF\x{100}

the data is assumed to be in (Latin 1 and) Unicode, not in your native
encoding.  In other words, this will match in "greek":

	"\xDF" =~ /\x{3af}/

but this will not

	"\xDF\x{100}" =~ /\x{3af}\x{100}/

since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL
LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left.  You
should not be mixing your legacy data and Unicode in the same string.

This pragma also affects encoding of the 0x80..0xFF code point range:
normally characters in that range are left as eight-bit bytes (unless
they are combined with characters with code points 0x100 or larger,
in which case all characters need to become UTF-8 encoded), but if
the C<encoding> pragma is present, even the 0x80..0xFF range always
gets UTF-8 encoded.

After all, the best thing about this pragma is that you don't have to
resort to \x{....} just to spell your name in a native encoding.
So feel free to put your strings in your encoding in quotes and
regexes.

=head2 tr/// with ranges

The B<encoding> pragma works by decoding string literals in
C<q//,qq//,qr//,qw///, qx//> and so forth.  In perl 5.8.0, this
does not apply to C<tr///>.  Therefore,

  use encoding 'euc-jp';
  #....
  $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/;
  #           -------- -------- -------- --------

Does not work as

  $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/;

=over

=item Legend of characters above

  utf8     euc-jp   charnames::viacode()
  -----------------------------------------
  \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A
  \x{3093} \xA4\xF3 HIRAGANA LETTER N
  \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A
  \x{30f3} \xA5\xF3 KATAKANA LETTER N

=back

This counterintuitive behavior has been fixed in perl 5.8.1.

=head3 workaround to tr///;

In perl 5.8.0, you can work around as follows;

  use encoding 'euc-jp';
  #  ....
  eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };

Note the C<tr//> expression is surrounded by C<qq{}>.  The idea behind
is the same as classic idiom that makes C<tr///> 'interpolate'.

   tr/$from/$to/;            # wrong!
   eval qq{ tr/$from/$to/ }; # workaround.

Nevertheless, in case of B<encoding> pragma even C<q//> is affected so
C<tr///> not being decoded was obviously against the will of Perl5
Porters so it has been fixed in Perl 5.8.1 or later.

=head1 EXAMPLE - Greekperl

    use encoding "iso 8859-7";

    # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode.

    $a = "\xDF";
    $b = "\x{100}";

    printf "%#x\n", ord($a); # will print 0x3af, not 0xdf

    $c = $a . $b;

    # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".

    # chr() is affected, and ...

    print "mega\n"  if ord(chr(0xdf)) == 0x3af;

    # ... ord() is affected by the encoding pragma ...

    print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;

    # ... as are eq and cmp ...

    print "peta\n" if "\x{3af}" eq  pack("C", 0xdf);
    print "exa\n"  if "\x{3af}" cmp pack("C", 0xdf) == 0;

    # ... but pack/unpack C are not affected, in case you still
    # want to go back to your native encoding

    print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;

=head1 KNOWN PROBLEMS

=over

=item literals in regex that are longer than 127 bytes

For native multibyte encodings (either fixed or variable length),
the current implementation of the regular expressions may introduce
recoding errors for regular expression literals longer than 127 bytes.

=item EBCDIC

The encoding pragma is not supported on EBCDIC platforms.
(Porters who are willing and able to remove this limitation are
welcome.)

=item format

This pragma doesn't work well with format because PerlIO does not
get along very well with it.  When format contains non-ascii
characters it prints funny or gets "wide character warnings".
To understand it, try the code below.

  # Save this one in utf8
  # replace *non-ascii* with a non-ascii string
  my $camel;
  format STDOUT =
  *non-ascii*@>>>>>>>
  $camel
  .
  $camel = "*non-ascii*";
  binmode(STDOUT=>':encoding(utf8)'); # bang!
  write;              # funny 
  print $camel, "\n"; # fine

Without binmode this happens to work but without binmode, print()
fails instead of write().

At any rate, the very use of format is questionable when it comes to
unicode characters since you have to consider such things as character
width (i.e. double-width for ideographs) and directions (i.e. BIDI for
Arabic and Hebrew).

=back

=head1 HISTORY

This pragma first appeared in Perl 5.8.0.  For features that require 
5.8.1 and better, see above.

=head1 SEE ALSO

L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,

Ch. 15 of C<Programming Perl (3rd Edition)>
by Larry Wall, Tom Christiansen, Jon Orwant;
O'Reilly & Associates; ISBN 0-596-00027-8

=cut
Commit	Line	Data
8f139f4c	1	# $Id: encoding.pm,v 1.46 2003/07/08 21:52:14 dankogai Exp $
3ef515df	2	package encoding;
8f139f4c	3	our $VERSION = do { my @r = (q$Revision: 1.46 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
3ef515df	4
3ef515df	5	use Encode;
046f36bf	6	use strict;
8f139f4c	7	sub DEBUG () { 0 }
3ef515df	8
	9	BEGIN {
	10	if (ord("A") == 193) {
	11	require Carp;
10c5ecbb	12	Carp::croak("encoding pragma does not support EBCDIC platforms");
3ef515df	13	}
	14	}
	15
0ab8f81e	16	our $HAS_PERLIO = 0;
	17	eval { require PerlIO::encoding };
	18	unless ($@){
	19	$HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02);
	20	}
b2704119	21
151b5d36	22	sub _exception{
151b5d36	23	my $name = shift;
b5ab1f6f	24	$] > 5.008 and return 0; # 5.8.1 or higher then no
151b5d36	25	my %utfs = map {$_=>1}
	26	qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
	27	UTF-32 UTF-32BE UTF-32LE);
b5ab1f6f	28	$utfs{$name} or return 0; # UTFs or no
151b5d36	29	require Config; Config->import(); our %Config;
2b6a28d4	30	return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no
151b5d36	31	}
fa6f41cf	32
3ef515df	33	sub import {
	34	my $class = shift;
	35	my $name = shift;
	36	my %arg = @_;
	37	$name \|\|= $ENV{PERL_ENCODING};
3ef515df	38	my $enc = find_encoding($name);
	39	unless (defined $enc) {
	40	require Carp;
10c5ecbb	41	Carp::croak("Unknown encoding '$name'");
3ef515df	42	}
fa6f41cf	43	$name = $enc->name; # canonize
0f7c507f	44	unless ($arg{Filter}) {
8f139f4c	45	DEBUG and warn "_exception($name) = ", _exception($name);
151b5d36	46	_exception($name) or ${^ENCODING} = $enc;
85982a32	47	$HAS_PERLIO or return 1;
aae85ceb	48	}else{
aae85ceb	49	defined(${^ENCODING}) and undef ${^ENCODING};
151b5d36	50	# implicitly 'use utf8'
	51	require utf8; # to fetch $utf8::hint_bits;
	52	$^H \|= $utf8::hint_bits;
aae85ceb	53	eval {
	54	require Filter::Util::Call ;
	55	Filter::Util::Call->import ;
aae85ceb	56	filter_add(sub{
151b5d36	57	my $status = filter_read();
151b5d36	58	if ($status > 0){
aae85ceb	59	$_ = $enc->decode($_, 1);
8f139f4c	60	DEBUG and warn $_;
aae85ceb	61	}
	62	$status ;
	63	});
	64	};
8f139f4c	65	} DEBUG and warn "Filter installed";
05ef2f67	66	defined ${^UNICODE} and ${^UNICODE} != 0 and return 1;
151b5d36	67	for my $h (qw(STDIN STDOUT)){
	68	if ($arg{$h}){
	69	unless (defined find_encoding($arg{$h})) {
	70	require Carp;
	71	Carp::croak("Unknown encoding for $h, '$arg{$h}'");
	72	}
6be7c101	73	eval { binmode($h, ":raw :encoding($arg{$h})") };
151b5d36	74	}else{
	75	unless (exists $arg{$h}){
	76	eval {
	77	no warnings 'uninitialized';
6be7c101	78	binmode($h, ":raw :encoding($name)");
151b5d36	79	};
	80	}
	81	}
	82	if ($@){
	83	require Carp;
	84	Carp::croak($@);
	85	}
3ef515df	86	}
	87	return 1; # I doubt if we need it, though
	88	}
	89
	90	sub unimport{
	91	no warnings;
	92	undef ${^ENCODING};
621b0f8d	93	if ($HAS_PERLIO){
	94	binmode(STDIN, ":raw");
	95	binmode(STDOUT, ":raw");
	96	}else{
6be7c101	97	binmode(STDIN);
6be7c101	98	binmode(STDOUT);
621b0f8d	99	}
aae85ceb	100	if ($INC{"Filter/Util/Call.pm"}){
	101	eval { filter_del() };
	102	}
3ef515df	103	}
	104
	105	1;
	106	__END__
85982a32	107
3ef515df	108	=pod
	109
	110	=head1 NAME
	111
0ab8f81e	112	encoding - allows you to write your script in non-ascii or non-utf8
3ef515df	113
	114	=head1 SYNOPSIS
	115
962111ca	116	use encoding "greek"; # Perl like Greek to you?
3ef515df	117	use encoding "euc-jp"; # Jperl!
3ef515df	118
962111ca	119	# or you can even do this if your shell supports your native encoding
3ef515df	120
962111ca	121	perl -Mencoding=latin2 -e '...' # Feeling centrally European?
0ab8f81e	122	perl -Mencoding=euc-kr -e '...' # Or Korean?
3ef515df	123
3ef515df	124	# more control
3ef515df	125
962111ca	126	# A simple euc-cn => utf-8 converter
6d1c0808	127	use encoding "euc-cn", STDOUT => "utf8"; while(<>){print};
3ef515df	128
	129	# "no encoding;" supported (but not scoped!)
	130	no encoding;
	131
aae85ceb	132	# an alternate way, Filter
aae85ceb	133	use encoding "euc-jp", Filter=>1;
aae85ceb	134	# now you can use kanji identifiers -- in euc-jp!
aae85ceb	135
3ef515df	136	=head1 ABSTRACT
3ef515df	137
962111ca	138	Let's start with a bit of history: Perl 5.6.0 introduced Unicode
	139	support. You could apply C<substr()> and regexes even to complex CJK
	140	characters -- so long as the script was written in UTF-8. But back
0ab8f81e	141	then, text editors that supported UTF-8 were still rare and many users
	142	instead chose to write scripts in legacy encodings, giving up a whole
	143	new feature of Perl 5.6.
3ef515df	144
0ab8f81e	145	Rewind to the future: starting from perl 5.8.0 with the B<encoding>
962111ca	146	pragma, you can write your script in any encoding you like (so long
962111ca	147	as the C<Encode> module supports it) and still enjoy Unicode support.
0f29a567	148	This pragma achieves that by doing the following:
05ef2f67	149
	150	=over
	151
	152	=item *
	153
	154	Internally converts all literals (C<q//,qq//,qr//,qw///, qx//>) from
	155	the encoding specified to utf8. In Perl 5.8.1 and later, literals in
	156	C<tr///> and C<DATA> pseudo-filehandle are also converted.
	157
	158	=item *
	159
	160	Changing PerlIO layers of C<STDIN> and C<STDOUT> to the encoding
	161	specified.
	162
	163	=back
	164
	165	=head2 Literal Conversions
	166
0ab8f81e	167	You can write code in EUC-JP as follows:
3ef515df	168
	169	my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
	170	#<-char-><-char-> # 4 octets
	171	s/\bCamel\b/$Rakuda/;
	172
	173	And with C<use encoding "euc-jp"> in effect, it is the same thing as
962111ca	174	the code in UTF-8:
3ef515df	175
32b9ed1f	176	my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
3ef515df	177	s/\bCamel\b/$Rakuda/;
3ef515df	178
05ef2f67	179	=head2 PerlIO layers for C<STD(IN\|OUT)>
	180
	181	The B<encoding> pragma also modifies the filehandle layers of
4b291ae6	182	STDIN and STDOUT to the specified encoding. Therefore,
3ef515df	183
	184	use encoding "euc-jp";
	185	my $message = "Camel is the symbol of perl.\n";
	186	my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
	187	$message =~ s/\bCamel\b/$Rakuda/;
	188	print $message;
	189
962111ca	190	Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n",
962111ca	191	not "\x{99F1}\x{99DD} is the symbol of perl.\n".
3ef515df	192
0ab8f81e	193	You can override this by giving extra arguments; see below.
3ef515df	194
05ef2f67	195	=head1 FEATURES THAT REQUIRE 5.8.1
	196
	197	Some of the features offered by this pragma requires perl 5.8.1. Most
0f29a567	198	of these are done by Inaba Hiroto. Any other features and changes
05ef2f67	199	are good for 5.8.0.
	200
	201	=over
	202
	203	=item "NON-EUC" doublebyte encodings
	204
0f29a567	205	Because perl needs to parse script before applying this pragma, such
05ef2f67	206	encodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH;
05ef2f67	207	\x5c) in the second byte fails because the second byte may
0f29a567	208	accidentally escape the quoting character that follows. Perl 5.8.1
05ef2f67	209	or later fixes this problem.
	210
	211	=item tr//
	212
	213	C<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0
	214	See the section below for details.
	215
	216	=item DATA pseudo-filehandle
	217
	218	Another feature that was overlooked was C<DATA>.
	219
	220	=back
	221
3ef515df	222	=head1 USAGE
	223
	224	=over 4
	225
	226	=item use encoding [I<ENCNAME>] ;
	227
05ef2f67	228	Sets the script encoding to I<ENCNAME>. And unless ${^UNICODE}
	229	exists and non-zero, PerlIO layers of STDIN and STDOUT are set to
	230	":encoding(I<ENCNAME>)".
	231
	232	Note that STDERR WILL NOT be changed.
	233
	234	Also note that non-STD file handles remain unaffected. Use C<use
	235	open> or C<binmode> to change layers of those.
3ef515df	236
3ef515df	237	If no encoding is specified, the environment variable L<PERL_ENCODING>
962111ca	238	is consulted. If no encoding can be found, the error C<Unknown encoding
962111ca	239	'I<ENCNAME>'> will be thrown.
3ef515df	240
aae85ceb	241	=item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ;
3ef515df	242
0ab8f81e	243	You can also individually set encodings of STDIN and STDOUT via the
32b9ed1f	244	C<< STDIN => I<ENCNAME> >> form. In this case, you cannot omit the
32b9ed1f	245	first I<ENCNAME>. C<< STDIN => undef >> turns the IO transcoding
aae85ceb	246	completely off.
3ef515df	247
05ef2f67	248	When ${^UNICODE} exists and non-zero, these options will completely
	249	ignored. ${^UNICODE} is a variable introduced in perl 5.8.1. See
	250	L<perlrun> see L<perlvar/"${^UNICODE}"> and L<perlrun/"-C"> for
	251	details (perl 5.8.1 and later).
	252
151b5d36	253	=item use encoding I<ENCNAME> Filter=E<gt>1;
	254
	255	This turns the encoding pragma into a source filter. While the
	256	default approach just decodes interpolated literals (in qq() and
	257	qr()), this will apply a source filter to the entire source code. See
05ef2f67	258	L</"The Filter Option"> below for details.
151b5d36	259
3ef515df	260	=item no encoding;
3ef515df	261
05ef2f67	262	Unsets the script encoding. The layers of STDIN, STDOUT are
962111ca	263	reset to ":raw" (the default unprocessed raw stream of bytes).
3ef515df	264
	265	=back
	266
151b5d36	267	=head1 The Filter Option
	268
	269	The magic of C<use encoding> is not applied to the names of
	270	identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human
	271	is a single Han ideograph) work, you still need to write your script
	272	in UTF-8 -- or use a source filter. That's what 'Filter=>1' does.
	273
151b5d36	274	What does this mean? Your source code behaves as if it is written in
	275	UTF-8 with 'use utf8' in effect. So even if your editor only supports
	276	Shift_JIS, for example, you can still try examples in Chapter 15 of
	277	C<Programming Perl, 3rd Ed.>. For instance, you can use UTF-8
	278	identifiers.
	279
	280	This option is significantly slower and (as of this writing) non-ASCII
	281	identifiers are not very stable WITHOUT this option and with the
	282	source code written in UTF-8.
	283
	284	=head2 Filter-related changes at Encode version 1.87
	285
	286	=over
	287
	288	=item *
	289
	290	The Filter option now sets STDIN and STDOUT like non-filter options.
	291	And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like
	292	non-filter version.
	293
	294	=item *
	295
	296	C<use utf8> is implicitly declared so you no longer have to C<use
	297	utf8> to C<${"\x{4eba}"}++>.
	298
	299	=back
	300
3ef515df	301	=head1 CAVEATS
	302
	303	=head2 NOT SCOPED
	304
	305	The pragma is a per script, not a per block lexical. Only the last
621b0f8d	306	C<use encoding> or C<no encoding> matters, and it affects
	307	B<the whole script>. However, the <no encoding> pragma is supported and
	308	B<use encoding> can appear as many times as you want in a given script.
	309	The multiple use of this pragma is discouraged.
	310
0f29a567	311	By the same reason, the use this pragma inside modules is also
	312	discouraged (though not as strongly discouranged as the case above.
	313	See below).
05ef2f67	314
	315	If you still have to write a module with this pragma, be very careful
	316	of the load order. See the codes below;
	317
	318	# called module
	319	package Module_IN_BAR;
	320	use encoding "bar";
	321	# stuff in "bar" encoding here
	322	1;
	323
	324	# caller script
	325	use encoding "foo"
	326	use Module_IN_BAR;
	327	# surprise! use encoding "bar" is in effect.
	328
	329	The best way to avoid this oddity is to use this pragma RIGHT AFTER
	330	other modules are loaded. i.e.
	331
	332	use Module_IN_BAR;
	333	use encoding "foo";
3ef515df	334
	335	=head2 DO NOT MIX MULTIPLE ENCODINGS
	336
	337	Notice that only literals (string or regular expression) having only
	338	legacy code points are affected: if you mix data like this
	339
	340	\xDF\x{100}
	341
	342	the data is assumed to be in (Latin 1 and) Unicode, not in your native
	343	encoding. In other words, this will match in "greek":
	344
	345	"\xDF" =~ /\x{3af}/
	346
	347	but this will not
	348
	349	"\xDF\x{100}" =~ /\x{3af}\x{100}/
	350
962111ca	351	since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
	352	the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL
	353	LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left. You
	354	should not be mixing your legacy data and Unicode in the same string.
3ef515df	355
	356	This pragma also affects encoding of the 0x80..0xFF code point range:
	357	normally characters in that range are left as eight-bit bytes (unless
	358	they are combined with characters with code points 0x100 or larger,
	359	in which case all characters need to become UTF-8 encoded), but if
	360	the C<encoding> pragma is present, even the 0x80..0xFF range always
	361	gets UTF-8 encoded.
	362
	363	After all, the best thing about this pragma is that you don't have to
0ab8f81e	364	resort to \x{....} just to spell your name in a native encoding.
	365	So feel free to put your strings in your encoding in quotes and
	366	regexes.
3ef515df	367
151b5d36	368	=head2 tr/// with ranges
4b291ae6	369
4b291ae6	370	The B<encoding> pragma works by decoding string literals in
151b5d36	371	C<q//,qq//,qr//,qw///, qx//> and so forth. In perl 5.8.0, this
4b291ae6	372	does not apply to C<tr///>. Therefore,
	373
	374	use encoding 'euc-jp';
	375	#....
	376	$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/;
	377	# -------- -------- -------- --------
	378
	379	Does not work as
	380
	381	$kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/;
	382
	383	=over
	384
	385	=item Legend of characters above
	386
	387	utf8 euc-jp charnames::viacode()
	388	-----------------------------------------
	389	\x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A
	390	\x{3093} \xA4\xF3 HIRAGANA LETTER N
	391	\x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A
	392	\x{30f3} \xA5\xF3 KATAKANA LETTER N
	393
	394	=back
	395
05ef2f67	396	This counterintuitive behavior has been fixed in perl 5.8.1.
151b5d36	397
4b291ae6	398	=head3 workaround to tr///;
4b291ae6	399
ce16148b	400	In perl 5.8.0, you can work around as follows;
4b291ae6	401
4b291ae6	402	use encoding 'euc-jp';
151b5d36	403	# ....
4b291ae6	404	eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };
4b291ae6	405
ce16148b	406	Note the C<tr//> expression is surrounded by C<qq{}>. The idea behind
4b291ae6	407	is the same as classic idiom that makes C<tr///> 'interpolate'.
	408
	409	tr/$from/$to/; # wrong!
	410	eval qq{ tr/$from/$to/ }; # workaround.
	411
	412	Nevertheless, in case of B<encoding> pragma even C<q//> is affected so
	413	C<tr///> not being decoded was obviously against the will of Perl5
05ef2f67	414	Porters so it has been fixed in Perl 5.8.1 or later.
aae85ceb	415
3ef515df	416	=head1 EXAMPLE - Greekperl
	417
	418	use encoding "iso 8859-7";
	419
0ab8f81e	420	# \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode.
3ef515df	421
	422	$a = "\xDF";
	423	$b = "\x{100}";
	424
	425	printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
	426
	427	$c = $a . $b;
	428
	429	# $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
	430
	431	# chr() is affected, and ...
	432
	433	print "mega\n" if ord(chr(0xdf)) == 0x3af;
	434
	435	# ... ord() is affected by the encoding pragma ...
	436
	437	print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
	438
	439	# ... as are eq and cmp ...
	440
	441	print "peta\n" if "\x{3af}" eq pack("C", 0xdf);
	442	print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0;
	443
	444	# ... but pack/unpack C are not affected, in case you still
0ab8f81e	445	# want to go back to your native encoding
3ef515df	446
	447	print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
	448
	449	=head1 KNOWN PROBLEMS
	450
151b5d36	451	=over
151b5d36	452
0f29a567	453	=item literals in regex that are longer than 127 bytes
151b5d36	454
0ab8f81e	455	For native multibyte encodings (either fixed or variable length),
3ef515df	456	the current implementation of the regular expressions may introduce
0ab8f81e	457	recoding errors for regular expression literals longer than 127 bytes.
3ef515df	458
05ef2f67	459	=item EBCDIC
151b5d36	460
3ef515df	461	The encoding pragma is not supported on EBCDIC platforms.
0ab8f81e	462	(Porters who are willing and able to remove this limitation are
0ab8f81e	463	welcome.)
3ef515df	464
05ef2f67	465	=item format
	466
	467	This pragma doesn't work well with format because PerlIO does not
	468	get along very well with it. When format contains non-ascii
	469	characters it prints funny or gets "wide character warnings".
	470	To understand it, try the code below.
	471
	472	# Save this one in utf8
	473	# replace non-ascii with a non-ascii string
	474	my $camel;
	475	format STDOUT =
	476	non-ascii@>>>>>>>
	477	$camel
	478	.
	479	$camel = "non-ascii";
	480	binmode(STDOUT=>':encoding(utf8)'); # bang!
	481	write; # funny
	482	print $camel, "\n"; # fine
	483
	484	Without binmode this happens to work but without binmode, print()
	485	fails instead of write().
	486
	487	At any rate, the very use of format is questionable when it comes to
	488	unicode characters since you have to consider such things as character
	489	width (i.e. double-width for ideographs) and directions (i.e. BIDI for
	490	Arabic and Hebrew).
	491
151b5d36	492	=back
151b5d36	493
05ef2f67	494	=head1 HISTORY
	495
	496	This pragma first appeared in Perl 5.8.0. For features that require
	497	5.8.1 and better, see above.
	498
3ef515df	499	=head1 SEE ALSO
3ef515df	500
aae85ceb	501	L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,
	502
	503	Ch. 15 of C<Programming Perl (3rd Edition)>
	504	by Larry Wall, Tom Christiansen, Jon Orwant;
	505	O'Reilly & Associates; ISBN 0-596-00027-8
3ef515df	506
3ef515df	507	=cut