[p5sagit/p5-mst-13.2.git] / ext / Encode / encoding.pm

# $Id: encoding.pm,v 1.47 2003/08/20 11:15:31 dankogai Exp dankogai $
package encoding;
our $VERSION = do { my @r = (q$Revision: 1.47 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };

use Encode;
use strict;
sub DEBUG () { 0 }

BEGIN {
    if (ord("A") == 193) {
	require Carp;
	Carp::croak("encoding pragma does not support EBCDIC platforms");
    }
}

our $HAS_PERLIO = 0;
eval { require PerlIO::encoding };
unless ($@){
    $HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02);
}

sub _exception{
    my $name = shift;
    $] > 5.008 and return 0;               # 5.8.1 or higher then no
    my %utfs = map {$_=>1}
	qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
	   UTF-32 UTF-32BE UTF-32LE);
    $utfs{$name} or return 0;               # UTFs or no
    require Config; Config->import(); our %Config;
    return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no
}

sub import {
    my $class = shift;
    my $name  = shift;
    my %arg = @_;
    $name ||= $ENV{PERL_ENCODING};
    my $enc = find_encoding($name);
    unless (defined $enc) {
	require Carp;
	Carp::croak("Unknown encoding '$name'");
    }
    $name = $enc->name; # canonize
    unless ($arg{Filter}) {
	DEBUG and warn "_exception($name) = ", _exception($name);
	_exception($name) or ${^ENCODING} = $enc;
	$HAS_PERLIO or return 1;
    }else{
	defined(${^ENCODING}) and undef ${^ENCODING};
	# implicitly 'use utf8'
	require utf8; # to fetch $utf8::hint_bits;
	$^H |= $utf8::hint_bits;
	eval {
	    require Filter::Util::Call ;
	    Filter::Util::Call->import ;
	    filter_add(sub{
			   my $status = filter_read();
                           if ($status > 0){
			       $_ = $enc->decode($_, 1);
			       DEBUG and warn $_;
			   }
			   $status ;
		       });
	};
    }	DEBUG and warn "Filter installed";
    defined ${^UNICODE} and ${^UNICODE} != 0 and return 1;
    for my $h (qw(STDIN STDOUT)){
	if ($arg{$h}){
	    unless (defined find_encoding($arg{$h})) {
		require Carp;
		Carp::croak("Unknown encoding for $h, '$arg{$h}'");
	    }
	    eval { binmode($h, ":raw :encoding($arg{$h})") };
	}else{
	    unless (exists $arg{$h}){
		eval { 
		    no warnings 'uninitialized';
		    binmode($h, ":raw :encoding($name)");
		};
	    }
	}
	if ($@){
	    require Carp;
	    Carp::croak($@);
	}
    }
    return 1; # I doubt if we need it, though
}

sub unimport{
    no warnings;
    undef ${^ENCODING};
    if ($HAS_PERLIO){
	binmode(STDIN,  ":raw");
	binmode(STDOUT, ":raw");
    }else{
	binmode(STDIN);
	binmode(STDOUT);
    }
    if ($INC{"Filter/Util/Call.pm"}){
	eval { filter_del() };
    }
}

1;
__END__

=pod

=head1 NAME

encoding - allows you to write your script in non-ascii or non-utf8

=head1 SYNOPSIS

  use encoding "greek";  # Perl like Greek to you?
  use encoding "euc-jp"; # Jperl!

  # or you can even do this if your shell supports your native encoding

  perl -Mencoding=latin2 -e '...' # Feeling centrally European?
  perl -Mencoding=euc-kr -e '...' # Or Korean?

  # more control

  # A simple euc-cn => utf-8 converter
  use encoding "euc-cn", STDOUT => "utf8";  while(<>){print};

  # "no encoding;" supported (but not scoped!)
  no encoding;

  # an alternate way, Filter
  use encoding "euc-jp", Filter=>1;
  # now you can use kanji identifiers -- in euc-jp!

=head1 ABSTRACT

Let's start with a bit of history: Perl 5.6.0 introduced Unicode
support.  You could apply C<substr()> and regexes even to complex CJK
characters -- so long as the script was written in UTF-8.  But back
then, text editors that supported UTF-8 were still rare and many users
instead chose to write scripts in legacy encodings, giving up a whole
new feature of Perl 5.6.

Rewind to the future: starting from perl 5.8.0 with the B<encoding>
pragma, you can write your script in any encoding you like (so long
as the C<Encode> module supports it) and still enjoy Unicode support.
This pragma achieves that by doing the following:

=over

=item *

Internally converts all literals (C<q//,qq//,qr//,qw///, qx//>) from
the encoding specified to utf8.  In Perl 5.8.1 and later, literals in
C<tr///> and C<DATA> pseudo-filehandle are also converted.

=item *

Changing PerlIO layers of C<STDIN> and C<STDOUT> to the encoding
 specified.

=back

=head2 Literal Conversions

You can write code in EUC-JP as follows:

  my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
               #<-char-><-char->   # 4 octets
  s/\bCamel\b/$Rakuda/;

And with C<use encoding "euc-jp"> in effect, it is the same thing as
the code in UTF-8:

  my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
  s/\bCamel\b/$Rakuda/;

=head2 PerlIO layers for C<STD(IN|OUT)>

The B<encoding> pragma also modifies the filehandle layers of
STDIN and STDOUT to the specified encoding.  Therefore,

  use encoding "euc-jp";
  my $message = "Camel is the symbol of perl.\n";
  my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
  $message =~ s/\bCamel\b/$Rakuda/;
  print $message;

Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n",
not "\x{99F1}\x{99DD} is the symbol of perl.\n".

You can override this by giving extra arguments; see below.

=head2 Implicit upgrading for byte strings

By default, if strings operating under byte semantics and strings
with Unicode character data are concatenated, the new string will
be created by decoding the byte strings as I<ISO 8859-1 (Latin-1)>.

The B<encoding> pragma changes this to use the specified encoding
instead.  For example:

    use encoding 'utf8';
    my $string = chr(20000); # a Unicode string
    utf8::encode($string);   # now it's a UTF-8 encoded byte string
    # concatenate with another Unicode string
    print length($string . chr(20000));

Will print C<2>, because C<$string> is upgraded as UTF-8.  Without
C<use encoding 'utf8';>, it will print C<4> instead, since C<$string>
is three octets when interpreted as Latin-1.

=head1 FEATURES THAT REQUIRE 5.8.1

Some of the features offered by this pragma requires perl 5.8.1.  Most
of these are done by Inaba Hiroto.  Any other features and changes
are good for 5.8.0.

=over

=item "NON-EUC" doublebyte encodings

Because perl needs to parse script before applying this pragma, such
encodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH;
\x5c) in the second byte fails because the second byte may
accidentally escape the quoting character that follows.  Perl 5.8.1
or later fixes this problem.

=item tr// 

C<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0
See the section below for details.

=item DATA pseudo-filehandle

Another feature that was overlooked was C<DATA>. 

=back

=head1 USAGE

=over 4

=item use encoding [I<ENCNAME>] ;

Sets the script encoding to I<ENCNAME>.  And unless ${^UNICODE} 
exists and non-zero, PerlIO layers of STDIN and STDOUT are set to
":encoding(I<ENCNAME>)".

Note that STDERR WILL NOT be changed.

Also note that non-STD file handles remain unaffected.  Use C<use
open> or C<binmode> to change layers of those.

If no encoding is specified, the environment variable L<PERL_ENCODING>
is consulted.  If no encoding can be found, the error C<Unknown encoding
'I<ENCNAME>'> will be thrown.

=item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ;

You can also individually set encodings of STDIN and STDOUT via the
C<< STDIN => I<ENCNAME> >> form.  In this case, you cannot omit the
first I<ENCNAME>.  C<< STDIN => undef >> turns the IO transcoding
completely off.

When ${^UNICODE} exists and non-zero, these options will completely
ignored.  ${^UNICODE} is a variable introduced in perl 5.8.1.  See
L<perlrun> see L<perlvar/"${^UNICODE}"> and L<perlrun/"-C"> for
details (perl 5.8.1 and later).

=item use encoding I<ENCNAME> Filter=E<gt>1;

This turns the encoding pragma into a source filter.  While the
default approach just decodes interpolated literals (in qq() and
qr()), this will apply a source filter to the entire source code.  See
L</"The Filter Option"> below for details.

=item no encoding;

Unsets the script encoding. The layers of STDIN, STDOUT are
reset to ":raw" (the default unprocessed raw stream of bytes).

=back

=head1 The Filter Option

The magic of C<use encoding> is not applied to the names of
identifiers.  In order to make C<${"\x{4eba}"}++> ($human++, where human
is a single Han ideograph) work, you still need to write your script
in UTF-8 -- or use a source filter.  That's what 'Filter=>1' does.

What does this mean?  Your source code behaves as if it is written in
UTF-8 with 'use utf8' in effect.  So even if your editor only supports
Shift_JIS, for example, you can still try examples in Chapter 15 of
C<Programming Perl, 3rd Ed.>.  For instance, you can use UTF-8
identifiers.

This option is significantly slower and (as of this writing) non-ASCII
identifiers are not very stable WITHOUT this option and with the
source code written in UTF-8.

=head2 Filter-related changes at Encode version 1.87

=over

=item *

The Filter option now sets STDIN and STDOUT like non-filter options.
And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like
non-filter version.

=item *

C<use utf8> is implicitly declared so you no longer have to C<use
utf8> to C<${"\x{4eba}"}++>.

=back

=head1 CAVEATS

=head2 NOT SCOPED

The pragma is a per script, not a per block lexical.  Only the last
C<use encoding> or C<no encoding> matters, and it affects 
B<the whole script>.  However, the <no encoding> pragma is supported and 
B<use encoding> can appear as many times as you want in a given script. 
The multiple use of this pragma is discouraged.

By the same reason, the use this pragma inside modules is also
discouraged (though not as strongly discouranged as the case above.  
See below).

If you still have to write a module with this pragma, be very careful
of the load order.  See the codes below;

  # called module
  package Module_IN_BAR;
  use encoding "bar";
  # stuff in "bar" encoding here
  1;

  # caller script
  use encoding "foo"
  use Module_IN_BAR;
  # surprise! use encoding "bar" is in effect.

The best way to avoid this oddity is to use this pragma RIGHT AFTER
other modules are loaded.  i.e.

  use Module_IN_BAR;
  use encoding "foo";

=head2 DO NOT MIX MULTIPLE ENCODINGS

Notice that only literals (string or regular expression) having only
legacy code points are affected: if you mix data like this

	\xDF\x{100}

the data is assumed to be in (Latin 1 and) Unicode, not in your native
encoding.  In other words, this will match in "greek":

	"\xDF" =~ /\x{3af}/

but this will not

	"\xDF\x{100}" =~ /\x{3af}\x{100}/

since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL
LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left.  You
should not be mixing your legacy data and Unicode in the same string.

This pragma also affects encoding of the 0x80..0xFF code point range:
normally characters in that range are left as eight-bit bytes (unless
they are combined with characters with code points 0x100 or larger,
in which case all characters need to become UTF-8 encoded), but if
the C<encoding> pragma is present, even the 0x80..0xFF range always
gets UTF-8 encoded.

After all, the best thing about this pragma is that you don't have to
resort to \x{....} just to spell your name in a native encoding.
So feel free to put your strings in your encoding in quotes and
regexes.

=head2 tr/// with ranges

The B<encoding> pragma works by decoding string literals in
C<q//,qq//,qr//,qw///, qx//> and so forth.  In perl 5.8.0, this
does not apply to C<tr///>.  Therefore,

  use encoding 'euc-jp';
  #....
  $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/;
  #           -------- -------- -------- --------

Does not work as

  $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/;

=over

=item Legend of characters above

  utf8     euc-jp   charnames::viacode()
  -----------------------------------------
  \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A
  \x{3093} \xA4\xF3 HIRAGANA LETTER N
  \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A
  \x{30f3} \xA5\xF3 KATAKANA LETTER N

=back

This counterintuitive behavior has been fixed in perl 5.8.1.

=head3 workaround to tr///;

In perl 5.8.0, you can work around as follows;

  use encoding 'euc-jp';
  #  ....
  eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };

Note the C<tr//> expression is surrounded by C<qq{}>.  The idea behind
is the same as classic idiom that makes C<tr///> 'interpolate'.

   tr/$from/$to/;            # wrong!
   eval qq{ tr/$from/$to/ }; # workaround.

Nevertheless, in case of B<encoding> pragma even C<q//> is affected so
C<tr///> not being decoded was obviously against the will of Perl5
Porters so it has been fixed in Perl 5.8.1 or later.

=head1 EXAMPLE - Greekperl

    use encoding "iso 8859-7";

    # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode.

    $a = "\xDF";
    $b = "\x{100}";

    printf "%#x\n", ord($a); # will print 0x3af, not 0xdf

    $c = $a . $b;

    # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".

    # chr() is affected, and ...

    print "mega\n"  if ord(chr(0xdf)) == 0x3af;

    # ... ord() is affected by the encoding pragma ...

    print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;

    # ... as are eq and cmp ...

    print "peta\n" if "\x{3af}" eq  pack("C", 0xdf);
    print "exa\n"  if "\x{3af}" cmp pack("C", 0xdf) == 0;

    # ... but pack/unpack C are not affected, in case you still
    # want to go back to your native encoding

    print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;

=head1 KNOWN PROBLEMS

=over

=item literals in regex that are longer than 127 bytes

For native multibyte encodings (either fixed or variable length),
the current implementation of the regular expressions may introduce
recoding errors for regular expression literals longer than 127 bytes.

=item EBCDIC

The encoding pragma is not supported on EBCDIC platforms.
(Porters who are willing and able to remove this limitation are
welcome.)

=item format

This pragma doesn't work well with format because PerlIO does not
get along very well with it.  When format contains non-ascii
characters it prints funny or gets "wide character warnings".
To understand it, try the code below.

  # Save this one in utf8
  # replace *non-ascii* with a non-ascii string
  my $camel;
  format STDOUT =
  *non-ascii*@>>>>>>>
  $camel
  .
  $camel = "*non-ascii*";
  binmode(STDOUT=>':encoding(utf8)'); # bang!
  write;              # funny 
  print $camel, "\n"; # fine

Without binmode this happens to work but without binmode, print()
fails instead of write().

At any rate, the very use of format is questionable when it comes to
unicode characters since you have to consider such things as character
width (i.e. double-width for ideographs) and directions (i.e. BIDI for
Arabic and Hebrew).

=back

=head1 HISTORY

This pragma first appeared in Perl 5.8.0.  For features that require 
5.8.1 and better, see above.

=head1 SEE ALSO

L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,

Ch. 15 of C<Programming Perl (3rd Edition)>
by Larry Wall, Tom Christiansen, Jon Orwant;
O'Reilly & Associates; ISBN 0-596-00027-8

=cut
Commit	Line	Data
b786ee6f	1	# $Id: encoding.pm,v 1.47 2003/08/20 11:15:31 dankogai Exp dankogai $
3ef515df	2	package encoding;
b786ee6f	3	our $VERSION = do { my @r = (q$Revision: 1.47 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
3ef515df	4
3ef515df	5	use Encode;
046f36bf	6	use strict;
8f139f4c	7	sub DEBUG () { 0 }
3ef515df	8
	9	BEGIN {
	10	if (ord("A") == 193) {
	11	require Carp;
10c5ecbb	12	Carp::croak("encoding pragma does not support EBCDIC platforms");
3ef515df	13	}
	14	}
	15
0ab8f81e	16	our $HAS_PERLIO = 0;
	17	eval { require PerlIO::encoding };
	18	unless ($@){
	19	$HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02);
	20	}
b2704119	21
151b5d36	22	sub _exception{
151b5d36	23	my $name = shift;
b5ab1f6f	24	$] > 5.008 and return 0; # 5.8.1 or higher then no
151b5d36	25	my %utfs = map {$_=>1}
	26	qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
	27	UTF-32 UTF-32BE UTF-32LE);
b5ab1f6f	28	$utfs{$name} or return 0; # UTFs or no
151b5d36	29	require Config; Config->import(); our %Config;
2b6a28d4	30	return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no
151b5d36	31	}
fa6f41cf	32
3ef515df	33	sub import {
	34	my $class = shift;
	35	my $name = shift;
	36	my %arg = @_;
	37	$name \|\|= $ENV{PERL_ENCODING};
3ef515df	38	my $enc = find_encoding($name);
	39	unless (defined $enc) {
	40	require Carp;
10c5ecbb	41	Carp::croak("Unknown encoding '$name'");
3ef515df	42	}
fa6f41cf	43	$name = $enc->name; # canonize
0f7c507f	44	unless ($arg{Filter}) {
8f139f4c	45	DEBUG and warn "_exception($name) = ", _exception($name);
151b5d36	46	_exception($name) or ${^ENCODING} = $enc;
85982a32	47	$HAS_PERLIO or return 1;
aae85ceb	48	}else{
aae85ceb	49	defined(${^ENCODING}) and undef ${^ENCODING};
151b5d36	50	# implicitly 'use utf8'
	51	require utf8; # to fetch $utf8::hint_bits;
	52	$^H \|= $utf8::hint_bits;
aae85ceb	53	eval {
	54	require Filter::Util::Call ;
	55	Filter::Util::Call->import ;
aae85ceb	56	filter_add(sub{
151b5d36	57	my $status = filter_read();
151b5d36	58	if ($status > 0){
aae85ceb	59	$_ = $enc->decode($_, 1);
8f139f4c	60	DEBUG and warn $_;
aae85ceb	61	}
	62	$status ;
	63	});
	64	};
8f139f4c	65	} DEBUG and warn "Filter installed";
05ef2f67	66	defined ${^UNICODE} and ${^UNICODE} != 0 and return 1;
151b5d36	67	for my $h (qw(STDIN STDOUT)){
	68	if ($arg{$h}){
	69	unless (defined find_encoding($arg{$h})) {
	70	require Carp;
	71	Carp::croak("Unknown encoding for $h, '$arg{$h}'");
	72	}
6be7c101	73	eval { binmode($h, ":raw :encoding($arg{$h})") };
151b5d36	74	}else{
	75	unless (exists $arg{$h}){
	76	eval {
	77	no warnings 'uninitialized';
6be7c101	78	binmode($h, ":raw :encoding($name)");
151b5d36	79	};
	80	}
	81	}
	82	if ($@){
	83	require Carp;
	84	Carp::croak($@);
	85	}
3ef515df	86	}
	87	return 1; # I doubt if we need it, though
	88	}
	89
	90	sub unimport{
	91	no warnings;
	92	undef ${^ENCODING};
621b0f8d	93	if ($HAS_PERLIO){
	94	binmode(STDIN, ":raw");
	95	binmode(STDOUT, ":raw");
	96	}else{
6be7c101	97	binmode(STDIN);
6be7c101	98	binmode(STDOUT);
621b0f8d	99	}
aae85ceb	100	if ($INC{"Filter/Util/Call.pm"}){
	101	eval { filter_del() };
	102	}
3ef515df	103	}
	104
	105	1;
	106	__END__
85982a32	107
3ef515df	108	=pod
	109
	110	=head1 NAME
	111
0ab8f81e	112	encoding - allows you to write your script in non-ascii or non-utf8
3ef515df	113
	114	=head1 SYNOPSIS
	115
962111ca	116	use encoding "greek"; # Perl like Greek to you?
3ef515df	117	use encoding "euc-jp"; # Jperl!
3ef515df	118
962111ca	119	# or you can even do this if your shell supports your native encoding
3ef515df	120
962111ca	121	perl -Mencoding=latin2 -e '...' # Feeling centrally European?
0ab8f81e	122	perl -Mencoding=euc-kr -e '...' # Or Korean?
3ef515df	123
3ef515df	124	# more control
3ef515df	125
962111ca	126	# A simple euc-cn => utf-8 converter
6d1c0808	127	use encoding "euc-cn", STDOUT => "utf8"; while(<>){print};
3ef515df	128
	129	# "no encoding;" supported (but not scoped!)
	130	no encoding;
	131
aae85ceb	132	# an alternate way, Filter
aae85ceb	133	use encoding "euc-jp", Filter=>1;
aae85ceb	134	# now you can use kanji identifiers -- in euc-jp!
aae85ceb	135
3ef515df	136	=head1 ABSTRACT
3ef515df	137
962111ca	138	Let's start with a bit of history: Perl 5.6.0 introduced Unicode
	139	support. You could apply C<substr()> and regexes even to complex CJK
	140	characters -- so long as the script was written in UTF-8. But back
0ab8f81e	141	then, text editors that supported UTF-8 were still rare and many users
	142	instead chose to write scripts in legacy encodings, giving up a whole
	143	new feature of Perl 5.6.
3ef515df	144
0ab8f81e	145	Rewind to the future: starting from perl 5.8.0 with the B<encoding>
962111ca	146	pragma, you can write your script in any encoding you like (so long
962111ca	147	as the C<Encode> module supports it) and still enjoy Unicode support.
0f29a567	148	This pragma achieves that by doing the following:
05ef2f67	149
	150	=over
	151
	152	=item *
	153
	154	Internally converts all literals (C<q//,qq//,qr//,qw///, qx//>) from
	155	the encoding specified to utf8. In Perl 5.8.1 and later, literals in
	156	C<tr///> and C<DATA> pseudo-filehandle are also converted.
	157
	158	=item *
	159
	160	Changing PerlIO layers of C<STDIN> and C<STDOUT> to the encoding
	161	specified.
	162
	163	=back
	164
	165	=head2 Literal Conversions
	166
0ab8f81e	167	You can write code in EUC-JP as follows:
3ef515df	168
	169	my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
	170	#<-char-><-char-> # 4 octets
	171	s/\bCamel\b/$Rakuda/;
	172
	173	And with C<use encoding "euc-jp"> in effect, it is the same thing as
962111ca	174	the code in UTF-8:
3ef515df	175
32b9ed1f	176	my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
3ef515df	177	s/\bCamel\b/$Rakuda/;
3ef515df	178
05ef2f67	179	=head2 PerlIO layers for C<STD(IN\|OUT)>
	180
	181	The B<encoding> pragma also modifies the filehandle layers of
4b291ae6	182	STDIN and STDOUT to the specified encoding. Therefore,
3ef515df	183
	184	use encoding "euc-jp";
	185	my $message = "Camel is the symbol of perl.\n";
	186	my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
	187	$message =~ s/\bCamel\b/$Rakuda/;
	188	print $message;
	189
962111ca	190	Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n",
962111ca	191	not "\x{99F1}\x{99DD} is the symbol of perl.\n".
3ef515df	192
0ab8f81e	193	You can override this by giving extra arguments; see below.
3ef515df	194
990e18f7	195	=head2 Implicit upgrading for byte strings
	196
	197	By default, if strings operating under byte semantics and strings
	198	with Unicode character data are concatenated, the new string will
	199	be created by decoding the byte strings as I<ISO 8859-1 (Latin-1)>.
	200
	201	The B<encoding> pragma changes this to use the specified encoding
	202	instead. For example:
	203
	204	use encoding 'utf8';
	205	my $string = chr(20000); # a Unicode string
	206	utf8::encode($string); # now it's a UTF-8 encoded byte string
	207	# concatenate with another Unicode string
	208	print length($string . chr(20000));
	209
	210	Will print C<2>, because C<$string> is upgraded as UTF-8. Without
	211	C<use encoding 'utf8';>, it will print C<4> instead, since C<$string>
	212	is three octets when interpreted as Latin-1.
	213
05ef2f67	214	=head1 FEATURES THAT REQUIRE 5.8.1
	215
	216	Some of the features offered by this pragma requires perl 5.8.1. Most
0f29a567	217	of these are done by Inaba Hiroto. Any other features and changes
05ef2f67	218	are good for 5.8.0.
	219
	220	=over
	221
	222	=item "NON-EUC" doublebyte encodings
	223
0f29a567	224	Because perl needs to parse script before applying this pragma, such
05ef2f67	225	encodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH;
05ef2f67	226	\x5c) in the second byte fails because the second byte may
0f29a567	227	accidentally escape the quoting character that follows. Perl 5.8.1
05ef2f67	228	or later fixes this problem.
	229
	230	=item tr//
	231
	232	C<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0
	233	See the section below for details.
	234
	235	=item DATA pseudo-filehandle
	236
	237	Another feature that was overlooked was C<DATA>.
	238
	239	=back
	240
3ef515df	241	=head1 USAGE
	242
	243	=over 4
	244
	245	=item use encoding [I<ENCNAME>] ;
	246
05ef2f67	247	Sets the script encoding to I<ENCNAME>. And unless ${^UNICODE}
	248	exists and non-zero, PerlIO layers of STDIN and STDOUT are set to
	249	":encoding(I<ENCNAME>)".
	250
	251	Note that STDERR WILL NOT be changed.
	252
	253	Also note that non-STD file handles remain unaffected. Use C<use
	254	open> or C<binmode> to change layers of those.
3ef515df	255
3ef515df	256	If no encoding is specified, the environment variable L<PERL_ENCODING>
962111ca	257	is consulted. If no encoding can be found, the error C<Unknown encoding
962111ca	258	'I<ENCNAME>'> will be thrown.
3ef515df	259
aae85ceb	260	=item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ;
3ef515df	261
0ab8f81e	262	You can also individually set encodings of STDIN and STDOUT via the
32b9ed1f	263	C<< STDIN => I<ENCNAME> >> form. In this case, you cannot omit the
32b9ed1f	264	first I<ENCNAME>. C<< STDIN => undef >> turns the IO transcoding
aae85ceb	265	completely off.
3ef515df	266
05ef2f67	267	When ${^UNICODE} exists and non-zero, these options will completely
	268	ignored. ${^UNICODE} is a variable introduced in perl 5.8.1. See
	269	L<perlrun> see L<perlvar/"${^UNICODE}"> and L<perlrun/"-C"> for
	270	details (perl 5.8.1 and later).
	271
151b5d36	272	=item use encoding I<ENCNAME> Filter=E<gt>1;
	273
	274	This turns the encoding pragma into a source filter. While the
	275	default approach just decodes interpolated literals (in qq() and
	276	qr()), this will apply a source filter to the entire source code. See
05ef2f67	277	L</"The Filter Option"> below for details.
151b5d36	278
3ef515df	279	=item no encoding;
3ef515df	280
05ef2f67	281	Unsets the script encoding. The layers of STDIN, STDOUT are
962111ca	282	reset to ":raw" (the default unprocessed raw stream of bytes).
3ef515df	283
	284	=back
	285
151b5d36	286	=head1 The Filter Option
	287
	288	The magic of C<use encoding> is not applied to the names of
	289	identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human
	290	is a single Han ideograph) work, you still need to write your script
	291	in UTF-8 -- or use a source filter. That's what 'Filter=>1' does.
	292
151b5d36	293	What does this mean? Your source code behaves as if it is written in
	294	UTF-8 with 'use utf8' in effect. So even if your editor only supports
	295	Shift_JIS, for example, you can still try examples in Chapter 15 of
	296	C<Programming Perl, 3rd Ed.>. For instance, you can use UTF-8
	297	identifiers.
	298
	299	This option is significantly slower and (as of this writing) non-ASCII
	300	identifiers are not very stable WITHOUT this option and with the
	301	source code written in UTF-8.
	302
	303	=head2 Filter-related changes at Encode version 1.87
	304
	305	=over
	306
	307	=item *
	308
	309	The Filter option now sets STDIN and STDOUT like non-filter options.
	310	And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like
	311	non-filter version.
	312
	313	=item *
	314
	315	C<use utf8> is implicitly declared so you no longer have to C<use
	316	utf8> to C<${"\x{4eba}"}++>.
	317
	318	=back
	319
3ef515df	320	=head1 CAVEATS
	321
	322	=head2 NOT SCOPED
	323
	324	The pragma is a per script, not a per block lexical. Only the last
621b0f8d	325	C<use encoding> or C<no encoding> matters, and it affects
	326	B<the whole script>. However, the <no encoding> pragma is supported and
	327	B<use encoding> can appear as many times as you want in a given script.
	328	The multiple use of this pragma is discouraged.
	329
0f29a567	330	By the same reason, the use this pragma inside modules is also
	331	discouraged (though not as strongly discouranged as the case above.
	332	See below).
05ef2f67	333
	334	If you still have to write a module with this pragma, be very careful
	335	of the load order. See the codes below;
	336
	337	# called module
	338	package Module_IN_BAR;
	339	use encoding "bar";
	340	# stuff in "bar" encoding here
	341	1;
	342
	343	# caller script
	344	use encoding "foo"
	345	use Module_IN_BAR;
	346	# surprise! use encoding "bar" is in effect.
	347
	348	The best way to avoid this oddity is to use this pragma RIGHT AFTER
	349	other modules are loaded. i.e.
	350
	351	use Module_IN_BAR;
	352	use encoding "foo";
3ef515df	353
	354	=head2 DO NOT MIX MULTIPLE ENCODINGS
	355
	356	Notice that only literals (string or regular expression) having only
	357	legacy code points are affected: if you mix data like this
	358
	359	\xDF\x{100}
	360
	361	the data is assumed to be in (Latin 1 and) Unicode, not in your native
	362	encoding. In other words, this will match in "greek":
	363
	364	"\xDF" =~ /\x{3af}/
	365
	366	but this will not
	367
	368	"\xDF\x{100}" =~ /\x{3af}\x{100}/
	369
962111ca	370	since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
	371	the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL
	372	LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left. You
	373	should not be mixing your legacy data and Unicode in the same string.
3ef515df	374
	375	This pragma also affects encoding of the 0x80..0xFF code point range:
	376	normally characters in that range are left as eight-bit bytes (unless
	377	they are combined with characters with code points 0x100 or larger,
	378	in which case all characters need to become UTF-8 encoded), but if
	379	the C<encoding> pragma is present, even the 0x80..0xFF range always
	380	gets UTF-8 encoded.
	381
	382	After all, the best thing about this pragma is that you don't have to
0ab8f81e	383	resort to \x{....} just to spell your name in a native encoding.
	384	So feel free to put your strings in your encoding in quotes and
	385	regexes.
3ef515df	386
151b5d36	387	=head2 tr/// with ranges
4b291ae6	388
4b291ae6	389	The B<encoding> pragma works by decoding string literals in
151b5d36	390	C<q//,qq//,qr//,qw///, qx//> and so forth. In perl 5.8.0, this
4b291ae6	391	does not apply to C<tr///>. Therefore,
	392
	393	use encoding 'euc-jp';
	394	#....
	395	$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/;
	396	# -------- -------- -------- --------
	397
	398	Does not work as
	399
	400	$kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/;
	401
	402	=over
	403
	404	=item Legend of characters above
	405
	406	utf8 euc-jp charnames::viacode()
	407	-----------------------------------------
	408	\x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A
	409	\x{3093} \xA4\xF3 HIRAGANA LETTER N
	410	\x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A
	411	\x{30f3} \xA5\xF3 KATAKANA LETTER N
	412
	413	=back
	414
05ef2f67	415	This counterintuitive behavior has been fixed in perl 5.8.1.
151b5d36	416
4b291ae6	417	=head3 workaround to tr///;
4b291ae6	418
ce16148b	419	In perl 5.8.0, you can work around as follows;
4b291ae6	420
4b291ae6	421	use encoding 'euc-jp';
151b5d36	422	# ....
4b291ae6	423	eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };
4b291ae6	424
ce16148b	425	Note the C<tr//> expression is surrounded by C<qq{}>. The idea behind
4b291ae6	426	is the same as classic idiom that makes C<tr///> 'interpolate'.
	427
	428	tr/$from/$to/; # wrong!
	429	eval qq{ tr/$from/$to/ }; # workaround.
	430
	431	Nevertheless, in case of B<encoding> pragma even C<q//> is affected so
	432	C<tr///> not being decoded was obviously against the will of Perl5
05ef2f67	433	Porters so it has been fixed in Perl 5.8.1 or later.
aae85ceb	434
3ef515df	435	=head1 EXAMPLE - Greekperl
	436
	437	use encoding "iso 8859-7";
	438
0ab8f81e	439	# \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode.
3ef515df	440
	441	$a = "\xDF";
	442	$b = "\x{100}";
	443
	444	printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
	445
	446	$c = $a . $b;
	447
	448	# $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
	449
	450	# chr() is affected, and ...
	451
	452	print "mega\n" if ord(chr(0xdf)) == 0x3af;
	453
	454	# ... ord() is affected by the encoding pragma ...
	455
	456	print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
	457
	458	# ... as are eq and cmp ...
	459
	460	print "peta\n" if "\x{3af}" eq pack("C", 0xdf);
	461	print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0;
	462
	463	# ... but pack/unpack C are not affected, in case you still
0ab8f81e	464	# want to go back to your native encoding
3ef515df	465
	466	print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
	467
	468	=head1 KNOWN PROBLEMS
	469
151b5d36	470	=over
151b5d36	471
0f29a567	472	=item literals in regex that are longer than 127 bytes
151b5d36	473
0ab8f81e	474	For native multibyte encodings (either fixed or variable length),
3ef515df	475	the current implementation of the regular expressions may introduce
0ab8f81e	476	recoding errors for regular expression literals longer than 127 bytes.
3ef515df	477
05ef2f67	478	=item EBCDIC
151b5d36	479
3ef515df	480	The encoding pragma is not supported on EBCDIC platforms.
0ab8f81e	481	(Porters who are willing and able to remove this limitation are
0ab8f81e	482	welcome.)
3ef515df	483
05ef2f67	484	=item format
	485
	486	This pragma doesn't work well with format because PerlIO does not
	487	get along very well with it. When format contains non-ascii
	488	characters it prints funny or gets "wide character warnings".
	489	To understand it, try the code below.
	490
	491	# Save this one in utf8
	492	# replace non-ascii with a non-ascii string
	493	my $camel;
	494	format STDOUT =
	495	non-ascii@>>>>>>>
	496	$camel
	497	.
	498	$camel = "non-ascii";
	499	binmode(STDOUT=>':encoding(utf8)'); # bang!
	500	write; # funny
	501	print $camel, "\n"; # fine
	502
	503	Without binmode this happens to work but without binmode, print()
	504	fails instead of write().
	505
	506	At any rate, the very use of format is questionable when it comes to
	507	unicode characters since you have to consider such things as character
	508	width (i.e. double-width for ideographs) and directions (i.e. BIDI for
	509	Arabic and Hebrew).
	510
151b5d36	511	=back
151b5d36	512
05ef2f67	513	=head1 HISTORY
	514
	515	This pragma first appeared in Perl 5.8.0. For features that require
	516	5.8.1 and better, see above.
	517
3ef515df	518	=head1 SEE ALSO
3ef515df	519
aae85ceb	520	L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,
	521
	522	Ch. 15 of C<Programming Perl (3rd Edition)>
	523	by Larry Wall, Tom Christiansen, Jon Orwant;
	524	O'Reilly & Associates; ISBN 0-596-00027-8
3ef515df	525
3ef515df	526	=cut