[p5sagit/p5-mst-13.2.git] / ext / Encode / encoding.pm

package encoding;
our $VERSION = do { my @r = (q$Revision: 1.40 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };

use Encode;
use strict;

BEGIN {
    if (ord("A") == 193) {
	require Carp;
	Carp::croak("encoding pragma does not support EBCDIC platforms");
    }
}

our $HAS_PERLIO = 0;
eval { require PerlIO::encoding };
unless ($@){
    $HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02);
}

my %utfs = map {$_=>1}
    qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE);

sub import {
    my $class = shift;
    my $name  = shift;
    my %arg = @_;
    $name ||= $ENV{PERL_ENCODING};

    my $enc = find_encoding($name);
    unless (defined $enc) {
	require Carp;
	Carp::croak("Unknown encoding '$name'");
    }
    $name = $enc->name; # canonize
    unless ($arg{Filter}) {
	${^ENCODING} = $enc unless $] <= 5.008 and $utfs{$name};
	$HAS_PERLIO or return 1;
	for my $h (qw(STDIN STDOUT)){
	    if ($arg{$h}){
		unless (defined find_encoding($arg{$h})) {
		    require Carp;
		    Carp::croak("Unknown encoding for $h, '$arg{$h}'");
		}
		eval { binmode($h, ":encoding($arg{$h})") };
	    }else{
		unless (exists $arg{$h}){
		    eval { 
			no warnings 'uninitialized';
			binmode($h, ":encoding($name)");
		    };
		}
	    }
	    if ($@){
		require Carp;
		Carp::croak($@);
	    }
	}
    }else{
	defined(${^ENCODING}) and undef ${^ENCODING};
	eval {
	    require Filter::Util::Call ;
	    Filter::Util::Call->import ;
	    binmode(STDIN);
	    binmode(STDOUT);
	    filter_add(sub{
			   my $status;
                           if (($status = filter_read()) > 0){
			       $_ = $enc->decode($_, 1);
			       # warn $_;
			   }
			   $status ;
		       });
	};
	# warn "Filter installed";
    }
    return 1; # I doubt if we need it, though
}

sub unimport{
    no warnings;
    undef ${^ENCODING};
    if ($HAS_PERLIO){
	binmode(STDIN,  ":raw");
	binmode(STDOUT, ":raw");
    }else{
    binmode(STDIN);
    binmode(STDOUT);
    }
    if ($INC{"Filter/Util/Call.pm"}){
	eval { filter_del() };
    }
}

1;
__END__

=pod

=head1 NAME

encoding - allows you to write your script in non-ascii or non-utf8

=head1 SYNOPSIS

  use encoding "greek";  # Perl like Greek to you?
  use encoding "euc-jp"; # Jperl!

  # or you can even do this if your shell supports your native encoding

  perl -Mencoding=latin2 -e '...' # Feeling centrally European?
  perl -Mencoding=euc-kr -e '...' # Or Korean?

  # more control

  # A simple euc-cn => utf-8 converter
  use encoding "euc-cn", STDOUT => "utf8";  while(<>){print};

  # "no encoding;" supported (but not scoped!)
  no encoding;

  # an alternate way, Filter
  use encoding "euc-jp", Filter=>1;
  use utf8;
  # now you can use kanji identifiers -- in euc-jp!

=head1 ABSTRACT

Let's start with a bit of history: Perl 5.6.0 introduced Unicode
support.  You could apply C<substr()> and regexes even to complex CJK
characters -- so long as the script was written in UTF-8.  But back
then, text editors that supported UTF-8 were still rare and many users
instead chose to write scripts in legacy encodings, giving up a whole
new feature of Perl 5.6.

Rewind to the future: starting from perl 5.8.0 with the B<encoding>
pragma, you can write your script in any encoding you like (so long
as the C<Encode> module supports it) and still enjoy Unicode support.
You can write code in EUC-JP as follows:

  my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
               #<-char-><-char->   # 4 octets
  s/\bCamel\b/$Rakuda/;

And with C<use encoding "euc-jp"> in effect, it is the same thing as
the code in UTF-8:

  my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
  s/\bCamel\b/$Rakuda/;

The B<encoding> pragma also modifies the filehandle disciplines of
STDIN and STDOUT to the specified encoding.  Therefore,

  use encoding "euc-jp";
  my $message = "Camel is the symbol of perl.\n";
  my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
  $message =~ s/\bCamel\b/$Rakuda/;
  print $message;

Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n",
not "\x{99F1}\x{99DD} is the symbol of perl.\n".

You can override this by giving extra arguments; see below.

=head1 USAGE

=over 4

=item use encoding [I<ENCNAME>] ;

Sets the script encoding to I<ENCNAME>. Filehandle disciplines of
STDIN and STDOUT are set to ":encoding(I<ENCNAME>)".  Note that STDERR
will not be changed.

If no encoding is specified, the environment variable L<PERL_ENCODING>
is consulted.  If no encoding can be found, the error C<Unknown encoding
'I<ENCNAME>'> will be thrown.

Note that non-STD file handles remain unaffected.  Use C<use open> or
C<binmode> to change disciplines of those.

=item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ;

You can also individually set encodings of STDIN and STDOUT via the
C<< STDIN => I<ENCNAME> >> form.  In this case, you cannot omit the
first I<ENCNAME>.  C<< STDIN => undef >> turns the IO transcoding
completely off.

=item no encoding;

Unsets the script encoding. The disciplines of STDIN, STDOUT are
reset to ":raw" (the default unprocessed raw stream of bytes).

=back

=head1 CAVEATS

=head2 NOT SCOPED

The pragma is a per script, not a per block lexical.  Only the last
C<use encoding> or C<no encoding> matters, and it affects 
B<the whole script>.  However, the <no encoding> pragma is supported and 
B<use encoding> can appear as many times as you want in a given script. 
The multiple use of this pragma is discouraged.

Because of this nature, the use of this pragma inside the module is
strongly discouraged (because the influence of this pragma lasts not
only for the module but the script that uses).  But if you have to,
make sure you say C<no encoding> at the end of the module so you
contain the influence of the pragma within the module.

=head2 DO NOT MIX MULTIPLE ENCODINGS

Notice that only literals (string or regular expression) having only
legacy code points are affected: if you mix data like this

	\xDF\x{100}

the data is assumed to be in (Latin 1 and) Unicode, not in your native
encoding.  In other words, this will match in "greek":

	"\xDF" =~ /\x{3af}/

but this will not

	"\xDF\x{100}" =~ /\x{3af}\x{100}/

since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL
LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left.  You
should not be mixing your legacy data and Unicode in the same string.

This pragma also affects encoding of the 0x80..0xFF code point range:
normally characters in that range are left as eight-bit bytes (unless
they are combined with characters with code points 0x100 or larger,
in which case all characters need to become UTF-8 encoded), but if
the C<encoding> pragma is present, even the 0x80..0xFF range always
gets UTF-8 encoded.

After all, the best thing about this pragma is that you don't have to
resort to \x{....} just to spell your name in a native encoding.
So feel free to put your strings in your encoding in quotes and
regexes.

=head2 tr/// with ranges remain unaffected

The B<encoding> pragma works by decoding string literals in
C<q//,qq//,qr//,qw///, qx//> and so forth.  As of perl 5.8.0, this
does not apply to C<tr///>.  Therefore,

  use encoding 'euc-jp';
  #....
  $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/;
  #           -------- -------- -------- --------

Does not work as

  $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/;

=over

=item Legend of characters above

  utf8     euc-jp   charnames::viacode()
  -----------------------------------------
  \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A
  \x{3093} \xA4\xF3 HIRAGANA LETTER N
  \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A
  \x{30f3} \xA5\xF3 KATAKANA LETTER N

=back

=head3 workaround to tr///;

You can, however, achieve the same as simply as follows;

  use encoding 'euc-jp';
  # ....
  eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };

Note the C<tr//> expression is surronded by C<qq{}>.  The idea behind
is the same as classic idiom that makes C<tr///> 'interpolate'.

   tr/$from/$to/;            # wrong!
   eval qq{ tr/$from/$to/ }; # workaround.

Nevertheless, in case of B<encoding> pragma even C<q//> is affected so
C<tr///> not being decoded was obviously against the will of Perl5
Porters.  In future version of perl, this counter-intuitive behaviour
of C<tr///> will be fixed so C<eval qq{}> trick will be unneccesary.

=head1 Non-ASCII Identifiers and Filter option

The magic of C<use encoding> is not applied to the names of
identifiers.  In order to make C<${"\x{4eba}"}++> ($human++, where human
is a single Han ideograph) work, you still need to write your script
in UTF-8 or use a source filter.

In other words, the same restriction as with Jperl applies.

If you dare to experiment, however, you can try the Filter option.

=over 4

=item use encoding I<ENCNAME> Filter=E<gt>1;

This turns the encoding pragma into a source filter.  While the default
approach just decodes interpolated literals (in qq() and qr()), this
will apply a source filter to the entire source code.  In this case,
STDIN and STDOUT remain untouched.

=back

What does this mean?  Your source code behaves as if it is written in
UTF-8.  So even if your editor only supports Shift_JIS, for example,
you can still try examples in Chapter 15 of C<Programming Perl, 3rd
Ed.>.  For instance, you can use UTF-8 identifiers.

This option is significantly slower and (as of this writing) non-ASCII
identifiers are not very stable WITHOUT this option and with the
source code written in UTF-8.

To make your script in legacy encoding work with minimum effort,
do not use Filter=E<gt>1.

=head1 EXAMPLE - Greekperl

    use encoding "iso 8859-7";

    # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode.

    $a = "\xDF";
    $b = "\x{100}";

    printf "%#x\n", ord($a); # will print 0x3af, not 0xdf

    $c = $a . $b;

    # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".

    # chr() is affected, and ...

    print "mega\n"  if ord(chr(0xdf)) == 0x3af;

    # ... ord() is affected by the encoding pragma ...

    print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;

    # ... as are eq and cmp ...

    print "peta\n" if "\x{3af}" eq  pack("C", 0xdf);
    print "exa\n"  if "\x{3af}" cmp pack("C", 0xdf) == 0;

    # ... but pack/unpack C are not affected, in case you still
    # want to go back to your native encoding

    print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;

=head1 KNOWN PROBLEMS

For native multibyte encodings (either fixed or variable length),
the current implementation of the regular expressions may introduce
recoding errors for regular expression literals longer than 127 bytes.

The encoding pragma is not supported on EBCDIC platforms.
(Porters who are willing and able to remove this limitation are
welcome.)

=head1 SEE ALSO

L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,

Ch. 15 of C<Programming Perl (3rd Edition)>
by Larry Wall, Tom Christiansen, Jon Orwant;
O'Reilly & Associates; ISBN 0-596-00027-8

=cut
Commit	Line	Data
3ef515df	1	package encoding;
fa6f41cf	2	our $VERSION = do { my @r = (q$Revision: 1.40 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
3ef515df	3
3ef515df	4	use Encode;
046f36bf	5	use strict;
3ef515df	6
	7	BEGIN {
	8	if (ord("A") == 193) {
	9	require Carp;
10c5ecbb	10	Carp::croak("encoding pragma does not support EBCDIC platforms");
3ef515df	11	}
	12	}
	13
0ab8f81e	14	our $HAS_PERLIO = 0;
	15	eval { require PerlIO::encoding };
	16	unless ($@){
	17	$HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02);
	18	}
b2704119	19
fa6f41cf	20	my %utfs = map {$_=>1}
	21	qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE);
	22
3ef515df	23	sub import {
	24	my $class = shift;
	25	my $name = shift;
	26	my %arg = @_;
	27	$name \|\|= $ENV{PERL_ENCODING};
	28
	29	my $enc = find_encoding($name);
	30	unless (defined $enc) {
	31	require Carp;
10c5ecbb	32	Carp::croak("Unknown encoding '$name'");
3ef515df	33	}
fa6f41cf	34	$name = $enc->name; # canonize
0f7c507f	35	unless ($arg{Filter}) {
fa6f41cf	36	${^ENCODING} = $enc unless $] <= 5.008 and $utfs{$name};
85982a32	37	$HAS_PERLIO or return 1;
aae85ceb	38	for my $h (qw(STDIN STDOUT)){
aae85ceb	39	if ($arg{$h}){
b2704119	40	unless (defined find_encoding($arg{$h})) {
aae85ceb	41	require Carp;
10c5ecbb	42	Carp::croak("Unknown encoding for $h, '$arg{$h}'");
aae85ceb	43	}
0ab8f81e	44	eval { binmode($h, ":encoding($arg{$h})") };
aae85ceb	45	}else{
aae85ceb	46	unless (exists $arg{$h}){
0ab8f81e	47	eval {
	48	no warnings 'uninitialized';
	49	binmode($h, ":encoding($name)");
	50	};
aae85ceb	51	}
	52	}
	53	if ($@){
3ef515df	54	require Carp;
aae85ceb	55	Carp::croak($@);
3ef515df	56	}
3ef515df	57	}
aae85ceb	58	}else{
	59	defined(${^ENCODING}) and undef ${^ENCODING};
	60	eval {
	61	require Filter::Util::Call ;
	62	Filter::Util::Call->import ;
b2704119	63	binmode(STDIN);
b2704119	64	binmode(STDOUT);
aae85ceb	65	filter_add(sub{
	66	my $status;
	67	if (($status = filter_read()) > 0){
	68	$_ = $enc->decode($_, 1);
	69	# warn $_;
	70	}
	71	$status ;
	72	});
	73	};
	74	# warn "Filter installed";
3ef515df	75	}
	76	return 1; # I doubt if we need it, though
	77	}
	78
	79	sub unimport{
	80	no warnings;
	81	undef ${^ENCODING};
621b0f8d	82	if ($HAS_PERLIO){
	83	binmode(STDIN, ":raw");
	84	binmode(STDOUT, ":raw");
	85	}else{
b2704119	86	binmode(STDIN);
b2704119	87	binmode(STDOUT);
621b0f8d	88	}
aae85ceb	89	if ($INC{"Filter/Util/Call.pm"}){
	90	eval { filter_del() };
	91	}
3ef515df	92	}
	93
	94	1;
	95	__END__
85982a32	96
3ef515df	97	=pod
	98
	99	=head1 NAME
	100
0ab8f81e	101	encoding - allows you to write your script in non-ascii or non-utf8
3ef515df	102
	103	=head1 SYNOPSIS
	104
962111ca	105	use encoding "greek"; # Perl like Greek to you?
3ef515df	106	use encoding "euc-jp"; # Jperl!
3ef515df	107
962111ca	108	# or you can even do this if your shell supports your native encoding
3ef515df	109
962111ca	110	perl -Mencoding=latin2 -e '...' # Feeling centrally European?
0ab8f81e	111	perl -Mencoding=euc-kr -e '...' # Or Korean?
3ef515df	112
3ef515df	113	# more control
3ef515df	114
962111ca	115	# A simple euc-cn => utf-8 converter
6d1c0808	116	use encoding "euc-cn", STDOUT => "utf8"; while(<>){print};
3ef515df	117
	118	# "no encoding;" supported (but not scoped!)
	119	no encoding;
	120
aae85ceb	121	# an alternate way, Filter
	122	use encoding "euc-jp", Filter=>1;
	123	use utf8;
	124	# now you can use kanji identifiers -- in euc-jp!
	125
3ef515df	126	=head1 ABSTRACT
3ef515df	127
962111ca	128	Let's start with a bit of history: Perl 5.6.0 introduced Unicode
	129	support. You could apply C<substr()> and regexes even to complex CJK
	130	characters -- so long as the script was written in UTF-8. But back
0ab8f81e	131	then, text editors that supported UTF-8 were still rare and many users
	132	instead chose to write scripts in legacy encodings, giving up a whole
	133	new feature of Perl 5.6.
3ef515df	134
0ab8f81e	135	Rewind to the future: starting from perl 5.8.0 with the B<encoding>
962111ca	136	pragma, you can write your script in any encoding you like (so long
962111ca	137	as the C<Encode> module supports it) and still enjoy Unicode support.
0ab8f81e	138	You can write code in EUC-JP as follows:
3ef515df	139
	140	my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
	141	#<-char-><-char-> # 4 octets
	142	s/\bCamel\b/$Rakuda/;
	143
	144	And with C<use encoding "euc-jp"> in effect, it is the same thing as
962111ca	145	the code in UTF-8:
3ef515df	146
32b9ed1f	147	my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
3ef515df	148	s/\bCamel\b/$Rakuda/;
3ef515df	149
962111ca	150	The B<encoding> pragma also modifies the filehandle disciplines of
4b291ae6	151	STDIN and STDOUT to the specified encoding. Therefore,
3ef515df	152
	153	use encoding "euc-jp";
	154	my $message = "Camel is the symbol of perl.\n";
	155	my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
	156	$message =~ s/\bCamel\b/$Rakuda/;
	157	print $message;
	158
962111ca	159	Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n",
962111ca	160	not "\x{99F1}\x{99DD} is the symbol of perl.\n".
3ef515df	161
0ab8f81e	162	You can override this by giving extra arguments; see below.
3ef515df	163
	164	=head1 USAGE
	165
	166	=over 4
	167
	168	=item use encoding [I<ENCNAME>] ;
	169
0ab8f81e	170	Sets the script encoding to I<ENCNAME>. Filehandle disciplines of
	171	STDIN and STDOUT are set to ":encoding(I<ENCNAME>)". Note that STDERR
	172	will not be changed.
3ef515df	173
3ef515df	174	If no encoding is specified, the environment variable L<PERL_ENCODING>
962111ca	175	is consulted. If no encoding can be found, the error C<Unknown encoding
962111ca	176	'I<ENCNAME>'> will be thrown.
3ef515df	177
	178	Note that non-STD file handles remain unaffected. Use C<use open> or
	179	C<binmode> to change disciplines of those.
	180
aae85ceb	181	=item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ;
3ef515df	182
0ab8f81e	183	You can also individually set encodings of STDIN and STDOUT via the
32b9ed1f	184	C<< STDIN => I<ENCNAME> >> form. In this case, you cannot omit the
32b9ed1f	185	first I<ENCNAME>. C<< STDIN => undef >> turns the IO transcoding
aae85ceb	186	completely off.
3ef515df	187
	188	=item no encoding;
	189
0ab8f81e	190	Unsets the script encoding. The disciplines of STDIN, STDOUT are
962111ca	191	reset to ":raw" (the default unprocessed raw stream of bytes).
3ef515df	192
	193	=back
	194
	195	=head1 CAVEATS
	196
	197	=head2 NOT SCOPED
	198
	199	The pragma is a per script, not a per block lexical. Only the last
621b0f8d	200	C<use encoding> or C<no encoding> matters, and it affects
	201	B<the whole script>. However, the <no encoding> pragma is supported and
	202	B<use encoding> can appear as many times as you want in a given script.
	203	The multiple use of this pragma is discouraged.
	204
	205	Because of this nature, the use of this pragma inside the module is
	206	strongly discouraged (because the influence of this pragma lasts not
	207	only for the module but the script that uses). But if you have to,
	208	make sure you say C<no encoding> at the end of the module so you
	209	contain the influence of the pragma within the module.
3ef515df	210
	211	=head2 DO NOT MIX MULTIPLE ENCODINGS
	212
	213	Notice that only literals (string or regular expression) having only
	214	legacy code points are affected: if you mix data like this
	215
	216	\xDF\x{100}
	217
	218	the data is assumed to be in (Latin 1 and) Unicode, not in your native
	219	encoding. In other words, this will match in "greek":
	220
	221	"\xDF" =~ /\x{3af}/
	222
	223	but this will not
	224
	225	"\xDF\x{100}" =~ /\x{3af}\x{100}/
	226
962111ca	227	since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
	228	the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL
	229	LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left. You
	230	should not be mixing your legacy data and Unicode in the same string.
3ef515df	231
	232	This pragma also affects encoding of the 0x80..0xFF code point range:
	233	normally characters in that range are left as eight-bit bytes (unless
	234	they are combined with characters with code points 0x100 or larger,
	235	in which case all characters need to become UTF-8 encoded), but if
	236	the C<encoding> pragma is present, even the 0x80..0xFF range always
	237	gets UTF-8 encoded.
	238
	239	After all, the best thing about this pragma is that you don't have to
0ab8f81e	240	resort to \x{....} just to spell your name in a native encoding.
	241	So feel free to put your strings in your encoding in quotes and
	242	regexes.
3ef515df	243
4b291ae6	244	=head2 tr/// with ranges remain unaffected
	245
	246	The B<encoding> pragma works by decoding string literals in
	247	C<q//,qq//,qr//,qw///, qx//> and so forth. As of perl 5.8.0, this
	248	does not apply to C<tr///>. Therefore,
	249
	250	use encoding 'euc-jp';
	251	#....
	252	$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/;
	253	# -------- -------- -------- --------
	254
	255	Does not work as
	256
	257	$kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/;
	258
	259	=over
	260
	261	=item Legend of characters above
	262
	263	utf8 euc-jp charnames::viacode()
	264	-----------------------------------------
	265	\x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A
	266	\x{3093} \xA4\xF3 HIRAGANA LETTER N
	267	\x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A
	268	\x{30f3} \xA5\xF3 KATAKANA LETTER N
	269
	270	=back
	271
	272	=head3 workaround to tr///;
	273
	274	You can, however, achieve the same as simply as follows;
	275
	276	use encoding 'euc-jp';
	277	# ....
	278	eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };
	279
	280	Note the C<tr//> expression is surronded by C<qq{}>. The idea behind
	281	is the same as classic idiom that makes C<tr///> 'interpolate'.
	282
	283	tr/$from/$to/; # wrong!
	284	eval qq{ tr/$from/$to/ }; # workaround.
	285
	286	Nevertheless, in case of B<encoding> pragma even C<q//> is affected so
	287	C<tr///> not being decoded was obviously against the will of Perl5
	288	Porters. In future version of perl, this counter-intuitive behaviour
	289	of C<tr///> will be fixed so C<eval qq{}> trick will be unneccesary.
	290
962111ca	291	=head1 Non-ASCII Identifiers and Filter option
aae85ceb	292
962111ca	293	The magic of C<use encoding> is not applied to the names of
32b9ed1f	294	identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human
962111ca	295	is a single Han ideograph) work, you still need to write your script
962111ca	296	in UTF-8 or use a source filter.
aae85ceb	297
0ab8f81e	298	In other words, the same restriction as with Jperl applies.
aae85ceb	299
0ab8f81e	300	If you dare to experiment, however, you can try the Filter option.
aae85ceb	301
	302	=over 4
	303
	304	=item use encoding I<ENCNAME> Filter=E<gt>1;
	305
0ab8f81e	306	This turns the encoding pragma into a source filter. While the default
aae85ceb	307	approach just decodes interpolated literals (in qq() and qr()), this
0ab8f81e	308	will apply a source filter to the entire source code. In this case,
0ab8f81e	309	STDIN and STDOUT remain untouched.
aae85ceb	310
	311	=back
	312
962111ca	313	What does this mean? Your source code behaves as if it is written in
0ab8f81e	314	UTF-8. So even if your editor only supports Shift_JIS, for example,
	315	you can still try examples in Chapter 15 of C<Programming Perl, 3rd
	316	Ed.>. For instance, you can use UTF-8 identifiers.
aae85ceb	317
	318	This option is significantly slower and (as of this writing) non-ASCII
	319	identifiers are not very stable WITHOUT this option and with the
	320	source code written in UTF-8.
	321
962111ca	322	To make your script in legacy encoding work with minimum effort,
962111ca	323	do not use Filter=E<gt>1.
aae85ceb	324
3ef515df	325	=head1 EXAMPLE - Greekperl
	326
	327	use encoding "iso 8859-7";
	328
0ab8f81e	329	# \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode.
3ef515df	330
	331	$a = "\xDF";
	332	$b = "\x{100}";
	333
	334	printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
	335
	336	$c = $a . $b;
	337
	338	# $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
	339
	340	# chr() is affected, and ...
	341
	342	print "mega\n" if ord(chr(0xdf)) == 0x3af;
	343
	344	# ... ord() is affected by the encoding pragma ...
	345
	346	print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
	347
	348	# ... as are eq and cmp ...
	349
	350	print "peta\n" if "\x{3af}" eq pack("C", 0xdf);
	351	print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0;
	352
	353	# ... but pack/unpack C are not affected, in case you still
0ab8f81e	354	# want to go back to your native encoding
3ef515df	355
	356	print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
	357
	358	=head1 KNOWN PROBLEMS
	359
0ab8f81e	360	For native multibyte encodings (either fixed or variable length),
3ef515df	361	the current implementation of the regular expressions may introduce
0ab8f81e	362	recoding errors for regular expression literals longer than 127 bytes.
3ef515df	363
3ef515df	364	The encoding pragma is not supported on EBCDIC platforms.
0ab8f81e	365	(Porters who are willing and able to remove this limitation are
0ab8f81e	366	welcome.)
3ef515df	367
	368	=head1 SEE ALSO
	369
aae85ceb	370	L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,
	371
	372	Ch. 15 of C<Programming Perl (3rd Edition)>
	373	by Larry Wall, Tom Christiansen, Jon Orwant;
	374	O'Reilly & Associates; ISBN 0-596-00027-8
3ef515df	375
3ef515df	376	=cut