package encoding;
-our $VERSION = do { my @r = (q$Revision: 1.26 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.41 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use Encode;
use strict;
+our $DEBUG = 0;
BEGIN {
if (ord("A") == 193) {
require Carp;
- Carp::croak "encoding pragma does not support EBCDIC platforms";
+ Carp::croak("encoding pragma does not support EBCDIC platforms");
}
}
+our $HAS_PERLIO = 0;
+eval { require PerlIO::encoding };
+unless ($@){
+ $HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02);
+}
+
+sub _exception{
+ my $name = shift;
+ $] > 5.008 and return 0; # 5.8.1 then no
+ my %utfs = map {$_=>1}
+ qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
+ UTF-32 UTF-32BE UTF-32LE);
+ $utfs{$name} or return 0; # UTFs or no
+ require Config; Config->import(); our %Config;
+ return $Config{perl_patchlevel} == 0 # maintperl then no
+}
+
sub import {
my $class = shift;
my $name = shift;
my %arg = @_;
$name ||= $ENV{PERL_ENCODING};
-
my $enc = find_encoding($name);
unless (defined $enc) {
require Carp;
- Carp::croak "Unknown encoding '$name'";
+ Carp::croak("Unknown encoding '$name'");
}
- unless ($arg{Filter}){
- ${^ENCODING} = $enc; # this is all you need, actually.
- for my $h (qw(STDIN STDOUT)){
- if ($arg{$h}){
- unless (defined find_encoding($arg{h})) {
- require Carp;
- Carp::croak "Unknown encoding for $h, '$arg{$h}'";
- }
- eval qq{ binmode($h, ":encoding($arg{$h})") };
- }else{
- unless (exists $arg{$h}){
- eval qq{ binmode($h, ":encoding($name)") };
- }
- }
- if ($@){
- require Carp;
- Carp::croak($@);
- }
- }
+ $name = $enc->name; # canonize
+ unless ($arg{Filter}) {
+ $DEBUG and warn "_exception($name) = ", _exception($name);
+ _exception($name) or ${^ENCODING} = $enc;
+ $HAS_PERLIO or return 1;
}else{
defined(${^ENCODING}) and undef ${^ENCODING};
+ # implicitly 'use utf8'
+ require utf8; # to fetch $utf8::hint_bits;
+ $^H |= $utf8::hint_bits;
eval {
require Filter::Util::Call ;
Filter::Util::Call->import ;
- binmode(STDIN, ":raw");
- binmode(STDOUT, ":raw");
filter_add(sub{
- my $status;
- if (($status = filter_read()) > 0){
+ my $status = filter_read();
+ if ($status > 0){
+ # $DEBUG and warn $_;
$_ = $enc->decode($_, 1);
- # warn $_;
+ $DEBUG and warn $_;
}
$status ;
});
};
- # warn "Filter installed";
+ } $DEBUG and warn "Filter installed";
+ for my $h (qw(STDIN STDOUT)){
+ if ($arg{$h}){
+ unless (defined find_encoding($arg{$h})) {
+ require Carp;
+ Carp::croak("Unknown encoding for $h, '$arg{$h}'");
+ }
+ eval { binmode($h, ":encoding($arg{$h})") };
+ }else{
+ unless (exists $arg{$h}){
+ eval {
+ no warnings 'uninitialized';
+ binmode($h, ":encoding($name)");
+ };
+ }
+ }
+ if ($@){
+ require Carp;
+ Carp::croak($@);
+ }
}
return 1; # I doubt if we need it, though
}
sub unimport{
no warnings;
undef ${^ENCODING};
- binmode(STDIN, ":raw");
- binmode(STDOUT, ":raw");
+ if ($HAS_PERLIO){
+ binmode(STDIN, ":raw");
+ binmode(STDOUT, ":raw");
+ }else{
+ binmode(STDIN);
+ binmode(STDOUT);
+ }
if ($INC{"Filter/Util/Call.pm"}){
eval { filter_del() };
}
1;
__END__
+
=pod
=head1 NAME
-encoding - allows you to write your script in non-asii or non-utf8
+encoding - allows you to write your script in non-ascii or non-utf8
=head1 SYNOPSIS
+ use encoding "greek"; # Perl like Greek to you?
use encoding "euc-jp"; # Jperl!
- # or you can even do this if your shell supports euc-jp
-
- > perl -Mencoding=euc-jp -e '...'
-
- # or from the shebang line
+ # or you can even do this if your shell supports your native encoding
- #!/your/path/to/perl -Mencoding=euc-jp
+ perl -Mencoding=latin2 -e '...' # Feeling centrally European?
+ perl -Mencoding=euc-kr -e '...' # Or Korean?
# more control
- # A simple euc-jp => utf-8 converter
- use encoding "euc-jp", STDOUT => "utf8"; while(<>){print};
+ # A simple euc-cn => utf-8 converter
+ use encoding "euc-cn", STDOUT => "utf8"; while(<>){print};
# "no encoding;" supported (but not scoped!)
no encoding;
# an alternate way, Filter
use encoding "euc-jp", Filter=>1;
- use utf8;
# now you can use kanji identifiers -- in euc-jp!
=head1 ABSTRACT
-Perl 5.6.0 has introduced Unicode support. You could apply
-C<substr()> and regexes even to complex CJK characters -- so long as
-the script was written in UTF-8. But back then text editors that
-support UTF-8 was still rare and many users rather chose to writer
-scripts in legacy encodings, given up whole new feature of Perl 5.6.
+Let's start with a bit of history: Perl 5.6.0 introduced Unicode
+support. You could apply C<substr()> and regexes even to complex CJK
+characters -- so long as the script was written in UTF-8. But back
+then, text editors that supported UTF-8 were still rare and many users
+instead chose to write scripts in legacy encodings, giving up a whole
+new feature of Perl 5.6.
-With B<encoding> pragma, you can write your script in any encoding you like
-(so long as the C<Encode> module supports it) and still enjoy Unicode
-support. You can write a code in EUC-JP as follows;
+Rewind to the future: starting from perl 5.8.0 with the B<encoding>
+pragma, you can write your script in any encoding you like (so long
+as the C<Encode> module supports it) and still enjoy Unicode support.
+You can write code in EUC-JP as follows:
my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
#<-char-><-char-> # 4 octets
s/\bCamel\b/$Rakuda/;
And with C<use encoding "euc-jp"> in effect, it is the same thing as
-the code in UTF-8 as follow.
+the code in UTF-8:
- my $Rakuda = "\x{99F1}\x{99DD}"; # who Unicode Characters
+ my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
s/\bCamel\b/$Rakuda/;
-The B<encoding> pragma also modifies the file handle disciplines of
-STDIN, STDOUT, and STDERR to the specified encoding. Therefore,
+The B<encoding> pragma also modifies the filehandle disciplines of
+STDIN and STDOUT to the specified encoding. Therefore,
use encoding "euc-jp";
my $message = "Camel is the symbol of perl.\n";
$message =~ s/\bCamel\b/$Rakuda/;
print $message;
-Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n", not
-"\x{99F1}\x{99DD} is the symbol of perl.\n".
+Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n",
+not "\x{99F1}\x{99DD} is the symbol of perl.\n".
-You can override this by giving extra arguments. See below.
+You can override this by giving extra arguments; see below.
=head1 USAGE
=item use encoding [I<ENCNAME>] ;
-Sets the script encoding to I<ENCNAME> and file handle disciplines of
-STDIN, STDOUT are set to ":encoding(I<ENCNAME>)". Note STDERR will not
-be changed.
+Sets the script encoding to I<ENCNAME>. Filehandle disciplines of
+STDIN and STDOUT are set to ":encoding(I<ENCNAME>)". Note that STDERR
+will not be changed.
If no encoding is specified, the environment variable L<PERL_ENCODING>
-is consulted. If no encoding can be found, C<Unknown encoding 'I<ENCNAME>'>
-error will be thrown.
+is consulted. If no encoding can be found, the error C<Unknown encoding
+'I<ENCNAME>'> will be thrown.
Note that non-STD file handles remain unaffected. Use C<use open> or
C<binmode> to change disciplines of those.
=item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ;
-You can also individually set encodings of STDIN and STDOUT via
-STDI<FH> =E<gt> I<ENCNAME_FH> form. In this case, you cannot omit the
-first I<ENCNAME>. C<STDI<FH> =E<gt> undef> turns IO transcoding
+You can also individually set encodings of STDIN and STDOUT via the
+C<< STDIN => I<ENCNAME> >> form. In this case, you cannot omit the
+first I<ENCNAME>. C<< STDIN => undef >> turns the IO transcoding
completely off.
+=item use encoding I<ENCNAME> Filter=E<gt>1;
+
+This turns the encoding pragma into a source filter. While the
+default approach just decodes interpolated literals (in qq() and
+qr()), this will apply a source filter to the entire source code. See
+L</"The Filter Option"> below for details
+
=item no encoding;
-Unsets the script encoding and the disciplines of STDIN, STDOUT are
-reset to ":raw".
+Unsets the script encoding. The disciplines of STDIN, STDOUT are
+reset to ":raw" (the default unprocessed raw stream of bytes).
+
+=back
+
+=head1 The Filter Option
+
+The magic of C<use encoding> is not applied to the names of
+identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human
+is a single Han ideograph) work, you still need to write your script
+in UTF-8 -- or use a source filter. That's what 'Filter=>1' does.
+
+
+What does this mean? Your source code behaves as if it is written in
+UTF-8 with 'use utf8' in effect. So even if your editor only supports
+Shift_JIS, for example, you can still try examples in Chapter 15 of
+C<Programming Perl, 3rd Ed.>. For instance, you can use UTF-8
+identifiers.
+
+This option is significantly slower and (as of this writing) non-ASCII
+identifiers are not very stable WITHOUT this option and with the
+source code written in UTF-8.
+
+=head2 Filter-related changes at Encode version 1.87
+
+=over
+
+=item *
+
+The Filter option now sets STDIN and STDOUT like non-filter options.
+And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like
+non-filter version.
+
+=item *
+
+C<use utf8> is implicitly declared so you no longer have to C<use
+utf8> to C<${"\x{4eba}"}++>.
=back
=head2 NOT SCOPED
The pragma is a per script, not a per block lexical. Only the last
-C<use encoding> or C<matters, and it affects B<the whole script>.
-Though <no encoding> pragma is supported and C<use encoding> can
-appear as many times as you want in a given script, the multiple use
-of this pragma is discouraged.
+C<use encoding> or C<no encoding> matters, and it affects
+B<the whole script>. However, the <no encoding> pragma is supported and
+B<use encoding> can appear as many times as you want in a given script.
+The multiple use of this pragma is discouraged.
+
+Because of this nature, the use of this pragma inside the module is
+strongly discouraged (because the influence of this pragma lasts not
+only for the module but the script that uses). But if you have to,
+make sure you say C<no encoding> at the end of the module so you
+contain the influence of the pragma within the module.
=head2 DO NOT MIX MULTIPLE ENCODINGS
"\xDF\x{100}" =~ /\x{3af}\x{100}/
-since the C<\xDF> on the left will B<not> be upgraded to C<\x{3af}>
-because of the C<\x{100}> on the left. You should not be mixing your
-legacy data and Unicode in the same string.
+since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
+the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL
+LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left. You
+should not be mixing your legacy data and Unicode in the same string.
This pragma also affects encoding of the 0x80..0xFF code point range:
normally characters in that range are left as eight-bit bytes (unless
gets UTF-8 encoded.
After all, the best thing about this pragma is that you don't have to
-resort to \x... just to spell your name in native encoding. So feel
-free to put your strings in your encoding in quotes and regexes.
+resort to \x{....} just to spell your name in a native encoding.
+So feel free to put your strings in your encoding in quotes and
+regexes.
-=head1 NON-ASCII Identifiers and Filter option
+=head2 format doesn't work well
-The magic of C<use encoding> is not applied to the names of identifiers.
-In order to make C<${"4eba"}++> ($man++, where man is a single ideograph)
-work, you still need to write your script in UTF-8 or use a source filter.
+This pragma doesn't work well with format because PerlIO does not
+get along very well with it. When format contains non-ascii
+characters it prints funny or gets "wide character warnings".
+To understand it, try the code below.
-In other words, the same restriction as Jperl applies.
+ # Save this one in utf8
+ # replace *non-ascii* with a non-ascii string
+ my $camel;
+ format STDOUT =
+ *non-ascii*@>>>>>>>
+ $camel
+ .
+ $camel = "*non-ascii*";
+ binmode(STDOUT=>':encoding(utf8)'); # bang!
+ write; # funny
+ print $camel, "\n"; # fine
-If you dare experiment, however, you can try Fitlter option.
+Without binmode this happens to work but without binmode, print()
+fails instead of write().
-=over 4
+At any rate, the very use of format is questionable when it comes to
+unicode characters since you have to consider such things as character
+width (i.e. double-width for ideographs) and directions (i.e. BIDI for
+Arabic and Hebrew).
-=item use encoding I<ENCNAME> Filter=E<gt>1;
+=head2 tr/// with ranges
+
+The B<encoding> pragma works by decoding string literals in
+C<q//,qq//,qr//,qw///, qx//> and so forth. In perl 5.8.0, this
+does not apply to C<tr///>. Therefore,
+
+ use encoding 'euc-jp';
+ #....
+ $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/;
+ # -------- -------- -------- --------
+
+Does not work as
-This turns encoding pragma into source filter. While the default
-approach just decodes interpolated literals (in qq() and qr()), this
-will apply source filter to entire source code. In this case, STDIN
-and STDOUT remain untouched.
+ $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/;
+
+=over
+
+=item Legend of characters above
+
+ utf8 euc-jp charnames::viacode()
+ -----------------------------------------
+ \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A
+ \x{3093} \xA4\xF3 HIRAGANA LETTER N
+ \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A
+ \x{30f3} \xA5\xF3 KATAKANA LETTER N
=back
-What does this mean? Your source code behaves as if it is written
-in UTF-8. So even if your editor only supports Shift_JIS, for
-example. You can still try examples in Chapter 15 of
-C<Programming Perl, 3rd Ed.> For instance, you can use UTF-8
-identifiers.
+This counterintuitive behavior has been fixed in perl 5.8.1 and up
+by INABA Hirohito.
-This option is significantly slower and (as of this writing) non-ASCII
-identifiers are not very stable WITHOUT this option and with the
-source code written in UTF-8.
+=head3 workaround to tr///;
+
+In perl 5.8.0, you can work aroud as follows;
-To make your script in legacy encoding work with minimum effort, do
-not use Filter=E<gt>1
+ use encoding 'euc-jp';
+ # ....
+ eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };
+Note the C<tr//> expression is surronded by C<qq{}>. The idea behind
+is the same as classic idiom that makes C<tr///> 'interpolate'.
+
+ tr/$from/$to/; # wrong!
+ eval qq{ tr/$from/$to/ }; # workaround.
+
+Nevertheless, in case of B<encoding> pragma even C<q//> is affected so
+C<tr///> not being decoded was obviously against the will of Perl5
+Porters so it has been fixed.
=head1 EXAMPLE - Greekperl
use encoding "iso 8859-7";
- # The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode.
+ # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode.
$a = "\xDF";
$b = "\x{100}";
print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0;
# ... but pack/unpack C are not affected, in case you still
- # want back to your native encoding
+ # want to go back to your native encoding
print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
=head1 KNOWN PROBLEMS
-For native multibyte encodings (either fixed or variable length)
+=over
+
+=item *
+
+For native multibyte encodings (either fixed or variable length),
the current implementation of the regular expressions may introduce
-recoding errors for longer regular expression literals than 127 bytes.
+recoding errors for regular expression literals longer than 127 bytes.
+
+=item *
The encoding pragma is not supported on EBCDIC platforms.
-(Porters wanted.)
+(Porters who are willing and able to remove this limitation are
+welcome.)
+
+=back
=head1 SEE ALSO