+# $Id: encoding.pm,v 2.2 2005/09/08 14:17:17 dankogai Exp $
package encoding;
-our $VERSION = do { my @r = (q$Revision: 1.35 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 2.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use Encode;
use strict;
+sub DEBUG () { 0 }
+
BEGIN {
if (ord("A") == 193) {
require Carp;
- Carp::croak("encoding pragma does not support EBCDIC platforms");
+ Carp::croak("encoding: pragma does not support EBCDIC platforms");
}
}
$HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02);
}
+sub _exception{
+ my $name = shift;
+ $] > 5.008 and return 0; # 5.8.1 or higher then no
+ my %utfs = map {$_=>1}
+ qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
+ UTF-32 UTF-32BE UTF-32LE);
+ $utfs{$name} or return 0; # UTFs or no
+ require Config; Config->import(); our %Config;
+ return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no
+}
+
+sub in_locale { $^H & ($locale::hint_bits || 0)}
+
+sub _get_locale_encoding {
+ my $locale_encoding;
+
+ # I18N::Langinfo isn't available everywhere
+ eval {
+ require I18N::Langinfo;
+ I18N::Langinfo->import(qw(langinfo CODESET));
+ $locale_encoding = langinfo(CODESET());
+ };
+
+ my $country_language;
+
+ no warnings 'uninitialized';
+
+ if (not $locale_encoding && in_locale()) {
+ if ($ENV{LC_ALL} =~ /^([^.]+)\.([^.]+)$/) {
+ ($country_language, $locale_encoding) = ($1, $2);
+ } elsif ($ENV{LANG} =~ /^([^.]+)\.([^.]+)$/) {
+ ($country_language, $locale_encoding) = ($1, $2);
+ }
+ # LANGUAGE affects only LC_MESSAGES only on glibc
+ } elsif (not $locale_encoding) {
+ if ($ENV{LC_ALL} =~ /\butf-?8\b/i ||
+ $ENV{LANG} =~ /\butf-?8\b/i) {
+ $locale_encoding = 'utf8';
+ }
+ # Could do more heuristics based on the country and language
+ # parts of LC_ALL and LANG (the parts before the dot (if any)),
+ # since we have Locale::Country and Locale::Language available.
+ # TODO: get a database of Language -> Encoding mappings
+ # (the Estonian database at http://www.eki.ee/letter/
+ # would be excellent!) --jhi
+ }
+ if (defined $locale_encoding &&
+ lc($locale_encoding) eq 'euc' &&
+ defined $country_language) {
+ if ($country_language =~ /^ja_JP|japan(?:ese)?$/i) {
+ $locale_encoding = 'euc-jp';
+ } elsif ($country_language =~ /^ko_KR|korean?$/i) {
+ $locale_encoding = 'euc-kr';
+ } elsif ($country_language =~ /^zh_CN|chin(?:a|ese)?$/i) {
+ $locale_encoding = 'euc-cn';
+ } elsif ($country_language =~ /^zh_TW|taiwan(?:ese)?$/i) {
+ $locale_encoding = 'euc-tw';
+ } else {
+ require Carp;
+ Carp::croak("encoding: Locale encoding '$locale_encoding' too ambiguous");
+ }
+ }
+
+ return $locale_encoding;
+}
+
sub import {
my $class = shift;
my $name = shift;
+ if ($name eq ':_get_locale_encoding') { # used by lib/open.pm
+ my $caller = caller();
+ {
+ no strict 'refs';
+ *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding;
+ }
+ return;
+ }
+ $name = _get_locale_encoding() if $name eq ':locale';
my %arg = @_;
- $name ||= $ENV{PERL_ENCODING};
-
+ $name = $ENV{PERL_ENCODING} unless defined $name;
my $enc = find_encoding($name);
unless (defined $enc) {
require Carp;
- Carp::croak("Unknown encoding '$name'");
+ Carp::croak("encoding: Unknown encoding '$name'");
}
- unless ($arg{Filter}){
- ${^ENCODING} = $enc; # this is all you need, actually.
+ $name = $enc->name; # canonize
+ unless ($arg{Filter}) {
+ DEBUG and warn "_exception($name) = ", _exception($name);
+ _exception($name) or ${^ENCODING} = $enc;
$HAS_PERLIO or return 1;
- for my $h (qw(STDIN STDOUT)){
- if ($arg{$h}){
- unless (defined find_encoding($arg{$h})) {
- require Carp;
- Carp::croak("Unknown encoding for $h, '$arg{$h}'");
- }
- eval { binmode($h, ":encoding($arg{$h})") };
- }else{
- unless (exists $arg{$h}){
- eval {
- no warnings 'uninitialized';
- binmode($h, ":encoding($name)");
- };
- }
- }
- if ($@){
- require Carp;
- Carp::croak($@);
- }
- }
}else{
defined(${^ENCODING}) and undef ${^ENCODING};
+ # implicitly 'use utf8'
+ require utf8; # to fetch $utf8::hint_bits;
+ $^H |= $utf8::hint_bits;
eval {
require Filter::Util::Call ;
Filter::Util::Call->import ;
- binmode(STDIN);
- binmode(STDOUT);
filter_add(sub{
- my $status;
- if (($status = filter_read()) > 0){
+ my $status = filter_read();
+ if ($status > 0){
$_ = $enc->decode($_, 1);
- # warn $_;
+ DEBUG and warn $_;
}
$status ;
});
};
- # warn "Filter installed";
+ $@ eq '' and DEBUG and warn "Filter installed";
+ }
+ defined ${^UNICODE} and ${^UNICODE} != 0 and return 1;
+ for my $h (qw(STDIN STDOUT)){
+ if ($arg{$h}){
+ unless (defined find_encoding($arg{$h})) {
+ require Carp;
+ Carp::croak("encoding: Unknown encoding for $h, '$arg{$h}'");
+ }
+ eval { binmode($h, ":raw :encoding($arg{$h})") };
+ }else{
+ unless (exists $arg{$h}){
+ eval {
+ no warnings 'uninitialized';
+ binmode($h, ":raw :encoding($name)");
+ };
+ }
+ }
+ if ($@){
+ require Carp;
+ Carp::croak($@);
+ }
}
return 1; # I doubt if we need it, though
}
binmode(STDIN, ":raw");
binmode(STDOUT, ":raw");
}else{
- binmode(STDIN);
- binmode(STDOUT);
+ binmode(STDIN);
+ binmode(STDOUT);
}
if ($INC{"Filter/Util/Call.pm"}){
eval { filter_del() };
# an alternate way, Filter
use encoding "euc-jp", Filter=>1;
- use utf8;
# now you can use kanji identifiers -- in euc-jp!
+ # switch on locale -
+ # note that this probably means that unless you have a complete control
+ # over the environments the application is ever going to be run, you should
+ # NOT use the feature of encoding pragma allowing you to write your script
+ # in any recognized encoding because changing locale settings will wreck
+ # the script; you can of course still use the other features of the pragma.
+ use encoding ':locale';
+
=head1 ABSTRACT
Let's start with a bit of history: Perl 5.6.0 introduced Unicode
Rewind to the future: starting from perl 5.8.0 with the B<encoding>
pragma, you can write your script in any encoding you like (so long
as the C<Encode> module supports it) and still enjoy Unicode support.
+This pragma achieves that by doing the following:
+
+=over
+
+=item *
+
+Internally converts all literals (C<q//,qq//,qr//,qw///, qx//>) from
+the encoding specified to utf8. In Perl 5.8.1 and later, literals in
+C<tr///> and C<DATA> pseudo-filehandle are also converted.
+
+=item *
+
+Changing PerlIO layers of C<STDIN> and C<STDOUT> to the encoding
+ specified.
+
+=back
+
+=head2 Literal Conversions
+
You can write code in EUC-JP as follows:
my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
s/\bCamel\b/$Rakuda/;
-The B<encoding> pragma also modifies the filehandle disciplines of
-STDIN, STDOUT, and STDERR to the specified encoding. Therefore,
+=head2 PerlIO layers for C<STD(IN|OUT)>
+
+The B<encoding> pragma also modifies the filehandle layers of
+STDIN and STDOUT to the specified encoding. Therefore,
use encoding "euc-jp";
my $message = "Camel is the symbol of perl.\n";
You can override this by giving extra arguments; see below.
+=head2 Implicit upgrading for byte strings
+
+By default, if strings operating under byte semantics and strings
+with Unicode character data are concatenated, the new string will
+be created by decoding the byte strings as I<ISO 8859-1 (Latin-1)>.
+
+The B<encoding> pragma changes this to use the specified encoding
+instead. For example:
+
+ use encoding 'utf8';
+ my $string = chr(20000); # a Unicode string
+ utf8::encode($string); # now it's a UTF-8 encoded byte string
+ # concatenate with another Unicode string
+ print length($string . chr(20000));
+
+Will print C<2>, because C<$string> is upgraded as UTF-8. Without
+C<use encoding 'utf8';>, it will print C<4> instead, since C<$string>
+is three octets when interpreted as Latin-1.
+
+=head1 FEATURES THAT REQUIRE 5.8.1
+
+Some of the features offered by this pragma requires perl 5.8.1. Most
+of these are done by Inaba Hiroto. Any other features and changes
+are good for 5.8.0.
+
+=over
+
+=item "NON-EUC" doublebyte encodings
+
+Because perl needs to parse script before applying this pragma, such
+encodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH;
+\x5c) in the second byte fails because the second byte may
+accidentally escape the quoting character that follows. Perl 5.8.1
+or later fixes this problem.
+
+=item tr//
+
+C<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0
+See the section below for details.
+
+=item DATA pseudo-filehandle
+
+Another feature that was overlooked was C<DATA>.
+
+=back
+
=head1 USAGE
=over 4
=item use encoding [I<ENCNAME>] ;
-Sets the script encoding to I<ENCNAME>. Filehandle disciplines of
-STDIN and STDOUT are set to ":encoding(I<ENCNAME>)". Note that STDERR
-will not be changed.
+Sets the script encoding to I<ENCNAME>. And unless ${^UNICODE}
+exists and non-zero, PerlIO layers of STDIN and STDOUT are set to
+":encoding(I<ENCNAME>)".
+
+Note that STDERR WILL NOT be changed.
+
+Also note that non-STD file handles remain unaffected. Use C<use
+open> or C<binmode> to change layers of those.
If no encoding is specified, the environment variable L<PERL_ENCODING>
is consulted. If no encoding can be found, the error C<Unknown encoding
'I<ENCNAME>'> will be thrown.
-Note that non-STD file handles remain unaffected. Use C<use open> or
-C<binmode> to change disciplines of those.
-
=item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ;
You can also individually set encodings of STDIN and STDOUT via the
first I<ENCNAME>. C<< STDIN => undef >> turns the IO transcoding
completely off.
+When ${^UNICODE} exists and non-zero, these options will completely
+ignored. ${^UNICODE} is a variable introduced in perl 5.8.1. See
+L<perlrun> see L<perlvar/"${^UNICODE}"> and L<perlrun/"-C"> for
+details (perl 5.8.1 and later).
+
+=item use encoding I<ENCNAME> Filter=E<gt>1;
+
+This turns the encoding pragma into a source filter. While the
+default approach just decodes interpolated literals (in qq() and
+qr()), this will apply a source filter to the entire source code. See
+L</"The Filter Option"> below for details.
+
=item no encoding;
-Unsets the script encoding. The disciplines of STDIN, STDOUT are
+Unsets the script encoding. The layers of STDIN, STDOUT are
reset to ":raw" (the default unprocessed raw stream of bytes).
=back
+=head1 The Filter Option
+
+The magic of C<use encoding> is not applied to the names of
+identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human
+is a single Han ideograph) work, you still need to write your script
+in UTF-8 -- or use a source filter. That's what 'Filter=>1' does.
+
+What does this mean? Your source code behaves as if it is written in
+UTF-8 with 'use utf8' in effect. So even if your editor only supports
+Shift_JIS, for example, you can still try examples in Chapter 15 of
+C<Programming Perl, 3rd Ed.>. For instance, you can use UTF-8
+identifiers.
+
+This option is significantly slower and (as of this writing) non-ASCII
+identifiers are not very stable WITHOUT this option and with the
+source code written in UTF-8.
+
+=head2 Filter-related changes at Encode version 1.87
+
+=over
+
+=item *
+
+The Filter option now sets STDIN and STDOUT like non-filter options.
+And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like
+non-filter version.
+
+=item *
+
+C<use utf8> is implicitly declared so you no longer have to C<use
+utf8> to C<${"\x{4eba}"}++>.
+
+=back
+
=head1 CAVEATS
=head2 NOT SCOPED
B<use encoding> can appear as many times as you want in a given script.
The multiple use of this pragma is discouraged.
-Because of this nature, the use of this pragma inside the module is
-strongly discouraged (because the influence of this pragma lasts not
-only for the module but the script that uses). But if you have to,
-make sure you say C<no encoding> at the end of the module so you
-contain the influence of the pragma within the module.
+By the same reason, the use this pragma inside modules is also
+discouraged (though not as strongly discouraged as the case above.
+See below).
+
+If you still have to write a module with this pragma, be very careful
+of the load order. See the codes below;
+
+ # called module
+ package Module_IN_BAR;
+ use encoding "bar";
+ # stuff in "bar" encoding here
+ 1;
+
+ # caller script
+ use encoding "foo"
+ use Module_IN_BAR;
+ # surprise! use encoding "bar" is in effect.
+
+The best way to avoid this oddity is to use this pragma RIGHT AFTER
+other modules are loaded. i.e.
+
+ use Module_IN_BAR;
+ use encoding "foo";
=head2 DO NOT MIX MULTIPLE ENCODINGS
So feel free to put your strings in your encoding in quotes and
regexes.
-=head1 Non-ASCII Identifiers and Filter option
+=head2 tr/// with ranges
-The magic of C<use encoding> is not applied to the names of
-identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human
-is a single Han ideograph) work, you still need to write your script
-in UTF-8 or use a source filter.
+The B<encoding> pragma works by decoding string literals in
+C<q//,qq//,qr//,qw///, qx//> and so forth. In perl 5.8.0, this
+does not apply to C<tr///>. Therefore,
-In other words, the same restriction as with Jperl applies.
+ use encoding 'euc-jp';
+ #....
+ $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/;
+ # -------- -------- -------- --------
-If you dare to experiment, however, you can try the Filter option.
+Does not work as
-=over 4
+ $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/;
-=item use encoding I<ENCNAME> Filter=E<gt>1;
+=over
+
+=item Legend of characters above
-This turns the encoding pragma into a source filter. While the default
-approach just decodes interpolated literals (in qq() and qr()), this
-will apply a source filter to the entire source code. In this case,
-STDIN and STDOUT remain untouched.
+ utf8 euc-jp charnames::viacode()
+ -----------------------------------------
+ \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A
+ \x{3093} \xA4\xF3 HIRAGANA LETTER N
+ \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A
+ \x{30f3} \xA5\xF3 KATAKANA LETTER N
=back
-What does this mean? Your source code behaves as if it is written in
-UTF-8. So even if your editor only supports Shift_JIS, for example,
-you can still try examples in Chapter 15 of C<Programming Perl, 3rd
-Ed.>. For instance, you can use UTF-8 identifiers.
+This counterintuitive behavior has been fixed in perl 5.8.1.
-This option is significantly slower and (as of this writing) non-ASCII
-identifiers are not very stable WITHOUT this option and with the
-source code written in UTF-8.
+=head3 workaround to tr///;
+
+In perl 5.8.0, you can work around as follows;
+
+ use encoding 'euc-jp';
+ # ....
+ eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };
+
+Note the C<tr//> expression is surrounded by C<qq{}>. The idea behind
+is the same as classic idiom that makes C<tr///> 'interpolate'.
+
+ tr/$from/$to/; # wrong!
+ eval qq{ tr/$from/$to/ }; # workaround.
-To make your script in legacy encoding work with minimum effort,
-do not use Filter=E<gt>1.
+Nevertheless, in case of B<encoding> pragma even C<q//> is affected so
+C<tr///> not being decoded was obviously against the will of Perl5
+Porters so it has been fixed in Perl 5.8.1 or later.
=head1 EXAMPLE - Greekperl
=head1 KNOWN PROBLEMS
+=over
+
+=item literals in regex that are longer than 127 bytes
+
For native multibyte encodings (either fixed or variable length),
the current implementation of the regular expressions may introduce
recoding errors for regular expression literals longer than 127 bytes.
+=item EBCDIC
+
The encoding pragma is not supported on EBCDIC platforms.
(Porters who are willing and able to remove this limitation are
welcome.)
+=item format
+
+This pragma doesn't work well with format because PerlIO does not
+get along very well with it. When format contains non-ascii
+characters it prints funny or gets "wide character warnings".
+To understand it, try the code below.
+
+ # Save this one in utf8
+ # replace *non-ascii* with a non-ascii string
+ my $camel;
+ format STDOUT =
+ *non-ascii*@>>>>>>>
+ $camel
+ .
+ $camel = "*non-ascii*";
+ binmode(STDOUT=>':encoding(utf8)'); # bang!
+ write; # funny
+ print $camel, "\n"; # fine
+
+Without binmode this happens to work but without binmode, print()
+fails instead of write().
+
+At any rate, the very use of format is questionable when it comes to
+unicode characters since you have to consider such things as character
+width (i.e. double-width for ideographs) and directions (i.e. BIDI for
+Arabic and Hebrew).
+
+=back
+
+=head2 The Logic of :locale
+
+The logic of C<:locale> is as follows:
+
+=over 4
+
+=item 1.
+
+If the platform supports the langinfo(CODESET) interface, the codeset
+returned is used as the default encoding for the open pragma.
+
+=item 2.
+
+If 1. didn't work but we are under the locale pragma, the environment
+variables LC_ALL and LANG (in that order) are matched for encodings
+(the part after C<.>, if any), and if any found, that is used
+as the default encoding for the open pragma.
+
+=item 3.
+
+If 1. and 2. didn't work, the environment variables LC_ALL and LANG
+(in that order) are matched for anything looking like UTF-8, and if
+any found, C<:utf8> is used as the default encoding for the open
+pragma.
+
+=back
+
+If your locale environment variables (LC_ALL, LC_CTYPE, LANG)
+contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching),
+the default encoding of your STDIN, STDOUT, and STDERR, and of
+B<any subsequent file open>, is UTF-8.
+
+=head1 HISTORY
+
+This pragma first appeared in Perl 5.8.0. For features that require
+5.8.1 and better, see above.
+
+The C<:locale> subpragma was implemented in 2.01, or Perl 5.8.6.
+
=head1 SEE ALSO
L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,