X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=ext%2FEncode%2Fencoding.pm;h=77ba4472a23a3cfcbc3bb692953a3dff9e4c1a78;hb=4ac71550d23cca4632a2bcdfcb1d83a6bf705e45;hp=b39830151385a8fdd2f89872e355f213b0b3ee08;hpb=b1aeb384e13509b880aa9ad8303863293c7f9b87;p=p5sagit%2Fp5-mst-13.2.git diff --git a/ext/Encode/encoding.pm b/ext/Encode/encoding.pm index b398301..77ba447 100644 --- a/ext/Encode/encoding.pm +++ b/ext/Encode/encoding.pm @@ -1,86 +1,101 @@ -# $Id: encoding.pm,v 2.01 2004/05/16 20:55:16 dankogai Exp $ +# $Id: encoding.pm,v 2.7 2008/03/12 09:51:11 dankogai Exp $ package encoding; -our $VERSION = do { my @r = (q$Revision: 2.01 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = '2.6_01'; use Encode; use strict; +use warnings; sub DEBUG () { 0 } BEGIN { - if (ord("A") == 193) { - require Carp; - Carp::croak("encoding: pragma does not support EBCDIC platforms"); + if ( ord("A") == 193 ) { + require Carp; + Carp::croak("encoding: pragma does not support EBCDIC platforms"); } } our $HAS_PERLIO = 0; eval { require PerlIO::encoding }; -unless ($@){ - $HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02); +unless ($@) { + $HAS_PERLIO = ( PerlIO::encoding->VERSION >= 0.02 ); } -sub _exception{ +sub _exception { my $name = shift; - $] > 5.008 and return 0; # 5.8.1 or higher then no - my %utfs = map {$_=>1} - qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE - UTF-32 UTF-32BE UTF-32LE); - $utfs{$name} or return 0; # UTFs or no - require Config; Config->import(); our %Config; - return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no + $] > 5.008 and return 0; # 5.8.1 or higher then no + my %utfs = map { $_ => 1 } + qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE + UTF-32 UTF-32BE UTF-32LE); + $utfs{$name} or return 0; # UTFs or no + require Config; + Config->import(); + our %Config; + return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no } -sub in_locale { $^H & ($locale::hint_bits || 0)} +sub in_locale { $^H & ( $locale::hint_bits || 0 ) } sub _get_locale_encoding { my $locale_encoding; # I18N::Langinfo isn't available everywhere eval { - require I18N::Langinfo; - I18N::Langinfo->import(qw(langinfo CODESET)); - $locale_encoding = langinfo(CODESET()); + require I18N::Langinfo; + I18N::Langinfo->import(qw(langinfo CODESET)); + $locale_encoding = langinfo( CODESET() ); }; - + my $country_language; no warnings 'uninitialized'; - if (not $locale_encoding && in_locale()) { - if ($ENV{LC_ALL} =~ /^([^.]+)\.([^.]+)$/) { - ($country_language, $locale_encoding) = ($1, $2); - } elsif ($ENV{LANG} =~ /^([^.]+)\.([^.]+)$/) { - ($country_language, $locale_encoding) = ($1, $2); - } - # LANGUAGE affects only LC_MESSAGES only on glibc - } elsif (not $locale_encoding) { - if ($ENV{LC_ALL} =~ /\butf-?8\b/i || - $ENV{LANG} =~ /\butf-?8\b/i) { - $locale_encoding = 'utf8'; - } - # Could do more heuristics based on the country and language - # parts of LC_ALL and LANG (the parts before the dot (if any)), - # since we have Locale::Country and Locale::Language available. - # TODO: get a database of Language -> Encoding mappings - # (the Estonian database at http://www.eki.ee/letter/ - # would be excellent!) --jhi + if ( (not $locale_encoding) && in_locale() ) { + if ( $ENV{LC_ALL} =~ /^([^.]+)\.([^.@]+)(@.*)?$/ ) { + ( $country_language, $locale_encoding ) = ( $1, $2 ); + } + elsif ( $ENV{LANG} =~ /^([^.]+)\.([^.@]+)(@.*)?$/ ) { + ( $country_language, $locale_encoding ) = ( $1, $2 ); + } + + # LANGUAGE affects only LC_MESSAGES only on glibc + } + elsif ( not $locale_encoding ) { + if ( $ENV{LC_ALL} =~ /\butf-?8\b/i + || $ENV{LANG} =~ /\butf-?8\b/i ) + { + $locale_encoding = 'utf8'; + } + + # Could do more heuristics based on the country and language + # parts of LC_ALL and LANG (the parts before the dot (if any)), + # since we have Locale::Country and Locale::Language available. + # TODO: get a database of Language -> Encoding mappings + # (the Estonian database at http://www.eki.ee/letter/ + # would be excellent!) --jhi } - if (defined $locale_encoding && - lc($locale_encoding) eq 'euc' && - defined $country_language) { - if ($country_language =~ /^ja_JP|japan(?:ese)?$/i) { - $locale_encoding = 'euc-jp'; - } elsif ($country_language =~ /^ko_KR|korean?$/i) { - $locale_encoding = 'euc-kr'; - } elsif ($country_language =~ /^zh_CN|chin(?:a|ese)?$/i) { - $locale_encoding = 'euc-cn'; - } elsif ($country_language =~ /^zh_TW|taiwan(?:ese)?$/i) { - $locale_encoding = 'euc-tw'; - } else { - require Carp; - Carp::croak("encoding: Locale encoding '$locale_encoding' too ambiguous"); - } + if ( defined $locale_encoding + && lc($locale_encoding) eq 'euc' + && defined $country_language ) + { + if ( $country_language =~ /^ja_JP|japan(?:ese)?$/i ) { + $locale_encoding = 'euc-jp'; + } + elsif ( $country_language =~ /^ko_KR|korean?$/i ) { + $locale_encoding = 'euc-kr'; + } + elsif ( $country_language =~ /^zh_CN|chin(?:a|ese)$/i ) { + $locale_encoding = 'euc-cn'; + } + elsif ( $country_language =~ /^zh_TW|taiwan(?:ese)?$/i ) { + $locale_encoding = 'euc-tw'; + } + else { + require Carp; + Carp::croak( + "encoding: Locale encoding '$locale_encoding' too ambiguous" + ); + } } return $locale_encoding; @@ -89,82 +104,89 @@ sub _get_locale_encoding { sub import { my $class = shift; my $name = shift; - if ($name eq ':_get_locale_encoding') { # used by lib/open.pm - my $caller = caller(); + if ( $name eq ':_get_locale_encoding' ) { # used by lib/open.pm + my $caller = caller(); { - no strict 'refs'; - *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding; - } - return; + no strict 'refs'; + *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding; + } + return; } $name = _get_locale_encoding() if $name eq ':locale'; my %arg = @_; $name = $ENV{PERL_ENCODING} unless defined $name; my $enc = find_encoding($name); - unless (defined $enc) { - require Carp; - Carp::croak("encoding: Unknown encoding '$name'"); + unless ( defined $enc ) { + require Carp; + Carp::croak("encoding: Unknown encoding '$name'"); + } + $name = $enc->name; # canonize + unless ( $arg{Filter} ) { + DEBUG and warn "_exception($name) = ", _exception($name); + _exception($name) or ${^ENCODING} = $enc; + $HAS_PERLIO or return 1; } - $name = $enc->name; # canonize - unless ($arg{Filter}) { - DEBUG and warn "_exception($name) = ", _exception($name); - _exception($name) or ${^ENCODING} = $enc; - $HAS_PERLIO or return 1; - }else{ - defined(${^ENCODING}) and undef ${^ENCODING}; - # implicitly 'use utf8' - require utf8; # to fetch $utf8::hint_bits; - $^H |= $utf8::hint_bits; - eval { - require Filter::Util::Call ; - Filter::Util::Call->import ; - filter_add(sub{ - my $status = filter_read(); - if ($status > 0){ - $_ = $enc->decode($_, 1); - DEBUG and warn $_; - } - $status ; - }); - }; - $@ == '' and DEBUG and warn "Filter installed"; + else { + defined( ${^ENCODING} ) and undef ${^ENCODING}; + + # implicitly 'use utf8' + require utf8; # to fetch $utf8::hint_bits; + $^H |= $utf8::hint_bits; + eval { + require Filter::Util::Call; + Filter::Util::Call->import; + filter_add( + sub { + my $status = filter_read(); + if ( $status > 0 ) { + $_ = $enc->decode( $_, 1 ); + DEBUG and warn $_; + } + $status; + } + ); + }; + $@ eq '' and DEBUG and warn "Filter installed"; } defined ${^UNICODE} and ${^UNICODE} != 0 and return 1; - for my $h (qw(STDIN STDOUT)){ - if ($arg{$h}){ - unless (defined find_encoding($arg{$h})) { - require Carp; - Carp::croak("encoding: Unknown encoding for $h, '$arg{$h}'"); - } - eval { binmode($h, ":raw :encoding($arg{$h})") }; - }else{ - unless (exists $arg{$h}){ - eval { - no warnings 'uninitialized'; - binmode($h, ":raw :encoding($name)"); - }; - } - } - if ($@){ - require Carp; - Carp::croak($@); - } + for my $h (qw(STDIN STDOUT)) { + if ( $arg{$h} ) { + unless ( defined find_encoding( $arg{$h} ) ) { + require Carp; + Carp::croak( + "encoding: Unknown encoding for $h, '$arg{$h}'"); + } + eval { binmode( $h, ":raw :encoding($arg{$h})" ) }; + } + else { + unless ( exists $arg{$h} ) { + eval { + no warnings 'uninitialized'; + binmode( $h, ":raw :encoding($name)" ); + }; + } + } + if ($@) { + require Carp; + Carp::croak($@); + } } - return 1; # I doubt if we need it, though + return 1; # I doubt if we need it, though } -sub unimport{ +sub unimport { no warnings; undef ${^ENCODING}; - if ($HAS_PERLIO){ - binmode(STDIN, ":raw"); - binmode(STDOUT, ":raw"); - }else{ - binmode(STDIN); - binmode(STDOUT); + if ($HAS_PERLIO) { + binmode( STDIN, ":raw" ); + binmode( STDOUT, ":raw" ); + } + else { + binmode(STDIN); + binmode(STDOUT); } - if ($INC{"Filter/Util/Call.pm"}){ - eval { filter_del() }; + if ( $INC{"Filter/Util/Call.pm"} ) { + eval { filter_del() }; } } @@ -285,6 +307,14 @@ Will print C<2>, because C<$string> is upgraded as UTF-8. Without C, it will print C<4> instead, since C<$string> is three octets when interpreted as Latin-1. +=head2 Side effects + +If the C pragma is in scope then the lengths returned are +calculated from the length of C<$/> in Unicode characters, which is not +always the same as the length of C<$/> in the native encoding. + +This pragma affects utf8::upgrade, but not utf8::downgrade. + =head1 FEATURES THAT REQUIRE 5.8.1 Some of the features offered by this pragma requires perl 5.8.1. Most @@ -402,7 +432,7 @@ B can appear as many times as you want in a given script. The multiple use of this pragma is discouraged. By the same reason, the use this pragma inside modules is also -discouraged (though not as strongly discouranged as the case above. +discouraged (though not as strongly discouraged as the case above. See below). If you still have to write a module with this pragma, be very careful @@ -430,16 +460,16 @@ other modules are loaded. i.e. Notice that only literals (string or regular expression) having only legacy code points are affected: if you mix data like this - \xDF\x{100} + \xDF\x{100} the data is assumed to be in (Latin 1 and) Unicode, not in your native encoding. In other words, this will match in "greek": - "\xDF" =~ /\x{3af}/ + "\xDF" =~ /\x{3af}/ but this will not - "\xDF\x{100}" =~ /\x{3af}\x{100}/ + "\xDF\x{100}" =~ /\x{3af}\x{100}/ since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on the left will B be upgraded to C<\x{3af}> (Unicode GREEK SMALL @@ -582,6 +612,11 @@ unicode characters since you have to consider such things as character width (i.e. double-width for ideographs) and directions (i.e. BIDI for Arabic and Hebrew). +=item Thread safety + +C is not thread-safe (i.e., do not use in threaded +applications). + =back =head2 The Logic of :locale