-# $Id: encoding.pm,v 2.2 2005/09/08 14:17:17 dankogai Exp dankogai $
+# $Id: encoding.pm,v 2.4 2006/06/03 20:28:48 dankogai Exp dankogai $
package encoding;
-our $VERSION = do { my @r = (q$Revision: 2.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = ( q$Revision: 2.4 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
use Encode;
use strict;
+use warnings;
sub DEBUG () { 0 }
BEGIN {
- if (ord("A") == 193) {
- require Carp;
- Carp::croak("encoding: pragma does not support EBCDIC platforms");
+ if ( ord("A") == 193 ) {
+ require Carp;
+ Carp::croak("encoding: pragma does not support EBCDIC platforms");
}
}
our $HAS_PERLIO = 0;
eval { require PerlIO::encoding };
-unless ($@){
- $HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02);
+unless ($@) {
+ $HAS_PERLIO = ( PerlIO::encoding->VERSION >= 0.02 );
}
-sub _exception{
+sub _exception {
my $name = shift;
- $] > 5.008 and return 0; # 5.8.1 or higher then no
- my %utfs = map {$_=>1}
- qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
- UTF-32 UTF-32BE UTF-32LE);
- $utfs{$name} or return 0; # UTFs or no
- require Config; Config->import(); our %Config;
- return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no
+ $] > 5.008 and return 0; # 5.8.1 or higher then no
+ my %utfs = map { $_ => 1 }
+ qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
+ UTF-32 UTF-32BE UTF-32LE);
+ $utfs{$name} or return 0; # UTFs or no
+ require Config;
+ Config->import();
+ our %Config;
+ return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no
}
-sub in_locale { $^H & ($locale::hint_bits || 0)}
+sub in_locale { $^H & ( $locale::hint_bits || 0 ) }
sub _get_locale_encoding {
my $locale_encoding;
# I18N::Langinfo isn't available everywhere
eval {
- require I18N::Langinfo;
- I18N::Langinfo->import(qw(langinfo CODESET));
- $locale_encoding = langinfo(CODESET());
+ require I18N::Langinfo;
+ I18N::Langinfo->import(qw(langinfo CODESET));
+ $locale_encoding = langinfo( CODESET() );
};
-
+
my $country_language;
no warnings 'uninitialized';
- if (not $locale_encoding && in_locale()) {
- if ($ENV{LC_ALL} =~ /^([^.]+)\.([^.]+)$/) {
- ($country_language, $locale_encoding) = ($1, $2);
- } elsif ($ENV{LANG} =~ /^([^.]+)\.([^.]+)$/) {
- ($country_language, $locale_encoding) = ($1, $2);
- }
- # LANGUAGE affects only LC_MESSAGES only on glibc
- } elsif (not $locale_encoding) {
- if ($ENV{LC_ALL} =~ /\butf-?8\b/i ||
- $ENV{LANG} =~ /\butf-?8\b/i) {
- $locale_encoding = 'utf8';
- }
- # Could do more heuristics based on the country and language
- # parts of LC_ALL and LANG (the parts before the dot (if any)),
- # since we have Locale::Country and Locale::Language available.
- # TODO: get a database of Language -> Encoding mappings
- # (the Estonian database at http://www.eki.ee/letter/
- # would be excellent!) --jhi
+ if ( not $locale_encoding && in_locale() ) {
+ if ( $ENV{LC_ALL} =~ /^([^.]+)\.([^.]+)$/ ) {
+ ( $country_language, $locale_encoding ) = ( $1, $2 );
+ }
+ elsif ( $ENV{LANG} =~ /^([^.]+)\.([^.]+)$/ ) {
+ ( $country_language, $locale_encoding ) = ( $1, $2 );
+ }
+
+ # LANGUAGE affects only LC_MESSAGES only on glibc
}
- if (defined $locale_encoding &&
- lc($locale_encoding) eq 'euc' &&
- defined $country_language) {
- if ($country_language =~ /^ja_JP|japan(?:ese)?$/i) {
- $locale_encoding = 'euc-jp';
- } elsif ($country_language =~ /^ko_KR|korean?$/i) {
- $locale_encoding = 'euc-kr';
- } elsif ($country_language =~ /^zh_CN|chin(?:a|ese)?$/i) {
- $locale_encoding = 'euc-cn';
- } elsif ($country_language =~ /^zh_TW|taiwan(?:ese)?$/i) {
- $locale_encoding = 'euc-tw';
- } else {
- require Carp;
- Carp::croak("encoding: Locale encoding '$locale_encoding' too ambiguous");
- }
+ elsif ( not $locale_encoding ) {
+ if ( $ENV{LC_ALL} =~ /\butf-?8\b/i
+ || $ENV{LANG} =~ /\butf-?8\b/i )
+ {
+ $locale_encoding = 'utf8';
+ }
+
+ # Could do more heuristics based on the country and language
+ # parts of LC_ALL and LANG (the parts before the dot (if any)),
+ # since we have Locale::Country and Locale::Language available.
+ # TODO: get a database of Language -> Encoding mappings
+ # (the Estonian database at http://www.eki.ee/letter/
+ # would be excellent!) --jhi
+ }
+ if ( defined $locale_encoding
+ && lc($locale_encoding) eq 'euc'
+ && defined $country_language )
+ {
+ if ( $country_language =~ /^ja_JP|japan(?:ese)?$/i ) {
+ $locale_encoding = 'euc-jp';
+ }
+ elsif ( $country_language =~ /^ko_KR|korean?$/i ) {
+ $locale_encoding = 'euc-kr';
+ }
+ elsif ( $country_language =~ /^zh_CN|chin(?:a|ese)$/i ) {
+ $locale_encoding = 'euc-cn';
+ }
+ elsif ( $country_language =~ /^zh_TW|taiwan(?:ese)?$/i ) {
+ $locale_encoding = 'euc-tw';
+ }
+ else {
+ require Carp;
+ Carp::croak(
+ "encoding: Locale encoding '$locale_encoding' too ambiguous"
+ );
+ }
}
return $locale_encoding;
sub import {
my $class = shift;
my $name = shift;
- if ($name eq ':_get_locale_encoding') { # used by lib/open.pm
- my $caller = caller();
+ if ( $name eq ':_get_locale_encoding' ) { # used by lib/open.pm
+ my $caller = caller();
{
- no strict 'refs';
- *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding;
- }
- return;
+ no strict 'refs';
+ *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding;
+ }
+ return;
}
$name = _get_locale_encoding() if $name eq ':locale';
my %arg = @_;
$name = $ENV{PERL_ENCODING} unless defined $name;
my $enc = find_encoding($name);
- unless (defined $enc) {
- require Carp;
- Carp::croak("encoding: Unknown encoding '$name'");
+ unless ( defined $enc ) {
+ require Carp;
+ Carp::croak("encoding: Unknown encoding '$name'");
+ }
+ $name = $enc->name; # canonize
+ unless ( $arg{Filter} ) {
+ DEBUG and warn "_exception($name) = ", _exception($name);
+ _exception($name) or ${^ENCODING} = $enc;
+ $HAS_PERLIO or return 1;
}
- $name = $enc->name; # canonize
- unless ($arg{Filter}) {
- DEBUG and warn "_exception($name) = ", _exception($name);
- _exception($name) or ${^ENCODING} = $enc;
- $HAS_PERLIO or return 1;
- }else{
- defined(${^ENCODING}) and undef ${^ENCODING};
- # implicitly 'use utf8'
- require utf8; # to fetch $utf8::hint_bits;
- $^H |= $utf8::hint_bits;
- eval {
- require Filter::Util::Call ;
- Filter::Util::Call->import ;
- filter_add(sub{
- my $status = filter_read();
- if ($status > 0){
- $_ = $enc->decode($_, 1);
- DEBUG and warn $_;
- }
- $status ;
- });
- };
+ else {
+ defined( ${^ENCODING} ) and undef ${^ENCODING};
+
+ # implicitly 'use utf8'
+ require utf8; # to fetch $utf8::hint_bits;
+ $^H |= $utf8::hint_bits;
+ eval {
+ require Filter::Util::Call;
+ Filter::Util::Call->import;
+ filter_add(
+ sub {
+ my $status = filter_read();
+ if ( $status > 0 ) {
+ $_ = $enc->decode( $_, 1 );
+ DEBUG and warn $_;
+ }
+ $status;
+ }
+ );
+ };
$@ eq '' and DEBUG and warn "Filter installed";
}
defined ${^UNICODE} and ${^UNICODE} != 0 and return 1;
- for my $h (qw(STDIN STDOUT)){
- if ($arg{$h}){
- unless (defined find_encoding($arg{$h})) {
- require Carp;
- Carp::croak("encoding: Unknown encoding for $h, '$arg{$h}'");
- }
- eval { binmode($h, ":raw :encoding($arg{$h})") };
- }else{
- unless (exists $arg{$h}){
- eval {
- no warnings 'uninitialized';
- binmode($h, ":raw :encoding($name)");
- };
- }
- }
- if ($@){
- require Carp;
- Carp::croak($@);
- }
+ for my $h (qw(STDIN STDOUT)) {
+ if ( $arg{$h} ) {
+ unless ( defined find_encoding( $arg{$h} ) ) {
+ require Carp;
+ Carp::croak(
+ "encoding: Unknown encoding for $h, '$arg{$h}'");
+ }
+ eval { binmode( $h, ":raw :encoding($arg{$h})" ) };
+ }
+ else {
+ unless ( exists $arg{$h} ) {
+ eval {
+ no warnings 'uninitialized';
+ binmode( $h, ":raw :encoding($name)" );
+ };
+ }
+ }
+ if ($@) {
+ require Carp;
+ Carp::croak($@);
+ }
}
- return 1; # I doubt if we need it, though
+ return 1; # I doubt if we need it, though
}
-sub unimport{
+sub unimport {
no warnings;
undef ${^ENCODING};
- if ($HAS_PERLIO){
- binmode(STDIN, ":raw");
- binmode(STDOUT, ":raw");
- }else{
- binmode(STDIN);
- binmode(STDOUT);
+ if ($HAS_PERLIO) {
+ binmode( STDIN, ":raw" );
+ binmode( STDOUT, ":raw" );
}
- if ($INC{"Filter/Util/Call.pm"}){
- eval { filter_del() };
+ else {
+ binmode(STDIN);
+ binmode(STDOUT);
+ }
+ if ( $INC{"Filter/Util/Call.pm"} ) {
+ eval { filter_del() };
}
}
C<use encoding 'utf8';>, it will print C<4> instead, since C<$string>
is three octets when interpreted as Latin-1.
+=head2 Side effects
+
+If the C<encoding> pragma is in scope then the lengths returned are
+calculated from the length of C<$/> in Unicode characters, which is not
+always the same as the length of C<$/> in the native encoding.
+
+This pragma affects utf8::upgrade, but not utf8::downgrade.
+
=head1 FEATURES THAT REQUIRE 5.8.1
Some of the features offered by this pragma requires perl 5.8.1. Most
Notice that only literals (string or regular expression) having only
legacy code points are affected: if you mix data like this
- \xDF\x{100}
+ \xDF\x{100}
the data is assumed to be in (Latin 1 and) Unicode, not in your native
encoding. In other words, this will match in "greek":
- "\xDF" =~ /\x{3af}/
+ "\xDF" =~ /\x{3af}/
but this will not
- "\xDF\x{100}" =~ /\x{3af}\x{100}/
+ "\xDF\x{100}" =~ /\x{3af}\x{100}/
since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL