From: Jarkko Hietaniemi Date: Mon, 16 Aug 2004 22:27:00 +0000 (+0300) Subject: Re-apply the encoding.pm part of: X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=b1aeb384e13509b880aa9ad8303863293c7f9b87;p=p5sagit%2Fp5-mst-13.2.git Re-apply the encoding.pm part of: Subject: [PATCH] encoding and open pragmas Message-ID: <41210A84.6060506@iki.fi> p4raw-id: //depot/perl@23381 --- diff --git a/ext/Encode/encoding.pm b/ext/Encode/encoding.pm index d1181ff..b398301 100644 --- a/ext/Encode/encoding.pm +++ b/ext/Encode/encoding.pm @@ -1,15 +1,16 @@ -# $Id: encoding.pm,v 2.0 2004/05/16 20:55:16 dankogai Exp $ +# $Id: encoding.pm,v 2.01 2004/05/16 20:55:16 dankogai Exp $ package encoding; -our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 2.01 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use Encode; use strict; + sub DEBUG () { 0 } BEGIN { if (ord("A") == 193) { require Carp; - Carp::croak("encoding pragma does not support EBCDIC platforms"); + Carp::croak("encoding: pragma does not support EBCDIC platforms"); } } @@ -30,15 +31,79 @@ sub _exception{ return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no } +sub in_locale { $^H & ($locale::hint_bits || 0)} + +sub _get_locale_encoding { + my $locale_encoding; + + # I18N::Langinfo isn't available everywhere + eval { + require I18N::Langinfo; + I18N::Langinfo->import(qw(langinfo CODESET)); + $locale_encoding = langinfo(CODESET()); + }; + + my $country_language; + + no warnings 'uninitialized'; + + if (not $locale_encoding && in_locale()) { + if ($ENV{LC_ALL} =~ /^([^.]+)\.([^.]+)$/) { + ($country_language, $locale_encoding) = ($1, $2); + } elsif ($ENV{LANG} =~ /^([^.]+)\.([^.]+)$/) { + ($country_language, $locale_encoding) = ($1, $2); + } + # LANGUAGE affects only LC_MESSAGES only on glibc + } elsif (not $locale_encoding) { + if ($ENV{LC_ALL} =~ /\butf-?8\b/i || + $ENV{LANG} =~ /\butf-?8\b/i) { + $locale_encoding = 'utf8'; + } + # Could do more heuristics based on the country and language + # parts of LC_ALL and LANG (the parts before the dot (if any)), + # since we have Locale::Country and Locale::Language available. + # TODO: get a database of Language -> Encoding mappings + # (the Estonian database at http://www.eki.ee/letter/ + # would be excellent!) --jhi + } + if (defined $locale_encoding && + lc($locale_encoding) eq 'euc' && + defined $country_language) { + if ($country_language =~ /^ja_JP|japan(?:ese)?$/i) { + $locale_encoding = 'euc-jp'; + } elsif ($country_language =~ /^ko_KR|korean?$/i) { + $locale_encoding = 'euc-kr'; + } elsif ($country_language =~ /^zh_CN|chin(?:a|ese)?$/i) { + $locale_encoding = 'euc-cn'; + } elsif ($country_language =~ /^zh_TW|taiwan(?:ese)?$/i) { + $locale_encoding = 'euc-tw'; + } else { + require Carp; + Carp::croak("encoding: Locale encoding '$locale_encoding' too ambiguous"); + } + } + + return $locale_encoding; +} + sub import { my $class = shift; my $name = shift; + if ($name eq ':_get_locale_encoding') { # used by lib/open.pm + my $caller = caller(); + { + no strict 'refs'; + *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding; + } + return; + } + $name = _get_locale_encoding() if $name eq ':locale'; my %arg = @_; - $name ||= $ENV{PERL_ENCODING}; + $name = $ENV{PERL_ENCODING} unless defined $name; my $enc = find_encoding($name); unless (defined $enc) { require Carp; - Carp::croak("Unknown encoding '$name'"); + Carp::croak("encoding: Unknown encoding '$name'"); } $name = $enc->name; # canonize unless ($arg{Filter}) { @@ -62,13 +127,14 @@ sub import { $status ; }); }; - } DEBUG and warn "Filter installed"; + $@ == '' and DEBUG and warn "Filter installed"; + } defined ${^UNICODE} and ${^UNICODE} != 0 and return 1; for my $h (qw(STDIN STDOUT)){ if ($arg{$h}){ unless (defined find_encoding($arg{$h})) { require Carp; - Carp::croak("Unknown encoding for $h, '$arg{$h}'"); + Carp::croak("encoding: Unknown encoding for $h, '$arg{$h}'"); } eval { binmode($h, ":raw :encoding($arg{$h})") }; }else{ @@ -133,6 +199,14 @@ encoding - allows you to write your script in non-ascii or non-utf8 use encoding "euc-jp", Filter=>1; # now you can use kanji identifiers -- in euc-jp! + # switch on locale - + # note that this probably means that unless you have a complete control + # over the environments the application is ever going to be run, you should + # NOT use the feature of encoding pragma allowing you to write your script + # in any recognized encoding because changing locale settings will wreck + # the script; you can of course still use the other features of the pragma. + use encoding ':locale'; + =head1 ABSTRACT Let's start with a bit of history: Perl 5.6.0 introduced Unicode @@ -510,11 +584,45 @@ Arabic and Hebrew). =back +=head2 The Logic of :locale + +The logic of C<:locale> is as follows: + +=over 4 + +=item 1. + +If the platform supports the langinfo(CODESET) interface, the codeset +returned is used as the default encoding for the open pragma. + +=item 2. + +If 1. didn't work but we are under the locale pragma, the environment +variables LC_ALL and LANG (in that order) are matched for encodings +(the part after C<.>, if any), and if any found, that is used +as the default encoding for the open pragma. + +=item 3. + +If 1. and 2. didn't work, the environment variables LC_ALL and LANG +(in that order) are matched for anything looking like UTF-8, and if +any found, C<:utf8> is used as the default encoding for the open +pragma. + +=back + +If your locale environment variables (LC_ALL, LC_CTYPE, LANG) +contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching), +the default encoding of your STDIN, STDOUT, and STDERR, and of +B, is UTF-8. + =head1 HISTORY This pragma first appeared in Perl 5.8.0. For features that require 5.8.1 and better, see above. +The C<:locale> subpragma was implemented in 2.01, or Perl 5.8.6. + =head1 SEE ALSO L, L, L, L,