From: Marcus Holland-Moritz Date: Mon, 18 Oct 2004 21:00:26 +0000 (+0000) Subject: Upgrade to Encode 2.04. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=8f1ed24a4d3af53304bd3890a619bde305168316;p=p5sagit%2Fp5-mst-13.2.git Upgrade to Encode 2.04. p4raw-id: //depot/perl@23380 --- diff --git a/ext/Encode/AUTHORS b/ext/Encode/AUTHORS index c0e1df6..a40701f 100644 --- a/ext/Encode/AUTHORS +++ b/ext/Encode/AUTHORS @@ -45,6 +45,7 @@ SUGAWARA Hajime SUZUKI Norio Simon Cozens Spider Boardman +Steve Hay Tatsuhiko Miyagawa Vadim Konovalov Yitzchak Scott-Thoennes diff --git a/ext/Encode/Changes b/ext/Encode/Changes index b6b6041..9f15ee5 100644 --- a/ext/Encode/Changes +++ b/ext/Encode/Changes @@ -1,17 +1,35 @@ # Revision history for Perl extension Encode. # -# $Id: Changes,v 2.2 2004/08/31 10:55:34 dankogai Exp dankogai $ +# $Id: Changes,v 2.4 2004/10/16 21:22:44 dankogai Exp dankogai $ # -$Revision: 2.2 $ $Date: 2004/08/31 10:55:34 $ - ucm/big5-hkscs.ucm AUTHORS t/big5-hkscs.enc t/big5-hkscs.utf +$Revision: 2.4 $ $Date: 2004/10/16 21:22:44 $ +! Makefle.PL + From: craigberry@mac.com + Subject: [PATCH ext/Encode/Makefile.PL] make Encode.c dependency explicit + Message-Id: <41716868.7000102@mac.com> + +2.03 2004/10/06 05:07:20 +! lib/Encode/Alias.pm + Resolved some alias case sensitivity glitches reported via RT. + http://rt.cpan.org/NoAuth/Bug.html?id=7835 +! bin/piconv + Resolved Win32 glitches reported via RT. + (Fixed by dankogai and tested by Steve Hay) + http://rt.cpan.org/Ticket/Display.html?id=7831 +! JP/JP.pm lib/Encode/Alias.pm lib/Encode/Supported.pod AUTHORS + /\bwindows-31j$/i is now an alias of CP932, by Steve Hay. + http://rt.cpan.org/NoAuth/Bug.html?id=6695 + +2.02 2004/08/31 10:55:34 +! ucm/big5-hkscs.ucm AUTHORS t/big5-hkscs.enc t/big5-hkscs.utf New map submitted by Deng Liu and Autrijus. Test data needed to be upgrade as well, done by dankogai Message-Id: <20040824204828.GB6999@aut.dyndns.org> - bin/ucmsort +! bin/ucmsort Now works for characters U+10000 and above. This fix was needed to "tidy" the original map that was submitted. - bin/enc2xs +! bin/enc2xs "ucmsort" now mentioned in pod 2.01 2004/05/25 16:27:14 diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index 616f50c..ab80547 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -1,9 +1,9 @@ # -# $Id: Encode.pm,v 2.2 2004/08/31 10:52:11 dankogai Exp $ +# $Id: Encode.pm,v 2.4 2004/10/16 21:22:31 dankogai Exp $ # package Encode; use strict; -our $VERSION = do { my @r = (q$Revision: 2.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 2.4 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; sub DEBUG () { 0 } use XSLoader (); XSLoader::load(__PACKAGE__, $VERSION); diff --git a/ext/Encode/JP/JP.pm b/ext/Encode/JP/JP.pm index 696f928..01ad37f 100644 --- a/ext/Encode/JP/JP.pm +++ b/ext/Encode/JP/JP.pm @@ -5,7 +5,7 @@ BEGIN { } } use Encode; -our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 2.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use XSLoader; XSLoader::load(__PACKAGE__,$VERSION); @@ -45,7 +45,7 @@ supported are as follows. = ISO-2022-JP with JIS X 0212-1990 support. See below MacJapanese Shift JIS + Apple vendor mappings - cp932 Code Page 932 + cp932 /\bwindows-31j$/i Code Page 932 = Shift JIS + MS/IBM vendor mappings jis0201-raw JIS0201, raw format jis0208-raw JIS0201, raw format diff --git a/ext/Encode/META.yml b/ext/Encode/META.yml index 9a3546e..b6a6513 100644 --- a/ext/Encode/META.yml +++ b/ext/Encode/META.yml @@ -1,7 +1,7 @@ # http://module-build.sourceforge.net/META-spec.html #XXXXXXX This is a prototype!!! It will change in the future!!! XXXXX# name: Encode -version: 2.02 +version: 2.04 version_from: Encode.pm installdirs: perl requires: diff --git a/ext/Encode/Makefile.PL b/ext/Encode/Makefile.PL index 83e4e64..58b307b 100644 --- a/ext/Encode/Makefile.PL +++ b/ext/Encode/Makefile.PL @@ -83,8 +83,10 @@ sub postamble { my $self = shift; my $dir = $self->catdir($self->curdir,'ucm'); - my $str = "# Encode\$(OBJ_EXT) depends on .h and .exh files not .c files - but all written by enc2xs\n"; + my $str = "# Encode\$(OBJ_EXT) does not depend on .c files directly\n"; + $str .= "# (except Encode.c), but on .h and .exh files written by enc2xs\n"; $str .= $^O eq 'MacOS' ? 'Encode.c.{$(MACPERL_BUILD_EXT_STATIC)}.o :' : 'Encode$(OBJ_EXT) :'; + $str .= ' Encode.c'; foreach my $table (keys %tables) { $str .= " $table.c"; diff --git a/ext/Encode/bin/piconv b/ext/Encode/bin/piconv index fd546f0..cb0c236 100644 --- a/ext/Encode/bin/piconv +++ b/ext/Encode/bin/piconv @@ -1,5 +1,5 @@ #!./perl -# $Id: piconv,v 2.0 2004/05/16 20:55:16 dankogai Exp $ +# $Id: piconv,v 2.1 2004/10/06 05:07:20 dankogai Exp $ # use 5.8.0; use strict; @@ -52,25 +52,39 @@ To: $to => $cto EOT } -# default -if ($scheme eq 'from_to'){ - while(<>){ - Encode::from_to($_, $from, $to, $Opt{check}); print; - }; -# step-by-step -}elsif ($scheme eq 'decode_encode'){ - while(<>){ - my $decoded = decode($from, $_, $Opt{check}); - my $encoded = encode($to, $decoded); - print $encoded; - }; -# NI-S favorite -}elsif ($scheme eq 'perlio'){ - binmode(STDIN, ":encoding($from)"); - binmode(STDOUT, ":encoding($to)"); - while(<>){ print; } -} else { # won't reach - die "$name: unknown scheme: $scheme"; +# we do not use <> (or ARGV) for the sake of binmode() +@ARGV or push @ARGV, \*STDIN; + +unless ($scheme eq 'perlio'){ + binmode STDOUT; + for my $argv (@ARGV){ + my $ifh = ref $argv ? $argv : undef; + $ifh or open $ifh, "<", $argv or next; + binmode $ifh; + if ($scheme eq 'from_to'){ # default + while(<$ifh>){ + Encode::from_to($_, $from, $to, $Opt{check}); + print; + } + }elsif ($scheme eq 'decode_encode'){ # step-by-step + while(<$ifh>){ + my $decoded = decode($from, $_, $Opt{check}); + my $encoded = encode($to, $decoded); + print $encoded; + } + } else { # won't reach + die "$name: unknown scheme: $scheme"; + } + } +}else{ + # NI-S favorite + binmode STDOUT => "raw:encoding($to)"; + for my $argv (@ARGV){ + my $ifh = ref $argv ? $argv : undef; + $ifh or open $ifh, "<", $argv or next; + binmode $ifh => "raw:encoding($from)"; + print while(<$ifh>); + } } sub list_encodings{ diff --git a/ext/Encode/bin/ucmsort b/ext/Encode/bin/ucmsort index 63d4952..a67ee8e 100644 --- a/ext/Encode/bin/ucmsort +++ b/ext/Encode/bin/ucmsort @@ -1,6 +1,6 @@ #!/usr/local/bin/perl # -# $Id: ucmsort,v 2.1 2004/08/31 10:55:34 dankogai Exp dankogai $ +# $Id: ucmsort,v 2.1 2004/08/31 10:55:34 dankogai Exp $ # use strict; my @lines; diff --git a/ext/Encode/encoding.pm b/ext/Encode/encoding.pm index b398301..d1181ff 100644 --- a/ext/Encode/encoding.pm +++ b/ext/Encode/encoding.pm @@ -1,16 +1,15 @@ -# $Id: encoding.pm,v 2.01 2004/05/16 20:55:16 dankogai Exp $ +# $Id: encoding.pm,v 2.0 2004/05/16 20:55:16 dankogai Exp $ package encoding; -our $VERSION = do { my @r = (q$Revision: 2.01 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use Encode; use strict; - sub DEBUG () { 0 } BEGIN { if (ord("A") == 193) { require Carp; - Carp::croak("encoding: pragma does not support EBCDIC platforms"); + Carp::croak("encoding pragma does not support EBCDIC platforms"); } } @@ -31,79 +30,15 @@ sub _exception{ return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no } -sub in_locale { $^H & ($locale::hint_bits || 0)} - -sub _get_locale_encoding { - my $locale_encoding; - - # I18N::Langinfo isn't available everywhere - eval { - require I18N::Langinfo; - I18N::Langinfo->import(qw(langinfo CODESET)); - $locale_encoding = langinfo(CODESET()); - }; - - my $country_language; - - no warnings 'uninitialized'; - - if (not $locale_encoding && in_locale()) { - if ($ENV{LC_ALL} =~ /^([^.]+)\.([^.]+)$/) { - ($country_language, $locale_encoding) = ($1, $2); - } elsif ($ENV{LANG} =~ /^([^.]+)\.([^.]+)$/) { - ($country_language, $locale_encoding) = ($1, $2); - } - # LANGUAGE affects only LC_MESSAGES only on glibc - } elsif (not $locale_encoding) { - if ($ENV{LC_ALL} =~ /\butf-?8\b/i || - $ENV{LANG} =~ /\butf-?8\b/i) { - $locale_encoding = 'utf8'; - } - # Could do more heuristics based on the country and language - # parts of LC_ALL and LANG (the parts before the dot (if any)), - # since we have Locale::Country and Locale::Language available. - # TODO: get a database of Language -> Encoding mappings - # (the Estonian database at http://www.eki.ee/letter/ - # would be excellent!) --jhi - } - if (defined $locale_encoding && - lc($locale_encoding) eq 'euc' && - defined $country_language) { - if ($country_language =~ /^ja_JP|japan(?:ese)?$/i) { - $locale_encoding = 'euc-jp'; - } elsif ($country_language =~ /^ko_KR|korean?$/i) { - $locale_encoding = 'euc-kr'; - } elsif ($country_language =~ /^zh_CN|chin(?:a|ese)?$/i) { - $locale_encoding = 'euc-cn'; - } elsif ($country_language =~ /^zh_TW|taiwan(?:ese)?$/i) { - $locale_encoding = 'euc-tw'; - } else { - require Carp; - Carp::croak("encoding: Locale encoding '$locale_encoding' too ambiguous"); - } - } - - return $locale_encoding; -} - sub import { my $class = shift; my $name = shift; - if ($name eq ':_get_locale_encoding') { # used by lib/open.pm - my $caller = caller(); - { - no strict 'refs'; - *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding; - } - return; - } - $name = _get_locale_encoding() if $name eq ':locale'; my %arg = @_; - $name = $ENV{PERL_ENCODING} unless defined $name; + $name ||= $ENV{PERL_ENCODING}; my $enc = find_encoding($name); unless (defined $enc) { require Carp; - Carp::croak("encoding: Unknown encoding '$name'"); + Carp::croak("Unknown encoding '$name'"); } $name = $enc->name; # canonize unless ($arg{Filter}) { @@ -127,14 +62,13 @@ sub import { $status ; }); }; - $@ == '' and DEBUG and warn "Filter installed"; - } + } DEBUG and warn "Filter installed"; defined ${^UNICODE} and ${^UNICODE} != 0 and return 1; for my $h (qw(STDIN STDOUT)){ if ($arg{$h}){ unless (defined find_encoding($arg{$h})) { require Carp; - Carp::croak("encoding: Unknown encoding for $h, '$arg{$h}'"); + Carp::croak("Unknown encoding for $h, '$arg{$h}'"); } eval { binmode($h, ":raw :encoding($arg{$h})") }; }else{ @@ -199,14 +133,6 @@ encoding - allows you to write your script in non-ascii or non-utf8 use encoding "euc-jp", Filter=>1; # now you can use kanji identifiers -- in euc-jp! - # switch on locale - - # note that this probably means that unless you have a complete control - # over the environments the application is ever going to be run, you should - # NOT use the feature of encoding pragma allowing you to write your script - # in any recognized encoding because changing locale settings will wreck - # the script; you can of course still use the other features of the pragma. - use encoding ':locale'; - =head1 ABSTRACT Let's start with a bit of history: Perl 5.6.0 introduced Unicode @@ -584,45 +510,11 @@ Arabic and Hebrew). =back -=head2 The Logic of :locale - -The logic of C<:locale> is as follows: - -=over 4 - -=item 1. - -If the platform supports the langinfo(CODESET) interface, the codeset -returned is used as the default encoding for the open pragma. - -=item 2. - -If 1. didn't work but we are under the locale pragma, the environment -variables LC_ALL and LANG (in that order) are matched for encodings -(the part after C<.>, if any), and if any found, that is used -as the default encoding for the open pragma. - -=item 3. - -If 1. and 2. didn't work, the environment variables LC_ALL and LANG -(in that order) are matched for anything looking like UTF-8, and if -any found, C<:utf8> is used as the default encoding for the open -pragma. - -=back - -If your locale environment variables (LC_ALL, LC_CTYPE, LANG) -contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching), -the default encoding of your STDIN, STDOUT, and STDERR, and of -B, is UTF-8. - =head1 HISTORY This pragma first appeared in Perl 5.8.0. For features that require 5.8.1 and better, see above. -The C<:locale> subpragma was implemented in 2.01, or Perl 5.8.6. - =head1 SEE ALSO L, L, L, L, diff --git a/ext/Encode/lib/Encode/Alias.pm b/ext/Encode/lib/Encode/Alias.pm index 554adce..a1cc253 100644 --- a/ext/Encode/lib/Encode/Alias.pm +++ b/ext/Encode/lib/Encode/Alias.pm @@ -2,7 +2,7 @@ package Encode::Alias; use strict; no warnings 'redefine'; use Encode; -our $VERSION = do { my @r = (q$Revision: 2.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 2.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; sub DEBUG () { 0 } use base qw(Exporter); @@ -18,45 +18,45 @@ our @EXPORT = our @Alias; # ordered matching list our %Alias; # cached known aliases -sub find_alias -{ +sub find_alias{ my $class = shift; my $find = shift; - unless (exists $Alias{$find}) - { + unless (exists $Alias{$find}) { $Alias{$find} = undef; # Recursion guard - for (my $i=0; $i < @Alias; $i += 2) - { + for (my $i=0; $i < @Alias; $i += 2){ my $alias = $Alias[$i]; my $val = $Alias[$i+1]; my $new; - if (ref($alias) eq 'Regexp' && $find =~ $alias) - { + if (ref($alias) eq 'Regexp' && $find =~ $alias){ DEBUG and warn "eval $val"; $new = eval $val; DEBUG and $@ and warn "$val, $@"; - } - elsif (ref($alias) eq 'CODE') - { + }elsif (ref($alias) eq 'CODE'){ DEBUG and warn "$alias", "->", "($find)"; $new = $alias->($find); - } - elsif (lc($find) eq lc($alias)) - { + }elsif (lc($find) eq lc($alias)){ $new = $val; } - if (defined($new)) - { + if (defined($new)){ next if $new eq $find; # avoid (direct) recursion on bugs DEBUG and warn "$alias, $new"; my $enc = (ref($new)) ? $new : Encode::find_encoding($new); - if ($enc) - { + if ($enc){ $Alias{$find} = $enc; last; } } } + # case insensitive search when canonical is not in all lowercase + # RT ticket #7835 + unless ($Alias{$find}){ + my $lcfind = lc($find); + for my $name (keys %Encode::Encoding, keys %Encode::ExtModule){ + $lcfind eq lc($name) or next; + $Alias{$find} = Encode::find_encoding($name); + DEBUG and warn "$find => $name"; + } + } } if (DEBUG){ my $name; @@ -70,23 +70,19 @@ sub find_alias return $Alias{$find}; } -sub define_alias -{ - while (@_) - { +sub define_alias{ + while (@_){ my ($alias,$name) = splice(@_,0,2); unshift(@Alias, $alias => $name); # newer one has precedence - # clear %Alias cache to allow overrides if (ref($alias)){ + # clear %Alias cache to allow overrides my @a = keys %Alias; for my $k (@a){ - if (ref($alias) eq 'Regexp' && $k =~ $alias) - { + if (ref($alias) eq 'Regexp' && $k =~ $alias){ DEBUG and warn "delete \$Alias\{$k\}"; delete $Alias{$k}; } - elsif (ref($alias) eq 'CODE') - { + elsif (ref($alias) eq 'CODE'){ DEBUG and warn "delete \$Alias\{$k\}"; delete $Alias{$alias->($name)}; } @@ -99,7 +95,7 @@ sub define_alias } # Allow latin-1 style names as well - # 0 1 2 3 4 5 6 7 8 9 10 +# 0 1 2 3 4 5 6 7 8 9 10 our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 ); # Allow winlatin1 style names as well our %Winlatin2cp = ( @@ -124,7 +120,6 @@ sub undef_aliases{ sub init_aliases { undef_aliases(); - # Try all-lower-case version should all else fails define_alias( qr/^(.*)$/ => '"\L$1"' ); @@ -134,9 +129,9 @@ sub init_aliases define_alias( qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"', qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")', qr/^iso-10646-1$/i => '"UCS-2BE"' ); - define_alias( qr/^UTF(16|32)-?BE$/i => '"UTF-$1BE"', - qr/^UTF(16|32)-?LE$/i => '"UTF-$1LE"', - qr/^UTF(16|32)$/i => '"UTF-$1"', + define_alias( qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"', + qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"', + qr/^UTF-?(16|32)$/i => '"UTF-$1"', ); # ASCII define_alias(qr/^(?:US-?)ascii$/i => '"ascii"'); @@ -211,6 +206,7 @@ sub init_aliases define_alias( qr/\bujis$/i => '"euc-jp"' ); define_alias( qr/\bshift.*jis$/i => '"shiftjis"' ); define_alias( qr/\bsjis$/i => '"shiftjis"' ); + define_alias( qr/\bwindows-31j$/i => '"cp932"' ); # for Encode::KR define_alias( qr/\beuc.*kr$/i => '"euc-kr"' ); define_alias( qr/\bkr.*euc$/i => '"euc-kr"' ); diff --git a/ext/Encode/lib/Encode/Supported.pod b/ext/Encode/lib/Encode/Supported.pod index 9280a97..7a535c6 100644 --- a/ext/Encode/lib/Encode/Supported.pod +++ b/ext/Encode/lib/Encode/Supported.pod @@ -664,7 +664,8 @@ probably has more rights for the name, though it may be objected that Microsoft shouldn't have used JIS as part of the name in the first place. -Unambiguous name: C. C name (not used?): C. +Unambiguous name: C. C name (also used by Mozilla, and +provided as an alias by Encode): C. Encode separately supports C and C. diff --git a/ext/Encode/ucm/big5-hkscs.ucm b/ext/Encode/ucm/big5-hkscs.ucm index 75bffc4..7bc3662 100644 --- a/ext/Encode/ucm/big5-hkscs.ucm +++ b/ext/Encode/ucm/big5-hkscs.ucm @@ -1,5 +1,5 @@ # -# $Id: big5-hkscs.ucm,v 2.1 2004/08/31 10:55:34 dankogai Exp dankogai $ +# $Id: big5-hkscs.ucm,v 2.1 2004/08/31 10:55:34 dankogai Exp $ # "big5-hkscs" 1