From: Rafael Garcia-Suarez Date: Thu, 4 May 2006 12:06:33 +0000 (+0000) Subject: Upgrade to Encode 2.16 X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=d1256cb1bc05a9b21e5ab396a011ea9c02150dd4;p=p5sagit%2Fp5-mst-13.2.git Upgrade to Encode 2.16 p4raw-id: //depot/perl@28098 --- diff --git a/ext/Encode/Byte/Byte.pm b/ext/Encode/Byte/Byte.pm index d40c1c3..03ba73a 100644 --- a/ext/Encode/Byte/Byte.pm +++ b/ext/Encode/Byte/Byte.pm @@ -1,9 +1,9 @@ package Encode::Byte; use Encode; -our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.1 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use XSLoader; -XSLoader::load(__PACKAGE__,$VERSION); +XSLoader::load( __PACKAGE__, $VERSION ); 1; __END__ diff --git a/ext/Encode/Byte/Makefile.PL b/ext/Encode/Byte/Makefile.PL index 6390522..67c3922 100644 --- a/ext/Encode/Byte/Makefile.PL +++ b/ext/Encode/Byte/Makefile.PL @@ -5,35 +5,35 @@ use File::Spec::Functions; my $name = 'Byte'; my %tables = ( - byte_t => - [ - # misc. vendors - 'gsm0338.ucm', - 'nextstep.ucm', - 'hp-roman8.ucm', - 'viscii.ucm', - 'adobeStdenc.ucm', - # koi8 - 'koi8-f.ucm', 'koi8-r.ucm', 'koi8-u.ucm', - # Mac - qw( - macArabic.ucm - macCentEuro.ucm - macCroatian.ucm - macCyrillic.ucm - macFarsi.ucm - macGreek.ucm - macHebrew.ucm - macIceland.ucm - macRoman.ucm - macROMnn.ucm - macRUMnn.ucm - macSami.ucm - macThai.ucm - macTurkish.ucm - macUkraine.ucm - ), - ], + byte_t => + [ + # misc. vendors + 'gsm0338.ucm', + 'nextstep.ucm', + 'hp-roman8.ucm', + 'viscii.ucm', + 'adobeStdenc.ucm', + # koi8 + 'koi8-f.ucm', 'koi8-r.ucm', 'koi8-u.ucm', + # Mac + qw( + macArabic.ucm + macCentEuro.ucm + macCroatian.ucm + macCyrillic.ucm + macFarsi.ucm + macGreek.ucm + macHebrew.ucm + macIceland.ucm + macRoman.ucm + macROMnn.ucm + macRUMnn.ucm + macSami.ucm + macThai.ucm + macTurkish.ucm + macUkraine.ucm + ), + ], ); my %not_here = @@ -55,18 +55,18 @@ closedir(ENC); WriteMakefile( INC => "-I../Encode", - NAME => 'Encode::'.$name, - VERSION_FROM => "$name.pm", - OBJECT => '$(O_FILES)', - 'dist' => { - COMPRESS => 'gzip -9f', - SUFFIX => 'gz', - DIST_DEFAULT => 'all tardist', - }, - MAN3PODS => {}, - # OS 390 winges about line numbers > 64K ??? - XSOPT => '-nolinenumbers', - ); + NAME => 'Encode::'.$name, + VERSION_FROM => "$name.pm", + OBJECT => '$(O_FILES)', + 'dist' => { + COMPRESS => 'gzip -9f', + SUFFIX => 'gz', + DIST_DEFAULT => 'all tardist', + }, + MAN3PODS => {}, + # OS 390 winges about line numbers > 64K ??? + XSOPT => '-nolinenumbers', + ); package MY; @@ -78,7 +78,7 @@ sub post_initialize # Add the table O_FILES foreach my $e (keys %tables) { - $o{$e.$x} = 1; + $o{$e.$x} = 1; } $o{"$name$x"} = 1; $self->{'O_FILES'} = [sort keys %o]; @@ -89,14 +89,14 @@ sub post_initialize $self->{'H'} = [$self->catfile($self->updir,'Encode', 'encode.h')]; my %xs; foreach my $table (keys %tables) { - push (@{$self->{'C'}},"$table.c"); - # Do NOT add $table.h etc. to H_FILES unless we own up as to how they - # get built. - foreach my $ext (qw($(OBJ_EXT) .c .h .exh .fnm)) { - push (@files,$table.$ext); - } - $self->{SOURCE} .= " $table.c" - if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$table\.c\b/; + push (@{$self->{'C'}},"$table.c"); + # Do NOT add $table.h etc. to H_FILES unless we own up as to how they + # get built. + foreach my $ext (qw($(OBJ_EXT) .c .h .exh .fnm)) { + push (@files,$table.$ext); + } + $self->{SOURCE} .= " $table.c" + if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$table\.c\b/; } $self->{'XS'} = { "$name.xs" => "$name.c" }; $self->{'clean'}{'FILES'} .= join(' ',@files); @@ -109,7 +109,7 @@ sub post_initialize #include "encode.h" END foreach my $table (keys %tables) { - print XS qq[#include "${table}.h"\n]; + print XS qq[#include "${table}.h"\n]; } print XS <<"END"; @@ -138,7 +138,7 @@ BOOT: { END foreach my $table (keys %tables) { - print XS qq[#include "${table}.exh"\n]; + print XS qq[#include "${table}.exh"\n]; } print XS "}\n"; close(XS); @@ -153,7 +153,7 @@ sub postamble $str .= "$name.c : $name.xs "; foreach my $table (keys %tables) { - $str .= " $table.c"; + $str .= " $table.c"; } $str .= "\n\n"; $str .= "$name\$(OBJ_EXT) : $name.c\n\n"; @@ -161,33 +161,33 @@ sub postamble my $enc2xs = $self->catfile($self->updir,'bin', 'enc2xs'); foreach my $table (keys %tables) { - my $numlines = 1; - my $lengthsofar = length($str); - my $continuator = ''; - $str .= "$table.c : $enc2xs Makefile.PL"; - foreach my $file (@{$tables{$table}}) - { - $str .= $continuator.' '.$self->catfile($dir,$file); - if ( length($str)-$lengthsofar > 128*$numlines ) - { - $continuator .= " \\\n\t"; - $numlines++; - } else { - $continuator = ''; - } - } - my $plib = $self->{PERL_CORE} ? '"-I$(PERL_LIB)"' : ''; - $plib .= " -MCross=$::Cross::platform" if defined $::Cross::platform; - my $ucopts = '-"Q" -"O"'; - $str .= - qq{\n\t\$(PERL) $plib $enc2xs $ucopts -o \$\@ -f $table.fnm\n\n}; - open (FILELIST, ">$table.fnm") - || die "Could not open $table.fnm: $!"; - foreach my $file (@{$tables{$table}}) - { - print FILELIST $self->catfile($dir,$file) . "\n"; - } - close(FILELIST); + my $numlines = 1; + my $lengthsofar = length($str); + my $continuator = ''; + $str .= "$table.c : $enc2xs Makefile.PL"; + foreach my $file (@{$tables{$table}}) + { + $str .= $continuator.' '.$self->catfile($dir,$file); + if ( length($str)-$lengthsofar > 128*$numlines ) + { + $continuator .= " \\\n\t"; + $numlines++; + } else { + $continuator = ''; + } + } + my $plib = $self->{PERL_CORE} ? '"-I$(PERL_LIB)"' : ''; + $plib .= " -MCross=$::Cross::platform" if defined $::Cross::platform; + my $ucopts = '-"Q" -"O"'; + $str .= + qq{\n\t\$(PERL) $plib $enc2xs $ucopts -o \$\@ -f $table.fnm\n\n}; + open (FILELIST, ">$table.fnm") + || die "Could not open $table.fnm: $!"; + foreach my $file (@{$tables{$table}}) + { + print FILELIST $self->catfile($dir,$file) . "\n"; + } + close(FILELIST); } return $str; } diff --git a/ext/Encode/CN/CN.pm b/ext/Encode/CN/CN.pm index be5a830..cdd3ae7 100644 --- a/ext/Encode/CN/CN.pm +++ b/ext/Encode/CN/CN.pm @@ -1,18 +1,20 @@ package Encode::CN; + BEGIN { - if (ord("A") == 193) { - die "Encode::CN not supported on EBCDIC\n"; + if ( ord("A") == 193 ) { + die "Encode::CN not supported on EBCDIC\n"; } } -our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.1 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use Encode; use XSLoader; -XSLoader::load(__PACKAGE__,$VERSION); +XSLoader::load( __PACKAGE__, $VERSION ); # Relocated from Encode.pm use Encode::CN::HZ; + # use Encode::CN::2022_CN; 1; @@ -36,15 +38,15 @@ Encodings supported are as follows. Canonical Alias Description -------------------------------------------------------------------- euc-cn /\beuc.*cn$/i EUC (Extended Unix Character) - /\bcn.*euc$/i + /\bcn.*euc$/i /\bGB[-_ ]?2312(?:\D.*$|$)/i (see below) gb2312-raw The raw (low-bit) GB2312 character map gb12345-raw Traditional chinese counterpart to - GB2312 (raw) + GB2312 (raw) iso-ir-165 GB2312 + GB6345 + GB8565 + additions MacChineseSimp GB2312 + Apple Additions cp936 Code Page 936, also known as GBK - (Extended GuoBiao) + (Extended GuoBiao) hz 7-bit escaped GB2312 encoding -------------------------------------------------------------------- diff --git a/ext/Encode/CN/Makefile.PL b/ext/Encode/CN/Makefile.PL index 61e1844..6d54404 100644 --- a/ext/Encode/CN/Makefile.PL +++ b/ext/Encode/CN/Makefile.PL @@ -4,25 +4,25 @@ use ExtUtils::MakeMaker; use strict; my %tables = (euc_cn_t => ['euc-cn.ucm', - 'cp936.ucm', - 'macChinsimp.ucm', - ], - '2312_t' => ['gb2312.ucm'], - '12345_t' => ['gb12345.ucm'], - ir_165_t => ['ir-165.ucm'], + 'cp936.ucm', + 'macChinsimp.ucm', + ], + '2312_t' => ['gb2312.ucm'], + '12345_t' => ['gb12345.ucm'], + ir_165_t => ['ir-165.ucm'], ); unless ($ENV{AGGREGATE_TABLES}){ my @ucm; for my $k (keys %tables){ - push @ucm, @{$tables{$k}}; + push @ucm, @{$tables{$k}}; } %tables = (); my $seq = 0; for my $ucm (sort @ucm){ - # 8.3 compliance ! - my $t = sprintf ("%s_%02d_t", substr($ucm, 0, 2), $seq++); - $tables{$t} = [ $ucm ]; + # 8.3 compliance ! + my $t = sprintf ("%s_%02d_t", substr($ucm, 0, 2), $seq++); + $tables{$t} = [ $ucm ]; } } @@ -30,19 +30,19 @@ my $name = 'CN'; WriteMakefile( INC => "-I../Encode", - NAME => 'Encode::'.$name, - VERSION_FROM => "$name.pm", - OBJECT => '$(O_FILES)', - 'dist' => { - COMPRESS => 'gzip -9f', - SUFFIX => 'gz', - DIST_DEFAULT => 'all tardist', - }, - MAN3PODS => {}, - # OS 390 winges about line numbers > 64K ??? - XSOPT => '-nolinenumbers', + NAME => 'Encode::'.$name, + VERSION_FROM => "$name.pm", + OBJECT => '$(O_FILES)', + 'dist' => { + COMPRESS => 'gzip -9f', + SUFFIX => 'gz', + DIST_DEFAULT => 'all tardist', + }, + MAN3PODS => {}, + # OS 390 winges about line numbers > 64K ??? + XSOPT => '-nolinenumbers', XSPROTOARG => '-noprototypes', - ); + ); package MY; @@ -54,7 +54,7 @@ sub post_initialize # Add the table O_FILES foreach my $e (keys %tables) { - $o{$e.$x} = 1; + $o{$e.$x} = 1; } $o{"$name$x"} = 1; $self->{'O_FILES'} = [sort keys %o]; @@ -65,14 +65,14 @@ sub post_initialize $self->{'H'} = [$self->catfile($self->updir,'Encode', 'encode.h')]; my %xs; foreach my $table (keys %tables) { - push (@{$self->{'C'}},"$table.c"); - # Do NOT add $table.h etc. to H_FILES unless we own up as to how they - # get built. - foreach my $ext (qw($(OBJ_EXT) .c .h .exh .fnm)) { - push (@files,$table.$ext); - } - $self->{SOURCE} .= " $table.c" - if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$table\.c\b/; + push (@{$self->{'C'}},"$table.c"); + # Do NOT add $table.h etc. to H_FILES unless we own up as to how they + # get built. + foreach my $ext (qw($(OBJ_EXT) .c .h .exh .fnm)) { + push (@files,$table.$ext); + } + $self->{SOURCE} .= " $table.c" + if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$table\.c\b/; } $self->{'XS'} = { "$name.xs" => "$name.c" }; $self->{'clean'}{'FILES'} .= join(' ',@files); @@ -85,7 +85,7 @@ sub post_initialize #include "encode.h" END foreach my $table (keys %tables) { - print XS qq[#include "${table}.h"\n]; + print XS qq[#include "${table}.h"\n]; } print XS <<"END"; @@ -114,7 +114,7 @@ BOOT: { END foreach my $table (keys %tables) { - print XS qq[#include "${table}.exh"\n]; + print XS qq[#include "${table}.exh"\n]; } print XS "}\n"; close(XS); @@ -129,7 +129,7 @@ sub postamble $str .= "$name.c : $name.xs "; foreach my $table (keys %tables) { - $str .= " $table.c"; + $str .= " $table.c"; } $str .= "\n\n"; $str .= "$name\$(OBJ_EXT) : $name.c\n\n"; @@ -137,33 +137,33 @@ sub postamble my $enc2xs = $self->catfile($self->updir,'bin', 'enc2xs'); foreach my $table (keys %tables) { - my $numlines = 1; - my $lengthsofar = length($str); - my $continuator = ''; - $str .= "$table.c : $enc2xs Makefile.PL"; - foreach my $file (@{$tables{$table}}) - { - $str .= $continuator.' '.$self->catfile($dir,$file); - if ( length($str)-$lengthsofar > 128*$numlines ) - { - $continuator .= " \\\n\t"; - $numlines++; - } else { - $continuator = ''; - } - } - my $plib = $self->{PERL_CORE} ? '"-I$(PERL_LIB)"' : ''; - $plib .= " -MCross=$::Cross::platform" if defined $::Cross::platform; - my $ucopts = '-"Q"'; - $str .= - qq{\n\t\$(PERL) $plib $enc2xs $ucopts -o \$\@ -f $table.fnm\n\n}; - open (FILELIST, ">$table.fnm") - || die "Could not open $table.fnm: $!"; - foreach my $file (@{$tables{$table}}) - { - print FILELIST $self->catfile($dir,$file) . "\n"; - } - close(FILELIST); + my $numlines = 1; + my $lengthsofar = length($str); + my $continuator = ''; + $str .= "$table.c : $enc2xs Makefile.PL"; + foreach my $file (@{$tables{$table}}) + { + $str .= $continuator.' '.$self->catfile($dir,$file); + if ( length($str)-$lengthsofar > 128*$numlines ) + { + $continuator .= " \\\n\t"; + $numlines++; + } else { + $continuator = ''; + } + } + my $plib = $self->{PERL_CORE} ? '"-I$(PERL_LIB)"' : ''; + $plib .= " -MCross=$::Cross::platform" if defined $::Cross::platform; + my $ucopts = '-"Q"'; + $str .= + qq{\n\t\$(PERL) $plib $enc2xs $ucopts -o \$\@ -f $table.fnm\n\n}; + open (FILELIST, ">$table.fnm") + || die "Could not open $table.fnm: $!"; + foreach my $file (@{$tables{$table}}) + { + print FILELIST $self->catfile($dir,$file) . "\n"; + } + close(FILELIST); } return $str; } diff --git a/ext/Encode/Changes b/ext/Encode/Changes index 50a7c3f..a904e3c 100644 --- a/ext/Encode/Changes +++ b/ext/Encode/Changes @@ -1,9 +1,18 @@ # Revision history for Perl extension Encode. # -# $Id: Changes,v 2.15 2006/04/06 15:44:11 dankogai Exp dankogai $ +# $Id: Changes,v 2.16 2006/05/03 18:24:10 dankogai Exp $ # +$Revision: 2.16 $ $Date: 2006/05/03 18:24:10 $ +! bin/piconv + --xmlcref and --htmlcref added. +! Encode.pm + Copyright Notice Added. + http://rt.cpan.org/NoAuth/Bug.html?id=#19056 +! * + Replaced remaining ^\t with q( ) x 4. -- Perl Best Practice pp. 20 + And all .pm's are now perltidy-ed. -$Revision: 2.15 $ $Date: 2006/04/06 15:44:11 $ +2.15 2006/04/06 15:44:11 ! Unicode/Unicode.xs Addressed: UTF-16, UTF-32, UCS, UTF-7 decoders mishandle illegal characters http://rt.cpan.org/NoAuth/Bug.html?id=#18556 diff --git a/ext/Encode/EBCDIC/EBCDIC.pm b/ext/Encode/EBCDIC/EBCDIC.pm index 200a82f..0d63fe3 100644 --- a/ext/Encode/EBCDIC/EBCDIC.pm +++ b/ext/Encode/EBCDIC/EBCDIC.pm @@ -1,9 +1,9 @@ package Encode::EBCDIC; use Encode; -our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.1 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use XSLoader; -XSLoader::load(__PACKAGE__,$VERSION); +XSLoader::load( __PACKAGE__, $VERSION ); 1; __END__ diff --git a/ext/Encode/EBCDIC/Makefile.PL b/ext/Encode/EBCDIC/Makefile.PL index 12016e7..0e2a13c 100644 --- a/ext/Encode/EBCDIC/Makefile.PL +++ b/ext/Encode/EBCDIC/Makefile.PL @@ -4,26 +4,26 @@ use ExtUtils::MakeMaker; my $name = 'EBCDIC'; my %tables = ( - ebcdic_t => - ['posix-bc.ucm', - qw(cp037.ucm cp1026.ucm cp1047.ucm cp500.ucm cp875.ucm), - ], - ); + ebcdic_t => + ['posix-bc.ucm', + qw(cp037.ucm cp1026.ucm cp1047.ucm cp500.ucm cp875.ucm), + ], + ); WriteMakefile( INC => "-I../Encode", - NAME => 'Encode::'.$name, - VERSION_FROM => "$name.pm", - OBJECT => '$(O_FILES)', - 'dist' => { - COMPRESS => 'gzip -9f', - SUFFIX => 'gz', - DIST_DEFAULT => 'all tardist', - }, - MAN3PODS => {}, - # OS 390 winges about line numbers > 64K ??? - XSOPT => '-nolinenumbers', - ); + NAME => 'Encode::'.$name, + VERSION_FROM => "$name.pm", + OBJECT => '$(O_FILES)', + 'dist' => { + COMPRESS => 'gzip -9f', + SUFFIX => 'gz', + DIST_DEFAULT => 'all tardist', + }, + MAN3PODS => {}, + # OS 390 winges about line numbers > 64K ??? + XSOPT => '-nolinenumbers', + ); package MY; @@ -35,7 +35,7 @@ sub post_initialize # Add the table O_FILES foreach my $e (keys %tables) { - $o{$e.$x} = 1; + $o{$e.$x} = 1; } $o{"$name$x"} = 1; $self->{'O_FILES'} = [sort keys %o]; @@ -46,14 +46,14 @@ sub post_initialize $self->{'H'} = [$self->catfile($self->updir,'Encode', 'encode.h')]; my %xs; foreach my $table (keys %tables) { - push (@{$self->{'C'}},"$table.c"); - # Do NOT add $table.h etc. to H_FILES unless we own up as to how they - # get built. - foreach my $ext (qw($(OBJ_EXT) .c .h .exh .fnm)) { - push (@files,$table.$ext); - } - $self->{SOURCE} .= " $table.c" - if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$table\.c\b/; + push (@{$self->{'C'}},"$table.c"); + # Do NOT add $table.h etc. to H_FILES unless we own up as to how they + # get built. + foreach my $ext (qw($(OBJ_EXT) .c .h .exh .fnm)) { + push (@files,$table.$ext); + } + $self->{SOURCE} .= " $table.c" + if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$table\.c\b/; } $self->{'XS'} = { "$name.xs" => "$name.c" }; $self->{'clean'}{'FILES'} .= join(' ',@files); @@ -66,7 +66,7 @@ sub post_initialize #include "encode.h" END foreach my $table (keys %tables) { - print XS qq[#include "${table}.h"\n]; + print XS qq[#include "${table}.h"\n]; } print XS <<"END"; @@ -95,7 +95,7 @@ BOOT: { END foreach my $table (keys %tables) { - print XS qq[#include "${table}.exh"\n]; + print XS qq[#include "${table}.exh"\n]; } print XS "}\n"; close(XS); @@ -110,7 +110,7 @@ sub postamble $str .= "$name.c : $name.xs "; foreach my $table (keys %tables) { - $str .= " $table.c"; + $str .= " $table.c"; } $str .= "\n\n"; $str .= "$name\$(OBJ_EXT) : $name.c\n\n"; @@ -118,33 +118,33 @@ sub postamble my $enc2xs = $self->catfile($self->updir,'bin', 'enc2xs'); foreach my $table (keys %tables) { - my $numlines = 1; - my $lengthsofar = length($str); - my $continuator = ''; - $str .= "$table.c : $enc2xs Makefile.PL"; - foreach my $file (@{$tables{$table}}) - { - $str .= $continuator.' '.$self->catfile($dir,$file); - if ( length($str)-$lengthsofar > 128*$numlines ) - { - $continuator .= " \\\n\t"; - $numlines++; - } else { - $continuator = ''; - } - } - my $plib = $self->{PERL_CORE} ? '"-I$(PERL_LIB)"' : ''; - $plib .= " -MCross=$::Cross::platform" if defined $::Cross::platform; - my $ucopts = '-"Q" -"O"'; - $str .= - qq{\n\t\$(PERL) $plib $enc2xs $ucopts -o \$\@ -f $table.fnm\n\n}; - open (FILELIST, ">$table.fnm") - || die "Could not open $table.fnm: $!"; - foreach my $file (@{$tables{$table}}) - { - print FILELIST $self->catfile($dir,$file) . "\n"; - } - close(FILELIST); + my $numlines = 1; + my $lengthsofar = length($str); + my $continuator = ''; + $str .= "$table.c : $enc2xs Makefile.PL"; + foreach my $file (@{$tables{$table}}) + { + $str .= $continuator.' '.$self->catfile($dir,$file); + if ( length($str)-$lengthsofar > 128*$numlines ) + { + $continuator .= " \\\n\t"; + $numlines++; + } else { + $continuator = ''; + } + } + my $plib = $self->{PERL_CORE} ? '"-I$(PERL_LIB)"' : ''; + $plib .= " -MCross=$::Cross::platform" if defined $::Cross::platform; + my $ucopts = '-"Q" -"O"'; + $str .= + qq{\n\t\$(PERL) $plib $enc2xs $ucopts -o \$\@ -f $table.fnm\n\n}; + open (FILELIST, ">$table.fnm") + || die "Could not open $table.fnm: $!"; + foreach my $file (@{$tables{$table}}) + { + print FILELIST $self->catfile($dir,$file) . "\n"; + } + close(FILELIST); } return $str; } diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index 75d0e51..61a5e89 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -1,12 +1,12 @@ # -# $Id: Encode.pm,v 2.15 2006/04/06 15:44:11 dankogai Exp dankogai $ +# $Id: Encode.pm,v 2.16 2006/05/03 18:32:25 dankogai Exp dankogai $ # package Encode; use strict; -our $VERSION = sprintf "%d.%02d", q$Revision: 2.15 $ =~ /(\d+)/g; +our $VERSION = sprintf "%d.%02d", q$Revision: 2.16 $ =~ /(\d+)/g; sub DEBUG () { 0 } use XSLoader (); -XSLoader::load(__PACKAGE__, $VERSION); +XSLoader::load( __PACKAGE__, $VERSION ); require Exporter; use base qw/Exporter/; @@ -17,31 +17,31 @@ our @EXPORT = qw( decode decode_utf8 encode encode_utf8 str2bytes bytes2str encodings find_encoding clone_encoding ); - -our @FB_FLAGS = qw(DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC - PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL); -our @FB_CONSTS = qw(FB_DEFAULT FB_CROAK FB_QUIET FB_WARN - FB_PERLQQ FB_HTMLCREF FB_XMLCREF); - -our @EXPORT_OK = - ( - qw( - _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit - is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade +our @FB_FLAGS = qw( + DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC + PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL +); +our @FB_CONSTS = qw( + FB_DEFAULT FB_CROAK FB_QUIET FB_WARN + FB_PERLQQ FB_HTMLCREF FB_XMLCREF +); +our @EXPORT_OK = ( + qw( + _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit + is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade ), - @FB_FLAGS, @FB_CONSTS, - ); + @FB_FLAGS, @FB_CONSTS, +); -our %EXPORT_TAGS = - ( - all => [ @EXPORT, @EXPORT_OK ], - fallbacks => [ @FB_CONSTS ], - fallback_all => [ @FB_CONSTS, @FB_FLAGS ], - ); +our %EXPORT_TAGS = ( + all => [ @EXPORT, @EXPORT_OK ], + fallbacks => [@FB_CONSTS], + fallback_all => [ @FB_CONSTS, @FB_FLAGS ], +); # Documentation moved after __END__ for speed - NI-S -our $ON_EBCDIC = (ord("A") == 193); +our $ON_EBCDIC = ( ord("A") == 193 ); use Encode::Alias; @@ -51,49 +51,46 @@ our %ExtModule; require Encode::Config; eval { require Encode::ConfigLocal }; -sub encodings -{ +sub encodings { my $class = shift; my %enc; - if (@_ and $_[0] eq ":all"){ - %enc = ( %Encoding, %ExtModule ); - }else{ - %enc = %Encoding; - for my $mod (map {m/::/o ? $_ : "Encode::$_" } @_){ - DEBUG and warn $mod; - for my $enc (keys %ExtModule){ - $ExtModule{$enc} eq $mod and $enc{$enc} = $mod; - } - } + if ( @_ and $_[0] eq ":all" ) { + %enc = ( %Encoding, %ExtModule ); } - return - sort { lc $a cmp lc $b } - grep {!/^(?:Internal|Unicode|Guess)$/o} keys %enc; + else { + %enc = %Encoding; + for my $mod ( map { m/::/o ? $_ : "Encode::$_" } @_ ) { + DEBUG and warn $mod; + for my $enc ( keys %ExtModule ) { + $ExtModule{$enc} eq $mod and $enc{$enc} = $mod; + } + } + } + return sort { lc $a cmp lc $b } + grep { !/^(?:Internal|Unicode|Guess)$/o } keys %enc; } -sub perlio_ok{ - my $obj = ref($_[0]) ? $_[0] : find_encoding($_[0]); +sub perlio_ok { + my $obj = ref( $_[0] ) ? $_[0] : find_encoding( $_[0] ); $obj->can("perlio_ok") and return $obj->perlio_ok(); - return 0; # safety net + return 0; # safety net } -sub define_encoding -{ +sub define_encoding { my $obj = shift; my $name = shift; $Encoding{$name} = $obj; my $lc = lc($name); - define_alias($lc => $obj) unless $lc eq $name; - while (@_){ - my $alias = shift; - define_alias($alias, $obj); + define_alias( $lc => $obj ) unless $lc eq $name; + while (@_) { + my $alias = shift; + define_alias( $alias, $obj ); } return $obj; } -sub getEncoding -{ - my ($class, $name, $skip_external) = @_; +sub getEncoding { + my ( $class, $name, $skip_external ) = @_; ref($name) && $name->can('renew') and return $name; exists $Encoding{$name} and return $Encoding{$name}; @@ -105,30 +102,29 @@ sub getEncoding $lc ne $name and $oc = $class->find_alias($lc); defined($oc) and return $oc; - unless ($skip_external) - { - if (my $mod = $ExtModule{$name} || $ExtModule{$lc}){ - $mod =~ s,::,/,g ; $mod .= '.pm'; - eval{ require $mod; }; - exists $Encoding{$name} and return $Encoding{$name}; - } + unless ($skip_external) { + if ( my $mod = $ExtModule{$name} || $ExtModule{$lc} ) { + $mod =~ s,::,/,g; + $mod .= '.pm'; + eval { require $mod; }; + exists $Encoding{$name} and return $Encoding{$name}; + } } return; } -sub find_encoding($;$) -{ - my ($name, $skip_external) = @_; - return __PACKAGE__->getEncoding($name,$skip_external); +sub find_encoding($;$) { + my ( $name, $skip_external ) = @_; + return __PACKAGE__->getEncoding( $name, $skip_external ); } -sub resolve_alias($){ +sub resolve_alias($) { my $obj = find_encoding(shift); defined $obj and return $obj->name; return; } -sub clone_encoding($){ +sub clone_encoding($) { my $obj = find_encoding(shift); ref $obj or return; eval { require Storable }; @@ -136,77 +132,73 @@ sub clone_encoding($){ return Storable::dclone($obj); } -sub encode($$;$) -{ - my ($name, $string, $check) = @_; +sub encode($$;$) { + my ( $name, $string, $check ) = @_; return undef unless defined $string; - $string .= '' if ref $string; # stringify; - $check ||=0; + $string .= '' if ref $string; # stringify; + $check ||= 0; my $enc = find_encoding($name); - unless(defined $enc){ - require Carp; - Carp::croak("Unknown encoding '$name'"); + unless ( defined $enc ) { + require Carp; + Carp::croak("Unknown encoding '$name'"); } - my $octets = $enc->encode($string,$check); - $_[1] = $string if $check and !($check & LEAVE_SRC()); + my $octets = $enc->encode( $string, $check ); + $_[1] = $string if $check and !( $check & LEAVE_SRC() ); return $octets; } *str2bytes = \&encode; -sub decode($$;$) -{ - my ($name,$octets,$check) = @_; +sub decode($$;$) { + my ( $name, $octets, $check ) = @_; return undef unless defined $octets; $octets .= '' if ref $octets; - $check ||=0; + $check ||= 0; my $enc = find_encoding($name); - unless(defined $enc){ - require Carp; - Carp::croak("Unknown encoding '$name'"); + unless ( defined $enc ) { + require Carp; + Carp::croak("Unknown encoding '$name'"); } - my $string = $enc->decode($octets,$check); - $_[1] = $octets if $check and !($check & LEAVE_SRC()); + my $string = $enc->decode( $octets, $check ); + $_[1] = $octets if $check and !( $check & LEAVE_SRC() ); return $string; } *bytes2str = \&decode; -sub from_to($$$;$) -{ - my ($string,$from,$to,$check) = @_; +sub from_to($$$;$) { + my ( $string, $from, $to, $check ) = @_; return undef unless defined $string; - $check ||=0; + $check ||= 0; my $f = find_encoding($from); - unless (defined $f){ - require Carp; - Carp::croak("Unknown encoding '$from'"); + unless ( defined $f ) { + require Carp; + Carp::croak("Unknown encoding '$from'"); } my $t = find_encoding($to); - unless (defined $t){ - require Carp; - Carp::croak("Unknown encoding '$to'"); + unless ( defined $t ) { + require Carp; + Carp::croak("Unknown encoding '$to'"); } my $uni = $f->decode($string); - $_[0] = $string = $t->encode($uni,$check); - return undef if ($check && length($uni)); - return defined($_[0]) ? length($string) : undef ; + $_[0] = $string = $t->encode( $uni, $check ); + return undef if ( $check && length($uni) ); + return defined( $_[0] ) ? length($string) : undef; } -sub encode_utf8($) -{ +sub encode_utf8($) { my ($str) = @_; utf8::encode($str); return $str; } -sub decode_utf8($;$) -{ - my ($str, $check) = @_; +sub decode_utf8($;$) { + my ( $str, $check ) = @_; return $str if is_utf8($str); - if ($check){ - return decode("utf8", $str, $check); - }else{ - return decode("utf8", $str); - return $str; + if ($check) { + return decode( "utf8", $str, $check ); + } + else { + return decode( "utf8", $str ); + return $str; } } @@ -216,94 +208,107 @@ predefine_encodings(1); # This is to restore %Encoding if really needed; # -sub predefine_encodings{ +sub predefine_encodings { use Encode::Encoding; no warnings 'redefine'; my $use_xs = shift; if ($ON_EBCDIC) { - # was in Encode::UTF_EBCDIC - package Encode::UTF_EBCDIC; - push @Encode::UTF_EBCDIC::ISA, 'Encode::Encoding'; - *decode = sub{ - my ($obj,$str,$chk) = @_; - my $res = ''; - for (my $i = 0; $i < length($str); $i++) { - $res .= - chr(utf8::unicode_to_native(ord(substr($str,$i,1)))); - } - $_[1] = '' if $chk; - return $res; - }; - *encode = sub{ - my ($obj,$str,$chk) = @_; - my $res = ''; - for (my $i = 0; $i < length($str); $i++) { - $res .= - chr(utf8::native_to_unicode(ord(substr($str,$i,1)))); - } - $_[1] = '' if $chk; - return $res; - }; - $Encode::Encoding{Unicode} = - bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC"; - } else { - package Encode::Internal; - push @Encode::Internal::ISA, 'Encode::Encoding'; - *decode = sub{ - my ($obj,$str,$chk) = @_; - utf8::upgrade($str); - $_[1] = '' if $chk; - return $str; - }; - *encode = \&decode; - $Encode::Encoding{Unicode} = - bless {Name => "Internal"} => "Encode::Internal"; + + # was in Encode::UTF_EBCDIC + package Encode::UTF_EBCDIC; + push @Encode::UTF_EBCDIC::ISA, 'Encode::Encoding'; + *decode = sub { + my ( $obj, $str, $chk ) = @_; + my $res = ''; + for ( my $i = 0 ; $i < length($str) ; $i++ ) { + $res .= + chr( + utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) ) + ); + } + $_[1] = '' if $chk; + return $res; + }; + *encode = sub { + my ( $obj, $str, $chk ) = @_; + my $res = ''; + for ( my $i = 0 ; $i < length($str) ; $i++ ) { + $res .= + chr( + utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) ) + ); + } + $_[1] = '' if $chk; + return $res; + }; + $Encode::Encoding{Unicode} = + bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC"; + } + else { + + package Encode::Internal; + push @Encode::Internal::ISA, 'Encode::Encoding'; + *decode = sub { + my ( $obj, $str, $chk ) = @_; + utf8::upgrade($str); + $_[1] = '' if $chk; + return $str; + }; + *encode = \&decode; + $Encode::Encoding{Unicode} = + bless { Name => "Internal" } => "Encode::Internal"; } { - # was in Encode::utf8 - package Encode::utf8; - push @Encode::utf8::ISA, 'Encode::Encoding'; - # - if ($use_xs){ - Encode::DEBUG and warn __PACKAGE__, " XS on"; - *decode = \&decode_xs; - *encode = \&encode_xs; - }else{ - Encode::DEBUG and warn __PACKAGE__, " XS off"; - *decode = sub{ - my ($obj,$octets,$chk) = @_; - my $str = Encode::decode_utf8($octets); - if (defined $str) { - $_[1] = '' if $chk; - return $str; - } - return undef; - }; - *encode = sub { - my ($obj,$string,$chk) = @_; - my $octets = Encode::encode_utf8($string); - $_[1] = '' if $chk; - return $octets; - }; - } - *cat_decode = sub{ # ($obj, $dst, $src, $pos, $trm, $chk) - my ($obj, undef, undef, $pos, $trm) = @_; # currently ignores $chk - my ($rdst, $rsrc, $rpos) = \@_[1,2,3]; - use bytes; - if ((my $npos = index($$rsrc, $trm, $pos)) >= 0) { - $$rdst .= substr($$rsrc, $pos, $npos - $pos + length($trm)); - $$rpos = $npos + length($trm); - return 1; - } - $$rdst .= substr($$rsrc, $pos); - $$rpos = length($$rsrc); - return ''; - }; - $Encode::Encoding{utf8} = - bless {Name => "utf8"} => "Encode::utf8"; - $Encode::Encoding{"utf-8-strict"} = - bless {Name => "utf-8-strict", strict_utf8 => 1 } => "Encode::utf8"; + + # was in Encode::utf8 + package Encode::utf8; + push @Encode::utf8::ISA, 'Encode::Encoding'; + + # + if ($use_xs) { + Encode::DEBUG and warn __PACKAGE__, " XS on"; + *decode = \&decode_xs; + *encode = \&encode_xs; + } + else { + Encode::DEBUG and warn __PACKAGE__, " XS off"; + *decode = sub { + my ( $obj, $octets, $chk ) = @_; + my $str = Encode::decode_utf8($octets); + if ( defined $str ) { + $_[1] = '' if $chk; + return $str; + } + return undef; + }; + *encode = sub { + my ( $obj, $string, $chk ) = @_; + my $octets = Encode::encode_utf8($string); + $_[1] = '' if $chk; + return $octets; + }; + } + *cat_decode = sub { # ($obj, $dst, $src, $pos, $trm, $chk) + # currently ignores $chk + my ( $obj, undef, undef, $pos, $trm ) = @_; + my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ]; + use bytes; + if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) { + $$rdst .= + substr( $$rsrc, $pos, $npos - $pos + length($trm) ); + $$rpos = $npos + length($trm); + return 1; + } + $$rdst .= substr( $$rsrc, $pos ); + $$rpos = length($$rsrc); + return ''; + }; + $Encode::Encoding{utf8} = + bless { Name => "utf8" } => "Encode::utf8"; + $Encode::Encoding{"utf-8-strict"} = + bless { Name => "utf-8-strict", strict_utf8 => 1 } => + "Encode::utf8"; } } @@ -850,4 +855,15 @@ by Dan Kogai Edankogai@dan.co.jpE. See AUTHORS for a full list of people involved. For any questions, use Eperl-unicode@perl.orgE so we can all share. +While Dan Kogai retains the copyright as a maintainer, the credit +should go to all those involoved. See AUTHORS for those submitted +codes. + +=head1 COPYRIGHT + +Copyright 2002-2006 Dan Kogai Edankogai@dan.co.jpE + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + =cut diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs index 3c8d681..709b764 100644 --- a/ext/Encode/Encode.xs +++ b/ext/Encode/Encode.xs @@ -1,5 +1,5 @@ /* - $Id: Encode.xs,v 2.8 2006/04/06 15:44:11 dankogai Exp dankogai $ + $Id: Encode.xs,v 2.9 2006/05/03 18:24:10 dankogai Exp $ */ #define PERL_NO_GET_CONTEXT @@ -22,7 +22,7 @@ #define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) {dTHX; \ Perl_croak(aTHX_ "panic_unimplemented"); \ - return (y)0; /* fool picky compilers */ \ + return (y)0; /* fool picky compilers */ \ } /**/ @@ -47,8 +47,8 @@ Encode_XSEncoding(pTHX_ encode_t * enc) PUSHMARK(sp); XPUSHs(sv); while (enc->name[i]) { - const char *name = enc->name[i++]; - XPUSHs(sv_2mortal(newSVpvn(name, strlen(name)))); + const char *name = enc->name[i++]; + XPUSHs(sv_2mortal(newSVpvn(name, strlen(name)))); } PUTBACK; call_pv("Encode::define_encoding", G_DISCARD); @@ -79,7 +79,7 @@ do_fallback_cb(pTHX_ UV ch) argc = call_sv(fallback_cb, G_SCALAR); SPAGAIN; if (argc != 1){ - croak("fallback sub must return scalar!"); + croak("fallback sub must return scalar!"); } retval = newSVsv(POPs); PUTBACK; @@ -90,7 +90,7 @@ do_fallback_cb(pTHX_ UV ch) static SV * encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, - int check, STRLEN * offset, SV * term, int * retcode) + int check, STRLEN * offset, SV * term, int * retcode) { STRLEN slen; U8 *s = (U8 *) SvPV(src, slen); @@ -110,156 +110,156 @@ encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, if (offset) { s += *offset; if (slen > *offset){ /* safeguard against slen overflow */ - slen -= *offset; + slen -= *offset; }else{ - slen = 0; + slen = 0; } tlen = slen; } if (slen == 0){ - SvCUR_set(dst, 0); - SvPOK_only(dst); - goto ENCODE_END; + SvCUR_set(dst, 0); + SvPOK_only(dst); + goto ENCODE_END; } while( (code = do_encode(dir, s, &slen, d, dlen, &dlen, !check, - trm, trmlen)) ) + trm, trmlen)) ) { - SvCUR_set(dst, dlen+ddone); - SvPOK_only(dst); - - if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL || - code == ENCODE_FOUND_TERM) { - break; - } - switch (code) { - case ENCODE_NOSPACE: - { - STRLEN more = 0; /* make sure you initialize! */ - STRLEN sleft; - sdone += slen; - ddone += dlen; - sleft = tlen - sdone; + SvCUR_set(dst, dlen+ddone); + SvPOK_only(dst); + + if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL || + code == ENCODE_FOUND_TERM) { + break; + } + switch (code) { + case ENCODE_NOSPACE: + { + STRLEN more = 0; /* make sure you initialize! */ + STRLEN sleft; + sdone += slen; + ddone += dlen; + sleft = tlen - sdone; #if ENCODE_XS_PROFILE >= 2 - Perl_warn(aTHX_ - "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n", - more, sdone, sleft, SvLEN(dst)); + Perl_warn(aTHX_ + "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n", + more, sdone, sleft, SvLEN(dst)); #endif - if (sdone != 0) { /* has src ever been processed ? */ + if (sdone != 0) { /* has src ever been processed ? */ #if ENCODE_XS_USEFP == 2 - more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone - - SvLEN(dst); + more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone + - SvLEN(dst); #elif ENCODE_XS_USEFP - more = (STRLEN)((1.0*SvLEN(dst)+1)/sdone * sleft); + more = (STRLEN)((1.0*SvLEN(dst)+1)/sdone * sleft); #else - /* safe until SvLEN(dst) == MAX_INT/16 */ - more = (16*SvLEN(dst)+1)/sdone/16 * sleft; + /* safe until SvLEN(dst) == MAX_INT/16 */ + more = (16*SvLEN(dst)+1)/sdone/16 * sleft; #endif - } - more += UTF8_MAXLEN; /* insurance policy */ - d = (U8 *) SvGROW(dst, SvLEN(dst) + more); - /* dst need to grow need MORE bytes! */ - if (ddone >= SvLEN(dst)) { - Perl_croak(aTHX_ "Destination couldn't be grown."); - } - dlen = SvLEN(dst)-ddone-1; - d += ddone; - s += slen; - slen = tlen-sdone; - continue; - } - case ENCODE_NOREP: - /* encoding */ - if (dir == enc->f_utf8) { - STRLEN clen; - UV ch = - utf8n_to_uvuni(s+slen, (SvCUR(src)-slen), - &clen, UTF8_ALLOW_ANY|UTF8_CHECK_ONLY); - /* if non-representable multibyte prefix at end of current buffer - break*/ - if (clen > tlen - sdone) break; - if (check & ENCODE_DIE_ON_ERR) { - Perl_croak(aTHX_ ERR_ENCODE_NOMAP, - (UV)ch, enc->name[0]); - return &PL_sv_undef; /* never reaches but be safe */ - } - if (check & ENCODE_WARN_ON_ERR){ - Perl_warner(aTHX_ packWARN(WARN_UTF8), - ERR_ENCODE_NOMAP, (UV)ch, enc->name[0]); - } - if (check & ENCODE_RETURN_ON_ERR){ - goto ENCODE_SET_SRC; - } - if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ - SV* subchar = - (fallback_cb != (SV*)NULL) ? do_fallback_cb(aTHX_ ch) : - newSVpvf(check & ENCODE_PERLQQ ? "\\x{%04"UVxf"}" : - check & ENCODE_HTMLCREF ? "&#%" UVuf ";" : - "&#x%" UVxf ";", (UV)ch); - sdone += slen + clen; - ddone += dlen + SvCUR(subchar); - sv_catsv(dst, subchar); - SvREFCNT_dec(subchar); - } else { - /* fallback char */ - sdone += slen + clen; - ddone += dlen + enc->replen; - sv_catpvn(dst, (char*)enc->rep, enc->replen); - } - } - /* decoding */ - else { - if (check & ENCODE_DIE_ON_ERR){ - Perl_croak(aTHX_ ERR_DECODE_NOMAP, + } + more += UTF8_MAXLEN; /* insurance policy */ + d = (U8 *) SvGROW(dst, SvLEN(dst) + more); + /* dst need to grow need MORE bytes! */ + if (ddone >= SvLEN(dst)) { + Perl_croak(aTHX_ "Destination couldn't be grown."); + } + dlen = SvLEN(dst)-ddone-1; + d += ddone; + s += slen; + slen = tlen-sdone; + continue; + } + case ENCODE_NOREP: + /* encoding */ + if (dir == enc->f_utf8) { + STRLEN clen; + UV ch = + utf8n_to_uvuni(s+slen, (SvCUR(src)-slen), + &clen, UTF8_ALLOW_ANY|UTF8_CHECK_ONLY); + /* if non-representable multibyte prefix at end of current buffer - break*/ + if (clen > tlen - sdone) break; + if (check & ENCODE_DIE_ON_ERR) { + Perl_croak(aTHX_ ERR_ENCODE_NOMAP, + (UV)ch, enc->name[0]); + return &PL_sv_undef; /* never reaches but be safe */ + } + if (check & ENCODE_WARN_ON_ERR){ + Perl_warner(aTHX_ packWARN(WARN_UTF8), + ERR_ENCODE_NOMAP, (UV)ch, enc->name[0]); + } + if (check & ENCODE_RETURN_ON_ERR){ + goto ENCODE_SET_SRC; + } + if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ + SV* subchar = + (fallback_cb != (SV*)NULL) ? do_fallback_cb(aTHX_ ch) : + newSVpvf(check & ENCODE_PERLQQ ? "\\x{%04"UVxf"}" : + check & ENCODE_HTMLCREF ? "&#%" UVuf ";" : + "&#x%" UVxf ";", (UV)ch); + sdone += slen + clen; + ddone += dlen + SvCUR(subchar); + sv_catsv(dst, subchar); + SvREFCNT_dec(subchar); + } else { + /* fallback char */ + sdone += slen + clen; + ddone += dlen + enc->replen; + sv_catpvn(dst, (char*)enc->rep, enc->replen); + } + } + /* decoding */ + else { + if (check & ENCODE_DIE_ON_ERR){ + Perl_croak(aTHX_ ERR_DECODE_NOMAP, enc->name[0], (UV)s[slen]); - return &PL_sv_undef; /* never reaches but be safe */ - } - if (check & ENCODE_WARN_ON_ERR){ - Perl_warner( - aTHX_ packWARN(WARN_UTF8), - ERR_DECODE_NOMAP, + return &PL_sv_undef; /* never reaches but be safe */ + } + if (check & ENCODE_WARN_ON_ERR){ + Perl_warner( + aTHX_ packWARN(WARN_UTF8), + ERR_DECODE_NOMAP, enc->name[0], (UV)s[slen]); - } - if (check & ENCODE_RETURN_ON_ERR){ - goto ENCODE_SET_SRC; - } - if (check & - (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ - SV* subchar = - (fallback_cb != (SV*)NULL) ? - do_fallback_cb(aTHX_ (UV)s[slen]) : - newSVpvf("\\x%02" UVXf, (UV)s[slen]); - sdone += slen + 1; - ddone += dlen + SvCUR(subchar); - sv_catsv(dst, subchar); - SvREFCNT_dec(subchar); - } else { - sdone += slen + 1; - ddone += dlen + strlen(FBCHAR_UTF8); - sv_catpv(dst, FBCHAR_UTF8); - } - } - /* settle variables when fallback */ - d = (U8 *)SvEND(dst); + } + if (check & ENCODE_RETURN_ON_ERR){ + goto ENCODE_SET_SRC; + } + if (check & + (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ + SV* subchar = + (fallback_cb != (SV*)NULL) ? + do_fallback_cb(aTHX_ (UV)s[slen]) : + newSVpvf("\\x%02" UVXf, (UV)s[slen]); + sdone += slen + 1; + ddone += dlen + SvCUR(subchar); + sv_catsv(dst, subchar); + SvREFCNT_dec(subchar); + } else { + sdone += slen + 1; + ddone += dlen + strlen(FBCHAR_UTF8); + sv_catpv(dst, FBCHAR_UTF8); + } + } + /* settle variables when fallback */ + d = (U8 *)SvEND(dst); dlen = SvLEN(dst) - ddone - 1; - s = (U8*)SvPVX(src) + sdone; - slen = tlen - sdone; - break; - - default: - Perl_croak(aTHX_ "Unexpected code %d converting %s %s", - code, (dir == enc->f_utf8) ? "to" : "from", - enc->name[0]); - return &PL_sv_undef; - } + s = (U8*)SvPVX(src) + sdone; + slen = tlen - sdone; + break; + + default: + Perl_croak(aTHX_ "Unexpected code %d converting %s %s", + code, (dir == enc->f_utf8) ? "to" : "from", + enc->name[0]); + return &PL_sv_undef; + } } ENCODE_SET_SRC: if (check && !(check & ENCODE_LEAVE_SRC)){ - sdone = SvCUR(src) - (slen+sdone); - if (sdone) { - sv_setpvn(src, (char*)s+slen, sdone); - } - SvCUR_set(src, sdone); + sdone = SvCUR(src) - (slen+sdone); + if (sdone) { + sv_setpvn(src, (char*)s+slen, sdone); + } + SvCUR_set(src, sdone); } /* warn("check = 0x%X, code = 0x%d\n", check, code); */ @@ -268,10 +268,10 @@ encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, #if ENCODE_XS_PROFILE if (SvCUR(dst) > SvCUR(src)){ - Perl_warn(aTHX_ - "SvLEN(dst)=%d, SvCUR(dst)=%d. %d bytes unused(%f %%)\n", - SvLEN(dst), SvCUR(dst), SvLEN(dst) - SvCUR(dst), - (SvLEN(dst) - SvCUR(dst))*1.0/SvLEN(dst)*100.0); + Perl_warn(aTHX_ + "SvLEN(dst)=%d, SvCUR(dst)=%d. %d bytes unused(%f %%)\n", + SvLEN(dst), SvCUR(dst), SvLEN(dst) - SvCUR(dst), + (SvLEN(dst) - SvCUR(dst))*1.0/SvLEN(dst)*100.0); } #endif @@ -332,8 +332,8 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, int check, UTF8_ALLOW_NONSTRICT) ); #if 1 /* perl-5.8.6 and older do not check UTF8_ALLOW_LONG */ - if (strict && uv > PERL_UNICODE_MAX) - ulen = -1; + if (strict && uv > PERL_UNICODE_MAX) + ulen = -1; #endif if (ulen == -1) { if (strict) { @@ -418,37 +418,37 @@ CODE: XPUSHs(obj); PUTBACK; if (call_method("renewed",G_SCALAR) == 1) { - SPAGAIN; - renewed = (bool)POPi; - PUTBACK; + SPAGAIN; + renewed = (bool)POPi; + PUTBACK; #if 0 - fprintf(stderr, "renewed == %d\n", renewed); + fprintf(stderr, "renewed == %d\n", renewed); #endif } FREETMPS; LEAVE; /* end PerlIO check */ if (SvUTF8(src)) { - s = utf8_to_bytes(s,&slen); - if (s) { - SvCUR_set(src,slen); - SvUTF8_off(src); - e = s+slen; - } - else { - croak("Cannot decode string with wide characters"); - } + s = utf8_to_bytes(s,&slen); + if (s) { + SvCUR_set(src,slen); + SvUTF8_off(src); + e = s+slen; + } + else { + croak("Cannot decode string with wide characters"); + } } s = process_utf8(aTHX_ dst, s, e, check, 0, strict_utf8(aTHX_ obj), renewed); /* Clear out translated part of source unless asked not to */ if (check && !(check & ENCODE_LEAVE_SRC)){ - slen = e-s; - if (slen) { - sv_setpvn(src, (char*)s, slen); - } - SvCUR_set(src, slen); + slen = e-s; + if (slen) { + sv_setpvn(src, (char*)s, slen); + } + SvCUR_set(src, slen); } SvUTF8_on(dst); ST(0) = sv_2mortal(dst); @@ -467,19 +467,19 @@ CODE: U8 *e = (U8 *) SvEND(src); SV *dst = newSV(slen>0?slen:1); /* newSV() abhors 0 -- inaba */ if (SvUTF8(src)) { - /* Already encoded */ - if (strict_utf8(aTHX_ obj)) { - s = process_utf8(aTHX_ dst, s, e, check, 1, 1, 0); - } + /* Already encoded */ + if (strict_utf8(aTHX_ obj)) { + s = process_utf8(aTHX_ dst, s, e, check, 1, 1, 0); + } else { /* trust it and just copy the octets */ sv_setpvn(dst,(char *)s,(e-s)); - s = e; + s = e; } } else { /* Native bytes - can always encode */ - U8 *d = (U8 *) SvGROW(dst, 2*slen+1); /* +1 or assertion will botch */ + U8 *d = (U8 *) SvGROW(dst, 2*slen+1); /* +1 or assertion will botch */ while (s < e) { UV uv = NATIVE_TO_UNI((UV) *s++); if (UNI_IS_INVARIANT(uv)) @@ -488,18 +488,18 @@ CODE: *d++ = (U8)UTF8_EIGHT_BIT_HI(uv); *d++ = (U8)UTF8_EIGHT_BIT_LO(uv); } - } + } SvCUR_set(dst, d- (U8 *)SvPVX(dst)); *SvEND(dst) = '\0'; } /* Clear out translated part of source unless asked not to */ if (check && !(check & ENCODE_LEAVE_SRC)){ - slen = e-s; - if (slen) { - sv_setpvn(src, (char*)s, slen); - } - SvCUR_set(src, slen); + slen = e-s; + if (slen) { + sv_setpvn(src, (char*)s, slen); + } + SvCUR_set(src, slen); } SvPOK_only(dst); SvUTF8_off(dst); @@ -554,12 +554,12 @@ CODE: sv_utf8_downgrade(src, FALSE); } sv_catsv(dst, encode_method(aTHX_ enc, enc->t_utf8, src, check, - &offset, term, &code)); + &offset, term, &code)); SvIV_set(off, (IV)offset); if (code == ENCODE_FOUND_TERM) { - ST(0) = &PL_sv_yes; + ST(0) = &PL_sv_yes; }else{ - ST(0) = &PL_sv_no; + ST(0) = &PL_sv_no; } XSRETURN(1); } @@ -577,18 +577,18 @@ CODE: sv_utf8_downgrade(src, FALSE); } if (SvROK(check_sv)){ - if (fallback_cb == (SV*)NULL){ + if (fallback_cb == (SV*)NULL){ fallback_cb = newSVsv(check_sv); /* First time */ }else{ SvSetSV(fallback_cb, check_sv); /* Been here before */ - } - check = ENCODE_PERLQQ|ENCODE_LEAVE_SRC; /* same as FB_PERLQQ */ + } + check = ENCODE_PERLQQ|ENCODE_LEAVE_SRC; /* same as FB_PERLQQ */ }else{ - fallback_cb = (SV*)NULL; - check = SvIV(check_sv); + fallback_cb = (SV*)NULL; + check = SvIV(check_sv); } ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check, - NULL, Nullsv, NULL); + NULL, Nullsv, NULL); SvUTF8_on(ST(0)); XSRETURN(1); } @@ -606,18 +606,18 @@ CODE: encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); sv_utf8_upgrade(src); if (SvROK(check_sv)){ - if (fallback_cb == (SV*)NULL){ + if (fallback_cb == (SV*)NULL){ fallback_cb = newSVsv(check_sv); /* First time */ }else{ SvSetSV(fallback_cb, check_sv); /* Been here before */ - } - check = ENCODE_PERLQQ|ENCODE_LEAVE_SRC; /* same as FB_PERLQQ */ + } + check = ENCODE_PERLQQ|ENCODE_LEAVE_SRC; /* same as FB_PERLQQ */ }else{ - fallback_cb = (SV*)NULL; - check = SvIV(check_sv); + fallback_cb = (SV*)NULL; + check = SvIV(check_sv); } ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check, - NULL, Nullsv, NULL); + NULL, Nullsv, NULL); XSRETURN(1); } @@ -642,9 +642,9 @@ CODE: eval_pv("require PerlIO::encoding", 0); if (SvTRUE(get_sv("@", 0))) { - ST(0) = &PL_sv_no; + ST(0) = &PL_sv_no; }else{ - ST(0) = &PL_sv_yes; + ST(0) = &PL_sv_yes; } XSRETURN(1); } @@ -663,15 +663,15 @@ CODE: if (encoding) RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding)); else { - STRLEN len; - U8* s = (U8*)SvPV(sv, len); - U8* converted; - - converted = bytes_to_utf8(s, &len); /* This allocs */ - sv_setpvn(sv, (char *)converted, len); - SvUTF8_on(sv); /* XXX Should we? */ - Safefree(converted); /* ... so free it */ - RETVAL = len; + STRLEN len; + U8* s = (U8*)SvPV(sv, len); + U8* converted; + + converted = bytes_to_utf8(s, &len); /* This allocs */ + sv_setpvn(sv, (char *)converted, len); + SvUTF8_on(sv); /* XXX Should we? */ + Safefree(converted); /* ... so free it */ + RETVAL = len; } } OUTPUT: @@ -686,63 +686,58 @@ CODE: SV * check = items > 2 ? ST(2) : Nullsv; if (to) { - RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to)); + RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to)); } else { - STRLEN len; - U8 *s = (U8*)SvPV(sv, len); + STRLEN len; + U8 *s = (U8*)SvPV(sv, len); - RETVAL = 0; - if (SvTRUE(check)) { - /* Must do things the slow way */ - U8 *dest; + RETVAL = 0; + if (SvTRUE(check)) { + /* Must do things the slow way */ + U8 *dest; /* We need a copy to pass to check() */ - U8 *src = s; - U8 *send = s + len; - U8 *d0; + U8 *src = (U8*)savepv((char *)s); + U8 *send = s + len; - New(83, dest, len, U8); /* I think */ - d0 = dest; + New(83, dest, len, U8); /* I think */ - while (s < send) { + while (s < send) { if (*s < 0x80){ - *dest++ = *s++; + *dest++ = *s++; } else { - STRLEN ulen; - UV uv = *s++; - - /* Have to do it all ourselves because of error routine, - aargh. */ - if (!(uv & 0x40)){ goto failure; } - if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; } - else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; } - else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; } - else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; } - else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; } - else if (!(uv & 0x01)) { ulen = 7; uv = 0; } - else { ulen = 13; uv = 0; } - - /* Note change to utf8.c variable naming, for variety */ - while (ulen--) { - if ((*s & 0xc0) != 0x80){ - goto failure; - } else { - uv = (uv << 6) | (*s++ & 0x3f); - } - } - if (uv > 256) { - failure: - call_failure(check, s, dest, src); - /* Now what happens? */ - } - *dest++ = (U8)uv; - } - } - RETVAL = dest - d0; - sv_usepvn(sv, (char *)dest, RETVAL); - SvUTF8_off(sv); - } else { - RETVAL = (utf8_to_bytes(s, &len) ? len : 0); - } + STRLEN ulen; + UV uv = *s++; + + /* Have to do it all ourselves because of error routine, + aargh. */ + if (!(uv & 0x40)){ goto failure; } + if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; } + else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; } + else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; } + else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; } + else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; } + else if (!(uv & 0x01)) { ulen = 7; uv = 0; } + else { ulen = 13; uv = 0; } + + /* Note change to utf8.c variable naming, for variety */ + while (ulen--) { + if ((*s & 0xc0) != 0x80){ + goto failure; + } else { + uv = (uv << 6) | (*s++ & 0x3f); + } + } + if (uv > 256) { + failure: + call_failure(check, s, dest, src); + /* Now what happens? */ + } + *dest++ = (U8)uv; + } + } + } else { + RETVAL = (utf8_to_bytes(s, &len) ? len : 0); + } } } OUTPUT: @@ -755,18 +750,18 @@ int check CODE: { if (SvGMAGICAL(sv)) /* it could be $1, for example */ - sv = newSVsv(sv); /* GMAGIG will be done */ + sv = newSVsv(sv); /* GMAGIG will be done */ if (SvPOK(sv)) { - RETVAL = SvUTF8(sv) ? TRUE : FALSE; - if (RETVAL && - check && - !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv))) - RETVAL = FALSE; + RETVAL = SvUTF8(sv) ? TRUE : FALSE; + if (RETVAL && + check && + !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv))) + RETVAL = FALSE; } else { - RETVAL = FALSE; + RETVAL = FALSE; } if (sv != ST(0)) - SvREFCNT_dec(sv); /* it was a temp copy */ + SvREFCNT_dec(sv); /* it was a temp copy */ } OUTPUT: RETVAL @@ -777,11 +772,11 @@ SV * sv CODE: { if (SvPOK(sv)) { - SV *rsv = newSViv(SvUTF8(sv)); - RETVAL = rsv; - SvUTF8_on(sv); + SV *rsv = newSViv(SvUTF8(sv)); + RETVAL = rsv; + SvUTF8_on(sv); } else { - RETVAL = &PL_sv_undef; + RETVAL = &PL_sv_undef; } } OUTPUT: @@ -793,11 +788,11 @@ SV * sv CODE: { if (SvPOK(sv)) { - SV *rsv = newSViv(SvUTF8(sv)); - RETVAL = rsv; - SvUTF8_off(sv); + SV *rsv = newSViv(SvUTF8(sv)); + RETVAL = rsv; + SvUTF8_off(sv); } else { - RETVAL = &PL_sv_undef; + RETVAL = &PL_sv_undef; } } OUTPUT: diff --git a/ext/Encode/Encode/Makefile_PL.e2x b/ext/Encode/Encode/Makefile_PL.e2x index 3bca0bf..1d83a2f 100644 --- a/ext/Encode/Encode/Makefile_PL.e2x +++ b/ext/Encode/Encode/Makefile_PL.e2x @@ -11,15 +11,15 @@ use Config; # Please edit the following to the taste! my $name = '$_Name_'; my %tables = ( - $_Name__t => [ $_TableFiles_ ], - ); + $_Name__t => [ $_TableFiles_ ], + ); #### DO NOT EDIT BEYOND THIS POINT! require File::Spec; my ($enc2xs, $encode_h) = (); PATHLOOP: for my $d (@Config{qw/bin sitebin vendorbin/}, - (split /$Config{path_sep}/o, $ENV{PATH})){ + (split /$Config{path_sep}/o, $ENV{PATH})){ for my $f (qw/enc2xs enc2xs5.7.3/){ my $path = File::Spec->catfile($d, $f); -r $path and $enc2xs = $path and last PATHLOOP; @@ -41,21 +41,21 @@ print "encode.h is at $encode_h\n"; WriteMakefile( INC => "-I$encode_h", #### END_OF_HEADER -- DO NOT EDIT THIS LINE BY HAND! #### - NAME => 'Encode::'.$name, - VERSION_FROM => "$name.pm", - OBJECT => '$(O_FILES)', - 'dist' => { - COMPRESS => 'gzip -9f', - SUFFIX => 'gz', - DIST_DEFAULT => 'all tardist', - }, - MAN3PODS => {}, - PREREQ_PM => { - 'Encode' => "1.41", + NAME => 'Encode::'.$name, + VERSION_FROM => "$name.pm", + OBJECT => '$(O_FILES)', + 'dist' => { + COMPRESS => 'gzip -9f', + SUFFIX => 'gz', + DIST_DEFAULT => 'all tardist', + }, + MAN3PODS => {}, + PREREQ_PM => { + 'Encode' => "1.41", }, - # OS 390 winges about line numbers > 64K ??? - XSOPT => '-nolinenumbers', - ); + # OS 390 winges about line numbers > 64K ??? + XSOPT => '-nolinenumbers', + ); package MY; @@ -67,7 +67,7 @@ sub post_initialize # Add the table O_FILES foreach my $e (keys %tables) { - $o{$e.$x} = 1; + $o{$e.$x} = 1; } $o{"$name$x"} = 1; $self->{'O_FILES'} = [sort keys %o]; @@ -79,12 +79,12 @@ sub post_initialize # $self->{'H'} = [$self->catfile($self->updir,'encode.h')]; my %xs; foreach my $table (keys %tables) { - push (@{$self->{'C'}},"$table.c"); - # Do NOT add $table.h etc. to H_FILES unless we own up as to how they - # get built. - foreach my $ext (qw($(OBJ_EXT) .c .h .exh .fnm)) { - push (@files,$table.$ext); - } + push (@{$self->{'C'}},"$table.c"); + # Do NOT add $table.h etc. to H_FILES unless we own up as to how they + # get built. + foreach my $ext (qw($(OBJ_EXT) .c .h .exh .fnm)) { + push (@files,$table.$ext); + } } $self->{'XS'} = { "$name.xs" => "$name.c" }; $self->{'clean'}{'FILES'} .= join(' ',@files); @@ -97,7 +97,7 @@ sub post_initialize #include "encode.h" END foreach my $table (keys %tables) { - print XS qq[#include "${table}.h"\n]; + print XS qq[#include "${table}.h"\n]; } print XS <<"END"; @@ -126,7 +126,7 @@ BOOT: { END foreach my $table (keys %tables) { - print XS qq[#include "${table}.exh"\n]; + print XS qq[#include "${table}.exh"\n]; } print XS "}\n"; close(XS); @@ -141,39 +141,39 @@ sub postamble $str .= "$name.c : $name.xs "; foreach my $table (keys %tables) { - $str .= " $table.c"; + $str .= " $table.c"; } $str .= "\n\n"; $str .= "$name\$(OBJ_EXT) : $name.c\n\n"; foreach my $table (keys %tables) { - my $numlines = 1; - my $lengthsofar = length($str); - my $continuator = ''; - $str .= "$table.c : Makefile.PL"; - foreach my $file (@{$tables{$table}}) - { - $str .= $continuator.' '.$self->catfile($dir,$file); - if ( length($str)-$lengthsofar > 128*$numlines ) - { - $continuator .= " \\\n\t"; - $numlines++; - } else { - $continuator = ''; - } - } - my $plib = $self->{PERL_CORE} ? '"-I$(PERL_LIB)"' : ''; - my $ucopts = '-"Q"'; - $str .= - qq{\n\t\$(PERL) $plib $enc2xs $ucopts -o \$\@ -f $table.fnm\n\n}; - open (FILELIST, ">$table.fnm") - || die "Could not open $table.fnm: $!"; - foreach my $file (@{$tables{$table}}) - { - print FILELIST $self->catfile($dir,$file) . "\n"; - } - close(FILELIST); + my $numlines = 1; + my $lengthsofar = length($str); + my $continuator = ''; + $str .= "$table.c : Makefile.PL"; + foreach my $file (@{$tables{$table}}) + { + $str .= $continuator.' '.$self->catfile($dir,$file); + if ( length($str)-$lengthsofar > 128*$numlines ) + { + $continuator .= " \\\n\t"; + $numlines++; + } else { + $continuator = ''; + } + } + my $plib = $self->{PERL_CORE} ? '"-I$(PERL_LIB)"' : ''; + my $ucopts = '-"Q"'; + $str .= + qq{\n\t\$(PERL) $plib $enc2xs $ucopts -o \$\@ -f $table.fnm\n\n}; + open (FILELIST, ">$table.fnm") + || die "Could not open $table.fnm: $!"; + foreach my $file (@{$tables{$table}}) + { + print FILELIST $self->catfile($dir,$file) . "\n"; + } + close(FILELIST); } return $str; } diff --git a/ext/Encode/Encode/encode.h b/ext/Encode/Encode/encode.h index a6880cf..0dcf83d 100644 --- a/ext/Encode/Encode/encode.h +++ b/ext/Encode/Encode/encode.h @@ -15,16 +15,16 @@ typedef struct encpage_s encpage_t; struct encpage_s { - /* fields ordered to pack nicely on 32-bit machines */ - const U8 *const seq; /* Packed output sequences we generate - if we match */ - const encpage_t *const next; /* Page to go to if we match */ - const U8 min; /* Min value of octet to match this entry */ - const U8 max; /* Max value of octet to match this entry */ - const U8 dlen; /* destination length - - size of entries in seq */ - const U8 slen; /* source length - - number of source octets needed */ + /* fields ordered to pack nicely on 32-bit machines */ + const U8 *const seq; /* Packed output sequences we generate + if we match */ + const encpage_t *const next; /* Page to go to if we match */ + const U8 min; /* Min value of octet to match this entry */ + const U8 max; /* Max value of octet to match this entry */ + const U8 dlen; /* destination length - + size of entries in seq */ + const U8 slen; /* source length - + number of source octets needed */ }; /* @@ -60,18 +60,18 @@ struct encpage_s typedef struct encode_s encode_t; struct encode_s { - const encpage_t *const t_utf8; /* Starting table for translation from - the encoding to UTF-8 form */ - const encpage_t *const f_utf8; /* Starting table for translation - from UTF-8 to the encoding */ - const U8 *const rep; /* Replacement character in this - encoding e.g. "?" */ - int replen; /* Number of octets in rep */ - U8 min_el; /* Minimum octets to represent a - character */ - U8 max_el; /* Maximum octets to represent a - character */ - const char *const name[2]; /* name(s) of this encoding */ + const encpage_t *const t_utf8; /* Starting table for translation from + the encoding to UTF-8 form */ + const encpage_t *const f_utf8; /* Starting table for translation + from UTF-8 to the encoding */ + const U8 *const rep; /* Replacement character in this + encoding e.g. "?" */ + int replen; /* Number of octets in rep */ + U8 min_el; /* Minimum octets to represent a + character */ + U8 max_el; /* Maximum octets to represent a + character */ + const char *const name[2]; /* name(s) of this encoding */ }; #ifdef U8 @@ -79,7 +79,7 @@ struct encode_s extern int do_encode(const encpage_t *enc, const U8 *src, STRLEN *slen, U8 *dst, STRLEN dlen, STRLEN *dout, int approx, - const U8 *term, STRLEN tlen); + const U8 *term, STRLEN tlen); extern void Encode_DefineEncoding(encode_t *enc); diff --git a/ext/Encode/JP/JP.pm b/ext/Encode/JP/JP.pm index 01ad37f..3577a8d 100644 --- a/ext/Encode/JP/JP.pm +++ b/ext/Encode/JP/JP.pm @@ -1,14 +1,15 @@ package Encode::JP; + BEGIN { - if (ord("A") == 193) { - die "Encode::JP not supported on EBCDIC\n"; + if ( ord("A") == 193 ) { + die "Encode::JP not supported on EBCDIC\n"; } } use Encode; -our $VERSION = do { my @r = (q$Revision: 2.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.2 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use XSLoader; -XSLoader::load(__PACKAGE__,$VERSION); +XSLoader::load( __PACKAGE__, $VERSION ); use Encode::JP::JIS7; @@ -34,16 +35,16 @@ supported are as follows. -------------------------------------------------------------------- euc-jp /\beuc.*jp$/i EUC (Extended Unix Character) /\bjp.*euc/i - /\bujis$/i + /\bujis$/i shiftjis /\bshift.*jis$/i Shift JIS (aka MS Kanji) - /\bsjis$/i + /\bsjis$/i 7bit-jis /\bjis$/i 7bit JIS iso-2022-jp ISO-2022-JP [RFC1468] - = 7bit JIS with all Halfwidth Kana - converted to Fullwidth + = 7bit JIS with all Halfwidth Kana + converted to Fullwidth iso-2022-jp-1 ISO-2022-JP-1 [RFC2237] = ISO-2022-JP with JIS X 0212-1990 - support. See below + support. See below MacJapanese Shift JIS + Apple vendor mappings cp932 /\bwindows-31j$/i Code Page 932 = Shift JIS + MS/IBM vendor mappings diff --git a/ext/Encode/JP/Makefile.PL b/ext/Encode/JP/Makefile.PL index a3b19d4..a75685e 100644 --- a/ext/Encode/JP/Makefile.PL +++ b/ext/Encode/JP/Makefile.PL @@ -4,26 +4,26 @@ use ExtUtils::MakeMaker; use strict; my %tables = ( - euc_jp_t => ['euc-jp.ucm'], + euc_jp_t => ['euc-jp.ucm'], sjis_t => ['shiftjis.ucm', - 'macJapanese.ucm', - 'cp932.ucm'], - raw_t => [ - qw(jis0201.ucm jis0208.ucm jis0212.ucm) - ], + 'macJapanese.ucm', + 'cp932.ucm'], + raw_t => [ + qw(jis0201.ucm jis0208.ucm jis0212.ucm) + ], ); unless ($ENV{AGGREGATE_TABLES}){ my @ucm; for my $k (keys %tables){ - push @ucm, @{$tables{$k}}; + push @ucm, @{$tables{$k}}; } %tables = (); my $seq = 0; for my $ucm (sort @ucm){ - # 8.3 compliance ! - my $t = sprintf ("%s_%02d_t", substr($ucm, 0, 2), $seq++); - $tables{$t} = [ $ucm ]; + # 8.3 compliance ! + my $t = sprintf ("%s_%02d_t", substr($ucm, 0, 2), $seq++); + $tables{$t} = [ $ucm ]; } } @@ -31,18 +31,18 @@ my $name = 'JP'; WriteMakefile( INC => "-I../Encode", - NAME => 'Encode::'.$name, - VERSION_FROM => "$name.pm", - OBJECT => '$(O_FILES)', - 'dist' => { - COMPRESS => 'gzip -9f', - SUFFIX => 'gz', - DIST_DEFAULT => 'all tardist', - }, - MAN3PODS => {}, - # OS 390 winges about line numbers > 64K ??? - XSOPT => '-nolinenumbers', - ); + NAME => 'Encode::'.$name, + VERSION_FROM => "$name.pm", + OBJECT => '$(O_FILES)', + 'dist' => { + COMPRESS => 'gzip -9f', + SUFFIX => 'gz', + DIST_DEFAULT => 'all tardist', + }, + MAN3PODS => {}, + # OS 390 winges about line numbers > 64K ??? + XSOPT => '-nolinenumbers', + ); package MY; @@ -54,7 +54,7 @@ sub post_initialize # Add the table O_FILES foreach my $e (keys %tables) { - $o{$e.$x} = 1; + $o{$e.$x} = 1; } $o{"$name$x"} = 1; $self->{'O_FILES'} = [sort keys %o]; @@ -65,14 +65,14 @@ sub post_initialize $self->{'H'} = [$self->catfile($self->updir,'Encode', 'encode.h')]; my %xs; foreach my $table (keys %tables) { - push (@{$self->{'C'}},"$table.c"); - # Do NOT add $table.h etc. to H_FILES unless we own up as to how they - # get built. - foreach my $ext (qw($(OBJ_EXT) .c .h .exh .fnm)) { - push (@files,$table.$ext); - } - $self->{SOURCE} .= " $table.c" - if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$table\.c\b/; + push (@{$self->{'C'}},"$table.c"); + # Do NOT add $table.h etc. to H_FILES unless we own up as to how they + # get built. + foreach my $ext (qw($(OBJ_EXT) .c .h .exh .fnm)) { + push (@files,$table.$ext); + } + $self->{SOURCE} .= " $table.c" + if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$table\.c\b/; } $self->{'XS'} = { "$name.xs" => "$name.c" }; $self->{'clean'}{'FILES'} .= join(' ',@files); @@ -85,7 +85,7 @@ sub post_initialize #include "encode.h" END foreach my $table (keys %tables) { - print XS qq[#include "${table}.h"\n]; + print XS qq[#include "${table}.h"\n]; } print XS <<"END"; @@ -114,7 +114,7 @@ BOOT: { END foreach my $table (keys %tables) { - print XS qq[#include "${table}.exh"\n]; + print XS qq[#include "${table}.exh"\n]; } print XS "}\n"; close(XS); @@ -129,7 +129,7 @@ sub postamble $str .= "$name.c : $name.xs "; foreach my $table (keys %tables) { - $str .= " $table.c"; + $str .= " $table.c"; } $str .= "\n\n"; $str .= "$name\$(OBJ_EXT) : $name.c\n\n"; @@ -137,33 +137,33 @@ sub postamble my $enc2xs = $self->catfile($self->updir,'bin', 'enc2xs'); foreach my $table (keys %tables) { - my $numlines = 1; - my $lengthsofar = length($str); - my $continuator = ''; - $str .= "$table.c : $enc2xs Makefile.PL"; - foreach my $file (@{$tables{$table}}) - { - $str .= $continuator.' '.$self->catfile($dir,$file); - if ( length($str)-$lengthsofar > 128*$numlines ) - { - $continuator .= " \\\n\t"; - $numlines++; - } else { - $continuator = ''; - } - } - my $plib = $self->{PERL_CORE} ? '"-I$(PERL_LIB)"' : ''; - $plib .= " -MCross=$::Cross::platform" if defined $::Cross::platform; - my $ucopts = '-"Q"'; - $str .= - qq{\n\t\$(PERL) $plib $enc2xs $ucopts -o \$\@ -f $table.fnm\n\n}; - open (FILELIST, ">$table.fnm") - || die "Could not open $table.fnm: $!"; - foreach my $file (@{$tables{$table}}) - { - print FILELIST $self->catfile($dir,$file) . "\n"; - } - close(FILELIST); + my $numlines = 1; + my $lengthsofar = length($str); + my $continuator = ''; + $str .= "$table.c : $enc2xs Makefile.PL"; + foreach my $file (@{$tables{$table}}) + { + $str .= $continuator.' '.$self->catfile($dir,$file); + if ( length($str)-$lengthsofar > 128*$numlines ) + { + $continuator .= " \\\n\t"; + $numlines++; + } else { + $continuator = ''; + } + } + my $plib = $self->{PERL_CORE} ? '"-I$(PERL_LIB)"' : ''; + $plib .= " -MCross=$::Cross::platform" if defined $::Cross::platform; + my $ucopts = '-"Q"'; + $str .= + qq{\n\t\$(PERL) $plib $enc2xs $ucopts -o \$\@ -f $table.fnm\n\n}; + open (FILELIST, ">$table.fnm") + || die "Could not open $table.fnm: $!"; + foreach my $file (@{$tables{$table}}) + { + print FILELIST $self->catfile($dir,$file) . "\n"; + } + close(FILELIST); } return $str; } diff --git a/ext/Encode/KR/KR.pm b/ext/Encode/KR/KR.pm index e9d4073..4a7ea72 100644 --- a/ext/Encode/KR/KR.pm +++ b/ext/Encode/KR/KR.pm @@ -1,14 +1,15 @@ package Encode::KR; + BEGIN { - if (ord("A") == 193) { - die "Encode::KR not supported on EBCDIC\n"; + if ( ord("A") == 193 ) { + die "Encode::KR not supported on EBCDIC\n"; } } -our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.1 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use Encode; use XSLoader; -XSLoader::load(__PACKAGE__,$VERSION); +XSLoader::load( __PACKAGE__, $VERSION ); use Encode::KR::2022_KR; @@ -34,7 +35,7 @@ are as follows. Canonical Alias Description -------------------------------------------------------------------- euc-kr /\beuc.*kr$/i EUC (Extended Unix Character) - /\bkr.*euc$/i + /\bkr.*euc$/i ksc5601-raw Korean standard code set (as is) cp949 /(?:x-)?uhc$/i /(?:x-)?windows-949$/i diff --git a/ext/Encode/KR/Makefile.PL b/ext/Encode/KR/Makefile.PL index 525e064..e95d039 100644 --- a/ext/Encode/KR/Makefile.PL +++ b/ext/Encode/KR/Makefile.PL @@ -4,24 +4,24 @@ use ExtUtils::MakeMaker; use strict; my %tables = (euc_kr_t => ['euc-kr.ucm', - 'macKorean.ucm', - 'cp949.ucm', - ], - '5601_t' => ['ksc5601.ucm'], - johab_t => ['johab.ucm'], + 'macKorean.ucm', + 'cp949.ucm', + ], + '5601_t' => ['ksc5601.ucm'], + johab_t => ['johab.ucm'], ); unless ($ENV{AGGREGATE_TABLES}){ my @ucm; for my $k (keys %tables){ - push @ucm, @{$tables{$k}}; + push @ucm, @{$tables{$k}}; } %tables = (); my $seq = 0; for my $ucm (sort @ucm){ - # 8.3 compliance ! - my $t = sprintf ("%s_%02d_t", substr($ucm, 0, 2), $seq++); - $tables{$t} = [ $ucm ]; + # 8.3 compliance ! + my $t = sprintf ("%s_%02d_t", substr($ucm, 0, 2), $seq++); + $tables{$t} = [ $ucm ]; } } @@ -29,18 +29,18 @@ my $name = 'KR'; WriteMakefile( INC => "-I../Encode", - NAME => 'Encode::'.$name, - VERSION_FROM => "$name.pm", - OBJECT => '$(O_FILES)', - 'dist' => { - COMPRESS => 'gzip -9f', - SUFFIX => 'gz', - DIST_DEFAULT => 'all tardist', - }, - MAN3PODS => {}, - # OS 390 winges about line numbers > 64K ??? - XSOPT => '-nolinenumbers', - ); + NAME => 'Encode::'.$name, + VERSION_FROM => "$name.pm", + OBJECT => '$(O_FILES)', + 'dist' => { + COMPRESS => 'gzip -9f', + SUFFIX => 'gz', + DIST_DEFAULT => 'all tardist', + }, + MAN3PODS => {}, + # OS 390 winges about line numbers > 64K ??? + XSOPT => '-nolinenumbers', + ); package MY; @@ -52,7 +52,7 @@ sub post_initialize # Add the table O_FILES foreach my $e (keys %tables) { - $o{$e.$x} = 1; + $o{$e.$x} = 1; } $o{"$name$x"} = 1; $self->{'O_FILES'} = [sort keys %o]; @@ -63,14 +63,14 @@ sub post_initialize $self->{'H'} = [$self->catfile($self->updir,'Encode', 'encode.h')]; my %xs; foreach my $table (keys %tables) { - push (@{$self->{'C'}},"$table.c"); - # Do NOT add $table.h etc. to H_FILES unless we own up as to how they - # get built. - foreach my $ext (qw($(OBJ_EXT) .c .h .exh .fnm)) { - push (@files,$table.$ext); - } - $self->{SOURCE} .= " $table.c" - if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$table\.c\b/; + push (@{$self->{'C'}},"$table.c"); + # Do NOT add $table.h etc. to H_FILES unless we own up as to how they + # get built. + foreach my $ext (qw($(OBJ_EXT) .c .h .exh .fnm)) { + push (@files,$table.$ext); + } + $self->{SOURCE} .= " $table.c" + if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$table\.c\b/; } $self->{'XS'} = { "$name.xs" => "$name.c" }; $self->{'clean'}{'FILES'} .= join(' ',@files); @@ -83,7 +83,7 @@ sub post_initialize #include "encode.h" END foreach my $table (keys %tables) { - print XS qq[#include "${table}.h"\n]; + print XS qq[#include "${table}.h"\n]; } print XS <<"END"; @@ -112,7 +112,7 @@ BOOT: { END foreach my $table (keys %tables) { - print XS qq[#include "${table}.exh"\n]; + print XS qq[#include "${table}.exh"\n]; } print XS "}\n"; close(XS); @@ -127,7 +127,7 @@ sub postamble $str .= "$name.c : $name.xs "; foreach my $table (keys %tables) { - $str .= " $table.c"; + $str .= " $table.c"; } $str .= "\n\n"; $str .= "$name\$(OBJ_EXT) : $name.c\n\n"; @@ -135,33 +135,33 @@ sub postamble my $enc2xs = $self->catfile($self->updir,'bin', 'enc2xs'); foreach my $table (keys %tables) { - my $numlines = 1; - my $lengthsofar = length($str); - my $continuator = ''; - $str .= "$table.c : $enc2xs Makefile.PL"; - foreach my $file (@{$tables{$table}}) - { - $str .= $continuator.' '.$self->catfile($dir,$file); - if ( length($str)-$lengthsofar > 128*$numlines ) - { - $continuator .= " \\\n\t"; - $numlines++; - } else { - $continuator = ''; - } - } - my $plib = $self->{PERL_CORE} ? '"-I$(PERL_LIB)"' : ''; - $plib .= " -MCross=$::Cross::platform" if defined $::Cross::platform; - my $ucopts = '-"Q"'; - $str .= - qq{\n\t\$(PERL) $plib $enc2xs $ucopts -o \$\@ -f $table.fnm\n\n}; - open (FILELIST, ">$table.fnm") - || die "Could not open $table.fnm: $!"; - foreach my $file (@{$tables{$table}}) - { - print FILELIST $self->catfile($dir,$file) . "\n"; - } - close(FILELIST); + my $numlines = 1; + my $lengthsofar = length($str); + my $continuator = ''; + $str .= "$table.c : $enc2xs Makefile.PL"; + foreach my $file (@{$tables{$table}}) + { + $str .= $continuator.' '.$self->catfile($dir,$file); + if ( length($str)-$lengthsofar > 128*$numlines ) + { + $continuator .= " \\\n\t"; + $numlines++; + } else { + $continuator = ''; + } + } + my $plib = $self->{PERL_CORE} ? '"-I$(PERL_LIB)"' : ''; + $plib .= " -MCross=$::Cross::platform" if defined $::Cross::platform; + my $ucopts = '-"Q"'; + $str .= + qq{\n\t\$(PERL) $plib $enc2xs $ucopts -o \$\@ -f $table.fnm\n\n}; + open (FILELIST, ">$table.fnm") + || die "Could not open $table.fnm: $!"; + foreach my $file (@{$tables{$table}}) + { + print FILELIST $self->catfile($dir,$file) . "\n"; + } + close(FILELIST); } return $str; } diff --git a/ext/Encode/Makefile.PL b/ext/Encode/Makefile.PL index d4049ca..981dba6 100644 --- a/ext/Encode/Makefile.PL +++ b/ext/Encode/Makefile.PL @@ -9,39 +9,39 @@ $ENV{PERL_CORE} ||= $ARGV{PERL_CORE}; my %tables = ( def_t => ['ascii.ucm', - '8859-1.ucm', - 'null.ucm', - 'ctrl.ucm', - ] + '8859-1.ucm', + 'null.ucm', + 'ctrl.ucm', + ] ); my @exe_files = qw(bin/enc2xs - bin/piconv - ); + bin/piconv + ); my @more_exe_files = qw( - unidump - ); + unidump + ); my @pmlibdirs = qw(lib Encode); $ARGV{MORE_SCRIOPTS} and push @exe_files, @more_exe_files; $ARGV{INSTALL_UCM} and push @pmlibdirs, "ucm"; WriteMakefile( - NAME => "Encode", - EXE_FILES => \@exe_files, - VERSION_FROM => 'Encode.pm', - OBJECT => '$(O_FILES)', - 'dist' => { - COMPRESS => 'gzip -9f', - SUFFIX => 'gz', - DIST_DEFAULT => 'all tardist', - }, - MAN1PODS => {}, - MAN3PODS => {}, - INC => "-I./Encode", - PMLIBDIRS => \@pmlibdirs, - INSTALLDIRS => 'perl', - ); + NAME => "Encode", + EXE_FILES => \@exe_files, + VERSION_FROM => 'Encode.pm', + OBJECT => '$(O_FILES)', + 'dist' => { + COMPRESS => 'gzip -9f', + SUFFIX => 'gz', + DIST_DEFAULT => 'all tardist', + }, + MAN1PODS => {}, + MAN3PODS => {}, + INC => "-I./Encode", + PMLIBDIRS => \@pmlibdirs, + INSTALLDIRS => 'perl', + ); package MY; @@ -53,13 +53,13 @@ sub post_initialize # Find existing O_FILES foreach my $f (@{$self->{'O_FILES'}}) { - $o{$f} = 1; + $o{$f} = 1; } my $x = $self->{'OBJ_EXT'}; # Add the table O_FILES foreach my $e (keys %tables) { - $o{$e.$x} = 1; + $o{$e.$x} = 1; } # Trick case-blind filesystems. delete $o{'encode'.$x}; @@ -69,12 +69,12 @@ sub post_initialize my @files; foreach my $table (keys %tables) { - foreach my $ext (qw($(OBJ_EXT) .c .h .exh .fnm)) + foreach my $ext (qw($(OBJ_EXT) .c .h .exh .fnm)) { - push (@files,$table.$ext); + push (@files,$table.$ext); } $self->{SOURCE} .= " $table.c" - if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$table\.c\b/; + if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$table\.c\b/; } $self->{'clean'}{'FILES'} .= join(' ',@files); return ''; @@ -90,39 +90,39 @@ sub postamble $str .= ' Encode.c'; foreach my $table (keys %tables) { - $str .= " $table.c"; + $str .= " $table.c"; } $str .= "\n\n"; foreach my $table (keys %tables) { - my $numlines = 1; - my $lengthsofar = length($str); - my $continuator = ''; - my $enc2xs = $self->catfile('bin', 'enc2xs'); - $str .= "$table.c : $enc2xs Makefile.PL"; - foreach my $file (@{$tables{$table}}) - { - $str .= $continuator.' '.$self->catfile($dir,$file); - if ( length($str)-$lengthsofar > 128*$numlines ) - { - $continuator .= " \\\n\t"; - $numlines++; - } else { - $continuator = ''; - } - } - my $plib = $self->{PERL_CORE} ? '"-I$(PERL_LIB)"' : ''; - $plib .= " -MCross=$::Cross::platform" if defined $::Cross::platform; - my $ucopts = '-"Q" -"O"'; - $str .= - qq{\n\t\$(PERL) $plib $enc2xs $ucopts -o \$\@ -f $table.fnm\n\n}; - open (FILELIST, ">$table.fnm") - || die "Could not open $table.fnm: $!"; - foreach my $file (@{$tables{$table}}) - { - print FILELIST $self->catfile($dir,$file) . "\n"; - } - close(FILELIST); + my $numlines = 1; + my $lengthsofar = length($str); + my $continuator = ''; + my $enc2xs = $self->catfile('bin', 'enc2xs'); + $str .= "$table.c : $enc2xs Makefile.PL"; + foreach my $file (@{$tables{$table}}) + { + $str .= $continuator.' '.$self->catfile($dir,$file); + if ( length($str)-$lengthsofar > 128*$numlines ) + { + $continuator .= " \\\n\t"; + $numlines++; + } else { + $continuator = ''; + } + } + my $plib = $self->{PERL_CORE} ? '"-I$(PERL_LIB)"' : ''; + $plib .= " -MCross=$::Cross::platform" if defined $::Cross::platform; + my $ucopts = '-"Q" -"O"'; + $str .= + qq{\n\t\$(PERL) $plib $enc2xs $ucopts -o \$\@ -f $table.fnm\n\n}; + open (FILELIST, ">$table.fnm") + || die "Could not open $table.fnm: $!"; + foreach my $file (@{$tables{$table}}) + { + print FILELIST $self->catfile($dir,$file) . "\n"; + } + close(FILELIST); } return $str; } diff --git a/ext/Encode/Symbol/Makefile.PL b/ext/Encode/Symbol/Makefile.PL index 045cc16..23ca1f4 100644 --- a/ext/Encode/Symbol/Makefile.PL +++ b/ext/Encode/Symbol/Makefile.PL @@ -4,31 +4,31 @@ use ExtUtils::MakeMaker; my $name = 'Symbol'; my %tables = ( - symbol_t => [qw( - symbol.ucm - dingbats.ucm - adobeSymbol.ucm - adobeZdingbat.ucm - macSymbol.ucm - macDingbats.ucm - ) - ], - ); + symbol_t => [qw( + symbol.ucm + dingbats.ucm + adobeSymbol.ucm + adobeZdingbat.ucm + macSymbol.ucm + macDingbats.ucm + ) + ], + ); WriteMakefile( INC => "-I../Encode", - NAME => 'Encode::'.$name, - VERSION_FROM => "$name.pm", - OBJECT => '$(O_FILES)', - 'dist' => { - COMPRESS => 'gzip -9f', - SUFFIX => 'gz', - DIST_DEFAULT => 'all tardist', - }, - MAN3PODS => {}, - # OS 390 winges about line numbers > 64K ??? - XSOPT => '-nolinenumbers', - ); + NAME => 'Encode::'.$name, + VERSION_FROM => "$name.pm", + OBJECT => '$(O_FILES)', + 'dist' => { + COMPRESS => 'gzip -9f', + SUFFIX => 'gz', + DIST_DEFAULT => 'all tardist', + }, + MAN3PODS => {}, + # OS 390 winges about line numbers > 64K ??? + XSOPT => '-nolinenumbers', + ); package MY; @@ -40,7 +40,7 @@ sub post_initialize # Add the table O_FILES foreach my $e (keys %tables) { - $o{$e.$x} = 1; + $o{$e.$x} = 1; } $o{"$name$x"} = 1; $self->{'O_FILES'} = [sort keys %o]; @@ -51,14 +51,14 @@ sub post_initialize $self->{'H'} = [$self->catfile($self->updir,'Encode', 'encode.h')]; my %xs; foreach my $table (keys %tables) { - push (@{$self->{'C'}},"$table.c"); - # Do NOT add $table.h etc. to H_FILES unless we own up as to how they - # get built. - foreach my $ext (qw($(OBJ_EXT) .c .h .exh .fnm)) { - push (@files,$table.$ext); - } - $self->{SOURCE} .= " $table.c" - if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$table\.c\b/; + push (@{$self->{'C'}},"$table.c"); + # Do NOT add $table.h etc. to H_FILES unless we own up as to how they + # get built. + foreach my $ext (qw($(OBJ_EXT) .c .h .exh .fnm)) { + push (@files,$table.$ext); + } + $self->{SOURCE} .= " $table.c" + if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$table\.c\b/; } $self->{'XS'} = { "$name.xs" => "$name.c" }; $self->{'clean'}{'FILES'} .= join(' ',@files); @@ -71,7 +71,7 @@ sub post_initialize #include "encode.h" END foreach my $table (keys %tables) { - print XS qq[#include "${table}.h"\n]; + print XS qq[#include "${table}.h"\n]; } print XS <<"END"; @@ -100,7 +100,7 @@ BOOT: { END foreach my $table (keys %tables) { - print XS qq[#include "${table}.exh"\n]; + print XS qq[#include "${table}.exh"\n]; } print XS "}\n"; close(XS); @@ -115,7 +115,7 @@ sub postamble $str .= "$name.c : $name.xs "; foreach my $table (keys %tables) { - $str .= " $table.c"; + $str .= " $table.c"; } $str .= "\n\n"; $str .= "$name\$(OBJ_EXT) : $name.c\n\n"; @@ -123,33 +123,33 @@ sub postamble my $enc2xs = $self->catfile($self->updir,'bin', 'enc2xs'); foreach my $table (keys %tables) { - my $numlines = 1; - my $lengthsofar = length($str); - my $continuator = ''; - $str .= "$table.c : $enc2xs Makefile.PL"; - foreach my $file (@{$tables{$table}}) - { - $str .= $continuator.' '.$self->catfile($dir,$file); - if ( length($str)-$lengthsofar > 128*$numlines ) - { - $continuator .= " \\\n\t"; - $numlines++; - } else { - $continuator = ''; - } - } - my $plib = $self->{PERL_CORE} ? '"-I$(PERL_LIB)"' : ''; - $plib .= " -MCross=$::Cross::platform" if defined $::Cross::platform; - my $ucopts = '-"Q" -"O"'; - $str .= - qq{\n\t\$(PERL) $plib $enc2xs $ucopts -o \$\@ -f $table.fnm\n\n}; - open (FILELIST, ">$table.fnm") - || die "Could not open $table.fnm: $!"; - foreach my $file (@{$tables{$table}}) - { - print FILELIST $self->catfile($dir,$file) . "\n"; - } - close(FILELIST); + my $numlines = 1; + my $lengthsofar = length($str); + my $continuator = ''; + $str .= "$table.c : $enc2xs Makefile.PL"; + foreach my $file (@{$tables{$table}}) + { + $str .= $continuator.' '.$self->catfile($dir,$file); + if ( length($str)-$lengthsofar > 128*$numlines ) + { + $continuator .= " \\\n\t"; + $numlines++; + } else { + $continuator = ''; + } + } + my $plib = $self->{PERL_CORE} ? '"-I$(PERL_LIB)"' : ''; + $plib .= " -MCross=$::Cross::platform" if defined $::Cross::platform; + my $ucopts = '-"Q" -"O"'; + $str .= + qq{\n\t\$(PERL) $plib $enc2xs $ucopts -o \$\@ -f $table.fnm\n\n}; + open (FILELIST, ">$table.fnm") + || die "Could not open $table.fnm: $!"; + foreach my $file (@{$tables{$table}}) + { + print FILELIST $self->catfile($dir,$file) . "\n"; + } + close(FILELIST); } return $str; } diff --git a/ext/Encode/Symbol/Symbol.pm b/ext/Encode/Symbol/Symbol.pm index 7ad8ca9..e617bd4 100644 --- a/ext/Encode/Symbol/Symbol.pm +++ b/ext/Encode/Symbol/Symbol.pm @@ -1,9 +1,9 @@ package Encode::Symbol; use Encode; -our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.1 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use XSLoader; -XSLoader::load(__PACKAGE__,$VERSION); +XSLoader::load( __PACKAGE__, $VERSION ); 1; __END__ diff --git a/ext/Encode/TW/Makefile.PL b/ext/Encode/TW/Makefile.PL index 15efd03..99c94bf 100644 --- a/ext/Encode/TW/Makefile.PL +++ b/ext/Encode/TW/Makefile.PL @@ -4,22 +4,22 @@ use ExtUtils::MakeMaker; use strict; my %tables = (big5_t => ['big5-eten.ucm', - 'big5-hkscs.ucm', - 'macChintrad.ucm', - 'cp950.ucm'], + 'big5-hkscs.ucm', + 'macChintrad.ucm', + 'cp950.ucm'], ); unless ($ENV{AGGREGATE_TABLES}){ my @ucm; for my $k (keys %tables){ - push @ucm, @{$tables{$k}}; + push @ucm, @{$tables{$k}}; } %tables = (); my $seq = 0; for my $ucm (sort @ucm){ - # 8.3 compliance ! - my $t = sprintf ("%s_%02d_t", substr($ucm, 0, 2), $seq++); - $tables{$t} = [ $ucm ]; + # 8.3 compliance ! + my $t = sprintf ("%s_%02d_t", substr($ucm, 0, 2), $seq++); + $tables{$t} = [ $ucm ]; } } @@ -27,18 +27,18 @@ my $name = 'TW'; WriteMakefile( INC => "-I../Encode", - NAME => 'Encode::'.$name, - VERSION_FROM => "$name.pm", - OBJECT => '$(O_FILES)', - 'dist' => { - COMPRESS => 'gzip -9f', - SUFFIX => 'gz', - DIST_DEFAULT => 'all tardist', - }, - MAN3PODS => {}, - # OS 390 winges about line numbers > 64K ??? - XSOPT => '-nolinenumbers', - ); + NAME => 'Encode::'.$name, + VERSION_FROM => "$name.pm", + OBJECT => '$(O_FILES)', + 'dist' => { + COMPRESS => 'gzip -9f', + SUFFIX => 'gz', + DIST_DEFAULT => 'all tardist', + }, + MAN3PODS => {}, + # OS 390 winges about line numbers > 64K ??? + XSOPT => '-nolinenumbers', + ); package MY; @@ -50,7 +50,7 @@ sub post_initialize # Add the table O_FILES foreach my $e (keys %tables) { - $o{$e.$x} = 1; + $o{$e.$x} = 1; } $o{"$name$x"} = 1; $self->{'O_FILES'} = [sort keys %o]; @@ -61,14 +61,14 @@ sub post_initialize $self->{'H'} = [$self->catfile($self->updir,'Encode', 'encode.h')]; my %xs; foreach my $table (keys %tables) { - push (@{$self->{'C'}},"$table.c"); - # Do NOT add $table.h etc. to H_FILES unless we own up as to how they - # get built. - foreach my $ext (qw($(OBJ_EXT) .c .h .exh .fnm)) { - push (@files,$table.$ext); - } - $self->{SOURCE} .= " $table.c" - if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$table\.c\b/; + push (@{$self->{'C'}},"$table.c"); + # Do NOT add $table.h etc. to H_FILES unless we own up as to how they + # get built. + foreach my $ext (qw($(OBJ_EXT) .c .h .exh .fnm)) { + push (@files,$table.$ext); + } + $self->{SOURCE} .= " $table.c" + if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$table\.c\b/; } $self->{'XS'} = { "$name.xs" => "$name.c" }; $self->{'clean'}{'FILES'} .= join(' ',@files); @@ -81,7 +81,7 @@ sub post_initialize #include "encode.h" END foreach my $table (keys %tables) { - print XS qq[#include "${table}.h"\n]; + print XS qq[#include "${table}.h"\n]; } print XS <<"END"; @@ -110,7 +110,7 @@ BOOT: { END foreach my $table (keys %tables) { - print XS qq[#include "${table}.exh"\n]; + print XS qq[#include "${table}.exh"\n]; } print XS "}\n"; close(XS); @@ -125,7 +125,7 @@ sub postamble $str .= "$name.c : $name.xs "; foreach my $table (keys %tables) { - $str .= " $table.c"; + $str .= " $table.c"; } $str .= "\n\n"; $str .= "$name\$(OBJ_EXT) : $name.c\n\n"; @@ -133,33 +133,33 @@ sub postamble my $enc2xs = $self->catfile($self->updir,'bin', 'enc2xs'); foreach my $table (keys %tables) { - my $numlines = 1; - my $lengthsofar = length($str); - my $continuator = ''; - $str .= "$table.c : $enc2xs Makefile.PL"; - foreach my $file (@{$tables{$table}}) - { - $str .= $continuator.' '.$self->catfile($dir,$file); - if ( length($str)-$lengthsofar > 128*$numlines ) - { - $continuator .= " \\\n\t"; - $numlines++; - } else { - $continuator = ''; - } - } - my $plib = $self->{PERL_CORE} ? '"-I$(PERL_LIB)"' : ''; - $plib .= " -MCross=$::Cross::platform" if defined $::Cross::platform; - my $ucopts = '-"Q"'; - $str .= - qq{\n\t\$(PERL) $plib $enc2xs $ucopts -o \$\@ -f $table.fnm\n\n}; - open (FILELIST, ">$table.fnm") - || die "Could not open $table.fnm: $!"; - foreach my $file (@{$tables{$table}}) - { - print FILELIST $self->catfile($dir,$file) . "\n"; - } - close(FILELIST); + my $numlines = 1; + my $lengthsofar = length($str); + my $continuator = ''; + $str .= "$table.c : $enc2xs Makefile.PL"; + foreach my $file (@{$tables{$table}}) + { + $str .= $continuator.' '.$self->catfile($dir,$file); + if ( length($str)-$lengthsofar > 128*$numlines ) + { + $continuator .= " \\\n\t"; + $numlines++; + } else { + $continuator = ''; + } + } + my $plib = $self->{PERL_CORE} ? '"-I$(PERL_LIB)"' : ''; + $plib .= " -MCross=$::Cross::platform" if defined $::Cross::platform; + my $ucopts = '-"Q"'; + $str .= + qq{\n\t\$(PERL) $plib $enc2xs $ucopts -o \$\@ -f $table.fnm\n\n}; + open (FILELIST, ">$table.fnm") + || die "Could not open $table.fnm: $!"; + foreach my $file (@{$tables{$table}}) + { + print FILELIST $self->catfile($dir,$file) . "\n"; + } + close(FILELIST); } return $str; } diff --git a/ext/Encode/TW/TW.pm b/ext/Encode/TW/TW.pm index 2e1abc0..236da36 100644 --- a/ext/Encode/TW/TW.pm +++ b/ext/Encode/TW/TW.pm @@ -1,14 +1,15 @@ package Encode::TW; + BEGIN { - if (ord("A") == 193) { - die "Encode::TW not supported on EBCDIC\n"; + if ( ord("A") == 193 ) { + die "Encode::TW not supported on EBCDIC\n"; } } -our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.1 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use Encode; use XSLoader; -XSLoader::load(__PACKAGE__,$VERSION); +XSLoader::load( __PACKAGE__, $VERSION ); 1; __END__ @@ -32,8 +33,8 @@ Encodings supported are as follows. Canonical Alias Description -------------------------------------------------------------------- big5-eten /\bbig-?5$/i Big5 encoding (with ETen extensions) - /\bbig5-?et(en)?$/i - /\btca-?big5$/i + /\bbig5-?et(en)?$/i + /\btca-?big5$/i big5-hkscs /\bbig5-?hk(scs)?$/i /\bhk(scs)?-?big5$/i Big5 + Cantonese characters in Hong Kong diff --git a/ext/Encode/Unicode/Makefile.PL b/ext/Encode/Unicode/Makefile.PL index d2dfdff..ce48b7a 100644 --- a/ext/Encode/Unicode/Makefile.PL +++ b/ext/Encode/Unicode/Makefile.PL @@ -4,8 +4,8 @@ use ExtUtils::MakeMaker; WriteMakefile( INC => "-I../Encode", - NAME => 'Encode::Unicode', - VERSION_FROM => "Unicode.pm", - MAN3PODS => {}, - ); + NAME => 'Encode::Unicode', + VERSION_FROM => "Unicode.pm", + MAN3PODS => {}, + ); diff --git a/ext/Encode/Unicode/Unicode.pm b/ext/Encode/Unicode/Unicode.pm index 4d0c31d..9a11d81 100644 --- a/ext/Encode/Unicode/Unicode.pm +++ b/ext/Encode/Unicode/Unicode.pm @@ -4,10 +4,10 @@ use strict; use warnings; no warnings 'redefine'; -our $VERSION = do { my @r = (q$Revision: 2.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.3 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use XSLoader; -XSLoader::load(__PACKAGE__,$VERSION); +XSLoader::load( __PACKAGE__, $VERSION ); # # Object Generator 8 transcoders all at once! @@ -15,38 +15,40 @@ XSLoader::load(__PACKAGE__,$VERSION); require Encode; -our %BOM_Unknown = map {$_ => 1} qw(UTF-16 UTF-32); +our %BOM_Unknown = map { $_ => 1 } qw(UTF-16 UTF-32); -for my $name (qw(UTF-16 UTF-16BE UTF-16LE - UTF-32 UTF-32BE UTF-32LE - UCS-2BE UCS-2LE)) +for my $name ( + qw(UTF-16 UTF-16BE UTF-16LE + UTF-32 UTF-32BE UTF-32LE + UCS-2BE UCS-2LE) + ) { - my ($size, $endian, $ucs2, $mask); + my ( $size, $endian, $ucs2, $mask ); $name =~ /^(\w+)-(\d+)(\w*)$/o; - if ($ucs2 = ($1 eq 'UCS')){ - $size = 2; - }else{ - $size = $2/8; + if ( $ucs2 = ( $1 eq 'UCS' ) ) { + $size = 2; } - $endian = ($3 eq 'BE') ? 'n' : ($3 eq 'LE') ? 'v' : '' ; + else { + $size = $2 / 8; + } + $endian = ( $3 eq 'BE' ) ? 'n' : ( $3 eq 'LE' ) ? 'v' : ''; $size == 4 and $endian = uc($endian); - $Encode::Encoding{$name} = - bless { - Name => $name, - size => $size, - endian => $endian, - ucs2 => $ucs2, - } => __PACKAGE__; + $Encode::Encoding{$name} = bless { + Name => $name, + size => $size, + endian => $endian, + ucs2 => $ucs2, + } => __PACKAGE__; } use base qw(Encode::Encoding); -sub renew { +sub renew { my $self = shift; - $BOM_Unknown{$self->name} or return $self; - my $clone = bless { %$self } => ref($self); - $clone->{renewed}++; # so the caller knows it is renewed. + $BOM_Unknown{ $self->name } or return $self; + my $clone = bless {%$self} => ref($self); + $clone->{renewed}++; # so the caller knows it is renewed. return $clone; } diff --git a/ext/Encode/Unicode/Unicode.xs b/ext/Encode/Unicode/Unicode.xs index 94404c6..9efead6 100644 --- a/ext/Encode/Unicode/Unicode.xs +++ b/ext/Encode/Unicode/Unicode.xs @@ -1,5 +1,5 @@ /* - $Id: Unicode.xs,v 2.2 2006/04/06 15:44:11 dankogai Exp dankogai $ + $Id: Unicode.xs,v 2.3 2006/05/03 18:24:10 dankogai Exp $ */ #define PERL_NO_GET_CONTEXT @@ -24,28 +24,28 @@ enc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian) U8 *s = *sp; UV v = 0; if (s+size > e) { - croak("Partial character %c",(char) endian); + croak("Partial character %c",(char) endian); } switch(endian) { case 'N': - v = *s++; - v = (v << 8) | *s++; + v = *s++; + v = (v << 8) | *s++; case 'n': - v = (v << 8) | *s++; - v = (v << 8) | *s++; - break; + v = (v << 8) | *s++; + v = (v << 8) | *s++; + break; case 'V': case 'v': - v |= *s++; - v |= (*s++ << 8); - if (endian == 'v') - break; - v |= (*s++ << 16); - v |= (*s++ << 24); - break; + v |= *s++; + v |= (*s++ << 8); + if (endian == 'v') + break; + v |= (*s++ << 16); + v |= (*s++ << 24); + break; default: - croak("Unknown endian %c",(char) endian); - break; + croak("Unknown endian %c",(char) endian); + break; } *sp = s; return v; @@ -58,25 +58,25 @@ enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value) switch(endian) { case 'v': case 'V': - d += SvCUR(result); - SvCUR_set(result,SvCUR(result)+size); - while (size--) { - *d++ = (U8)(value & 0xFF); - value >>= 8; - } - break; + d += SvCUR(result); + SvCUR_set(result,SvCUR(result)+size); + while (size--) { + *d++ = (U8)(value & 0xFF); + value >>= 8; + } + break; case 'n': case 'N': - SvCUR_set(result,SvCUR(result)+size); - d += SvCUR(result); - while (size--) { - *--d = (U8)(value & 0xFF); - value >>= 8; - } - break; + SvCUR_set(result,SvCUR(result)+size); + d += SvCUR(result); + while (size--) { + *--d = (U8)(value & 0xFF); + value >>= 8; + } + break; default: - croak("Unknown endian %c",(char) endian); - break; + croak("Unknown endian %c",(char) endian); + break; } } @@ -106,111 +106,111 @@ CODE: SvUTF8_on(result); if (!endian && s+size <= e) { - UV bom; - endian = (size == 4) ? 'N' : 'n'; - bom = enc_unpack(aTHX_ &s,e,size,endian); + UV bom; + endian = (size == 4) ? 'N' : 'n'; + bom = enc_unpack(aTHX_ &s,e,size,endian); if (bom != BOM_BE) { - if (bom == BOM16LE) { - endian = 'v'; - } - else if (bom == BOM32LE) { - endian = 'V'; - } - else { - croak("%"SVf":Unrecognised BOM %"UVxf, + if (bom == BOM16LE) { + endian = 'v'; + } + else if (bom == BOM32LE) { + endian = 'V'; + } + else { + croak("%"SVf":Unrecognised BOM %"UVxf, *hv_fetch((HV *)SvRV(obj),"Name",4,0), - bom); - } - } + bom); + } + } #if 1 - /* Update endian for next sequence */ - if (renewed) { - hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); - } + /* Update endian for next sequence */ + if (renewed) { + hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); + } #endif } while (s < e && s+size <= e) { - UV ord = enc_unpack(aTHX_ &s,e,size,endian); - U8 *d; - if (issurrogate(ord)) { - if (ucs2 || size == 4) { - if (check) { - croak("%"SVf":no surrogates allowed %"UVxf, - *hv_fetch((HV *)SvRV(obj),"Name",4,0), - ord); - } - if (s+size <= e) { + UV ord = enc_unpack(aTHX_ &s,e,size,endian); + U8 *d; + if (issurrogate(ord)) { + if (ucs2 || size == 4) { + if (check) { + croak("%"SVf":no surrogates allowed %"UVxf, + *hv_fetch((HV *)SvRV(obj),"Name",4,0), + ord); + } + if (s+size <= e) { /* skip the next one as well */ - enc_unpack(aTHX_ &s,e,size,endian); - } - ord = FBCHAR; - } - else { - UV lo; - if (!isHiSurrogate(ord)) { - if (check) { - croak("%"SVf":Malformed HI surrogate %"UVxf, - *hv_fetch((HV *)SvRV(obj),"Name",4,0), - ord); - } - else { - ord = FBCHAR; - } - } - else { - if (s+size > e) { - /* Partial character */ - s -= size; /* back up to 1st half */ - break; /* And exit loop */ - } - lo = enc_unpack(aTHX_ &s,e,size,endian); - if (!isLoSurrogate(lo)){ - if (check) { - croak("%"SVf":Malformed LO surrogate %"UVxf, - *hv_fetch((HV *)SvRV(obj),"Name",4,0), - ord); - } - else { - ord = FBCHAR; - } - } - else { - ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00); - } - } - } - } + enc_unpack(aTHX_ &s,e,size,endian); + } + ord = FBCHAR; + } + else { + UV lo; + if (!isHiSurrogate(ord)) { + if (check) { + croak("%"SVf":Malformed HI surrogate %"UVxf, + *hv_fetch((HV *)SvRV(obj),"Name",4,0), + ord); + } + else { + ord = FBCHAR; + } + } + else { + if (s+size > e) { + /* Partial character */ + s -= size; /* back up to 1st half */ + break; /* And exit loop */ + } + lo = enc_unpack(aTHX_ &s,e,size,endian); + if (!isLoSurrogate(lo)){ + if (check) { + croak("%"SVf":Malformed LO surrogate %"UVxf, + *hv_fetch((HV *)SvRV(obj),"Name",4,0), + ord); + } + else { + ord = FBCHAR; + } + } + else { + ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00); + } + } + } + } - if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) { - if (check) { - croak("%"SVf":Unicode character %"UVxf" is illegal", - *hv_fetch((HV *)SvRV(obj),"Name",4,0), - ord); - } else { - ord = FBCHAR; - } - } + if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) { + if (check) { + croak("%"SVf":Unicode character %"UVxf" is illegal", + *hv_fetch((HV *)SvRV(obj),"Name",4,0), + ord); + } else { + ord = FBCHAR; + } + } - d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1); - d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0); - SvCUR_set(result,d - (U8 *)SvPVX(result)); + d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1); + d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0); + SvCUR_set(result,d - (U8 *)SvPVX(result)); } if (s < e) { - /* unlikely to happen because it's fixed-length -- dankogai */ - if (check & ENCODE_WARN_ON_ERR){ - Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character", - *hv_fetch((HV *)SvRV(obj),"Name",4,0)); - } + /* unlikely to happen because it's fixed-length -- dankogai */ + if (check & ENCODE_WARN_ON_ERR){ + Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character", + *hv_fetch((HV *)SvRV(obj),"Name",4,0)); + } } if (check && !(check & ENCODE_LEAVE_SRC)){ - if (s < e) { - Move(s,SvPVX(str),e-s,U8); - SvCUR_set(str,(e-s)); - } - else { - SvCUR_set(str,0); - } - *SvEND(str) = '\0'; + if (s < e) { + Move(s,SvPVX(str),e-s,U8); + SvCUR_set(str,(e-s)); + } + else { + SvCUR_set(str,0); + } + *SvEND(str) = '\0'; } XSRETURN(1); } @@ -232,64 +232,64 @@ CODE: U8 *e = (U8 *)SvEND(utf8); ST(0) = sv_2mortal(result); if (!endian) { - endian = (size == 4) ? 'N' : 'n'; - enc_pack(aTHX_ result,size,endian,BOM_BE); + endian = (size == 4) ? 'N' : 'n'; + enc_pack(aTHX_ result,size,endian,BOM_BE); #if 1 - /* Update endian for next sequence */ - if (renewed){ - hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); - } + /* Update endian for next sequence */ + if (renewed){ + hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); + } #endif } while (s < e && s+UTF8SKIP(s) <= e) { - STRLEN len; - UV ord = utf8n_to_uvuni(s, e-s, &len, 0); + STRLEN len; + UV ord = utf8n_to_uvuni(s, e-s, &len, 0); s += len; - if (size != 4 && invalid_ucs2(ord)) { - if (!issurrogate(ord)){ - if (ucs2) { - if (check) { - croak("%"SVf":code point \"\\x{%"UVxf"}\" too high", - *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord); - } - enc_pack(aTHX_ result,size,endian,FBCHAR); - }else{ - UV hi = ((ord - 0x10000) >> 10) + 0xD800; - UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00; - enc_pack(aTHX_ result,size,endian,hi); - enc_pack(aTHX_ result,size,endian,lo); - } - } - else { - /* not supposed to happen */ - enc_pack(aTHX_ result,size,endian,FBCHAR); - } - } - else { - enc_pack(aTHX_ result,size,endian,ord); - } + if (size != 4 && invalid_ucs2(ord)) { + if (!issurrogate(ord)){ + if (ucs2) { + if (check) { + croak("%"SVf":code point \"\\x{%"UVxf"}\" too high", + *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord); + } + enc_pack(aTHX_ result,size,endian,FBCHAR); + }else{ + UV hi = ((ord - 0x10000) >> 10) + 0xD800; + UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00; + enc_pack(aTHX_ result,size,endian,hi); + enc_pack(aTHX_ result,size,endian,lo); + } + } + else { + /* not supposed to happen */ + enc_pack(aTHX_ result,size,endian,FBCHAR); + } + } + else { + enc_pack(aTHX_ result,size,endian,ord); + } } if (s < e) { - /* UTF-8 partial char happens often on PerlIO. - Since this is okay and normal, we do not warn. - But this is critical when you choose to LEAVE_SRC - in which case we die */ - if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)){ - Perl_croak(aTHX_ "%"SVf":partial character is not allowed " - "when CHECK = 0x%" UVuf, - *hv_fetch((HV *)SvRV(obj),"Name",4,0), check); - } - + /* UTF-8 partial char happens often on PerlIO. + Since this is okay and normal, we do not warn. + But this is critical when you choose to LEAVE_SRC + in which case we die */ + if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)){ + Perl_croak(aTHX_ "%"SVf":partial character is not allowed " + "when CHECK = 0x%" UVuf, + *hv_fetch((HV *)SvRV(obj),"Name",4,0), check); + } + } if (check && !(check & ENCODE_LEAVE_SRC)){ - if (s < e) { - Move(s,SvPVX(utf8),e-s,U8); - SvCUR_set(utf8,(e-s)); - } - else { - SvCUR_set(utf8,0); - } - *SvEND(utf8) = '\0'; + if (s < e) { + Move(s,SvPVX(utf8),e-s,U8); + SvCUR_set(utf8,(e-s)); + } + else { + SvCUR_set(utf8,0); + } + *SvEND(utf8) = '\0'; } XSRETURN(1); } diff --git a/ext/Encode/bin/enc2xs b/ext/Encode/bin/enc2xs index 7930ece..6ca0efe 100644 --- a/ext/Encode/bin/enc2xs +++ b/ext/Encode/bin/enc2xs @@ -9,7 +9,7 @@ use strict; use warnings; use Getopt::Std; my @orig_ARGV = @ARGV; -our $VERSION = do { my @r = (q$Revision: 2.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 2.3 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; # These may get re-ordered. # RAW is a do_now as inserted by &enter @@ -279,13 +279,13 @@ if ($doC) my $sym = "${enc}_encoding"; $sym =~ s/\W+/_/g; my @info = ($e2u->{Cname},$u2e->{Cname},"${sym}_rep_character",$replen, - $min_el,$max_el); + $min_el,$max_el); print C "static const U8 ${sym}_rep_character[] = \"$rep\";\n"; print C "static const char ${sym}_enc_name[] = \"$enc\";\n\n"; print C "const encode_t $sym = \n"; # This is to make null encoding work -- dankogai for (my $i = (scalar @info) - 1; $i >= 0; --$i){ - $info[$i] ||= 1; + $info[$i] ||= 1; } # end of null tweak -- dankogai print C " {",join(',',@info,"{${sym}_enc_name,(const char *)0}"),"};\n\n"; @@ -714,38 +714,38 @@ sub outbigstring $strings_in_acc{$s} = $index; } else { OPTIMISER: { - if ($opt{'O'}) { - my $sublength = length $s; - while (--$sublength > 0) { - # progressively lop characters off the end, to see if the start of - # the new string overlaps the end of the accumulator. - if (substr ($string_acc, -$sublength) - eq substr ($s, 0, $sublength)) { - $subsave += $sublength; - $strings_in_acc{$s} = length ($string_acc) - $sublength; - # append the last bit on the end. - $string_acc .= substr ($s, $sublength); - last OPTIMISER; - } - # or if the end of the new string overlaps the start of the - # accumulator - next unless substr ($string_acc, 0, $sublength) - eq substr ($s, -$sublength); - # well, the last $sublength characters of the accumulator match. - # so as we're prepending to the accumulator, need to shift all our - # existing offsets forwards - $_ += $sublength foreach values %strings_in_acc; - $subsave += $sublength; - $strings_in_acc{$s} = 0; - # append the first bit on the start. - $string_acc = substr ($s, 0, -$sublength) . $string_acc; - last OPTIMISER; - } - } - # Optimiser (if it ran) found nothing, so just going have to tack the - # whole thing on the end. - $strings_in_acc{$s} = length $string_acc; - $string_acc .= $s; + if ($opt{'O'}) { + my $sublength = length $s; + while (--$sublength > 0) { + # progressively lop characters off the end, to see if the start of + # the new string overlaps the end of the accumulator. + if (substr ($string_acc, -$sublength) + eq substr ($s, 0, $sublength)) { + $subsave += $sublength; + $strings_in_acc{$s} = length ($string_acc) - $sublength; + # append the last bit on the end. + $string_acc .= substr ($s, $sublength); + last OPTIMISER; + } + # or if the end of the new string overlaps the start of the + # accumulator + next unless substr ($string_acc, 0, $sublength) + eq substr ($s, -$sublength); + # well, the last $sublength characters of the accumulator match. + # so as we're prepending to the accumulator, need to shift all our + # existing offsets forwards + $_ += $sublength foreach values %strings_in_acc; + $subsave += $sublength; + $strings_in_acc{$s} = 0; + # append the first bit on the start. + $string_acc = substr ($s, 0, -$sublength) . $string_acc; + last OPTIMISER; + } + } + # Optimiser (if it ran) found nothing, so just going have to tack the + # whole thing on the end. + $strings_in_acc{$s} = length $string_acc; + $string_acc .= $s; }; } } @@ -913,25 +913,25 @@ sub find_e2x{ eval { require File::Find; }; my (@inc, %e2x_dir); for my $inc (@INC){ - push @inc, $inc unless $inc eq '.'; #skip current dir + push @inc, $inc unless $inc eq '.'; #skip current dir } File::Find::find( - sub { - my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, - $atime,$mtime,$ctime,$blksize,$blocks) - = lstat($_) or return; - -f _ or return; - if (/^.*\.e2x$/o){ - no warnings 'once'; - $e2x_dir{$File::Find::dir} ||= $mtime; - } - return; - }, @inc); + sub { + my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, + $atime,$mtime,$ctime,$blksize,$blocks) + = lstat($_) or return; + -f _ or return; + if (/^.*\.e2x$/o){ + no warnings 'once'; + $e2x_dir{$File::Find::dir} ||= $mtime; + } + return; + }, @inc); warn join("\n", keys %e2x_dir), "\n"; for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){ - $_E2X = $d; - # warn "$_E2X => ", scalar localtime($e2x_dir{$d}); - return $_E2X; + $_E2X = $d; + # warn "$_E2X => ", scalar localtime($e2x_dir{$d}); + return $_E2X; } } @@ -957,9 +957,9 @@ sub make_makefile_pl } use vars qw( - $_ModLines - $_LocalVer - ); + $_ModLines + $_LocalVer + ); sub make_configlocal_pm { @@ -970,38 +970,38 @@ sub make_configlocal_pm my %in_core = map {$_=>1}('ascii','iso-8859-1','utf8'); my %LocalMod = (); for my $d (@INC){ - my $inc = File::Spec->catfile($d, "Encode"); - -d $inc or next; - opendir my $dh, $inc or die "$inc:$!"; - warn "Checking $inc...\n"; - for my $f (grep /\.pm$/o, readdir($dh)){ - -f File::Spec->catfile($inc, "$f") or next; - $INC{"Encode/$f"} and next; - warn "require Encode/$f;\n"; - eval { require "Encode/$f"; }; - $@ and die "Can't require Encode/$f: $@\n"; - for my $enc (Encode->encodings()){ - no warnings 'once'; - $in_core{$enc} and next; - $Encode::Config::ExtModule{$enc} and next; - my $mod = "Encode/$f"; - $mod =~ s/\.pm$//o; $mod =~ s,/,::,og; - $LocalMod{$enc} ||= $mod; - } - } + my $inc = File::Spec->catfile($d, "Encode"); + -d $inc or next; + opendir my $dh, $inc or die "$inc:$!"; + warn "Checking $inc...\n"; + for my $f (grep /\.pm$/o, readdir($dh)){ + -f File::Spec->catfile($inc, "$f") or next; + $INC{"Encode/$f"} and next; + warn "require Encode/$f;\n"; + eval { require "Encode/$f"; }; + $@ and die "Can't require Encode/$f: $@\n"; + for my $enc (Encode->encodings()){ + no warnings 'once'; + $in_core{$enc} and next; + $Encode::Config::ExtModule{$enc} and next; + my $mod = "Encode/$f"; + $mod =~ s/\.pm$//o; $mod =~ s,/,::,og; + $LocalMod{$enc} ||= $mod; + } + } } $_ModLines = ""; for my $enc (sort keys %LocalMod){ - $_ModLines .= - qq(\$Encode::ExtModule{'$enc'} =\t"$LocalMod{$enc}";\n); + $_ModLines .= + qq(\$Encode::ExtModule{'$enc'} =\t"$LocalMod{$enc}";\n); } warn $_ModLines; $_LocalVer = _mkversion(); $_E2X = find_e2x(); $_Inc = $INC{"Encode.pm"}; $_Inc =~ s/\.pm$//o; _print_expand(File::Spec->catfile($_E2X,"ConfigLocal_PM.e2x"), - File::Spec->catfile($_Inc,"ConfigLocal.pm"), - 1); + File::Spec->catfile($_Inc,"ConfigLocal.pm"), + 1); exit; } @@ -1017,22 +1017,22 @@ sub _print_expand{ File::Basename->import(); my ($src, $dst, $clobber) = @_; if (!$clobber and -e $dst){ - warn "$dst exists. skipping\n"; - return; + warn "$dst exists. skipping\n"; + return; } warn "Generating $dst...\n"; open my $in, $src or die "$src : $!"; if ((my $d = dirname($dst)) ne '.'){ - -d $d or mkdir $d, 0755 or die "mkdir $d : $!"; + -d $d or mkdir $d, 0755 or die "mkdir $d : $!"; } open my $out, ">$dst" or die "$!"; my $asis = 0; while (<$in>){ - if (/^#### END_OF_HEADER/){ - $asis = 1; next; - } - s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis; - print $out $_; + if (/^#### END_OF_HEADER/){ + $asis = 1; next; + } + s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis; + print $out $_; } } __END__ diff --git a/ext/Encode/bin/piconv b/ext/Encode/bin/piconv index cb0c236..0a2f6f9 100644 --- a/ext/Encode/bin/piconv +++ b/ext/Encode/bin/piconv @@ -1,5 +1,5 @@ #!./perl -# $Id: piconv,v 2.1 2004/10/06 05:07:20 dankogai Exp $ +# $Id: piconv,v 2.2 2006/05/03 18:24:10 dankogai Exp $ # use 5.8.0; use strict; @@ -17,18 +17,20 @@ my %Opt; help() unless GetOptions(\%Opt, - 'from|f=s', - 'to|t=s', - 'list|l', - 'string|s=s', - 'check|C=i', - 'c', - 'perlqq|p', - 'debug|D', - 'scheme|S=s', - 'resolve|r=s', - 'help', - ); + 'from|f=s', + 'to|t=s', + 'list|l', + 'string|s=s', + 'check|C=i', + 'c', + 'perlqq|p', + 'htmlcref', + 'xmlcref', + 'debug|D', + 'scheme|S=s', + 'resolve|r=s', + 'help', + ); $Opt{help} and help(); $Opt{list} and list_encodings(); @@ -40,7 +42,9 @@ my $to = $Opt{to} || $locale or help("to_encoding unspecified"); $Opt{string} and Encode::from_to($Opt{string}, $from, $to) and print $Opt{string} and exit; my $scheme = exists $Scheme{$Opt{Scheme}} ? $Opt{Scheme} : 'from_to'; $Opt{check} ||= $Opt{c}; -$Opt{perlqq} and $Opt{check} = Encode::FB_PERLQQ; +$Opt{perlqq} and $Opt{check} = Encode::PERLQQ; +$Opt{htmlcref} and $Opt{check} = Encode::HTMLCREF; +$Opt{xmlcref} and $Opt{check} = Encode::XMLCREF; if ($Opt{debug}){ my $cfrom = Encode->getEncoding($from)->name; @@ -53,56 +57,61 @@ EOT } # we do not use <> (or ARGV) for the sake of binmode() -@ARGV or push @ARGV, \*STDIN; +@ARGV or push @ARGV, \*STDIN; -unless ($scheme eq 'perlio'){ +unless ( $scheme eq 'perlio' ) { binmode STDOUT; - for my $argv (@ARGV){ - my $ifh = ref $argv ? $argv : undef; - $ifh or open $ifh, "<", $argv or next; - binmode $ifh; - if ($scheme eq 'from_to'){ # default - while(<$ifh>){ - Encode::from_to($_, $from, $to, $Opt{check}); - print; - } - }elsif ($scheme eq 'decode_encode'){ # step-by-step - while(<$ifh>){ - my $decoded = decode($from, $_, $Opt{check}); - my $encoded = encode($to, $decoded); - print $encoded; - } - } else { # won't reach - die "$name: unknown scheme: $scheme"; - } + for my $argv (@ARGV) { + my $ifh = ref $argv ? $argv : undef; + $ifh or open $ifh, "<", $argv or next; + binmode $ifh; + if ( $scheme eq 'from_to' ) { # default + while (<$ifh>) { + Encode::from_to( $_, $from, $to, $Opt{check} ); + print; + } + } + elsif ( $scheme eq 'decode_encode' ) { # step-by-step + while (<$ifh>) { + my $decoded = decode( $from, $_, $Opt{check} ); + my $encoded = encode( $to, $decoded ); + print $encoded; + } + } + else { # won't reach + die "$name: unknown scheme: $scheme"; + } } -}else{ +} +else { + # NI-S favorite binmode STDOUT => "raw:encoding($to)"; - for my $argv (@ARGV){ - my $ifh = ref $argv ? $argv : undef; - $ifh or open $ifh, "<", $argv or next; - binmode $ifh => "raw:encoding($from)"; - print while(<$ifh>); + for my $argv (@ARGV) { + my $ifh = ref $argv ? $argv : undef; + $ifh or open $ifh, "<", $argv or next; + binmode $ifh => "raw:encoding($from)"; + print while (<$ifh>); } } -sub list_encodings{ - print join("\n", Encode->encodings(":all")), "\n"; +sub list_encodings { + print join( "\n", Encode->encodings(":all") ), "\n"; exit 0; } sub resolve_encoding { - if (my $alias = Encode::resolve_alias($_[0])) { - print $alias, "\n"; - exit 0; - } else { - warn "$name: $_[0] is not known to Encode\n"; - exit 1; + if ( my $alias = Encode::resolve_alias( $_[0] ) ) { + print $alias, "\n"; + exit 0; + } + else { + warn "$name: $_[0] is not known to Encode\n"; + exit 1; } } -sub help{ +sub help { my $message = shift; $message and print STDERR "$name error: $message\n"; print STDERR <<"EOT"; @@ -121,10 +130,14 @@ $name -r encoding_alias "string" will be the input instead of STDIN or files The following are mainly of interest to Encode hackers: -D,--debug show debug information - -C N | -c | -p check the validity of the input + -C N | -c check the validity of the input -S,--scheme scheme use the scheme for conversion +Those are handy when you can only see ascii characters: + -p,--perlqq + --htmlcref + --xmlcref EOT - exit; + exit; } __END__ @@ -195,7 +208,15 @@ Same as C<-C 1>. =item -p,--perlqq -Same as C<-C -1>. +=item --htmlcref + +=item --xmlcref + +Applies PERLQQ, HTMLCREF, XMLCREF, respectively. Try + + piconv -f utf8 -t ascii --perlqq + +To see what it does. =item -h,--help diff --git a/ext/Encode/bin/ucm2table b/ext/Encode/bin/ucm2table index 4207c7d..66e63fc 100644 --- a/ext/Encode/bin/ucm2table +++ b/ext/Encode/bin/ucm2table @@ -1,5 +1,5 @@ #!/usr/bin/perl -# $Id: ucm2table,v 2.0 2004/05/16 20:55:16 dankogai Exp $ +# $Id: ucm2table,v 2.1 2006/05/03 18:24:10 dankogai Exp $ # use 5.006; @@ -11,23 +11,23 @@ my %Chartab; my $Hex = '[0-9A-Fa-f]'; while(<>){ - chomp; - my ($uni, $enc, $fb) = - /^\s+(\S+)\s+\|(\d)/o or next; - $fb eq '0' or next; - my @byte = (); - my $ord = 0; - while($enc =~ /\G\\x($Hex+)/iog){ - my $byte = hex($1); - push @byte, $byte; - $ord <<= 8; $ord += $byte; - }; - # print join('', @byte), " => $ord \n"; - if ($Opt{u}){ - $Chartab{$ord} = pack("U", hex($uni)); - }else{ - $Chartab{$ord} = pack("C*", @byte); - } + chomp; + my ($uni, $enc, $fb) = + /^\s+(\S+)\s+\|(\d)/o or next; + $fb eq '0' or next; + my @byte = (); + my $ord = 0; + while($enc =~ /\G\\x($Hex+)/iog){ + my $byte = hex($1); + push @byte, $byte; + $ord <<= 8; $ord += $byte; + }; + # print join('', @byte), " => $ord \n"; + if ($Opt{u}){ + $Chartab{$ord} = pack("U", hex($uni)); + }else{ + $Chartab{$ord} = pack("C*", @byte); + } } my $start = $Opt{a} ? 0x20 : 0xa0; @@ -35,10 +35,10 @@ my $start = $Opt{a} ? 0x20 : 0xa0; for (my $x = $start; $x <= 0xffff; $x += 32) { my $line = ''; for my $i (0..31){ - my $num = $x+$i; $num eq 0x7f and next; # skip delete - my $char = $Chartab{$num}; - $line .= !$char ? " " : - ($num < 0x7f ) ? " $char" : $char ; + my $num = $x+$i; $num eq 0x7f and next; # skip delete + my $char = $Chartab{$num}; + $line .= !$char ? " " : + ($num < 0x7f ) ? " $char" : $char ; } $line =~ /^\s+$/o and next; printf "0x%04x: $line\n", $x; diff --git a/ext/Encode/bin/ucmlint b/ext/Encode/bin/ucmlint index bc0ebf5..c5d755b 100644 --- a/ext/Encode/bin/ucmlint +++ b/ext/Encode/bin/ucmlint @@ -1,10 +1,10 @@ #!/usr/local/bin/perl # -# $Id: ucmlint,v 2.0 2004/05/16 20:55:16 dankogai Exp $ +# $Id: ucmlint,v 2.1 2006/05/03 18:24:10 dankogai Exp $ # use strict; -our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 2.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use Getopt::Std; our %Opt; @@ -39,11 +39,11 @@ sub nit($;$){ my ($msg, $level) = @_; my $lstr; if ($level == 2){ - $lstr = 'notice'; + $lstr = 'notice'; }elsif ($level == 1){ - $lstr = 'warning'; $nwarning++; + $lstr = 'warning'; $nwarning++; }else{ - $lstr = 'error'; $nerror++; + $lstr = 'error'; $nerror++; } print "$ARGV:$lstr in line $.: $msg\n"; } @@ -54,77 +54,77 @@ for $ARGV (@ARGV){ $in_charmap = $nerror = $nwarning = 0; $. = 0; while(){ - chomp; - s/\s*#.*$//o; /^$/ and next; - if ($_ eq "CHARMAP"){ - $in_charmap = 1; - for my $must (qw/code_set_name mb_cur_min mb_cur_max/){ - exists $Hdr{$must} or nit "<$must> nonexistent"; - } - $Hdr{mb_cur_min} > $Hdr{mb_cur_max} - and nit sprintf("mb_cur_min(%d) > mb_cur_max(%d)", - $Hdr{mb_cur_min},$Hdr{mb_cur_max}); - $in_charmap = 1; - next; - } - unless ($in_charmap){ - my($hkey, $hvalue) = /^<(\S+)>\s+[\"\']?([^\"\']+)/o or next; - $Opt{D} and warn "$hkey => $hvalue"; - if ($hkey eq "code_set_name"){ # name check - exists $Hdr{code_set_name} - and nit "Duplicate : $hkey"; - } - if ($hkey eq "code_set_alias"){ # alias check - $hvalue eq $Hdr{code_set_name} - and nit qq(alias "$hvalue" is already in ); - } - $Hdr{$hkey} = $hvalue; - }else{ - my $name = $Hdr{code_set_name}; - my($unistr, $encstr, $fb) = /^(\S+)\s+(\S+)\s(\S+)/o or next; - $Opt{v} and nit $_, 2; - my $uni = uniparse($unistr); - my $enc = encparse($encstr); - $fb =~ /^\|([0123])$/ or nit "malformed fallback: $fb"; - $fb = $1; - $Opt{f} and $fb = 0; - unless ($fb == 1){ # check uni -> enc - if (exists $U2E{$uni}){ - nit "dupe encode map: U$uni => $U2E{$uni} and $enc", 1; - }else{ - $U2E{$uni} = $enc; - if ($Opt{e} and $fb != 3) { - my $e = hex2enc($enc); - my $u = hex2uni($uni); - my $eu = Encode::encode($name, $u); - $e eq $eu - or nit qq(encode('$name', $uni) != $enc); - } - } - } - unless ($fb == 3){ # check enc -> uni - if (exists $E2U{$enc}){ - nit "dupe decode map: $enc => U$E2U{$enc} and U$uni", 1; - }else{ - $E2U{$enc} = $uni; - if ($Opt{e} and $fb != 1) { - my $e = hex2enc($enc); - my $u = hex2uni($uni); - $Opt{D} and warn "$uni, $enc"; - my $de = Encode::decode($name, $e); - $de eq $u - or nit qq(decode('$name', $enc) != $uni); - } - } - } - # warn "$uni, $enc, $fb"; - } + chomp; + s/\s*#.*$//o; /^$/ and next; + if ($_ eq "CHARMAP"){ + $in_charmap = 1; + for my $must (qw/code_set_name mb_cur_min mb_cur_max/){ + exists $Hdr{$must} or nit "<$must> nonexistent"; + } + $Hdr{mb_cur_min} > $Hdr{mb_cur_max} + and nit sprintf("mb_cur_min(%d) > mb_cur_max(%d)", + $Hdr{mb_cur_min},$Hdr{mb_cur_max}); + $in_charmap = 1; + next; + } + unless ($in_charmap){ + my($hkey, $hvalue) = /^<(\S+)>\s+[\"\']?([^\"\']+)/o or next; + $Opt{D} and warn "$hkey => $hvalue"; + if ($hkey eq "code_set_name"){ # name check + exists $Hdr{code_set_name} + and nit "Duplicate : $hkey"; + } + if ($hkey eq "code_set_alias"){ # alias check + $hvalue eq $Hdr{code_set_name} + and nit qq(alias "$hvalue" is already in ); + } + $Hdr{$hkey} = $hvalue; + }else{ + my $name = $Hdr{code_set_name}; + my($unistr, $encstr, $fb) = /^(\S+)\s+(\S+)\s(\S+)/o or next; + $Opt{v} and nit $_, 2; + my $uni = uniparse($unistr); + my $enc = encparse($encstr); + $fb =~ /^\|([0123])$/ or nit "malformed fallback: $fb"; + $fb = $1; + $Opt{f} and $fb = 0; + unless ($fb == 1){ # check uni -> enc + if (exists $U2E{$uni}){ + nit "dupe encode map: U$uni => $U2E{$uni} and $enc", 1; + }else{ + $U2E{$uni} = $enc; + if ($Opt{e} and $fb != 3) { + my $e = hex2enc($enc); + my $u = hex2uni($uni); + my $eu = Encode::encode($name, $u); + $e eq $eu + or nit qq(encode('$name', $uni) != $enc); + } + } + } + unless ($fb == 3){ # check enc -> uni + if (exists $E2U{$enc}){ + nit "dupe decode map: $enc => U$E2U{$enc} and U$uni", 1; + }else{ + $E2U{$enc} = $uni; + if ($Opt{e} and $fb != 1) { + my $e = hex2enc($enc); + my $u = hex2uni($uni); + $Opt{D} and warn "$uni, $enc"; + my $de = Encode::decode($name, $e); + $de eq $u + or nit qq(decode('$name', $enc) != $uni); + } + } + } + # warn "$uni, $enc, $fb"; + } } $in_charmap or nit "Where is CHARMAP?"; checkRT(); printf ("$ARGV: %s error%s found\n", - ($nerror == 0 ? 'no' : $nerror), - ($nerror > 1 ? 's' : '')); + ($nerror == 0 ? 'no' : $nerror), + ($nerror > 1 ? 's' : '')); } exit; @@ -138,14 +138,14 @@ sub hex2uni{ sub checkRT{ for my $uni (keys %E2U){ - my $enc = $U2E{$uni} or next; # okay - $E2U{$U2E{$uni}} eq $uni or - nit "RT failure: U$uni => $enc =>U$E2U{$U2E{$uni}}"; + my $enc = $U2E{$uni} or next; # okay + $E2U{$U2E{$uni}} eq $uni or + nit "RT failure: U$uni => $enc =>U$E2U{$U2E{$uni}}"; } for my $enc (keys %E2U){ - my $uni = $E2U{$enc} or next; # okay - $U2E{$E2U{$enc}} eq $enc or - nit "RT failure: $enc => U$uni => $U2E{$E2U{$enc}}"; + my $uni = $E2U{$enc} or next; # okay + $U2E{$E2U{$enc}} eq $enc or + nit "RT failure: $enc => U$uni => $U2E{$E2U{$enc}}"; } } @@ -155,8 +155,8 @@ sub uniparse{ my @u; push @u, $1 while($str =~ /\G/ig); for my $u (@u){ - $u =~ /^([0-9A-Za-z]+)$/o - or nit "malformed Unicode character: $u"; + $u =~ /^([0-9A-Za-z]+)$/o + or nit "malformed Unicode character: $u"; } return join(',', @u); } @@ -165,10 +165,10 @@ sub encparse{ my $str = shift; my @e; for my $e (split /\\x/io, $str){ - $e or next; # first \x - $e =~ /^([0-9A-Za-z]{1,2})$/io - or nit "Hex $e in $str is bogus"; - push @e, $1; + $e or next; # first \x + $e =~ /^([0-9A-Za-z]{1,2})$/io + or nit "Hex $e in $str is bogus"; + push @e, $1; } return join(',', @e); } diff --git a/ext/Encode/bin/ucmsort b/ext/Encode/bin/ucmsort index a67ee8e..3e037dc 100644 --- a/ext/Encode/bin/ucmsort +++ b/ext/Encode/bin/ucmsort @@ -1,6 +1,6 @@ #!/usr/local/bin/perl # -# $Id: ucmsort,v 2.1 2004/08/31 10:55:34 dankogai Exp $ +# $Id: ucmsort,v 2.2 2006/05/03 18:24:10 dankogai Exp $ # use strict; my @lines; @@ -8,11 +8,11 @@ my ($head, $tail); while (<>){ unless (m/^){ print $head; for (sort { hex($a->[0]) <=> hex($b->[0]) # Unicode descending order - or $a->[2] cmp $b->[2] # fallback descending order - or $a->[1] cmp $b->[1] # Encoding descending order + or $a->[2] cmp $b->[2] # fallback descending order + or $a->[1] cmp $b->[1] # Encoding descending order } @lines) { my $u = shift @$_; diff --git a/ext/Encode/bin/unidump b/ext/Encode/bin/unidump index a9484d0..ae0da30 100644 --- a/ext/Encode/bin/unidump +++ b/ext/Encode/bin/unidump @@ -28,22 +28,22 @@ sub do_perl{ my $string = shift; $Opt{P} and print "#!$^X -w\nprint\n"; unless ($string){ - while(<>){ - use utf8; - $linebuf .= Encode::decode($Opt{f}, $_); - while($linebuf){ - my $chr = render_p(substr($linebuf, 0, 1, '')); - length($outbuf) + length($chr) > $CPL and print_P(); - $outbuf .= $chr; - } - } - $outbuf and print print_P(";"); + while(<>){ + use utf8; + $linebuf .= Encode::decode($Opt{f}, $_); + while($linebuf){ + my $chr = render_p(substr($linebuf, 0, 1, '')); + length($outbuf) + length($chr) > $CPL and print_P(); + $outbuf .= $chr; + } + } + $outbuf and print print_P(";"); }else{ - while($string){ - my $chr = render_p(substr($string, 0, 1, '')); - length($outbuf) + length($chr) > $CPL and print_P(); - $outbuf .= $chr; - } + while($string){ + my $chr = render_p(substr($string, 0, 1, '')); + length($outbuf) + length($chr) > $CPL and print_P(); + $outbuf .= $chr; + } } $outbuf and print print_P(";"); exit; @@ -55,7 +55,7 @@ sub render_p{ $S2pstr{$chr} and return $S2pstr{$chr}; # \t\n... $chr =~ /[\x20-\x7e]/ and return $chr; # ascii, printable; my $fmt = ($chr =~ /[\x00-\x1f\x7F]/) ? - q(\x%x) : q(\x{%x}); + q(\x%x) : q(\x{%x}); return sprintf $fmt, ord($chr); } @@ -72,22 +72,22 @@ sub do_dump{ my $string = shift; !$Opt{p} and exists $Opt{H} and print_H(); unless ($string){ - while(<>){ - use utf8; - $linebuf .= Encode::decode($Opt{f}, $_); - while (length($linebuf) > $CPL){ - my $chunk = substr($linebuf, 0, $CPL, ''); - print_C($chunk, $linenum++); - $Opt{H} and $linenum % $Opt{H} == $CPL-1 and print_S(); - } - } - $linebuf and print_C($linebuf); + while(<>){ + use utf8; + $linebuf .= Encode::decode($Opt{f}, $_); + while (length($linebuf) > $CPL){ + my $chunk = substr($linebuf, 0, $CPL, ''); + print_C($chunk, $linenum++); + $Opt{H} and $linenum % $Opt{H} == $CPL-1 and print_S(); + } + } + $linebuf and print_C($linebuf); }else{ - while ($string){ - my $chunk = substr($string, 0, $CPL, ''); - print_C($chunk, $linenum++); - $Opt{H} and $linenum % $Opt{H} == $CPL-1 and print_S(); - } + while ($string){ + my $chunk = substr($string, 0, $CPL, ''); + print_C($chunk, $linenum++); + $Opt{H} and $linenum % $Opt{H} == $CPL-1 and print_S(); + } } exit; } @@ -95,14 +95,14 @@ sub do_dump{ sub print_S{ print "--------+------------------------------------------------"; if ($Opt{C}){ - print "-+-----------------"; + print "-+-----------------"; } print "\n"; } sub print_H{ print " Offset 0 1 2 3 4 5 6 7"; if ($Opt{C}){ - print " | 0 1 2 3 4 5 6 7"; + print " | 0 1 2 3 4 5 6 7"; } print "\n"; print_S; @@ -111,23 +111,23 @@ sub print_H{ sub print_C{ my ($chunk, $linenum) = @_; if (!$Opt{v} and $chunk eq $PrevChunk){ - printf "%08x *\n", $linenum*8; return; + printf "%08x *\n", $linenum*8; return; } $PrevChunk = $chunk; my $end = length($chunk) - 1; my (@ord, @chr); for my $i (0..$end){ - use utf8; - my $chr = substr($chunk,$i,1); - my $ord = ord($chr); - my $fmt = $ord <= 0xffff ? " %04x" : " %05x"; - push @ord, (sprintf $fmt, $ord); - $Opt{C} and push @chr, render_c($chr); + use utf8; + my $chr = substr($chunk,$i,1); + my $ord = ord($chr); + my $fmt = $ord <= 0xffff ? " %04x" : " %05x"; + push @ord, (sprintf $fmt, $ord); + $Opt{C} and push @chr, render_c($chr); } if (++$end < 7){ - for my $i ($end..7){ - push @ord, (" " x 6); - } + for my $i ($end..7){ + push @ord, (" " x 6); + } } my $line = sprintf "%08x %s", $linenum*8, join('', @ord); $Opt{C} and $line .= sprintf " | %s", join('', @chr); @@ -176,97 +176,97 @@ EOT BEGIN{ our %S2pstr= ( - "\\" => '\\\\', - "\0" => '\0', - "\t" => '\t', - "\n" => '\n', - "\r" => '\r', - "\v" => '\v', - "\a" => '\a', - "\e" => '\e', - "\"" => qq(\\\"), - "\'" => qq(\\\'), - '$' => '\$', - "@" => '\@', - "%" => '\%', - ); + "\\" => '\\\\', + "\0" => '\0', + "\t" => '\t', + "\n" => '\n', + "\r" => '\r', + "\v" => '\v', + "\a" => '\a', + "\e" => '\e', + "\"" => qq(\\\"), + "\'" => qq(\\\'), + '$' => '\$', + "@" => '\@', + "%" => '\%', + ); our %S2str = ( - qq(\x00) => q(\0), # NULL - qq(\x01) => q(^A), # START OF HEADING - qq(\x02) => q(^B), # START OF TEXT - qq(\x03) => q(^C), # END OF TEXT - qq(\x04) => q(^D), # END OF TRANSMISSION - qq(\x05) => q(^E), # ENQUIRY - qq(\x06) => q(^F), # ACKNOWLEDGE - qq(\x07) => q(\a), # BELL - qq(\x08) => q(^H), # BACKSPACE - qq(\x09) => q(\t), # HORIZONTAL TABULATION - qq(\x0A) => q(\n), # LINE FEED - qq(\x0B) => q(\v), # VERTICAL TABULATION - qq(\x0C) => q(^L), # FORM FEED - qq(\x0D) => q(\r), # CARRIAGE RETURN - qq(\x0E) => q(^N), # SHIFT OUT - qq(\x0F) => q(^O), # SHIFT IN - qq(\x10) => q(^P), # DATA LINK ESCAPE - qq(\x11) => q(^Q), # DEVICE CONTROL ONE - qq(\x12) => q(^R), # DEVICE CONTROL TWO - qq(\x13) => q(^S), # DEVICE CONTROL THREE - qq(\x14) => q(^T), # DEVICE CONTROL FOUR - qq(\x15) => q(^U), # NEGATIVE ACKNOWLEDGE - qq(\x16) => q(^V), # SYNCHRONOUS IDLE - qq(\x17) => q(^W), # END OF TRANSMISSION BLOCK - qq(\x18) => q(^X), # CANCEL - qq(\x19) => q(^Y), # END OF MEDIUM - qq(\x1A) => q(^Z), # SUBSTITUTE - qq(\x1B) => q(\e), # ESCAPE (\c[) - qq(\x1C) => "^\\", # FILE SEPARATOR - qq(\x1D) => "^\]", # GROUP SEPARATOR - qq(\x1E) => q(^^), # RECORD SEPARATOR - qq(\x1F) => q(^_), # UNIT SEPARATOR - ); + qq(\x00) => q(\0), # NULL + qq(\x01) => q(^A), # START OF HEADING + qq(\x02) => q(^B), # START OF TEXT + qq(\x03) => q(^C), # END OF TEXT + qq(\x04) => q(^D), # END OF TRANSMISSION + qq(\x05) => q(^E), # ENQUIRY + qq(\x06) => q(^F), # ACKNOWLEDGE + qq(\x07) => q(\a), # BELL + qq(\x08) => q(^H), # BACKSPACE + qq(\x09) => q(\t), # HORIZONTAL TABULATION + qq(\x0A) => q(\n), # LINE FEED + qq(\x0B) => q(\v), # VERTICAL TABULATION + qq(\x0C) => q(^L), # FORM FEED + qq(\x0D) => q(\r), # CARRIAGE RETURN + qq(\x0E) => q(^N), # SHIFT OUT + qq(\x0F) => q(^O), # SHIFT IN + qq(\x10) => q(^P), # DATA LINK ESCAPE + qq(\x11) => q(^Q), # DEVICE CONTROL ONE + qq(\x12) => q(^R), # DEVICE CONTROL TWO + qq(\x13) => q(^S), # DEVICE CONTROL THREE + qq(\x14) => q(^T), # DEVICE CONTROL FOUR + qq(\x15) => q(^U), # NEGATIVE ACKNOWLEDGE + qq(\x16) => q(^V), # SYNCHRONOUS IDLE + qq(\x17) => q(^W), # END OF TRANSMISSION BLOCK + qq(\x18) => q(^X), # CANCEL + qq(\x19) => q(^Y), # END OF MEDIUM + qq(\x1A) => q(^Z), # SUBSTITUTE + qq(\x1B) => q(\e), # ESCAPE (\c[) + qq(\x1C) => "^\\", # FILE SEPARATOR + qq(\x1D) => "^\]", # GROUP SEPARATOR + qq(\x1E) => q(^^), # RECORD SEPARATOR + qq(\x1F) => q(^_), # UNIT SEPARATOR + ); # # Generated out of lib/unicore/EastAsianWidth.txt # will it work ? # our $IsFullWidth = - qr/^[ - \x{1100}-\x{1159} - \x{115F}-\x{115F} - \x{2329}-\x{232A} - \x{2E80}-\x{2E99} - \x{2E9B}-\x{2EF3} - \x{2F00}-\x{2FD5} - \x{2FF0}-\x{2FFB} - \x{3000}-\x{303E} - \x{3041}-\x{3096} - \x{3099}-\x{30FF} - \x{3105}-\x{312C} - \x{3131}-\x{318E} - \x{3190}-\x{31B7} - \x{31F0}-\x{321C} - \x{3220}-\x{3243} - \x{3251}-\x{327B} - \x{327F}-\x{32CB} - \x{32D0}-\x{32FE} - \x{3300}-\x{3376} - \x{337B}-\x{33DD} - \x{3400}-\x{4DB5} - \x{4E00}-\x{9FA5} - \x{33E0}-\x{33FE} - \x{A000}-\x{A48C} - \x{AC00}-\x{D7A3} - \x{A490}-\x{A4C6} - \x{F900}-\x{FA2D} - \x{FA30}-\x{FA6A} - \x{FE30}-\x{FE46} - \x{FE49}-\x{FE52} - \x{FE54}-\x{FE66} - \x{FE68}-\x{FE6B} - \x{FF01}-\x{FF60} - \x{FFE0}-\x{FFE6} - \x{20000}-\x{2A6D6} - ]$/xo; + qr/^[ + \x{1100}-\x{1159} + \x{115F}-\x{115F} + \x{2329}-\x{232A} + \x{2E80}-\x{2E99} + \x{2E9B}-\x{2EF3} + \x{2F00}-\x{2FD5} + \x{2FF0}-\x{2FFB} + \x{3000}-\x{303E} + \x{3041}-\x{3096} + \x{3099}-\x{30FF} + \x{3105}-\x{312C} + \x{3131}-\x{318E} + \x{3190}-\x{31B7} + \x{31F0}-\x{321C} + \x{3220}-\x{3243} + \x{3251}-\x{327B} + \x{327F}-\x{32CB} + \x{32D0}-\x{32FE} + \x{3300}-\x{3376} + \x{337B}-\x{33DD} + \x{3400}-\x{4DB5} + \x{4E00}-\x{9FA5} + \x{33E0}-\x{33FE} + \x{A000}-\x{A48C} + \x{AC00}-\x{D7A3} + \x{A490}-\x{A4C6} + \x{F900}-\x{FA2D} + \x{FA30}-\x{FA6A} + \x{FE30}-\x{FE46} + \x{FE49}-\x{FE52} + \x{FE54}-\x{FE66} + \x{FE68}-\x{FE6B} + \x{FF01}-\x{FF60} + \x{FFE0}-\x{FFE6} + \x{20000}-\x{2A6D6} + ]$/xo; } __END__ diff --git a/ext/Encode/encengine.c b/ext/Encode/encengine.c index 6fb65da..255e4d7 100644 --- a/ext/Encode/encengine.c +++ b/ext/Encode/encengine.c @@ -93,7 +93,7 @@ we add a flag to re-add the removed byte to the source we could handle int do_encode(const encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst, - STRLEN dlen, STRLEN * dout, int approx, const U8 *term, STRLEN tlen) + STRLEN dlen, STRLEN * dout, int approx, const U8 *term, STRLEN tlen) { const U8 *s = src; const U8 *send = s + *slen; @@ -102,55 +102,55 @@ do_encode(const encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst, U8 *dend = d + dlen, *dlast = d; int code = 0; while (s < send) { - const encpage_t *e = enc; - U8 byte = *s; - while (byte > e->max) - e++; - if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) { - const U8 *cend = s + (e->slen & 0x7f); - if (cend <= send) { - STRLEN n; - if ((n = e->dlen)) { - const U8 *out = e->seq + n * (byte - e->min); - U8 *oend = d + n; - if (dst) { - if (oend <= dend) { - while (d < oend) - *d++ = *out++; - } - else { - /* Out of space */ - code = ENCODE_NOSPACE; - break; - } - } - else - d = oend; - } - enc = e->next; - s++; - if (s == cend) { - if (approx && (e->slen & 0x80)) - code = ENCODE_FALLBACK; - last = s; - if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) { - code = ENCODE_FOUND_TERM; - break; - } - dlast = d; - } - } - else { - /* partial source character */ - code = ENCODE_PARTIAL; - break; - } - } - else { - /* Cannot represent */ - code = ENCODE_NOREP; - break; - } + const encpage_t *e = enc; + U8 byte = *s; + while (byte > e->max) + e++; + if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) { + const U8 *cend = s + (e->slen & 0x7f); + if (cend <= send) { + STRLEN n; + if ((n = e->dlen)) { + const U8 *out = e->seq + n * (byte - e->min); + U8 *oend = d + n; + if (dst) { + if (oend <= dend) { + while (d < oend) + *d++ = *out++; + } + else { + /* Out of space */ + code = ENCODE_NOSPACE; + break; + } + } + else + d = oend; + } + enc = e->next; + s++; + if (s == cend) { + if (approx && (e->slen & 0x80)) + code = ENCODE_FALLBACK; + last = s; + if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) { + code = ENCODE_FOUND_TERM; + break; + } + dlast = d; + } + } + else { + /* partial source character */ + code = ENCODE_PARTIAL; + break; + } + } + else { + /* Cannot represent */ + code = ENCODE_NOREP; + break; + } } *slen = last - src; *dout = d - dst; diff --git a/ext/Encode/encoding.pm b/ext/Encode/encoding.pm index 4db0401..bf0fc69 100644 --- a/ext/Encode/encoding.pm +++ b/ext/Encode/encoding.pm @@ -1,6 +1,6 @@ -# $Id: encoding.pm,v 2.2 2005/09/08 14:17:17 dankogai Exp $ +# $Id: encoding.pm,v 2.3 2006/05/03 18:24:10 dankogai Exp $ package encoding; -our $VERSION = do { my @r = (q$Revision: 2.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.3 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use Encode; use strict; @@ -8,79 +8,93 @@ use strict; sub DEBUG () { 0 } BEGIN { - if (ord("A") == 193) { - require Carp; - Carp::croak("encoding: pragma does not support EBCDIC platforms"); + if ( ord("A") == 193 ) { + require Carp; + Carp::croak("encoding: pragma does not support EBCDIC platforms"); } } our $HAS_PERLIO = 0; eval { require PerlIO::encoding }; -unless ($@){ - $HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02); +unless ($@) { + $HAS_PERLIO = ( PerlIO::encoding->VERSION >= 0.02 ); } -sub _exception{ +sub _exception { my $name = shift; - $] > 5.008 and return 0; # 5.8.1 or higher then no - my %utfs = map {$_=>1} - qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE - UTF-32 UTF-32BE UTF-32LE); - $utfs{$name} or return 0; # UTFs or no - require Config; Config->import(); our %Config; - return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no + $] > 5.008 and return 0; # 5.8.1 or higher then no + my %utfs = map { $_ => 1 } + qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE + UTF-32 UTF-32BE UTF-32LE); + $utfs{$name} or return 0; # UTFs or no + require Config; + Config->import(); + our %Config; + return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no } -sub in_locale { $^H & ($locale::hint_bits || 0)} +sub in_locale { $^H & ( $locale::hint_bits || 0 ) } sub _get_locale_encoding { my $locale_encoding; # I18N::Langinfo isn't available everywhere eval { - require I18N::Langinfo; - I18N::Langinfo->import(qw(langinfo CODESET)); - $locale_encoding = langinfo(CODESET()); + require I18N::Langinfo; + I18N::Langinfo->import(qw(langinfo CODESET)); + $locale_encoding = langinfo( CODESET() ); }; - + my $country_language; no warnings 'uninitialized'; - if (not $locale_encoding && in_locale()) { - if ($ENV{LC_ALL} =~ /^([^.]+)\.([^.]+)$/) { - ($country_language, $locale_encoding) = ($1, $2); - } elsif ($ENV{LANG} =~ /^([^.]+)\.([^.]+)$/) { - ($country_language, $locale_encoding) = ($1, $2); - } - # LANGUAGE affects only LC_MESSAGES only on glibc - } elsif (not $locale_encoding) { - if ($ENV{LC_ALL} =~ /\butf-?8\b/i || - $ENV{LANG} =~ /\butf-?8\b/i) { - $locale_encoding = 'utf8'; - } - # Could do more heuristics based on the country and language - # parts of LC_ALL and LANG (the parts before the dot (if any)), - # since we have Locale::Country and Locale::Language available. - # TODO: get a database of Language -> Encoding mappings - # (the Estonian database at http://www.eki.ee/letter/ - # would be excellent!) --jhi + if ( not $locale_encoding && in_locale() ) { + if ( $ENV{LC_ALL} =~ /^([^.]+)\.([^.]+)$/ ) { + ( $country_language, $locale_encoding ) = ( $1, $2 ); + } + elsif ( $ENV{LANG} =~ /^([^.]+)\.([^.]+)$/ ) { + ( $country_language, $locale_encoding ) = ( $1, $2 ); + } + + # LANGUAGE affects only LC_MESSAGES only on glibc + } + elsif ( not $locale_encoding ) { + if ( $ENV{LC_ALL} =~ /\butf-?8\b/i + || $ENV{LANG} =~ /\butf-?8\b/i ) + { + $locale_encoding = 'utf8'; + } + + # Could do more heuristics based on the country and language + # parts of LC_ALL and LANG (the parts before the dot (if any)), + # since we have Locale::Country and Locale::Language available. + # TODO: get a database of Language -> Encoding mappings + # (the Estonian database at http://www.eki.ee/letter/ + # would be excellent!) --jhi } - if (defined $locale_encoding && - lc($locale_encoding) eq 'euc' && - defined $country_language) { - if ($country_language =~ /^ja_JP|japan(?:ese)?$/i) { - $locale_encoding = 'euc-jp'; - } elsif ($country_language =~ /^ko_KR|korean?$/i) { - $locale_encoding = 'euc-kr'; - } elsif ($country_language =~ /^zh_CN|chin(?:a|ese)?$/i) { - $locale_encoding = 'euc-cn'; - } elsif ($country_language =~ /^zh_TW|taiwan(?:ese)?$/i) { - $locale_encoding = 'euc-tw'; - } else { - require Carp; - Carp::croak("encoding: Locale encoding '$locale_encoding' too ambiguous"); - } + if ( defined $locale_encoding + && lc($locale_encoding) eq 'euc' + && defined $country_language ) + { + if ( $country_language =~ /^ja_JP|japan(?:ese)?$/i ) { + $locale_encoding = 'euc-jp'; + } + elsif ( $country_language =~ /^ko_KR|korean?$/i ) { + $locale_encoding = 'euc-kr'; + } + elsif ( $country_language =~ /^zh_CN|chin(?:a|ese)?$/i ) { + $locale_encoding = 'euc-cn'; + } + elsif ( $country_language =~ /^zh_TW|taiwan(?:ese)?$/i ) { + $locale_encoding = 'euc-tw'; + } + else { + require Carp; + Carp::croak( + "encoding: Locale encoding '$locale_encoding' too ambiguous" + ); + } } return $locale_encoding; @@ -89,82 +103,89 @@ sub _get_locale_encoding { sub import { my $class = shift; my $name = shift; - if ($name eq ':_get_locale_encoding') { # used by lib/open.pm - my $caller = caller(); + if ( $name eq ':_get_locale_encoding' ) { # used by lib/open.pm + my $caller = caller(); { - no strict 'refs'; - *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding; - } - return; + no strict 'refs'; + *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding; + } + return; } $name = _get_locale_encoding() if $name eq ':locale'; my %arg = @_; $name = $ENV{PERL_ENCODING} unless defined $name; my $enc = find_encoding($name); - unless (defined $enc) { - require Carp; - Carp::croak("encoding: Unknown encoding '$name'"); + unless ( defined $enc ) { + require Carp; + Carp::croak("encoding: Unknown encoding '$name'"); + } + $name = $enc->name; # canonize + unless ( $arg{Filter} ) { + DEBUG and warn "_exception($name) = ", _exception($name); + _exception($name) or ${^ENCODING} = $enc; + $HAS_PERLIO or return 1; } - $name = $enc->name; # canonize - unless ($arg{Filter}) { - DEBUG and warn "_exception($name) = ", _exception($name); - _exception($name) or ${^ENCODING} = $enc; - $HAS_PERLIO or return 1; - }else{ - defined(${^ENCODING}) and undef ${^ENCODING}; - # implicitly 'use utf8' - require utf8; # to fetch $utf8::hint_bits; - $^H |= $utf8::hint_bits; - eval { - require Filter::Util::Call ; - Filter::Util::Call->import ; - filter_add(sub{ - my $status = filter_read(); - if ($status > 0){ - $_ = $enc->decode($_, 1); - DEBUG and warn $_; - } - $status ; - }); - }; + else { + defined( ${^ENCODING} ) and undef ${^ENCODING}; + + # implicitly 'use utf8' + require utf8; # to fetch $utf8::hint_bits; + $^H |= $utf8::hint_bits; + eval { + require Filter::Util::Call; + Filter::Util::Call->import; + filter_add( + sub { + my $status = filter_read(); + if ( $status > 0 ) { + $_ = $enc->decode( $_, 1 ); + DEBUG and warn $_; + } + $status; + } + ); + }; $@ eq '' and DEBUG and warn "Filter installed"; } defined ${^UNICODE} and ${^UNICODE} != 0 and return 1; - for my $h (qw(STDIN STDOUT)){ - if ($arg{$h}){ - unless (defined find_encoding($arg{$h})) { - require Carp; - Carp::croak("encoding: Unknown encoding for $h, '$arg{$h}'"); - } - eval { binmode($h, ":raw :encoding($arg{$h})") }; - }else{ - unless (exists $arg{$h}){ - eval { - no warnings 'uninitialized'; - binmode($h, ":raw :encoding($name)"); - }; - } - } - if ($@){ - require Carp; - Carp::croak($@); - } + for my $h (qw(STDIN STDOUT)) { + if ( $arg{$h} ) { + unless ( defined find_encoding( $arg{$h} ) ) { + require Carp; + Carp::croak( + "encoding: Unknown encoding for $h, '$arg{$h}'"); + } + eval { binmode( $h, ":raw :encoding($arg{$h})" ) }; + } + else { + unless ( exists $arg{$h} ) { + eval { + no warnings 'uninitialized'; + binmode( $h, ":raw :encoding($name)" ); + }; + } + } + if ($@) { + require Carp; + Carp::croak($@); + } } - return 1; # I doubt if we need it, though + return 1; # I doubt if we need it, though } -sub unimport{ +sub unimport { no warnings; undef ${^ENCODING}; - if ($HAS_PERLIO){ - binmode(STDIN, ":raw"); - binmode(STDOUT, ":raw"); - }else{ - binmode(STDIN); - binmode(STDOUT); + if ($HAS_PERLIO) { + binmode( STDIN, ":raw" ); + binmode( STDOUT, ":raw" ); + } + else { + binmode(STDIN); + binmode(STDOUT); } - if ($INC{"Filter/Util/Call.pm"}){ - eval { filter_del() }; + if ( $INC{"Filter/Util/Call.pm"} ) { + eval { filter_del() }; } } @@ -430,16 +451,16 @@ other modules are loaded. i.e. Notice that only literals (string or regular expression) having only legacy code points are affected: if you mix data like this - \xDF\x{100} + \xDF\x{100} the data is assumed to be in (Latin 1 and) Unicode, not in your native encoding. In other words, this will match in "greek": - "\xDF" =~ /\x{3af}/ + "\xDF" =~ /\x{3af}/ but this will not - "\xDF\x{100}" =~ /\x{3af}\x{100}/ + "\xDF\x{100}" =~ /\x{3af}\x{100}/ since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on the left will B be upgraded to C<\x{3af}> (Unicode GREEK SMALL diff --git a/ext/Encode/lib/Encode/Alias.pm b/ext/Encode/lib/Encode/Alias.pm index c0bbf69..2a4898b 100644 --- a/ext/Encode/lib/Encode/Alias.pm +++ b/ext/Encode/lib/Encode/Alias.pm @@ -2,141 +2,155 @@ package Encode::Alias; use strict; no warnings 'redefine'; use Encode; -our $VERSION = do { my @r = (q$Revision: 2.4 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.5 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; sub DEBUG () { 0 } use base qw(Exporter); # Public, encouraged API is exported by default -our @EXPORT = - qw ( - define_alias - find_alias - ); +our @EXPORT = + qw ( + define_alias + find_alias +); -our @Alias; # ordered matching list -our %Alias; # cached known aliases +our @Alias; # ordered matching list +our %Alias; # cached known aliases -sub find_alias{ +sub find_alias { my $class = shift; - my $find = shift; - unless (exists $Alias{$find}) { - $Alias{$find} = undef; # Recursion guard - for (my $i=0; $i < @Alias; $i += 2){ - my $alias = $Alias[$i]; - my $val = $Alias[$i+1]; - my $new; - if (ref($alias) eq 'Regexp' && $find =~ $alias){ - DEBUG and warn "eval $val"; - $new = eval $val; - DEBUG and $@ and warn "$val, $@"; - }elsif (ref($alias) eq 'CODE'){ - DEBUG and warn "$alias", "->", "($find)"; - $new = $alias->($find); - }elsif (lc($find) eq lc($alias)){ - $new = $val; - } - if (defined($new)){ - next if $new eq $find; # avoid (direct) recursion on bugs - DEBUG and warn "$alias, $new"; - my $enc = (ref($new)) ? $new : Encode::find_encoding($new); - if ($enc){ - $Alias{$find} = $enc; - last; - } - } - } - # case insensitive search when canonical is not in all lowercase - # RT ticket #7835 - unless ($Alias{$find}){ - my $lcfind = lc($find); - for my $name (keys %Encode::Encoding, keys %Encode::ExtModule){ - $lcfind eq lc($name) or next; - $Alias{$find} = Encode::find_encoding($name); - DEBUG and warn "$find => $name"; - } - } + my $find = shift; + unless ( exists $Alias{$find} ) { + $Alias{$find} = undef; # Recursion guard + for ( my $i = 0 ; $i < @Alias ; $i += 2 ) { + my $alias = $Alias[$i]; + my $val = $Alias[ $i + 1 ]; + my $new; + if ( ref($alias) eq 'Regexp' && $find =~ $alias ) { + DEBUG and warn "eval $val"; + $new = eval $val; + DEBUG and $@ and warn "$val, $@"; + } + elsif ( ref($alias) eq 'CODE' ) { + DEBUG and warn "$alias", "->", "($find)"; + $new = $alias->($find); + } + elsif ( lc($find) eq lc($alias) ) { + $new = $val; + } + if ( defined($new) ) { + next if $new eq $find; # avoid (direct) recursion on bugs + DEBUG and warn "$alias, $new"; + my $enc = + ( ref($new) ) ? $new : Encode::find_encoding($new); + if ($enc) { + $Alias{$find} = $enc; + last; + } + } + } + + # case insensitive search when canonical is not in all lowercase + # RT ticket #7835 + unless ( $Alias{$find} ) { + my $lcfind = lc($find); + for my $name ( keys %Encode::Encoding, keys %Encode::ExtModule ) + { + $lcfind eq lc($name) or next; + $Alias{$find} = Encode::find_encoding($name); + DEBUG and warn "$find => $name"; + } + } } - if (DEBUG){ - my $name; - if (my $e = $Alias{$find}){ - $name = $e->name; - }else{ - $name = ""; - } - warn "find_alias($class, $find)->name = $name"; + if (DEBUG) { + my $name; + if ( my $e = $Alias{$find} ) { + $name = $e->name; + } + else { + $name = ""; + } + warn "find_alias($class, $find)->name = $name"; } return $Alias{$find}; } -sub define_alias{ - while (@_){ - my ($alias,$name) = splice(@_,0,2); - unshift(@Alias, $alias => $name); # newer one has precedence - if (ref($alias)){ - # clear %Alias cache to allow overrides - my @a = keys %Alias; - for my $k (@a){ - if (ref($alias) eq 'Regexp' && $k =~ $alias){ - DEBUG and warn "delete \$Alias\{$k\}"; - delete $Alias{$k}; - } - elsif (ref($alias) eq 'CODE'){ - DEBUG and warn "delete \$Alias\{$k\}"; - delete $Alias{$alias->($name)}; - } - } - }else{ - DEBUG and warn "delete \$Alias\{$alias\}"; - delete $Alias{$alias}; - } +sub define_alias { + while (@_) { + my ( $alias, $name ) = splice( @_, 0, 2 ); + unshift( @Alias, $alias => $name ); # newer one has precedence + if ( ref($alias) ) { + + # clear %Alias cache to allow overrides + my @a = keys %Alias; + for my $k (@a) { + if ( ref($alias) eq 'Regexp' && $k =~ $alias ) { + DEBUG and warn "delete \$Alias\{$k\}"; + delete $Alias{$k}; + } + elsif ( ref($alias) eq 'CODE' ) { + DEBUG and warn "delete \$Alias\{$k\}"; + delete $Alias{ $alias->($name) }; + } + } + } + else { + DEBUG and warn "delete \$Alias\{$alias\}"; + delete $Alias{$alias}; + } } } # Allow latin-1 style names as well # 0 1 2 3 4 5 6 7 8 9 10 our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 ); + # Allow winlatin1 style names as well -our %Winlatin2cp = ( - 'latin1' => 1252, - 'latin2' => 1250, - 'cyrillic' => 1251, - 'greek' => 1253, - 'turkish' => 1254, - 'hebrew' => 1255, - 'arabic' => 1256, - 'baltic' => 1257, - 'vietnamese' => 1258, - ); +our %Winlatin2cp = ( + 'latin1' => 1252, + 'latin2' => 1250, + 'cyrillic' => 1251, + 'greek' => 1253, + 'turkish' => 1254, + 'hebrew' => 1255, + 'arabic' => 1256, + 'baltic' => 1257, + 'vietnamese' => 1258, +); init_aliases(); -sub undef_aliases{ +sub undef_aliases { @Alias = (); %Alias = (); } -sub init_aliases -{ +sub init_aliases { undef_aliases(); + # Try all-lower-case version should all else fails define_alias( qr/^(.*)$/ => '"\L$1"' ); # UTF/UCS stuff - define_alias( qr/^UTF-?7$/i => '"UTF-7"'); - define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' ); - define_alias( qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"', - qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")', - qr/^iso-10646-1$/i => '"UCS-2BE"' ); - define_alias( qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"', - qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"', - qr/^UTF-?(16|32)$/i => '"UTF-$1"', - ); + define_alias( qr/^UTF-?7$/i => '"UTF-7"' ); + define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' ); + define_alias( + qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"', + qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")', + qr/^iso-10646-1$/i => '"UCS-2BE"' + ); + define_alias( + qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"', + qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"', + qr/^UTF-?(16|32)$/i => '"UTF-$1"', + ); + # ASCII - define_alias(qr/^(?:US-?)ascii$/i => '"ascii"'); - define_alias('C' => 'ascii'); - define_alias(qr/\bISO[-_]?646[-_]?US$/i => '"ascii"'); + define_alias( qr/^(?:US-?)ascii$/i => '"ascii"' ); + define_alias( 'C' => 'ascii' ); + define_alias( qr/\bISO[-_]?646[-_]?US$/i => '"ascii"' ); + # Allow variants of iso-8859-1 etc. define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' ); @@ -144,7 +158,9 @@ sub init_aliases define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' ); # More HP stuff. - define_alias( qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' ); + define_alias( + qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => + '"${1}8"' ); # The Official name of ASCII. define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' ); @@ -154,27 +170,32 @@ sub init_aliases # has been redefined as the euro symbol.) define_alias( qr/^(.+)\@euro$/i => '"$1"' ); - define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i - => 'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef' ); + define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i => +'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef' + ); - define_alias( qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish| - hebrew|arabic|baltic|vietnamese)$/ix => - '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' ); + define_alias( + qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish| + hebrew|arabic|baltic|vietnamese)$/ix => + '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' + ); # Common names for non-latin preferred MIME names - define_alias( 'ascii' => 'US-ascii', - 'cyrillic' => 'iso-8859-5', - 'arabic' => 'iso-8859-6', - 'greek' => 'iso-8859-7', - 'hebrew' => 'iso-8859-8', - 'thai' => 'iso-8859-11', - 'tis620' => 'iso-8859-11', - ); + define_alias( + 'ascii' => 'US-ascii', + 'cyrillic' => 'iso-8859-5', + 'arabic' => 'iso-8859-6', + 'greek' => 'iso-8859-7', + 'hebrew' => 'iso-8859-8', + 'thai' => 'iso-8859-11', + 'tis620' => 'iso-8859-11', + ); # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN. # And Microsoft has their own naming (again, surprisingly). - # And windows-* is registered in IANA! - define_alias( qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"'); + # And windows-* is registered in IANA! + define_alias( + qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"' ); # Sometimes seen with a leading zero. # define_alias( qr/\bcp037\b/i => '"cp37"'); @@ -182,46 +203,56 @@ sub init_aliases # Mac Mappings # predefined in *.ucm; unneeded # define_alias( qr/\bmacIcelandic$/i => '"macIceland"'); - define_alias( qr/^mac_(.*)$/i => '"mac$1"'); + define_alias( qr/^mac_(.*)$/i => '"mac$1"' ); + # Ououououou. gone. They are differente! # define_alias( qr/\bmacRomanian$/i => '"macRumanian"'); - + # Standardize on the dashed versions. define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' ); - unless ($Encode::ON_EBCDIC){ + unless ($Encode::ON_EBCDIC) { + # for Encode::CN - define_alias( qr/\beuc.*cn$/i => '"euc-cn"' ); - define_alias( qr/\bcn.*euc$/i => '"euc-cn"' ); - # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' ) - # CP936 doesn't have vendor-addon for GBK, so they're identical. - define_alias( qr/^gbk$/i => '"cp936"'); - # This fixes gb2312 vs. euc-cn confusion, practically - define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' ); - # for Encode::JP - define_alias( qr/\bjis$/i => '"7bit-jis"' ); - define_alias( qr/\beuc.*jp$/i => '"euc-jp"' ); - define_alias( qr/\bjp.*euc$/i => '"euc-jp"' ); - define_alias( qr/\bujis$/i => '"euc-jp"' ); - define_alias( qr/\bshift.*jis$/i => '"shiftjis"' ); - define_alias( qr/\bsjis$/i => '"shiftjis"' ); - define_alias( qr/\bwindows-31j$/i => '"cp932"' ); + define_alias( qr/\beuc.*cn$/i => '"euc-cn"' ); + define_alias( qr/\bcn.*euc$/i => '"euc-cn"' ); + + # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' ) + # CP936 doesn't have vendor-addon for GBK, so they're identical. + define_alias( qr/^gbk$/i => '"cp936"' ); + + # This fixes gb2312 vs. euc-cn confusion, practically + define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' ); + + # for Encode::JP + define_alias( qr/\bjis$/i => '"7bit-jis"' ); + define_alias( qr/\beuc.*jp$/i => '"euc-jp"' ); + define_alias( qr/\bjp.*euc$/i => '"euc-jp"' ); + define_alias( qr/\bujis$/i => '"euc-jp"' ); + define_alias( qr/\bshift.*jis$/i => '"shiftjis"' ); + define_alias( qr/\bsjis$/i => '"shiftjis"' ); + define_alias( qr/\bwindows-31j$/i => '"cp932"' ); + # for Encode::KR - define_alias( qr/\beuc.*kr$/i => '"euc-kr"' ); - define_alias( qr/\bkr.*euc$/i => '"euc-kr"' ); - # This fixes ksc5601 vs. euc-kr confusion, practically - define_alias( qr/(?:x-)?uhc$/i => '"cp949"' ); - define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' ); - define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' ); + define_alias( qr/\beuc.*kr$/i => '"euc-kr"' ); + define_alias( qr/\bkr.*euc$/i => '"euc-kr"' ); + + # This fixes ksc5601 vs. euc-kr confusion, practically + define_alias( qr/(?:x-)?uhc$/i => '"cp949"' ); + define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' ); + define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' ); + # for Encode::TW - define_alias( qr/\bbig-?5$/i => '"big5-eten"' ); - define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' ); - define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' ); - define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' ); - define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' ); + define_alias( qr/\bbig-?5$/i => '"big5-eten"' ); + define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' ); + define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' ); + define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' ); + define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' ); } + # utf8 is blessed :) - define_alias( qr/^UTF-8$/i => '"utf-8-strict"'); + define_alias( qr/^UTF-8$/i => '"utf-8-strict"' ); + # At last, Map white space and _ to '-' define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' ); } diff --git a/ext/Encode/lib/Encode/CJKConstants.pm b/ext/Encode/lib/Encode/CJKConstants.pm index 4ab40e7..411d3cd 100644 --- a/ext/Encode/lib/Encode/CJKConstants.pm +++ b/ext/Encode/lib/Encode/CJKConstants.pm @@ -1,13 +1,13 @@ # -# $Id: CJKConstants.pm,v 2.0 2004/05/16 20:55:16 dankogai Exp $ +# $Id: CJKConstants.pm,v 2.1 2006/05/03 18:24:10 dankogai Exp $ # package Encode::CJKConstants; use strict; -our $RCSID = q$Id: CJKConstants.pm,v 2.0 2004/05/16 20:55:16 dankogai Exp $; -our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $RCSID = q$Id: CJKConstants.pm,v 2.1 2006/05/03 18:24:10 dankogai Exp $; +our $VERSION = do { my @r = ( q$Revision: 2.1 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use Carp; @@ -18,44 +18,43 @@ our @EXPORT_OK = qw(%CHARCODE %ESC %RE); our %EXPORT_TAGS = ( 'all' => [ @EXPORT_OK, @EXPORT ] ); my %_0208 = ( - 1978 => '\e\$\@', - 1983 => '\e\$B', - 1990 => '\e&\@\e\$B', - ); + 1978 => '\e\$\@', + 1983 => '\e\$B', + 1990 => '\e&\@\e\$B', +); our %CHARCODE = ( - UNDEF_EUC => "\xa2\xae", # ¢® in EUC - UNDEF_SJIS => "\x81\xac", # ¢® in SJIS - UNDEF_JIS => "\xa2\xf7", # ¢÷ -- used in unicode - UNDEF_UNICODE => "\x20\x20", # ¢÷ -- used in unicode - ); + UNDEF_EUC => "\xa2\xae", # ¢® in EUC + UNDEF_SJIS => "\x81\xac", # ¢® in SJIS + UNDEF_JIS => "\xa2\xf7", # ¢÷ -- used in unicode + UNDEF_UNICODE => "\x20\x20", # ¢÷ -- used in unicode +); -our %ESC = ( - GB_2312 => "\e\$A", - JIS_0208 => "\e\$B", - JIS_0212 => "\e\$(D", - KSC_5601 => "\e\$(C", - ASC => "\e\(B", - KANA => "\e\(I", - '2022_KR' => "\e\$)C", - ); +our %ESC = ( + GB_2312 => "\e\$A", + JIS_0208 => "\e\$B", + JIS_0212 => "\e\$(D", + KSC_5601 => "\e\$(C", + ASC => "\e\(B", + KANA => "\e\(I", + '2022_KR' => "\e\$)C", +); -our %RE = - ( - ASCII => '[\x00-\x7f]', - BIN => '[\x00-\x06\x7f\xff]', - EUC_0212 => '\x8f[\xa1-\xfe][\xa1-\xfe]', - EUC_C => '[\xa1-\xfe][\xa1-\xfe]', - EUC_KANA => '\x8e[\xa1-\xdf]', - JIS_0208 => "$_0208{1978}|$_0208{1983}|$_0208{1990}", - JIS_0212 => "\e" . '\$\(D', - ISO_ASC => "\e" . '\([BJ]', - JIS_KANA => "\e" . '\(I', - '2022_KR' => "\e" . '\$\)C', - SJIS_C => '[\x81-\x9f\xe0-\xfc][\x40-\x7e\x80-\xfc]', - SJIS_KANA => '[\xa1-\xdf]', - UTF8 => '[\xc0-\xdf][\x80-\xbf]|[\xe0-\xef][\x80-\xbf][\x80-\xbf]' - ); +our %RE = ( + ASCII => '[\x00-\x7f]', + BIN => '[\x00-\x06\x7f\xff]', + EUC_0212 => '\x8f[\xa1-\xfe][\xa1-\xfe]', + EUC_C => '[\xa1-\xfe][\xa1-\xfe]', + EUC_KANA => '\x8e[\xa1-\xdf]', + JIS_0208 => "$_0208{1978}|$_0208{1983}|$_0208{1990}", + JIS_0212 => "\e" . '\$\(D', + ISO_ASC => "\e" . '\([BJ]', + JIS_KANA => "\e" . '\(I', + '2022_KR' => "\e" . '\$\)C', + SJIS_C => '[\x81-\x9f\xe0-\xfc][\x40-\x7e\x80-\xfc]', + SJIS_KANA => '[\xa1-\xdf]', + UTF8 => '[\xc0-\xdf][\x80-\xbf]|[\xe0-\xef][\x80-\xbf][\x80-\xbf]' +); 1; @@ -64,3 +63,4 @@ our %RE = Encode::CJKConstants.pm -- Internally used by Encode::??::ISO_2022_* =cut + diff --git a/ext/Encode/lib/Encode/CN/HZ.pm b/ext/Encode/lib/Encode/CN/HZ.pm index 94b372c..d178800 100644 --- a/ext/Encode/lib/Encode/CN/HZ.pm +++ b/ext/Encode/lib/Encode/CN/HZ.pm @@ -3,7 +3,7 @@ package Encode::CN::HZ; use strict; use vars qw($VERSION); -$VERSION = do { my @r = (q$Revision: 2.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +$VERSION = do { my @r = ( q$Revision: 2.3 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use Encode qw(:fallbacks); @@ -15,63 +15,63 @@ __PACKAGE__->Define('hz'); # not ported for EBCDIC. Which should be used, "~" or "\x7E"? -sub needs_lines { 1 } +sub needs_lines { 1 } -sub decode ($$;$) -{ - my ($obj,$str,$chk) = @_; +sub decode ($$;$) { + my ( $obj, $str, $chk ) = @_; - my $GB = Encode::find_encoding('gb2312-raw'); + my $GB = Encode::find_encoding('gb2312-raw'); my $ret = ''; - my $in_ascii = 1; # default mode is ASCII. - - while (length $str) { - if ($in_ascii) { # ASCII mode - if ($str =~ s/^([\x00-\x7D\x7F]+)//) { # no '~' => ASCII - $ret .= $1; - # EBCDIC should need ascii2native, but not ported. - } - elsif ($str =~ s/^\x7E\x7E//) { # escaped tilde - $ret .= '~'; - } - elsif ($str =~ s/^\x7E\cJ//) { # '\cJ' == LF in ASCII - 1; # no-op - } - elsif ($str =~ s/^\x7E\x7B//) { # '~{' - $in_ascii = 0; # to GB - } - else { # encounters an invalid escape, \x80 or greater - last; - } - } - else { # GB mode; the byte ranges are as in RFC 1843. - no warnings 'uninitialized'; - if ($str =~ s/^((?:[\x21-\x77][\x21-\x7E])+)//) { - $ret .= $GB->decode($1, $chk); - } - elsif ($str =~ s/^\x7E\x7D//) { # '~}' - $in_ascii = 1; - } - else { # invalid - last; - } - } + my $in_ascii = 1; # default mode is ASCII. + + while ( length $str ) { + if ($in_ascii) { # ASCII mode + if ( $str =~ s/^([\x00-\x7D\x7F]+)// ) { # no '~' => ASCII + $ret .= $1; + + # EBCDIC should need ascii2native, but not ported. + } + elsif ( $str =~ s/^\x7E\x7E// ) { # escaped tilde + $ret .= '~'; + } + elsif ( $str =~ s/^\x7E\cJ// ) { # '\cJ' == LF in ASCII + 1; # no-op + } + elsif ( $str =~ s/^\x7E\x7B// ) { # '~{' + $in_ascii = 0; # to GB + } + else { # encounters an invalid escape, \x80 or greater + last; + } + } + else { # GB mode; the byte ranges are as in RFC 1843. + no warnings 'uninitialized'; + if ( $str =~ s/^((?:[\x21-\x77][\x21-\x7E])+)// ) { + $ret .= $GB->decode( $1, $chk ); + } + elsif ( $str =~ s/^\x7E\x7D// ) { # '~}' + $in_ascii = 1; + } + else { # invalid + last; + } + } } - $_[1] = '' if $chk; # needs_lines guarantees no partial character + $_[1] = '' if $chk; # needs_lines guarantees no partial character return $ret; } sub cat_decode { - my ($obj, undef, $src, $pos, $trm, $chk) = @_; - my ($rdst, $rsrc, $rpos) = \@_[1..3]; + my ( $obj, undef, $src, $pos, $trm, $chk ) = @_; + my ( $rdst, $rsrc, $rpos ) = \@_[ 1 .. 3 ]; - my $GB = Encode::find_encoding('gb2312-raw'); + my $GB = Encode::find_encoding('gb2312-raw'); my $ret = ''; - my $in_ascii = 1; # default mode is ASCII. + my $in_ascii = 1; # default mode is ASCII. my $ini_pos = pos($$rsrc); - substr($src, 0, $pos) = ''; + substr( $src, 0, $pos ) = ''; my $ini_len = bytes::length($src); @@ -79,110 +79,108 @@ sub cat_decode { # XXX: Is better C<$src =~ s/^\x7E// or die if ...>? $src =~ s/^\x7E// if $trm eq "\x7E"; - while (length $src) { - my $now; - if ($in_ascii) { # ASCII mode - if ($src =~ s/^([\x00-\x7D\x7F])//) { # no '~' => ASCII - $now = $1; - } - elsif ($src =~ s/^\x7E\x7E//) { # escaped tilde - $now = '~'; - } - elsif ($src =~ s/^\x7E\cJ//) { # '\cJ' == LF in ASCII - next; - } - elsif ($src =~ s/^\x7E\x7B//) { # '~{' - $in_ascii = 0; # to GB - next; - } - else { # encounters an invalid escape, \x80 or greater - last; - } - } - else { # GB mode; the byte ranges are as in RFC 1843. - if ($src =~ s/^((?:[\x21-\x77][\x21-\x7F])+)//) { - $now = $GB->decode($1, $chk); - } - elsif ($src =~ s/^\x7E\x7D//) { # '~}' - $in_ascii = 1; - next; - } - else { # invalid - last; - } - } - - next if ! defined $now; - - $ret .= $now; - - if ($now eq $trm) { - $$rdst .= $ret; - $$rpos = $ini_pos + $pos + $ini_len - bytes::length($src); - pos($$rsrc) = $ini_pos; - return 1; - } + while ( length $src ) { + my $now; + if ($in_ascii) { # ASCII mode + if ( $src =~ s/^([\x00-\x7D\x7F])// ) { # no '~' => ASCII + $now = $1; + } + elsif ( $src =~ s/^\x7E\x7E// ) { # escaped tilde + $now = '~'; + } + elsif ( $src =~ s/^\x7E\cJ// ) { # '\cJ' == LF in ASCII + next; + } + elsif ( $src =~ s/^\x7E\x7B// ) { # '~{' + $in_ascii = 0; # to GB + next; + } + else { # encounters an invalid escape, \x80 or greater + last; + } + } + else { # GB mode; the byte ranges are as in RFC 1843. + if ( $src =~ s/^((?:[\x21-\x77][\x21-\x7F])+)// ) { + $now = $GB->decode( $1, $chk ); + } + elsif ( $src =~ s/^\x7E\x7D// ) { # '~}' + $in_ascii = 1; + next; + } + else { # invalid + last; + } + } + + next if !defined $now; + + $ret .= $now; + + if ( $now eq $trm ) { + $$rdst .= $ret; + $$rpos = $ini_pos + $pos + $ini_len - bytes::length($src); + pos($$rsrc) = $ini_pos; + return 1; + } } $$rdst .= $ret; $$rpos = $ini_pos + $pos + $ini_len - bytes::length($src); pos($$rsrc) = $ini_pos; - return ''; # terminator not found + return ''; # terminator not found } +sub encode($$;$) { + my ( $obj, $str, $chk ) = @_; -sub encode($$;$) -{ - my ($obj,$str,$chk) = @_; - - my $GB = Encode::find_encoding('gb2312-raw'); + my $GB = Encode::find_encoding('gb2312-raw'); my $ret = ''; - my $in_ascii = 1; # default mode is ASCII. - - no warnings 'utf8'; # $str may be malformed UTF8 at the end of a chunk. - - while (length $str) { - if ($str =~ s/^([[:ascii:]]+)//) { - my $tmp = $1; - $tmp =~ s/~/~~/g; # escapes tildes - if (! $in_ascii) { - $ret .= "\x7E\x7D"; # '~}' - $in_ascii = 1; - } - $ret .= pack 'a*', $tmp; # remove UTF8 flag. - } - elsif ($str =~ s/(.)//) { - my $s = $1; - my $tmp = $GB->encode($s, $chk); - last if !defined $tmp; - if (length $tmp == 2) { # maybe a valid GB char (XXX) - if ($in_ascii) { - $ret .= "\x7E\x7B"; # '~{' - $in_ascii = 0; - } - $ret .= $tmp; - } - elsif (length $tmp) { # maybe FALLBACK in ASCII (XXX) - if (!$in_ascii) { - $ret .= "\x7E\x7D"; # '~}' - $in_ascii = 1; - } - $ret .= $tmp; - } - } - else { # if $str is malformed UTF8 *and* if length $str != 0. - last; - } + my $in_ascii = 1; # default mode is ASCII. + + no warnings 'utf8'; # $str may be malformed UTF8 at the end of a chunk. + + while ( length $str ) { + if ( $str =~ s/^([[:ascii:]]+)// ) { + my $tmp = $1; + $tmp =~ s/~/~~/g; # escapes tildes + if ( !$in_ascii ) { + $ret .= "\x7E\x7D"; # '~}' + $in_ascii = 1; + } + $ret .= pack 'a*', $tmp; # remove UTF8 flag. + } + elsif ( $str =~ s/(.)// ) { + my $s = $1; + my $tmp = $GB->encode( $s, $chk ); + last if !defined $tmp; + if ( length $tmp == 2 ) { # maybe a valid GB char (XXX) + if ($in_ascii) { + $ret .= "\x7E\x7B"; # '~{' + $in_ascii = 0; + } + $ret .= $tmp; + } + elsif ( length $tmp ) { # maybe FALLBACK in ASCII (XXX) + if ( !$in_ascii ) { + $ret .= "\x7E\x7D"; # '~}' + $in_ascii = 1; + } + $ret .= $tmp; + } + } + else { # if $str is malformed UTF8 *and* if length $str != 0. + last; + } } $_[1] = $str if $chk; - # The state at the end of the chunk is discarded, even if in GB mode. - # That results in the combination of GB-OUT and GB-IN, i.e. "~}~{". - # Parhaps it is harmless, but further investigations may be required... + # The state at the end of the chunk is discarded, even if in GB mode. + # That results in the combination of GB-OUT and GB-IN, i.e. "~}~{". + # Parhaps it is harmless, but further investigations may be required... - if (! $in_ascii) { - $ret .= "\x7E\x7D"; # '~}' - $in_ascii = 1; + if ( !$in_ascii ) { + $ret .= "\x7E\x7D"; # '~}' + $in_ascii = 1; } return $ret; } diff --git a/ext/Encode/lib/Encode/Config.pm b/ext/Encode/lib/Encode/Config.pm index d69b92d..9c490ee 100644 --- a/ext/Encode/lib/Encode/Config.pm +++ b/ext/Encode/lib/Encode/Config.pm @@ -2,157 +2,158 @@ # Demand-load module list # package Encode::Config; -our $VERSION = do { my @r = (q$Revision: 2.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.2 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use strict; -our %ExtModule = - ( - # Encode::Byte - #iso-8859-1 is in Encode.pm itself - 'iso-8859-2' => 'Encode::Byte', - 'iso-8859-3' => 'Encode::Byte', - 'iso-8859-4' => 'Encode::Byte', - 'iso-8859-5' => 'Encode::Byte', - 'iso-8859-6' => 'Encode::Byte', - 'iso-8859-7' => 'Encode::Byte', - 'iso-8859-8' => 'Encode::Byte', - 'iso-8859-9' => 'Encode::Byte', - 'iso-8859-10' => 'Encode::Byte', - 'iso-8859-11' => 'Encode::Byte', - 'iso-8859-13' => 'Encode::Byte', - 'iso-8859-14' => 'Encode::Byte', - 'iso-8859-15' => 'Encode::Byte', - 'iso-8859-16' => 'Encode::Byte', - 'koi8-f' => 'Encode::Byte', - 'koi8-r' => 'Encode::Byte', - 'koi8-u' => 'Encode::Byte', - 'viscii' => 'Encode::Byte', - 'cp424' => 'Encode::Byte', - 'cp437' => 'Encode::Byte', - 'cp737' => 'Encode::Byte', - 'cp775' => 'Encode::Byte', - 'cp850' => 'Encode::Byte', - 'cp852' => 'Encode::Byte', - 'cp855' => 'Encode::Byte', - 'cp856' => 'Encode::Byte', - 'cp857' => 'Encode::Byte', - 'cp860' => 'Encode::Byte', - 'cp861' => 'Encode::Byte', - 'cp862' => 'Encode::Byte', - 'cp863' => 'Encode::Byte', - 'cp864' => 'Encode::Byte', - 'cp865' => 'Encode::Byte', - 'cp866' => 'Encode::Byte', - 'cp869' => 'Encode::Byte', - 'cp874' => 'Encode::Byte', - 'cp1006' => 'Encode::Byte', - 'cp1250' => 'Encode::Byte', - 'cp1251' => 'Encode::Byte', - 'cp1252' => 'Encode::Byte', - 'cp1253' => 'Encode::Byte', - 'cp1254' => 'Encode::Byte', - 'cp1255' => 'Encode::Byte', - 'cp1256' => 'Encode::Byte', - 'cp1257' => 'Encode::Byte', - 'cp1258' => 'Encode::Byte', - 'AdobeStandardEncoding' => 'Encode::Byte', - 'MacArabic' => 'Encode::Byte', - 'MacCentralEurRoman' => 'Encode::Byte', - 'MacCroatian' => 'Encode::Byte', - 'MacCyrillic' => 'Encode::Byte', - 'MacFarsi' => 'Encode::Byte', - 'MacGreek' => 'Encode::Byte', - 'MacHebrew' => 'Encode::Byte', - 'MacIcelandic' => 'Encode::Byte', - 'MacRoman' => 'Encode::Byte', - 'MacRomanian' => 'Encode::Byte', - 'MacRumanian' => 'Encode::Byte', - 'MacSami' => 'Encode::Byte', - 'MacThai' => 'Encode::Byte', - 'MacTurkish' => 'Encode::Byte', - 'MacUkrainian' => 'Encode::Byte', - 'nextstep' => 'Encode::Byte', - 'hp-roman8' => 'Encode::Byte', - 'gsm0338' => 'Encode::Byte', - # Encode::EBCDIC - 'cp37' => 'Encode::EBCDIC', - 'cp500' => 'Encode::EBCDIC', - 'cp875' => 'Encode::EBCDIC', - 'cp1026' => 'Encode::EBCDIC', - 'cp1047' => 'Encode::EBCDIC', - 'posix-bc' => 'Encode::EBCDIC', - # Encode::Symbol - 'dingbats' => 'Encode::Symbol', - 'symbol' => 'Encode::Symbol', - 'AdobeSymbol' => 'Encode::Symbol', - 'AdobeZdingbat' => 'Encode::Symbol', - 'MacDingbats' => 'Encode::Symbol', - 'MacSymbol' => 'Encode::Symbol', - # Encode::Unicode - 'UCS-2BE' => 'Encode::Unicode', - 'UCS-2LE' => 'Encode::Unicode', - 'UTF-16' => 'Encode::Unicode', - 'UTF-16BE' => 'Encode::Unicode', - 'UTF-16LE' => 'Encode::Unicode', - 'UTF-32' => 'Encode::Unicode', - 'UTF-32BE' => 'Encode::Unicode', - 'UTF-32LE' => 'Encode::Unicode', - 'UTF-7' => 'Encode::Unicode::UTF7', - ); +our %ExtModule = ( + + # Encode::Byte + #iso-8859-1 is in Encode.pm itself + 'iso-8859-2' => 'Encode::Byte', + 'iso-8859-3' => 'Encode::Byte', + 'iso-8859-4' => 'Encode::Byte', + 'iso-8859-5' => 'Encode::Byte', + 'iso-8859-6' => 'Encode::Byte', + 'iso-8859-7' => 'Encode::Byte', + 'iso-8859-8' => 'Encode::Byte', + 'iso-8859-9' => 'Encode::Byte', + 'iso-8859-10' => 'Encode::Byte', + 'iso-8859-11' => 'Encode::Byte', + 'iso-8859-13' => 'Encode::Byte', + 'iso-8859-14' => 'Encode::Byte', + 'iso-8859-15' => 'Encode::Byte', + 'iso-8859-16' => 'Encode::Byte', + 'koi8-f' => 'Encode::Byte', + 'koi8-r' => 'Encode::Byte', + 'koi8-u' => 'Encode::Byte', + 'viscii' => 'Encode::Byte', + 'cp424' => 'Encode::Byte', + 'cp437' => 'Encode::Byte', + 'cp737' => 'Encode::Byte', + 'cp775' => 'Encode::Byte', + 'cp850' => 'Encode::Byte', + 'cp852' => 'Encode::Byte', + 'cp855' => 'Encode::Byte', + 'cp856' => 'Encode::Byte', + 'cp857' => 'Encode::Byte', + 'cp860' => 'Encode::Byte', + 'cp861' => 'Encode::Byte', + 'cp862' => 'Encode::Byte', + 'cp863' => 'Encode::Byte', + 'cp864' => 'Encode::Byte', + 'cp865' => 'Encode::Byte', + 'cp866' => 'Encode::Byte', + 'cp869' => 'Encode::Byte', + 'cp874' => 'Encode::Byte', + 'cp1006' => 'Encode::Byte', + 'cp1250' => 'Encode::Byte', + 'cp1251' => 'Encode::Byte', + 'cp1252' => 'Encode::Byte', + 'cp1253' => 'Encode::Byte', + 'cp1254' => 'Encode::Byte', + 'cp1255' => 'Encode::Byte', + 'cp1256' => 'Encode::Byte', + 'cp1257' => 'Encode::Byte', + 'cp1258' => 'Encode::Byte', + 'AdobeStandardEncoding' => 'Encode::Byte', + 'MacArabic' => 'Encode::Byte', + 'MacCentralEurRoman' => 'Encode::Byte', + 'MacCroatian' => 'Encode::Byte', + 'MacCyrillic' => 'Encode::Byte', + 'MacFarsi' => 'Encode::Byte', + 'MacGreek' => 'Encode::Byte', + 'MacHebrew' => 'Encode::Byte', + 'MacIcelandic' => 'Encode::Byte', + 'MacRoman' => 'Encode::Byte', + 'MacRomanian' => 'Encode::Byte', + 'MacRumanian' => 'Encode::Byte', + 'MacSami' => 'Encode::Byte', + 'MacThai' => 'Encode::Byte', + 'MacTurkish' => 'Encode::Byte', + 'MacUkrainian' => 'Encode::Byte', + 'nextstep' => 'Encode::Byte', + 'hp-roman8' => 'Encode::Byte', + 'gsm0338' => 'Encode::Byte', + + # Encode::EBCDIC + 'cp37' => 'Encode::EBCDIC', + 'cp500' => 'Encode::EBCDIC', + 'cp875' => 'Encode::EBCDIC', + 'cp1026' => 'Encode::EBCDIC', + 'cp1047' => 'Encode::EBCDIC', + 'posix-bc' => 'Encode::EBCDIC', + + # Encode::Symbol + 'dingbats' => 'Encode::Symbol', + 'symbol' => 'Encode::Symbol', + 'AdobeSymbol' => 'Encode::Symbol', + 'AdobeZdingbat' => 'Encode::Symbol', + 'MacDingbats' => 'Encode::Symbol', + 'MacSymbol' => 'Encode::Symbol', + + # Encode::Unicode + 'UCS-2BE' => 'Encode::Unicode', + 'UCS-2LE' => 'Encode::Unicode', + 'UTF-16' => 'Encode::Unicode', + 'UTF-16BE' => 'Encode::Unicode', + 'UTF-16LE' => 'Encode::Unicode', + 'UTF-32' => 'Encode::Unicode', + 'UTF-32BE' => 'Encode::Unicode', + 'UTF-32LE' => 'Encode::Unicode', + 'UTF-7' => 'Encode::Unicode::UTF7', +); + +unless ( ord("A") == 193 ) { + %ExtModule = ( + %ExtModule, + 'euc-cn' => 'Encode::CN', + 'gb12345-raw' => 'Encode::CN', + 'gb2312-raw' => 'Encode::CN', + 'hz' => 'Encode::CN', + 'iso-ir-165' => 'Encode::CN', + 'cp936' => 'Encode::CN', + 'MacChineseSimp' => 'Encode::CN', -unless (ord("A") == 193){ - %ExtModule = - ( - %ExtModule, - 'euc-cn' => 'Encode::CN', - 'gb12345-raw' => 'Encode::CN', - 'gb2312-raw' => 'Encode::CN', - 'hz' => 'Encode::CN', - 'iso-ir-165' => 'Encode::CN', - 'cp936' => 'Encode::CN', - 'MacChineseSimp' => 'Encode::CN', - - '7bit-jis' => 'Encode::JP', - 'euc-jp' => 'Encode::JP', - 'iso-2022-jp' => 'Encode::JP', - 'iso-2022-jp-1' => 'Encode::JP', - 'jis0201-raw' => 'Encode::JP', - 'jis0208-raw' => 'Encode::JP', - 'jis0212-raw' => 'Encode::JP', - 'cp932' => 'Encode::JP', - 'MacJapanese' => 'Encode::JP', - 'shiftjis' => 'Encode::JP', - - - 'euc-kr' => 'Encode::KR', - 'iso-2022-kr' => 'Encode::KR', - 'johab' => 'Encode::KR', - 'ksc5601-raw' => 'Encode::KR', - 'cp949' => 'Encode::KR', - 'MacKorean' => 'Encode::KR', - - 'big5-eten' => 'Encode::TW', - 'big5-hkscs' => 'Encode::TW', - 'cp950' => 'Encode::TW', - 'MacChineseTrad' => 'Encode::TW', - - #'big5plus' => 'Encode::HanExtra', - #'euc-tw' => 'Encode::HanExtra', - #'gb18030' => 'Encode::HanExtra', - - 'MIME-Header' => 'Encode::MIME::Header', - 'MIME-B' => 'Encode::MIME::Header', - 'MIME-Q' => 'Encode::MIME::Header', - - 'MIME-Header-ISO_2022_JP' => 'Encode::MIME::Header::ISO_2022_JP', - ); + '7bit-jis' => 'Encode::JP', + 'euc-jp' => 'Encode::JP', + 'iso-2022-jp' => 'Encode::JP', + 'iso-2022-jp-1' => 'Encode::JP', + 'jis0201-raw' => 'Encode::JP', + 'jis0208-raw' => 'Encode::JP', + 'jis0212-raw' => 'Encode::JP', + 'cp932' => 'Encode::JP', + 'MacJapanese' => 'Encode::JP', + 'shiftjis' => 'Encode::JP', + + 'euc-kr' => 'Encode::KR', + 'iso-2022-kr' => 'Encode::KR', + 'johab' => 'Encode::KR', + 'ksc5601-raw' => 'Encode::KR', + 'cp949' => 'Encode::KR', + 'MacKorean' => 'Encode::KR', + + 'big5-eten' => 'Encode::TW', + 'big5-hkscs' => 'Encode::TW', + 'cp950' => 'Encode::TW', + 'MacChineseTrad' => 'Encode::TW', + + #'big5plus' => 'Encode::HanExtra', + #'euc-tw' => 'Encode::HanExtra', + #'gb18030' => 'Encode::HanExtra', + + 'MIME-Header' => 'Encode::MIME::Header', + 'MIME-B' => 'Encode::MIME::Header', + 'MIME-Q' => 'Encode::MIME::Header', + + 'MIME-Header-ISO_2022_JP' => 'Encode::MIME::Header::ISO_2022_JP', + ); } # # Why not export ? to keep ConfigLocal Happy! # -while (my ($enc,$mod) = each %ExtModule){ +while ( my ( $enc, $mod ) = each %ExtModule ) { $Encode::ExtModule{$enc} = $mod; } diff --git a/ext/Encode/lib/Encode/Encoder.pm b/ext/Encode/lib/Encode/Encoder.pm index fe2a2b9..f7194f8 100644 --- a/ext/Encode/lib/Encode/Encoder.pm +++ b/ext/Encode/lib/Encode/Encoder.pm @@ -1,13 +1,13 @@ # -# $Id: Encoder.pm,v 2.0 2004/05/16 20:55:17 dankogai Exp $ +# $Id: Encoder.pm,v 2.1 2006/05/03 18:24:10 dankogai Exp $ # package Encode::Encoder; use strict; use warnings; -our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.1 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; require Exporter; -our @ISA = qw(Exporter); +our @ISA = qw(Exporter); our @EXPORT_OK = qw ( encoder ); our $AUTOLOAD; @@ -15,83 +15,87 @@ sub DEBUG () { 0 } use Encode qw(encode decode find_encoding from_to); use Carp; -sub new{ - my ($class, $data, $encname) = @_; - unless($encname){ - $encname = Encode::is_utf8($data) ? 'utf8' : ''; - }else{ - my $obj = find_encoding($encname) - or croak __PACKAGE__, ": unknown encoding: $encname"; - $encname = $obj->name; +sub new { + my ( $class, $data, $encname ) = @_; + unless ($encname) { + $encname = Encode::is_utf8($data) ? 'utf8' : ''; + } + else { + my $obj = find_encoding($encname) + or croak __PACKAGE__, ": unknown encoding: $encname"; + $encname = $obj->name; } my $self = { - data => $data, - encoding => $encname, - }; + data => $data, + encoding => $encname, + }; bless $self => $class; } -sub encoder{ __PACKAGE__->new(@_) } +sub encoder { __PACKAGE__->new(@_) } -sub data{ - my ($self, $data) = @_; - if (defined $data){ - $self->{data} = $data; - return $data; - }else{ - return $self->{data}; +sub data { + my ( $self, $data ) = @_; + if ( defined $data ) { + $self->{data} = $data; + return $data; + } + else { + return $self->{data}; } } -sub encoding{ - my ($self, $encname) = @_; - if ($encname){ - my $obj = find_encoding($encname) - or confess __PACKAGE__, ": unknown encoding: $encname"; - $self->{encoding} = $obj->name; - return $self; - }else{ - return $self->{encoding} +sub encoding { + my ( $self, $encname ) = @_; + if ($encname) { + my $obj = find_encoding($encname) + or confess __PACKAGE__, ": unknown encoding: $encname"; + $self->{encoding} = $obj->name; + return $self; + } + else { + return $self->{encoding}; } } sub bytes { - my ($self, $encname) = @_; + my ( $self, $encname ) = @_; $encname ||= $self->{encoding}; - my $obj = find_encoding($encname) - or confess __PACKAGE__, ": unknown encoding: $encname"; - $self->{data} = $obj->decode($self->{data}, 1); - $self->{encoding} = '' ; + my $obj = find_encoding($encname) + or confess __PACKAGE__, ": unknown encoding: $encname"; + $self->{data} = $obj->decode( $self->{data}, 1 ); + $self->{encoding} = ''; return $self; } -sub DESTROY{ # defined so it won't autoload. +sub DESTROY { # defined so it won't autoload. DEBUG and warn shift; } sub AUTOLOAD { my $self = shift; my $type = ref($self) - or confess "$self is not an object"; + or confess "$self is not an object"; my $myname = $AUTOLOAD; - $myname =~ s/.*://; # strip fully-qualified portion - my $obj = find_encoding($myname) - or confess __PACKAGE__, ": unknown encoding: $myname"; + $myname =~ s/.*://; # strip fully-qualified portion + my $obj = find_encoding($myname) + or confess __PACKAGE__, ": unknown encoding: $myname"; DEBUG and warn $self->{encoding}, " => ", $obj->name; - if ($self->{encoding}){ - from_to($self->{data}, $self->{encoding}, $obj->name, 1); - }else{ - $self->{data} = $obj->encode($self->{data}, 1); + if ( $self->{encoding} ) { + from_to( $self->{data}, $self->{encoding}, $obj->name, 1 ); + } + else { + $self->{data} = $obj->encode( $self->{data}, 1 ); } $self->{encoding} = $obj->name; return $self; } -use overload - q("") => sub { $_[0]->{data} }, - q(0+) => sub { use bytes (); bytes::length($_[0]->{data}) }, - fallback => 1, - ; +use overload + q("") => sub { $_[0]->{data} }, + q(0+) => sub { use bytes(); bytes::length( $_[0]->{data} ) }, + fallback => 1, + ; 1; __END__ diff --git a/ext/Encode/lib/Encode/Encoding.pm b/ext/Encode/lib/Encode/Encoding.pm index 06af9fb..47c9308 100644 --- a/ext/Encode/lib/Encode/Encoding.pm +++ b/ext/Encode/lib/Encode/Encoding.pm @@ -1,47 +1,49 @@ package Encode::Encoding; + # Base class for classes which implement encodings use strict; -our $VERSION = do { my @r = (q$Revision: 2.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.3 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; require Encode; sub DEBUG { 0 } -sub Define -{ - my $obj = shift; + +sub Define { + my $obj = shift; my $canonical = shift; - $obj = bless { Name => $canonical },$obj unless ref $obj; + $obj = bless { Name => $canonical }, $obj unless ref $obj; + # warn "$canonical => $obj\n"; - Encode::define_encoding($obj, $canonical, @_); + Encode::define_encoding( $obj, $canonical, @_ ); } -sub name { return shift->{'Name'} } +sub name { return shift->{'Name'} } # sub renew { return $_[0] } sub renew { my $self = shift; - my $clone = bless { %$self } => ref($self); - $clone->{renewed}++; # so the caller can see it + my $clone = bless {%$self} => ref($self); + $clone->{renewed}++; # so the caller can see it DEBUG and warn $clone->{renewed}; return $clone; } -sub renewed{ return $_[0]->{renewed} || 0 } +sub renewed { return $_[0]->{renewed} || 0 } *new_sequence = \&renew; -sub needs_lines { 0 }; +sub needs_lines { 0 } -sub perlio_ok { - eval{ require PerlIO::encoding }; +sub perlio_ok { + eval { require PerlIO::encoding }; return $@ ? 0 : 1; } # (Temporary|legacy) methods -sub toUnicode { shift->decode(@_) } -sub fromUnicode { shift->encode(@_) } +sub toUnicode { shift->decode(@_) } +sub fromUnicode { shift->encode(@_) } # # Needs to be overloaded or just croak @@ -51,17 +53,17 @@ sub encode { require Carp; my $obj = shift; my $class = ref($obj) ? ref($obj) : $obj; - Carp::croak($class . "->encode() not defined!"); + Carp::croak( $class . "->encode() not defined!" ); } -sub decode{ +sub decode { require Carp; my $obj = shift; my $class = ref($obj) ? ref($obj) : $obj; - Carp::croak($class . "->encode() not defined!"); + Carp::croak( $class . "->encode() not defined!" ); } -sub DESTROY {} +sub DESTROY { } 1; __END__ diff --git a/ext/Encode/lib/Encode/Guess.pm b/ext/Encode/lib/Encode/Guess.pm index 5692cee..260616e 100644 --- a/ext/Encode/lib/Encode/Guess.pm +++ b/ext/Encode/lib/Encode/Guess.pm @@ -2,65 +2,64 @@ package Encode::Guess; use strict; use Encode qw(:fallbacks find_encoding); -our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.1 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; my $Canon = 'Guess'; sub DEBUG () { 0 } our %DEF_SUSPECTS = map { $_ => find_encoding($_) } qw(ascii utf8); -$Encode::Encoding{$Canon} = - bless { - Name => $Canon, - Suspects => { %DEF_SUSPECTS }, - } => __PACKAGE__; +$Encode::Encoding{$Canon} = bless { + Name => $Canon, + Suspects => {%DEF_SUSPECTS}, +} => __PACKAGE__; use base qw(Encode::Encoding); sub needs_lines { 1 } -sub perlio_ok { 0 } +sub perlio_ok { 0 } -our @EXPORT = qw(guess_encoding); +our @EXPORT = qw(guess_encoding); our $NoUTFAutoGuess = 0; -our $UTF8_BOM = pack("C3", 0xef, 0xbb, 0xbf); +our $UTF8_BOM = pack( "C3", 0xef, 0xbb, 0xbf ); -sub import { # Exporter not used so we do it on our own +sub import { # Exporter not used so we do it on our own my $callpkg = caller; - for my $item (@EXPORT){ - no strict 'refs'; - *{"$callpkg\::$item"} = \&{"$item"}; + for my $item (@EXPORT) { + no strict 'refs'; + *{"$callpkg\::$item"} = \&{"$item"}; } set_suspects(@_); } -sub set_suspects{ +sub set_suspects { my $class = shift; my $self = ref($class) ? $class : $Encode::Encoding{$Canon}; - $self->{Suspects} = { %DEF_SUSPECTS }; + $self->{Suspects} = {%DEF_SUSPECTS}; $self->add_suspects(@_); } -sub add_suspects{ +sub add_suspects { my $class = shift; my $self = ref($class) ? $class : $Encode::Encoding{$Canon}; - for my $c (@_){ - my $e = find_encoding($c) or die "Unknown encoding: $c"; - $self->{Suspects}{$e->name} = $e; - DEBUG and warn "Added: ", $e->name; + for my $c (@_) { + my $e = find_encoding($c) or die "Unknown encoding: $c"; + $self->{Suspects}{ $e->name } = $e; + DEBUG and warn "Added: ", $e->name; } } -sub decode($$;$){ - my ($obj, $octet, $chk) = @_; - my $guessed = guess($obj, $octet); - unless (ref($guessed)){ - require Carp; - Carp::croak($guessed); +sub decode($$;$) { + my ( $obj, $octet, $chk ) = @_; + my $guessed = guess( $obj, $octet ); + unless ( ref($guessed) ) { + require Carp; + Carp::croak($guessed); } - my $utf8 = $guessed->decode($octet, $chk); + my $utf8 = $guessed->decode( $octet, $chk ); $_[1] = $octet if $chk; return $utf8; } -sub guess_encoding{ - guess($Encode::Encoding{$Canon}, @_); +sub guess_encoding { + guess( $Encode::Encoding{$Canon}, @_ ); } sub guess { @@ -73,90 +72,95 @@ sub guess { # cheat 0: utf8 flag; if ( Encode::is_utf8($octet) ) { - return find_encoding('utf8') unless $NoUTFAutoGuess; - Encode::_utf8_off($octet); + return find_encoding('utf8') unless $NoUTFAutoGuess; + Encode::_utf8_off($octet); } + # cheat 1: BOM use Encode::Unicode; unless ($NoUTFAutoGuess) { - my $BOM = pack('C3', unpack("C3", $octet)); - return find_encoding('utf8') - if (defined $BOM and $BOM eq $UTF8_BOM); - $BOM = unpack('N', $octet); - return find_encoding('UTF-32') - if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe0000)); - $BOM = unpack('n', $octet); - return find_encoding('UTF-16') - if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe)); - if ($octet =~ /\x00/o){ # if \x00 found, we assume UTF-(16|32)(BE|LE) - my $utf; - my ($be, $le) = (0, 0); - if ($octet =~ /\x00\x00/o){ # UTF-32(BE|LE) assumed - $utf = "UTF-32"; - for my $char (unpack('N*', $octet)){ - $char & 0x0000ffff and $be++; - $char & 0xffff0000 and $le++; - } - }else{ # UTF-16(BE|LE) assumed - $utf = "UTF-16"; - for my $char (unpack('n*', $octet)){ - $char & 0x00ff and $be++; - $char & 0xff00 and $le++; - } - } - DEBUG and warn "$utf, be == $be, le == $le"; - $be == $le - and return - "Encodings ambiguous between $utf BE and LE ($be, $le)"; - $utf .= ($be > $le) ? 'BE' : 'LE'; - return find_encoding($utf); - } + my $BOM = pack( 'C3', unpack( "C3", $octet ) ); + return find_encoding('utf8') + if ( defined $BOM and $BOM eq $UTF8_BOM ); + $BOM = unpack( 'N', $octet ); + return find_encoding('UTF-32') + if ( defined $BOM and ( $BOM == 0xFeFF or $BOM == 0xFFFe0000 ) ); + $BOM = unpack( 'n', $octet ); + return find_encoding('UTF-16') + if ( defined $BOM and ( $BOM == 0xFeFF or $BOM == 0xFFFe ) ); + if ( $octet =~ /\x00/o ) + { # if \x00 found, we assume UTF-(16|32)(BE|LE) + my $utf; + my ( $be, $le ) = ( 0, 0 ); + if ( $octet =~ /\x00\x00/o ) { # UTF-32(BE|LE) assumed + $utf = "UTF-32"; + for my $char ( unpack( 'N*', $octet ) ) { + $char & 0x0000ffff and $be++; + $char & 0xffff0000 and $le++; + } + } + else { # UTF-16(BE|LE) assumed + $utf = "UTF-16"; + for my $char ( unpack( 'n*', $octet ) ) { + $char & 0x00ff and $be++; + $char & 0xff00 and $le++; + } + } + DEBUG and warn "$utf, be == $be, le == $le"; + $be == $le + and return + "Encodings ambiguous between $utf BE and LE ($be, $le)"; + $utf .= ( $be > $le ) ? 'BE' : 'LE'; + return find_encoding($utf); + } } - my %try = %{$obj->{Suspects}}; - for my $c (@_){ - my $e = find_encoding($c) or die "Unknown encoding: $c"; - $try{$e->name} = $e; - DEBUG and warn "Added: ", $e->name; + my %try = %{ $obj->{Suspects} }; + for my $c (@_) { + my $e = find_encoding($c) or die "Unknown encoding: $c"; + $try{ $e->name } = $e; + DEBUG and warn "Added: ", $e->name; } my $nline = 1; - for my $line (split /\r\n?|\n/, $octet){ - # cheat 2 -- \e in the string - if ($line =~ /\e/o){ - my @keys = keys %try; - delete @try{qw/utf8 ascii/}; - for my $k (@keys){ - ref($try{$k}) eq 'Encode::XS' and delete $try{$k}; - } - } - my %ok = %try; - # warn join(",", keys %try); - for my $k (keys %try){ - my $scratch = $line; - $try{$k}->decode($scratch, FB_QUIET); - if ($scratch eq ''){ - DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k); - }else{ - use bytes (); - DEBUG and - warn sprintf("%4d:%-24s not ok; %d bytes left\n", - $nline, $k, bytes::length($scratch)); - delete $ok{$k}; - } - } - %ok or return "No appropriate encodings found!"; - if (scalar(keys(%ok)) == 1){ - my ($retval) = values(%ok); - return $retval; - } - %try = %ok; $nline++; + for my $line ( split /\r\n?|\n/, $octet ) { + + # cheat 2 -- \e in the string + if ( $line =~ /\e/o ) { + my @keys = keys %try; + delete @try{qw/utf8 ascii/}; + for my $k (@keys) { + ref( $try{$k} ) eq 'Encode::XS' and delete $try{$k}; + } + } + my %ok = %try; + + # warn join(",", keys %try); + for my $k ( keys %try ) { + my $scratch = $line; + $try{$k}->decode( $scratch, FB_QUIET ); + if ( $scratch eq '' ) { + DEBUG and warn sprintf( "%4d:%-24s ok\n", $nline, $k ); + } + else { + use bytes (); + DEBUG + and warn sprintf( "%4d:%-24s not ok; %d bytes left\n", + $nline, $k, bytes::length($scratch) ); + delete $ok{$k}; + } + } + %ok or return "No appropriate encodings found!"; + if ( scalar( keys(%ok) ) == 1 ) { + my ($retval) = values(%ok); + return $retval; + } + %try = %ok; + $nline++; } - $try{ascii} or - return "Encodings too ambiguous: ", join(" or ", keys %try); + $try{ascii} + or return "Encodings too ambiguous: ", join( " or ", keys %try ); return $try{ascii}; } - - 1; __END__ diff --git a/ext/Encode/lib/Encode/JP/H2Z.pm b/ext/Encode/lib/Encode/JP/H2Z.pm index 0c84c62..36a074a 100644 --- a/ext/Encode/lib/Encode/JP/H2Z.pm +++ b/ext/Encode/lib/Encode/JP/H2Z.pm @@ -1,114 +1,114 @@ # -# $Id: H2Z.pm,v 2.0 2004/05/16 20:55:17 dankogai Exp $ +# $Id: H2Z.pm,v 2.1 2006/05/03 18:24:10 dankogai Exp $ # package Encode::JP::H2Z; use strict; -our $RCSID = q$Id: H2Z.pm,v 2.0 2004/05/16 20:55:17 dankogai Exp $; -our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $RCSID = q$Id: H2Z.pm,v 2.1 2006/05/03 18:24:10 dankogai Exp $; +our $VERSION = do { my @r = ( q$Revision: 2.1 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use Encode::CJKConstants qw(:all); use vars qw(%_D2Z $_PAT_D2Z - %_Z2D $_PAT_Z2D - %_H2Z $_PAT_H2Z - %_Z2H $_PAT_Z2H); + %_Z2D $_PAT_Z2D + %_H2Z $_PAT_H2Z + %_Z2H $_PAT_Z2H); %_H2Z = ( - "\x8e\xa1" => "\xa1\xa3", #¡£ - "\x8e\xa2" => "\xa1\xd6", #¡Ö - "\x8e\xa3" => "\xa1\xd7", #¡× - "\x8e\xa4" => "\xa1\xa2", #¡¢ - "\x8e\xa5" => "\xa1\xa6", #¡¦ - "\x8e\xa6" => "\xa5\xf2", #¥ò - "\x8e\xa7" => "\xa5\xa1", #¥¡ - "\x8e\xa8" => "\xa5\xa3", #¥£ - "\x8e\xa9" => "\xa5\xa5", #¥¥ - "\x8e\xaa" => "\xa5\xa7", #¥§ - "\x8e\xab" => "\xa5\xa9", #¥© - "\x8e\xac" => "\xa5\xe3", #¥ã - "\x8e\xad" => "\xa5\xe5", #¥å - "\x8e\xae" => "\xa5\xe7", #¥ç - "\x8e\xaf" => "\xa5\xc3", #¥Ã - "\x8e\xb0" => "\xa1\xbc", #¡¼ - "\x8e\xb1" => "\xa5\xa2", #¥¢ - "\x8e\xb2" => "\xa5\xa4", #¥¤ - "\x8e\xb3" => "\xa5\xa6", #¥¦ - "\x8e\xb4" => "\xa5\xa8", #¥¨ - "\x8e\xb5" => "\xa5\xaa", #¥ª - "\x8e\xb6" => "\xa5\xab", #¥« - "\x8e\xb7" => "\xa5\xad", #¥­ - "\x8e\xb8" => "\xa5\xaf", #¥¯ - "\x8e\xb9" => "\xa5\xb1", #¥± - "\x8e\xba" => "\xa5\xb3", #¥³ - "\x8e\xbb" => "\xa5\xb5", #¥µ - "\x8e\xbc" => "\xa5\xb7", #¥· - "\x8e\xbd" => "\xa5\xb9", #¥¹ - "\x8e\xbe" => "\xa5\xbb", #¥» - "\x8e\xbf" => "\xa5\xbd", #¥½ - "\x8e\xc0" => "\xa5\xbf", #¥¿ - "\x8e\xc1" => "\xa5\xc1", #¥Á - "\x8e\xc2" => "\xa5\xc4", #¥Ä - "\x8e\xc3" => "\xa5\xc6", #¥Æ - "\x8e\xc4" => "\xa5\xc8", #¥È - "\x8e\xc5" => "\xa5\xca", #¥Ê - "\x8e\xc6" => "\xa5\xcb", #¥Ë - "\x8e\xc7" => "\xa5\xcc", #¥Ì - "\x8e\xc8" => "\xa5\xcd", #¥Í - "\x8e\xc9" => "\xa5\xce", #¥Î - "\x8e\xca" => "\xa5\xcf", #¥Ï - "\x8e\xcb" => "\xa5\xd2", #¥Ò - "\x8e\xcc" => "\xa5\xd5", #¥Õ - "\x8e\xcd" => "\xa5\xd8", #¥Ø - "\x8e\xce" => "\xa5\xdb", #¥Û - "\x8e\xcf" => "\xa5\xde", #¥Þ - "\x8e\xd0" => "\xa5\xdf", #¥ß - "\x8e\xd1" => "\xa5\xe0", #¥à - "\x8e\xd2" => "\xa5\xe1", #¥á - "\x8e\xd3" => "\xa5\xe2", #¥â - "\x8e\xd4" => "\xa5\xe4", #¥ä - "\x8e\xd5" => "\xa5\xe6", #¥æ - "\x8e\xd6" => "\xa5\xe8", #¥è - "\x8e\xd7" => "\xa5\xe9", #¥é - "\x8e\xd8" => "\xa5\xea", #¥ê - "\x8e\xd9" => "\xa5\xeb", #¥ë - "\x8e\xda" => "\xa5\xec", #¥ì - "\x8e\xdb" => "\xa5\xed", #¥í - "\x8e\xdc" => "\xa5\xef", #¥ï - "\x8e\xdd" => "\xa5\xf3", #¥ó - "\x8e\xde" => "\xa1\xab", #¡« - "\x8e\xdf" => "\xa1\xac", #¡¬ + "\x8e\xa1" => "\xa1\xa3", #¡£ + "\x8e\xa2" => "\xa1\xd6", #¡Ö + "\x8e\xa3" => "\xa1\xd7", #¡× + "\x8e\xa4" => "\xa1\xa2", #¡¢ + "\x8e\xa5" => "\xa1\xa6", #¡¦ + "\x8e\xa6" => "\xa5\xf2", #¥ò + "\x8e\xa7" => "\xa5\xa1", #¥¡ + "\x8e\xa8" => "\xa5\xa3", #¥£ + "\x8e\xa9" => "\xa5\xa5", #¥¥ + "\x8e\xaa" => "\xa5\xa7", #¥§ + "\x8e\xab" => "\xa5\xa9", #¥© + "\x8e\xac" => "\xa5\xe3", #¥ã + "\x8e\xad" => "\xa5\xe5", #¥å + "\x8e\xae" => "\xa5\xe7", #¥ç + "\x8e\xaf" => "\xa5\xc3", #¥Ã + "\x8e\xb0" => "\xa1\xbc", #¡¼ + "\x8e\xb1" => "\xa5\xa2", #¥¢ + "\x8e\xb2" => "\xa5\xa4", #¥¤ + "\x8e\xb3" => "\xa5\xa6", #¥¦ + "\x8e\xb4" => "\xa5\xa8", #¥¨ + "\x8e\xb5" => "\xa5\xaa", #¥ª + "\x8e\xb6" => "\xa5\xab", #¥« + "\x8e\xb7" => "\xa5\xad", #¥­ + "\x8e\xb8" => "\xa5\xaf", #¥¯ + "\x8e\xb9" => "\xa5\xb1", #¥± + "\x8e\xba" => "\xa5\xb3", #¥³ + "\x8e\xbb" => "\xa5\xb5", #¥µ + "\x8e\xbc" => "\xa5\xb7", #¥· + "\x8e\xbd" => "\xa5\xb9", #¥¹ + "\x8e\xbe" => "\xa5\xbb", #¥» + "\x8e\xbf" => "\xa5\xbd", #¥½ + "\x8e\xc0" => "\xa5\xbf", #¥¿ + "\x8e\xc1" => "\xa5\xc1", #¥Á + "\x8e\xc2" => "\xa5\xc4", #¥Ä + "\x8e\xc3" => "\xa5\xc6", #¥Æ + "\x8e\xc4" => "\xa5\xc8", #¥È + "\x8e\xc5" => "\xa5\xca", #¥Ê + "\x8e\xc6" => "\xa5\xcb", #¥Ë + "\x8e\xc7" => "\xa5\xcc", #¥Ì + "\x8e\xc8" => "\xa5\xcd", #¥Í + "\x8e\xc9" => "\xa5\xce", #¥Î + "\x8e\xca" => "\xa5\xcf", #¥Ï + "\x8e\xcb" => "\xa5\xd2", #¥Ò + "\x8e\xcc" => "\xa5\xd5", #¥Õ + "\x8e\xcd" => "\xa5\xd8", #¥Ø + "\x8e\xce" => "\xa5\xdb", #¥Û + "\x8e\xcf" => "\xa5\xde", #¥Þ + "\x8e\xd0" => "\xa5\xdf", #¥ß + "\x8e\xd1" => "\xa5\xe0", #¥à + "\x8e\xd2" => "\xa5\xe1", #¥á + "\x8e\xd3" => "\xa5\xe2", #¥â + "\x8e\xd4" => "\xa5\xe4", #¥ä + "\x8e\xd5" => "\xa5\xe6", #¥æ + "\x8e\xd6" => "\xa5\xe8", #¥è + "\x8e\xd7" => "\xa5\xe9", #¥é + "\x8e\xd8" => "\xa5\xea", #¥ê + "\x8e\xd9" => "\xa5\xeb", #¥ë + "\x8e\xda" => "\xa5\xec", #¥ì + "\x8e\xdb" => "\xa5\xed", #¥í + "\x8e\xdc" => "\xa5\xef", #¥ï + "\x8e\xdd" => "\xa5\xf3", #¥ó + "\x8e\xde" => "\xa1\xab", #¡« + "\x8e\xdf" => "\xa1\xac", #¡¬ ); %_D2Z = ( - "\x8e\xb6\x8e\xde" => "\xa5\xac", #¥¬ - "\x8e\xb7\x8e\xde" => "\xa5\xae", #¥® - "\x8e\xb8\x8e\xde" => "\xa5\xb0", #¥° - "\x8e\xb9\x8e\xde" => "\xa5\xb2", #¥² - "\x8e\xba\x8e\xde" => "\xa5\xb4", #¥´ - "\x8e\xbb\x8e\xde" => "\xa5\xb6", #¥¶ - "\x8e\xbc\x8e\xde" => "\xa5\xb8", #¥¸ - "\x8e\xbd\x8e\xde" => "\xa5\xba", #¥º - "\x8e\xbe\x8e\xde" => "\xa5\xbc", #¥¼ - "\x8e\xbf\x8e\xde" => "\xa5\xbe", #¥¾ - "\x8e\xc0\x8e\xde" => "\xa5\xc0", #¥À - "\x8e\xc1\x8e\xde" => "\xa5\xc2", #¥Â - "\x8e\xc2\x8e\xde" => "\xa5\xc5", #¥Å - "\x8e\xc3\x8e\xde" => "\xa5\xc7", #¥Ç - "\x8e\xc4\x8e\xde" => "\xa5\xc9", #¥É - "\x8e\xca\x8e\xde" => "\xa5\xd0", #¥Ð - "\x8e\xcb\x8e\xde" => "\xa5\xd3", #¥Ó - "\x8e\xcc\x8e\xde" => "\xa5\xd6", #¥Ö - "\x8e\xcd\x8e\xde" => "\xa5\xd9", #¥Ù - "\x8e\xce\x8e\xde" => "\xa5\xdc", #¥Ü - "\x8e\xca\x8e\xdf" => "\xa5\xd1", #¥Ñ - "\x8e\xcb\x8e\xdf" => "\xa5\xd4", #¥Ô - "\x8e\xcc\x8e\xdf" => "\xa5\xd7", #¥× - "\x8e\xcd\x8e\xdf" => "\xa5\xda", #¥Ú - "\x8e\xce\x8e\xdf" => "\xa5\xdd", #¥Ý - "\x8e\xb3\x8e\xde" => "\xa5\xf4", #¥ô + "\x8e\xb6\x8e\xde" => "\xa5\xac", #¥¬ + "\x8e\xb7\x8e\xde" => "\xa5\xae", #¥® + "\x8e\xb8\x8e\xde" => "\xa5\xb0", #¥° + "\x8e\xb9\x8e\xde" => "\xa5\xb2", #¥² + "\x8e\xba\x8e\xde" => "\xa5\xb4", #¥´ + "\x8e\xbb\x8e\xde" => "\xa5\xb6", #¥¶ + "\x8e\xbc\x8e\xde" => "\xa5\xb8", #¥¸ + "\x8e\xbd\x8e\xde" => "\xa5\xba", #¥º + "\x8e\xbe\x8e\xde" => "\xa5\xbc", #¥¼ + "\x8e\xbf\x8e\xde" => "\xa5\xbe", #¥¾ + "\x8e\xc0\x8e\xde" => "\xa5\xc0", #¥À + "\x8e\xc1\x8e\xde" => "\xa5\xc2", #¥Â + "\x8e\xc2\x8e\xde" => "\xa5\xc5", #¥Å + "\x8e\xc3\x8e\xde" => "\xa5\xc7", #¥Ç + "\x8e\xc4\x8e\xde" => "\xa5\xc9", #¥É + "\x8e\xca\x8e\xde" => "\xa5\xd0", #¥Ð + "\x8e\xcb\x8e\xde" => "\xa5\xd3", #¥Ó + "\x8e\xcc\x8e\xde" => "\xa5\xd6", #¥Ö + "\x8e\xcd\x8e\xde" => "\xa5\xd9", #¥Ù + "\x8e\xce\x8e\xde" => "\xa5\xdc", #¥Ü + "\x8e\xca\x8e\xdf" => "\xa5\xd1", #¥Ñ + "\x8e\xcb\x8e\xdf" => "\xa5\xd4", #¥Ô + "\x8e\xcc\x8e\xdf" => "\xa5\xd7", #¥× + "\x8e\xcd\x8e\xdf" => "\xa5\xda", #¥Ú + "\x8e\xce\x8e\xdf" => "\xa5\xdd", #¥Ý + "\x8e\xb3\x8e\xde" => "\xa5\xf4", #¥ô ); # init only once; @@ -124,42 +124,43 @@ use vars qw(%_D2Z $_PAT_D2Z sub h2z { no warnings qw(uninitialized); - my $r_str = shift; + my $r_str = shift; my ($keep_dakuten) = @_; - my $n = 0; - unless ($keep_dakuten){ - $n = ( - $$r_str =~ s( - ($RE{EUC_KANA} - (?:\x8e[\xde\xdf])?) - ){ - my $str = $1; - $_D2Z{$str} || $_H2Z{$str} || - # in case dakuten and handakuten are side-by-side! - $_H2Z{substr($str,0,2)} . $_H2Z{substr($str,2,2)}; - }eogx - ); - }else{ - $n = ( - $$r_str =~ s( - ($RE{EUC_KANA}) - ){ - $_H2Z{$1}; - }eogx - ); + my $n = 0; + unless ($keep_dakuten) { + $n = ( + $$r_str =~ s( + ($RE{EUC_KANA} + (?:\x8e[\xde\xdf])?) + ){ + my $str = $1; + $_D2Z{$str} || $_H2Z{$str} || + # in case dakuten and handakuten are side-by-side! + $_H2Z{substr($str,0,2)} . $_H2Z{substr($str,2,2)}; + }eogx + ); + } + else { + $n = ( + $$r_str =~ s( + ($RE{EUC_KANA}) + ){ + $_H2Z{$1}; + }eogx + ); } $n; } sub z2h { my $r_str = shift; - my $n = ( - $$r_str =~ s( - ($RE{EUC_C}|$RE{EUC_0212}|$RE{EUC_KANA}) - ){ - $_Z2D{$1} || $_Z2H{$1} || $1; - }eogx - ); + my $n = ( + $$r_str =~ s( + ($RE{EUC_C}|$RE{EUC_0212}|$RE{EUC_KANA}) + ){ + $_Z2D{$1} || $_Z2H{$1} || $1; + }eogx + ); $n; } diff --git a/ext/Encode/lib/Encode/JP/JIS7.pm b/ext/Encode/lib/Encode/JP/JIS7.pm index 28503ec..822461a 100644 --- a/ext/Encode/lib/Encode/JP/JIS7.pm +++ b/ext/Encode/lib/Encode/JP/JIS7.pm @@ -1,20 +1,19 @@ package Encode::JP::JIS7; use strict; -our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.1 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use Encode qw(:fallbacks); -for my $name ('7bit-jis', 'iso-2022-jp', 'iso-2022-jp-1'){ - my $h2z = ($name eq '7bit-jis') ? 0 : 1; - my $jis0212 = ($name eq 'iso-2022-jp') ? 0 : 1; +for my $name ( '7bit-jis', 'iso-2022-jp', 'iso-2022-jp-1' ) { + my $h2z = ( $name eq '7bit-jis' ) ? 0 : 1; + my $jis0212 = ( $name eq 'iso-2022-jp' ) ? 0 : 1; - $Encode::Encoding{$name} = - bless { - Name => $name, - h2z => $h2z, - jis0212 => $jis0212, - } => __PACKAGE__; + $Encode::Encoding{$name} = bless { + Name => $name, + h2z => $h2z, + jis0212 => $jis0212, + } => __PACKAGE__; } use base qw(Encode::Encoding); @@ -28,32 +27,31 @@ use Encode::CJKConstants qw(:all); # decode is identical for all 2022 variants # -sub decode($$;$) -{ - my ($obj, $str, $chk) = @_; +sub decode($$;$) { + my ( $obj, $str, $chk ) = @_; my $residue = ''; - if ($chk){ - $str =~ s/([^\x00-\x7f].*)$//so and $residue = $1; + if ($chk) { + $str =~ s/([^\x00-\x7f].*)$//so and $residue = $1; } - $residue .= jis_euc(\$str); + $residue .= jis_euc( \$str ); $_[1] = $residue if $chk; - return Encode::decode('euc-jp', $str, FB_PERLQQ); + return Encode::decode( 'euc-jp', $str, FB_PERLQQ ); } # # encode is different # -sub encode($$;$) -{ +sub encode($$;$) { require Encode::JP::H2Z; - my ($obj, $utf8, $chk) = @_; + my ( $obj, $utf8, $chk ) = @_; + # empty the input string in the stack so perlio is ok $_[1] = '' if $chk; - my ($h2z, $jis0212) = @$obj{qw(h2z jis0212)}; - my $octet = Encode::encode('euc-jp', $utf8, FB_PERLQQ) ; - $h2z and &Encode::JP::H2Z::h2z(\$octet); - euc_jis(\$octet, $jis0212); + my ( $h2z, $jis0212 ) = @$obj{qw(h2z jis0212)}; + my $octet = Encode::encode( 'euc-jp', $utf8, FB_PERLQQ ); + $h2z and &Encode::JP::H2Z::h2z( \$octet ); + euc_jis( \$octet, $jis0212 ); return $octet; } @@ -65,36 +63,38 @@ my $re_scan_jis_g = qr{ ($RE{ISO_ASC}) | ($RE{JIS_KANA}) | ) ([^\e]*) }x; -sub cat_decode { # ($obj, $dst, $src, $pos, $trm, $chk) - my ($obj, undef, undef, $pos, $trm) = @_; # currently ignores $chk - my ($rdst, $rsrc, $rpos) = \@_[1,2,3]; + +sub cat_decode { # ($obj, $dst, $src, $pos, $trm, $chk) + my ( $obj, undef, undef, $pos, $trm ) = @_; # currently ignores $chk + my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ]; local ${^ENCODING}; use bytes; my $opos = pos($$rsrc); pos($$rsrc) = $pos; - while ($$rsrc =~ /$re_scan_jis_g/gc) { - my ($esc, $esc_0212, $esc_asc, $esc_kana, $chunk) = - ($1, $2, $3, $4, $5); - - unless ($chunk) { $esc or last; next; } - - if ($esc && !$esc_asc) { - $chunk =~ tr/\x21-\x7e/\xa1-\xfe/; - if ($esc_kana) { - $chunk =~ s/([\xa1-\xdf])/\x8e$1/og; - } elsif ($esc_0212) { - $chunk =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og; - } - $chunk = Encode::decode('euc-jp', $chunk, 0); - } - elsif ((my $npos = index($chunk, $trm)) >= 0) { - $$rdst .= substr($chunk, 0, $npos + length($trm)); - $$rpos += length($esc) + $npos + length($trm); - pos($$rsrc) = $opos; - return 1; - } - $$rdst .= $chunk; - $$rpos = pos($$rsrc); + while ( $$rsrc =~ /$re_scan_jis_g/gc ) { + my ( $esc, $esc_0212, $esc_asc, $esc_kana, $chunk ) = + ( $1, $2, $3, $4, $5 ); + + unless ($chunk) { $esc or last; next; } + + if ( $esc && !$esc_asc ) { + $chunk =~ tr/\x21-\x7e/\xa1-\xfe/; + if ($esc_kana) { + $chunk =~ s/([\xa1-\xdf])/\x8e$1/og; + } + elsif ($esc_0212) { + $chunk =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og; + } + $chunk = Encode::decode( 'euc-jp', $chunk, 0 ); + } + elsif ( ( my $npos = index( $chunk, $trm ) ) >= 0 ) { + $$rdst .= substr( $chunk, 0, $npos + length($trm) ); + $$rpos += length($esc) + $npos + length($trm); + pos($$rsrc) = $opos; + return 1; + } + $$rdst .= $chunk; + $$rpos = pos($$rsrc); } $$rpos = pos($$rsrc); pos($$rsrc) = $opos; @@ -111,46 +111,45 @@ sub jis_euc { my $r_str = shift; $$r_str =~ s($re_scan_jis) { - my ($esc_0212, $esc_asc, $esc_kana, $chunk) = - ($1, $2, $3, $4); - if (!$esc_asc) { - $chunk =~ tr/\x21-\x7e/\xa1-\xfe/; - if ($esc_kana) { - $chunk =~ s/([\xa1-\xdf])/\x8e$1/og; - } - elsif ($esc_0212) { - $chunk =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og; - } - } - $chunk; + my ($esc_0212, $esc_asc, $esc_kana, $chunk) = + ($1, $2, $3, $4); + if (!$esc_asc) { + $chunk =~ tr/\x21-\x7e/\xa1-\xfe/; + if ($esc_kana) { + $chunk =~ s/([\xa1-\xdf])/\x8e$1/og; + } + elsif ($esc_0212) { + $chunk =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og; + } + } + $chunk; }geox; - my ($residue) = ($$r_str =~ s/(\e.*)$//so); + my ($residue) = ( $$r_str =~ s/(\e.*)$//so ); return $residue; } -sub euc_jis{ +sub euc_jis { no warnings qw(uninitialized); - my $r_str = shift; + my $r_str = shift; my $jis0212 = shift; $$r_str =~ s{ - ((?:$RE{EUC_C})+|(?:$RE{EUC_KANA})+|(?:$RE{EUC_0212})+) - }{ - my $chunk = $1; - my $esc = - ( $chunk =~ tr/\x8E//d ) ? $ESC{KANA} : - ( $chunk =~ tr/\x8F//d ) ? $ESC{JIS_0212} : - $ESC{JIS_0208}; - if ($esc eq $ESC{JIS_0212} && !$jis0212){ - # fallback to '?' - $chunk =~ tr/\xA1-\xFE/\x3F/; - }else{ - $chunk =~ tr/\xA1-\xFE/\x21-\x7E/; - } - $esc . $chunk . $ESC{ASC}; - }geox; - $$r_str =~ - s/\Q$ESC{ASC}\E - (\Q$ESC{KANA}\E|\Q$ESC{JIS_0212}\E|\Q$ESC{JIS_0208}\E)/$1/gox; + ((?:$RE{EUC_C})+|(?:$RE{EUC_KANA})+|(?:$RE{EUC_0212})+) + }{ + my $chunk = $1; + my $esc = + ( $chunk =~ tr/\x8E//d ) ? $ESC{KANA} : + ( $chunk =~ tr/\x8F//d ) ? $ESC{JIS_0212} : + $ESC{JIS_0208}; + if ($esc eq $ESC{JIS_0212} && !$jis0212){ + # fallback to '?' + $chunk =~ tr/\xA1-\xFE/\x3F/; + }else{ + $chunk =~ tr/\xA1-\xFE/\x21-\x7E/; + } + $esc . $chunk . $ESC{ASC}; + }geox; + $$r_str =~ s/\Q$ESC{ASC}\E + (\Q$ESC{KANA}\E|\Q$ESC{JIS_0212}\E|\Q$ESC{JIS_0208}\E)/$1/gox; $$r_str; } diff --git a/ext/Encode/lib/Encode/KR/2022_KR.pm b/ext/Encode/lib/Encode/KR/2022_KR.pm index 8b4052b..7388093 100644 --- a/ext/Encode/lib/Encode/KR/2022_KR.pm +++ b/ext/Encode/lib/Encode/KR/2022_KR.pm @@ -1,36 +1,36 @@ package Encode::KR::2022_KR; use strict; -our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.1 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use Encode qw(:fallbacks); use base qw(Encode::Encoding); __PACKAGE__->Define('iso-2022-kr'); -sub needs_lines { 1 } +sub needs_lines { 1 } -sub perlio_ok { - return 0; # for the time being +sub perlio_ok { + return 0; # for the time being } -sub decode -{ - my ($obj, $str, $chk) = @_; - my $res = $str; - my $residue = iso_euc(\$res); +sub decode { + my ( $obj, $str, $chk ) = @_; + my $res = $str; + my $residue = iso_euc( \$res ); + # This is for PerlIO $_[1] = $residue if $chk; - return Encode::decode('euc-kr', $res, FB_PERLQQ); + return Encode::decode( 'euc-kr', $res, FB_PERLQQ ); } -sub encode -{ - my ($obj, $utf8, $chk) = @_; +sub encode { + my ( $obj, $utf8, $chk ) = @_; + # empty the input string in the stack so perlio is ok $_[1] = '' if $chk; - my $octet = Encode::encode('euc-kr', $utf8, FB_PERLQQ) ; - euc_iso(\$octet); + my $octet = Encode::encode( 'euc-kr', $utf8, FB_PERLQQ ); + euc_iso( \$octet ); return $octet; } @@ -38,9 +38,9 @@ use Encode::CJKConstants qw(:all); # ISO<->EUC -sub iso_euc{ +sub iso_euc { my $r_str = shift; - $$r_str =~ s/$RE{'2022_KR'}//gox; # remove the designator + $$r_str =~ s/$RE{'2022_KR'}//gox; # remove the designator $$r_str =~ s{ # replace characters in GL \x0e # between SO(\x0e) and SI(\x0f) ([^\x0f]*) # with characters in GR @@ -51,15 +51,17 @@ sub iso_euc{ $out =~ tr/\x21-\x7e/\xa1-\xfe/; $out; }geox; - my ($residue) = ($$r_str =~ s/(\e.*)$//so); + my ($residue) = ( $$r_str =~ s/(\e.*)$//so ); return $residue; } -sub euc_iso{ +sub euc_iso { no warnings qw(uninitialized); my $r_str = shift; - substr($$r_str,0,0)=$ESC{'2022_KR'}; # put the designator at the beg. - $$r_str =~ s{ # move KS X 1001 characters in GR to GL + substr( $$r_str, 0, 0 ) = + $ESC{'2022_KR'}; # put the designator at the beg. + $$r_str =~ + s{ # move KS X 1001 characters in GR to GL ($RE{EUC_C}+) # and enclose them with SO and SI }{ my $str = $1; diff --git a/ext/Encode/lib/Encode/MIME/Header.pm b/ext/Encode/lib/Encode/MIME/Header.pm index 29fc858..6e4398e 100644 --- a/ext/Encode/lib/Encode/MIME/Header.pm +++ b/ext/Encode/lib/Encode/MIME/Header.pm @@ -1,180 +1,184 @@ package Encode::MIME::Header; use strict; + # use warnings; -our $VERSION = do { my @r = (q$Revision: 2.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.3 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use Encode qw(find_encoding encode_utf8 decode_utf8); use MIME::Base64; use Carp; -my %seed = - ( - decode_b => '1', # decodes 'B' encoding ? - decode_q => '1', # decodes 'Q' encoding ? - encode => 'B', # encode with 'B' or 'Q' ? - bpl => 75, # bytes per line - ); +my %seed = ( + decode_b => '1', # decodes 'B' encoding ? + decode_q => '1', # decodes 'Q' encoding ? + encode => 'B', # encode with 'B' or 'Q' ? + bpl => 75, # bytes per line +); $Encode::Encoding{'MIME-Header'} = - bless { - %seed, - Name => 'MIME-Header', - } => __PACKAGE__; - -$Encode::Encoding{'MIME-B'} = - bless { - %seed, - decode_q => 0, - Name => 'MIME-B', - } => __PACKAGE__; - -$Encode::Encoding{'MIME-Q'} = - bless { - %seed, - decode_q => 1, - encode => 'Q', - Name => 'MIME-Q', - } => __PACKAGE__; + bless { %seed, Name => 'MIME-Header', } => __PACKAGE__; + +$Encode::Encoding{'MIME-B'} = bless { + %seed, + decode_q => 0, + Name => 'MIME-B', +} => __PACKAGE__; + +$Encode::Encoding{'MIME-Q'} = bless { + %seed, + decode_q => 1, + encode => 'Q', + Name => 'MIME-Q', +} => __PACKAGE__; use base qw(Encode::Encoding); sub needs_lines { 1 } -sub perlio_ok{ 0 }; +sub perlio_ok { 0 } -sub decode($$;$){ +sub decode($$;$) { use utf8; - my ($obj, $str, $chk) = @_; + my ( $obj, $str, $chk ) = @_; + # zap spaces between encoded words $str =~ s/\?=\s+=\?/\?==\?/gos; + # multi-line header to single line $str =~ s/(:?\r|\n|\r\n)[ \t]//gos; - 1 while ($str =~ s/(\=\?[0-9A-Za-z\-_]+\?[Qq]\?)(.*?)\?\=\1(.*?)\?\=/$1$2$3\?\=/); # Concat consecutive QP encoded mime headers - # Fixes breaking inside multi-byte characters + 1 while ( $str =~ + s/(\=\?[0-9A-Za-z\-_]+\?[Qq]\?)(.*?)\?\=\1(.*?)\?\=/$1$2$3\?\=/ ) + ; # Concat consecutive QP encoded mime headers + # Fixes breaking inside multi-byte characters - $str =~ - s{ - =\? # begin encoded word - ([0-9A-Za-z\-_]+) # charset (encoding) + $str =~ s{ + =\? # begin encoded word + ([0-9A-Za-z\-_]+) # charset (encoding) (?:\*[A-Za-z]{1,8}(?:-[A-Za-z]{1,8})*)? # language (RFC 2231) - \?([QqBb])\? # delimiter - (.*?) # Base64-encodede contents - \?= # end encoded word - }{ - if (uc($2) eq 'B'){ - $obj->{decode_b} or croak qq(MIME "B" unsupported); - decode_b($1, $3); - }elsif(uc($2) eq 'Q'){ - $obj->{decode_q} or croak qq(MIME "Q" unsupported); - decode_q($1, $3); - }else{ - croak qq(MIME "$2" encoding is nonexistent!); - } - }egox; + \?([QqBb])\? # delimiter + (.*?) # Base64-encodede contents + \?= # end encoded word + }{ + if (uc($2) eq 'B'){ + $obj->{decode_b} or croak qq(MIME "B" unsupported); + decode_b($1, $3); + }elsif(uc($2) eq 'Q'){ + $obj->{decode_q} or croak qq(MIME "Q" unsupported); + decode_q($1, $3); + }else{ + croak qq(MIME "$2" encoding is nonexistent!); + } + }egox; $_[1] = '' if $chk; return $str; } -sub decode_b{ - my $enc = shift; - my $d = find_encoding($enc) or croak qq(Unknown encoding "$enc"); +sub decode_b { + my $enc = shift; + my $d = find_encoding($enc) or croak qq(Unknown encoding "$enc"); my $db64 = decode_base64(shift); - return $d->name eq 'utf8' ? - Encode::decode_utf8($db64) : $d->decode($db64, Encode::FB_PERLQQ); + return $d->name eq 'utf8' + ? Encode::decode_utf8($db64) + : $d->decode( $db64, Encode::FB_PERLQQ ); } -sub decode_q{ - my ($enc, $q) = @_; +sub decode_q { + my ( $enc, $q ) = @_; my $d = find_encoding($enc) or croak qq(Unknown encoding "$enc"); $q =~ s/_/ /go; $q =~ s/=([0-9A-Fa-f]{2})/pack("C", hex($1))/ego; - return $d->name eq 'utf8' ? - Encode::decode_utf8($q) : $d->decode($q, Encode::FB_PERLQQ); + return $d->name eq 'utf8' + ? Encode::decode_utf8($q) + : $d->decode( $q, Encode::FB_PERLQQ ); } -my $especials = - join('|' => - map {quotemeta(chr($_))} - unpack("C*", qq{()<>@,;:\"\'/[]?.=})); +my $especials = + join( '|' => map { quotemeta( chr($_) ) } + unpack( "C*", qq{()<>@,;:\"\'/[]?.=} ) ); -my $re_encoded_word = - qr{ +my $re_encoded_word = qr{ (?: - =\? # begin encoded word - (?:[0-9A-Za-z\-_]+) # charset (encoding) + =\? # begin encoded word + (?:[0-9A-Za-z\-_]+) # charset (encoding) (?:\*\w+(?:-\w+)*)? # language (RFC 2231) - \?(?:[QqBb])\? # delimiter - (?:.*?) # Base64-encodede contents - \?= # end encoded word + \?(?:[QqBb])\? # delimiter + (?:.*?) # Base64-encodede contents + \?= # end encoded word ) }xo; my $re_especials = qr{$re_encoded_word|$especials}xo; -sub encode($$;$){ - my ($obj, $str, $chk) = @_; +sub encode($$;$) { + my ( $obj, $str, $chk ) = @_; my @line = (); - for my $line (split /\r|\n|\r\n/o, $str){ - my (@word, @subline); - for my $word (split /($re_especials)/o, $line){ - if ($word =~ /[^\x00-\x7f]/o or $word =~ /^$re_encoded_word$/o){ - push @word, $obj->_encode($word); - }else{ - push @word, $word; - } - } - my $subline = ''; - for my $word (@word){ - use bytes (); - if (bytes::length($subline) + bytes::length($word) > $obj->{bpl}){ - push @subline, $subline; - $subline = ''; - } - $subline .= $word; - } - $subline and push @subline, $subline; - push @line, join("\n " => @subline); + for my $line ( split /\r|\n|\r\n/o, $str ) { + my ( @word, @subline ); + for my $word ( split /($re_especials)/o, $line ) { + if ( $word =~ /[^\x00-\x7f]/o + or $word =~ /^$re_encoded_word$/o ) + { + push @word, $obj->_encode($word); + } + else { + push @word, $word; + } + } + my $subline = ''; + for my $word (@word) { + use bytes (); + if ( bytes::length($subline) + bytes::length($word) > + $obj->{bpl} ) + { + push @subline, $subline; + $subline = ''; + } + $subline .= $word; + } + $subline and push @subline, $subline; + push @line, join( "\n " => @subline ); } $_[1] = '' if $chk; - return join("\n", @line); + return join( "\n", @line ); } -use constant HEAD => '=?UTF-8?'; -use constant TAIL => '?='; +use constant HEAD => '=?UTF-8?'; +use constant TAIL => '?='; use constant SINGLE => { B => \&_encode_b, Q => \&_encode_q, }; -sub _encode{ - my ($o, $str) = @_; - my $enc = $o->{encode}; - my $llen = ($o->{bpl} - length(HEAD) - 2 - length(TAIL)); +sub _encode { + my ( $o, $str ) = @_; + my $enc = $o->{encode}; + my $llen = ( $o->{bpl} - length(HEAD) - 2 - length(TAIL) ); + # to coerce a floating-point arithmetics, the following contains # .0 in numbers -- dankogai - $llen *= $enc eq 'B' ? 3.0/4.0 : 1.0/3.0; + $llen *= $enc eq 'B' ? 3.0 / 4.0 : 1.0 / 3.0; my @result = (); - my $chunk = ''; - while(length(my $chr = substr($str, 0, 1, ''))){ - use bytes (); - if (bytes::length($chunk) + bytes::length($chr) > $llen){ - push @result, SINGLE->{$enc}($chunk); - $chunk = ''; - } - $chunk .= $chr; + my $chunk = ''; + while ( length( my $chr = substr( $str, 0, 1, '' ) ) ) { + use bytes (); + if ( bytes::length($chunk) + bytes::length($chr) > $llen ) { + push @result, SINGLE->{$enc}($chunk); + $chunk = ''; + } + $chunk .= $chr; } $chunk and push @result, SINGLE->{$enc}($chunk); return @result; } -sub _encode_b{ - HEAD . 'B?' . encode_base64(encode_utf8(shift), '') . TAIL; +sub _encode_b { + HEAD . 'B?' . encode_base64( encode_utf8(shift), '' ) . TAIL; } -sub _encode_q{ +sub _encode_q { my $chunk = shift; $chunk =~ s{ - ([^0-9A-Za-z]) - }{ - join("" => map {sprintf "=%02X", $_} unpack("C*", $1)) - }egox; - return decode_utf8(HEAD . 'Q?' . $chunk . TAIL); + ([^0-9A-Za-z]) + }{ + join("" => map {sprintf "=%02X", $_} unpack("C*", $1)) + }egox; + return decode_utf8( HEAD . 'Q?' . $chunk . TAIL ); } 1; diff --git a/ext/Encode/lib/Encode/MIME/Header/ISO_2022_JP.pm b/ext/Encode/lib/Encode/MIME/Header/ISO_2022_JP.pm index 5f637a3..6d3ea46 100644 --- a/ext/Encode/lib/Encode/MIME/Header/ISO_2022_JP.pm +++ b/ext/Encode/lib/Encode/MIME/Header/ISO_2022_JP.pm @@ -3,125 +3,127 @@ package Encode::MIME::Header::ISO_2022_JP; use strict; use base qw(Encode::MIME::Header); -$Encode::Encoding{'MIME-Header-ISO_2022_JP'} - = bless {encode => 'B', bpl => 76, Name => 'MIME-Header-ISO_2022_JP'} - => __PACKAGE__; +$Encode::Encoding{'MIME-Header-ISO_2022_JP'} = + bless { encode => 'B', bpl => 76, Name => 'MIME-Header-ISO_2022_JP' } => + __PACKAGE__; -use constant HEAD => '=?ISO-2022-JP?B?'; -use constant TAIL => '?='; +use constant HEAD => '=?ISO-2022-JP?B?'; +use constant TAIL => '?='; use Encode::CJKConstants qw(%RE); -our $VERSION = do { my @r = (q$Revision: 1.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; - +our $VERSION = do { my @r = ( q$Revision: 1.2 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; # I owe the below codes totally to # Jcode by Dan Kogai & http://www.din.or.jp/~ohzaki/perl.htm#JP_Base64 sub encode { - my $self = shift; - my $str = shift; + my $self = shift; + my $str = shift; - utf8::encode($str) if( Encode::is_utf8($str) ); - Encode::from_to($str, 'utf8', 'euc-jp'); + utf8::encode($str) if ( Encode::is_utf8($str) ); + Encode::from_to( $str, 'utf8', 'euc-jp' ); - my($trailing_crlf) = ($str =~ /(\n|\r|\x0d\x0a)$/o); + my ($trailing_crlf) = ( $str =~ /(\n|\r|\x0d\x0a)$/o ); - $str = _mime_unstructured_header($str, $self->{bpl}); + $str = _mime_unstructured_header( $str, $self->{bpl} ); - not $trailing_crlf and $str =~ s/(\n|\r|\x0d\x0a)$//o; + not $trailing_crlf and $str =~ s/(\n|\r|\x0d\x0a)$//o; - return $str; + return $str; } - sub _mime_unstructured_header { - my ($oldheader, $bpl) = @_; - my $crlf = $oldheader =~ /\n$/; - my($header, @words, @wordstmp, $i) = (''); - - $oldheader =~ s/\s+$//; - - @wordstmp = split /\s+/, $oldheader; - - for ($i = 0; $i < $#wordstmp; $i++){ - if( $wordstmp[$i] !~ /^[\x21-\x7E]+$/ and $wordstmp[$i + 1] !~ /^[\x21-\x7E]+$/){ - $wordstmp[$i + 1] = "$wordstmp[$i] $wordstmp[$i + 1]"; - } - else{ - push(@words, $wordstmp[$i]); - } - } - - push(@words, $wordstmp[-1]); - - for my $word (@words){ - if ($word =~ /^[\x21-\x7E]+$/) { - $header =~ /(?:.*\n)*(.*)/; - if (length($1) + length($word) > $bpl) { - $header .= "\n $word"; - } - else{ - $header .= $word; - } - } - else{ - $header = _add_encoded_word($word, $header, $bpl); - } - - $header =~ /(?:.*\n)*(.*)/; - - if(length($1) == $bpl){ - $header .= "\n "; - } - else { - $header .= ' '; - } - } - - $header =~ s/\n? $//mg; - - $crlf ? "$header\n" : $header; + my ( $oldheader, $bpl ) = @_; + my $crlf = $oldheader =~ /\n$/; + my ( $header, @words, @wordstmp, $i ) = (''); + + $oldheader =~ s/\s+$//; + + @wordstmp = split /\s+/, $oldheader; + + for ( $i = 0 ; $i < $#wordstmp ; $i++ ) { + if ( $wordstmp[$i] !~ /^[\x21-\x7E]+$/ + and $wordstmp[ $i + 1 ] !~ /^[\x21-\x7E]+$/ ) + { + $wordstmp[ $i + 1 ] = "$wordstmp[$i] $wordstmp[$i + 1]"; + } + else { + push( @words, $wordstmp[$i] ); + } + } + + push( @words, $wordstmp[-1] ); + + for my $word (@words) { + if ( $word =~ /^[\x21-\x7E]+$/ ) { + $header =~ /(?:.*\n)*(.*)/; + if ( length($1) + length($word) > $bpl ) { + $header .= "\n $word"; + } + else { + $header .= $word; + } + } + else { + $header = _add_encoded_word( $word, $header, $bpl ); + } + + $header =~ /(?:.*\n)*(.*)/; + + if ( length($1) == $bpl ) { + $header .= "\n "; + } + else { + $header .= ' '; + } + } + + $header =~ s/\n? $//mg; + + $crlf ? "$header\n" : $header; } - sub _add_encoded_word { - my($str, $line, $bpl) = @_; - my $result = ''; - - while( length($str) ){ - my $target = $str; - $str = ''; - - if(length($line) + 22 + ($target =~ /^(?:$RE{EUC_0212}|$RE{EUC_C})/o) * 8 > $bpl){ - $line =~ s/[ \t\n\r]*$/\n/; - $result .= $line; - $line = ' '; - } - - while(1){ - my $iso_2022_jp = $target; - Encode::from_to($iso_2022_jp, 'euc-jp', 'iso-2022-jp'); - - my $encoded - = HEAD . MIME::Base64::encode_base64($iso_2022_jp, '') . TAIL; - - if(length($encoded) + length($line) > $bpl){ - $target =~ s/($RE{EUC_0212}|$RE{EUC_KANA}|$RE{EUC_C}|$RE{ASCII})$//o; - $str = $1 . $str; - } - else{ - $line .= $encoded; - last; - } - } - - } - - $result . $line; + my ( $str, $line, $bpl ) = @_; + my $result = ''; + + while ( length($str) ) { + my $target = $str; + $str = ''; + + if ( + length($line) + 22 + + ( $target =~ /^(?:$RE{EUC_0212}|$RE{EUC_C})/o ) * 8 > $bpl ) + { + $line =~ s/[ \t\n\r]*$/\n/; + $result .= $line; + $line = ' '; + } + + while (1) { + my $iso_2022_jp = $target; + Encode::from_to( $iso_2022_jp, 'euc-jp', 'iso-2022-jp' ); + + my $encoded = + HEAD . MIME::Base64::encode_base64( $iso_2022_jp, '' ) . TAIL; + + if ( length($encoded) + length($line) > $bpl ) { + $target =~ + s/($RE{EUC_0212}|$RE{EUC_KANA}|$RE{EUC_C}|$RE{ASCII})$//o; + $str = $1 . $str; + } + else { + $line .= $encoded; + last; + } + } + + } + + $result . $line; } - 1; __END__ diff --git a/ext/Encode/lib/Encode/PerlIO.pod b/ext/Encode/lib/Encode/PerlIO.pod index abd1f2d..1a9269a 100644 --- a/ext/Encode/lib/Encode/PerlIO.pod +++ b/ext/Encode/lib/Encode/PerlIO.pod @@ -105,7 +105,7 @@ encodings such as ISO-2022-JP. Now let's see what happens when you try to decode from ISO-2022-JP and the buffer ends in the middle of a character. - JIS208-ESC \x{5f3e} + JIS208-ESC \x{5f3e} A B C .... ~ \e $ B |DAN | .... 41 42 43 .... 7E 1b 24 41 43 46 .... <- buffer ---------------------------> diff --git a/ext/Encode/lib/Encode/Unicode/UTF7.pm b/ext/Encode/lib/Encode/Unicode/UTF7.pm index a2a789b..cbbd492 100644 --- a/ext/Encode/lib/Encode/Unicode/UTF7.pm +++ b/ext/Encode/lib/Encode/Unicode/UTF7.pm @@ -1,12 +1,12 @@ # -# $Id: UTF7.pm,v 2.1 2004/05/25 16:27:14 dankogai Exp $ +# $Id: UTF7.pm,v 2.3 2006/05/03 18:24:10 dankogai Exp $ # package Encode::Unicode::UTF7; use strict; no warnings 'redefine'; use base qw(Encode::Encoding); __PACKAGE__->Define('UTF-7'); -our $VERSION = do { my @r = (q$Revision: 2.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.3 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use MIME::Base64; use Encode; @@ -15,63 +15,71 @@ use Encode; # our $OPTIONAL_DIRECT_CHARS = 1; -my $specials = quotemeta "\'(),-./:?"; -$OPTIONAL_DIRECT_CHARS and - $specials .= quotemeta "!\"#$%&*;<=>@[]^_`{|}"; +my $specials = quotemeta "\'(),-./:?"; +$OPTIONAL_DIRECT_CHARS + and $specials .= quotemeta "!\"#$%&*;<=>@[]^_`{|}"; + # \s will not work because it matches U+3000 DEOGRAPHIC SPACE -# We use qr/[\n\r\t\ ] instead -my $re_asis = qr/(?:[\n\r\t\ A-Za-z0-9$specials])/; +# We use qr/[\n\r\t\ ] instead +my $re_asis = qr/(?:[\n\r\t\ A-Za-z0-9$specials])/; my $re_encoded = qr/(?:[^\n\r\t\ A-Za-z0-9$specials])/; -my $e_utf16 = find_encoding("UTF-16BE"); +my $e_utf16 = find_encoding("UTF-16BE"); -sub needs_lines { 1 }; +sub needs_lines { 1 } -sub encode($$;$){ - my ($obj, $str, $chk) = @_; +sub encode($$;$) { + my ( $obj, $str, $chk ) = @_; my $len = length($str); pos($str) = 0; my $bytes = ''; - while (pos($str) < $len){ - if ($str =~ /\G($re_asis+)/ogc){ - $bytes .= $1; - }elsif($str =~ /\G($re_encoded+)/ogsc){ - if ($1 eq "+"){ - $bytes .= "+-"; - }else{ - my $s = $1; - my $base64 = encode_base64($e_utf16->encode($s), ''); - $base64 =~ s/=+$//; - $bytes .= "+$base64-"; - } - }else{ - die "This should not happen! (pos=" . pos($str) . ")"; - } + while ( pos($str) < $len ) { + if ( $str =~ /\G($re_asis+)/ogc ) { + $bytes .= $1; + } + elsif ( $str =~ /\G($re_encoded+)/ogsc ) { + if ( $1 eq "+" ) { + $bytes .= "+-"; + } + else { + my $s = $1; + my $base64 = encode_base64( $e_utf16->encode($s), '' ); + $base64 =~ s/=+$//; + $bytes .= "+$base64-"; + } + } + else { + die "This should not happen! (pos=" . pos($str) . ")"; + } } $_[1] = '' if $chk; return $bytes; } - -sub decode($$;$){ - my ($obj, $bytes, $chk) = @_; + +sub decode($$;$) { + my ( $obj, $bytes, $chk ) = @_; my $len = length($bytes); my $str = ""; no warnings 'uninitialized'; - while (pos($bytes) < $len) { - if ($bytes =~ /\G([^+]+)/ogc) { - $str .= $1; - }elsif($bytes =~ /\G\+-/ogc) { - $str .= "+"; - }elsif($bytes =~ /\G\+([A-Za-z0-9+\/]+)-?/ogsc) { - my $base64 = $1; - my $pad = length($base64) % 4; - $base64 .= "=" x (4 - $pad) if $pad; - $str .= $e_utf16->decode(decode_base64($base64)); - }elsif($bytes =~ /\G\+/ogc) { - $^W and warn "Bad UTF7 data escape"; - $str .= "+"; - }else{ - die "This should not happen " . pos($bytes); - } + while ( pos($bytes) < $len ) { + if ( $bytes =~ /\G([^+]+)/ogc ) { + $str .= $1; + } + elsif ( $bytes =~ /\G\+-/ogc ) { + $str .= "+"; + } + elsif ( $bytes =~ /\G\+([A-Za-z0-9+\/]+)-?/ogsc ) { + my $base64 = $1; + my $pad = length($base64) % 4; + $base64 .= "=" x ( 4 - $pad ) if $pad; + $str .= $e_utf16->decode( decode_base64($base64) ); + } + elsif ( $bytes =~ /\G\+/ogc ) { + $^W and warn "Bad UTF7 data escape"; + $str .= "+"; + } + else { + die "This should not happen " . pos($bytes); + } } $_[1] = '' if $chk; return $str; diff --git a/ext/Encode/t/Aliases.t b/ext/Encode/t/Aliases.t index 2fce73e..ff86ed1 100644 --- a/ext/Encode/t/Aliases.t +++ b/ext/Encode/t/Aliases.t @@ -2,13 +2,13 @@ BEGIN { if ($ENV{'PERL_CORE'}){ - chdir 't'; - unshift @INC, '../lib'; + chdir 't'; + unshift @INC, '../lib'; } require Config; import Config; if ($Config{'extensions'} !~ /\bEncode\b/) { - print "1..0 # Skip: Encode was not built\n"; - exit 0; + print "1..0 # Skip: Encode was not built\n"; + exit 0; } } @@ -20,83 +20,83 @@ my $ON_EBCDIC; sub init_a2c{ %a2c = ( - 'US-ascii' => 'ascii', - 'ISO-646-US' => 'ascii', - 'UTF-8' => 'utf-8-strict', - 'UCS-2' => 'UCS-2BE', - 'UCS2' => 'UCS-2BE', - 'iso-10646-1' => 'UCS-2BE', - 'ucs2-le' => 'UCS-2LE', - 'ucs2-be' => 'UCS-2BE', - 'utf16' => 'UTF-16', - 'utf32' => 'UTF-32', - 'utf16-be' => 'UTF-16BE', - 'utf32-be' => 'UTF-32BE', - 'utf16-le' => 'UTF-16LE', - 'utf32-le' => 'UTF-32LE', - 'UCS4-BE' => 'UTF-32BE', - 'UCS-4-LE' => 'UTF-32LE', - 'cyrillic' => 'iso-8859-5', - 'arabic' => 'iso-8859-6', - 'greek' => 'iso-8859-7', - 'hebrew' => 'iso-8859-8', - 'thai' => 'iso-8859-11', - 'tis620' => 'iso-8859-11', - 'WinLatin1' => 'cp1252', - 'WinLatin2' => 'cp1250', - 'WinCyrillic' => 'cp1251', - 'WinGreek' => 'cp1253', - 'WinTurkish' => 'cp1254', - 'WinHebrew' => 'cp1255', - 'WinArabic' => 'cp1256', - 'WinBaltic' => 'cp1257', - 'WinVietnamese' => 'cp1258', - 'koi8r' => 'koi8-r', - 'koi8u' => 'koi8-u', - 'ja_JP.euc' => $ON_EBCDIC ? '' : 'euc-jp', - 'x-euc-jp' => $ON_EBCDIC ? '' : 'euc-jp', - 'zh_CN.euc' => $ON_EBCDIC ? '' : 'euc-cn', - 'x-euc-cn' => $ON_EBCDIC ? '' : 'euc-cn', - 'ko_KR.euc' => $ON_EBCDIC ? '' : 'euc-kr', - 'x-euc-kr' => $ON_EBCDIC ? '' : 'euc-kr', - 'ujis' => $ON_EBCDIC ? '' : 'euc-jp', - 'Shift_JIS' => $ON_EBCDIC ? '' : 'shiftjis', - 'x-sjis' => $ON_EBCDIC ? '' : 'shiftjis', - 'jis' => $ON_EBCDIC ? '' : '7bit-jis', - 'big-5' => $ON_EBCDIC ? '' : 'big5-eten', - 'zh_TW.Big5' => $ON_EBCDIC ? '' : 'big5-eten', - 'tca-big5' => $ON_EBCDIC ? '' : 'big5-eten', - 'big5-hk' => $ON_EBCDIC ? '' : 'big5-hkscs', - 'hkscs-big5' => $ON_EBCDIC ? '' : 'big5-hkscs', - 'GB_2312-80' => $ON_EBCDIC ? '' : 'euc-cn', - 'KS_C_5601-1987' => $ON_EBCDIC ? '' : 'cp949', - # - 'gb12345-raw' => $ON_EBCDIC ? '' : 'gb12345-raw', - 'gb2312-raw' => $ON_EBCDIC ? '' : 'gb2312-raw', - 'jis0201-raw' => $ON_EBCDIC ? '' : 'jis0201-raw', - 'jis0208-raw' => $ON_EBCDIC ? '' : 'jis0208-raw', - 'jis0212-raw' => $ON_EBCDIC ? '' : 'jis0212-raw', - 'ksc5601-raw' => $ON_EBCDIC ? '' : 'ksc5601-raw', - ); + 'US-ascii' => 'ascii', + 'ISO-646-US' => 'ascii', + 'UTF-8' => 'utf-8-strict', + 'UCS-2' => 'UCS-2BE', + 'UCS2' => 'UCS-2BE', + 'iso-10646-1' => 'UCS-2BE', + 'ucs2-le' => 'UCS-2LE', + 'ucs2-be' => 'UCS-2BE', + 'utf16' => 'UTF-16', + 'utf32' => 'UTF-32', + 'utf16-be' => 'UTF-16BE', + 'utf32-be' => 'UTF-32BE', + 'utf16-le' => 'UTF-16LE', + 'utf32-le' => 'UTF-32LE', + 'UCS4-BE' => 'UTF-32BE', + 'UCS-4-LE' => 'UTF-32LE', + 'cyrillic' => 'iso-8859-5', + 'arabic' => 'iso-8859-6', + 'greek' => 'iso-8859-7', + 'hebrew' => 'iso-8859-8', + 'thai' => 'iso-8859-11', + 'tis620' => 'iso-8859-11', + 'WinLatin1' => 'cp1252', + 'WinLatin2' => 'cp1250', + 'WinCyrillic' => 'cp1251', + 'WinGreek' => 'cp1253', + 'WinTurkish' => 'cp1254', + 'WinHebrew' => 'cp1255', + 'WinArabic' => 'cp1256', + 'WinBaltic' => 'cp1257', + 'WinVietnamese' => 'cp1258', + 'koi8r' => 'koi8-r', + 'koi8u' => 'koi8-u', + 'ja_JP.euc' => $ON_EBCDIC ? '' : 'euc-jp', + 'x-euc-jp' => $ON_EBCDIC ? '' : 'euc-jp', + 'zh_CN.euc' => $ON_EBCDIC ? '' : 'euc-cn', + 'x-euc-cn' => $ON_EBCDIC ? '' : 'euc-cn', + 'ko_KR.euc' => $ON_EBCDIC ? '' : 'euc-kr', + 'x-euc-kr' => $ON_EBCDIC ? '' : 'euc-kr', + 'ujis' => $ON_EBCDIC ? '' : 'euc-jp', + 'Shift_JIS' => $ON_EBCDIC ? '' : 'shiftjis', + 'x-sjis' => $ON_EBCDIC ? '' : 'shiftjis', + 'jis' => $ON_EBCDIC ? '' : '7bit-jis', + 'big-5' => $ON_EBCDIC ? '' : 'big5-eten', + 'zh_TW.Big5' => $ON_EBCDIC ? '' : 'big5-eten', + 'tca-big5' => $ON_EBCDIC ? '' : 'big5-eten', + 'big5-hk' => $ON_EBCDIC ? '' : 'big5-hkscs', + 'hkscs-big5' => $ON_EBCDIC ? '' : 'big5-hkscs', + 'GB_2312-80' => $ON_EBCDIC ? '' : 'euc-cn', + 'KS_C_5601-1987' => $ON_EBCDIC ? '' : 'cp949', + # + 'gb12345-raw' => $ON_EBCDIC ? '' : 'gb12345-raw', + 'gb2312-raw' => $ON_EBCDIC ? '' : 'gb2312-raw', + 'jis0201-raw' => $ON_EBCDIC ? '' : 'jis0201-raw', + 'jis0208-raw' => $ON_EBCDIC ? '' : 'jis0208-raw', + 'jis0212-raw' => $ON_EBCDIC ? '' : 'jis0212-raw', + 'ksc5601-raw' => $ON_EBCDIC ? '' : 'ksc5601-raw', + ); for my $i (1..11,13..16){ - $a2c{"ISO 8859 $i"} = "iso-8859-$i"; + $a2c{"ISO 8859 $i"} = "iso-8859-$i"; } for my $i (1..10){ - $a2c{"ISO Latin $i"} = "iso-8859-$Encode::Alias::Latin2iso[$i]"; + $a2c{"ISO Latin $i"} = "iso-8859-$Encode::Alias::Latin2iso[$i]"; } for my $k (keys %Encode::Alias::Winlatin2cp){ - my $v = $Encode::Alias::Winlatin2cp{$k}; - $a2c{"Win" . ucfirst($k)} = "cp" . $v; - $a2c{"IBM-$v"} = $a2c{"MS-$v"} = "cp" . $v; - $a2c{"cp-" . $v} = "cp" . $v; + my $v = $Encode::Alias::Winlatin2cp{$k}; + $a2c{"Win" . ucfirst($k)} = "cp" . $v; + $a2c{"IBM-$v"} = $a2c{"MS-$v"} = "cp" . $v; + $a2c{"cp-" . $v} = "cp" . $v; } my @a2c = keys %a2c; for my $k (@a2c){ - $a2c{uc($k)} = $a2c{$k}; - $a2c{lc($k)} = $a2c{$k}; - $a2c{lcfirst($k)} = $a2c{$k}; - $a2c{ucfirst($k)} = $a2c{$k}; + $a2c{uc($k)} = $a2c{$k}; + $a2c{lc($k)} = $a2c{$k}; + $a2c{lcfirst($k)} = $a2c{$k}; + $a2c{ucfirst($k)} = $a2c{$k}; } } @@ -109,12 +109,12 @@ BEGIN{ if ($ON_EBCDIC){ delete @Encode::ExtModule{ - qw(euc-cn gb2312 gb12345 gbk cp936 iso-ir-165 MacChineseSimp - euc-jp iso-2022-jp 7bit-jis shiftjis MacJapanese cp932 - euc-kr ksc5601 cp949 MacKorean - big5 big5-hkscs cp950 MacChineseTrad - gb18030 big5plus euc-tw) - }; + qw(euc-cn gb2312 gb12345 gbk cp936 iso-ir-165 MacChineseSimp + euc-jp iso-2022-jp 7bit-jis shiftjis MacJapanese cp932 + euc-kr ksc5601 cp949 MacKorean + big5 big5-hkscs cp950 MacChineseTrad + gb18030 big5plus euc-tw) + }; } use Test::More tests => (scalar keys %a2c) * 4; @@ -124,25 +124,25 @@ print "# alias test; \$ON_EBCDIC == $ON_EBCDIC\n"; foreach my $a (keys %a2c){ my $e = Encode::find_encoding($a); is((defined($e) and $e->name), $a2c{$a},$a) - or warn "alias was $a";; + or warn "alias was $a";; } # now we override some of the aliases and see if it works fine define_alias( - qr/ascii/i => 'WinLatin1', - qr/cyrillic/i => 'WinCyrillic', - qr/arabic/i => 'WinArabic', - qr/greek/i => 'WinGreek', - qr/hebrew/i => 'WinHebrew' - ); + qr/ascii/i => 'WinLatin1', + qr/cyrillic/i => 'WinCyrillic', + qr/arabic/i => 'WinArabic', + qr/greek/i => 'WinGreek', + qr/hebrew/i => 'WinHebrew' + ); print "# alias test with alias overrides\n"; foreach my $a (keys %a2c){ my $e = Encode::find_encoding($a); is((defined($e) and $e->name), $a2c{$a}, "Override $a") - or warn "alias was $a"; + or warn "alias was $a"; } print "# alias undef test\n"; @@ -151,7 +151,7 @@ Encode::Alias->undef_aliases; foreach my $a (keys %a2c){ my $e = Encode::find_encoding($a); ok(!defined($e) || $e->name =~ /-raw$/o,"Undef $a") - or warn "alias was $a"; + or warn "alias was $a"; } print "# alias reinit test\n"; @@ -161,7 +161,7 @@ init_a2c(); foreach my $a (keys %a2c){ my $e = Encode::find_encoding($a); is((defined($e) and $e->name), $a2c{$a}, "Reinit $a") - or warn "alias was $a"; + or warn "alias was $a"; } __END__ for my $k (keys %a2c){ diff --git a/ext/Encode/t/CJKT.t b/ext/Encode/t/CJKT.t index 1480439..d58e3e3 100644 --- a/ext/Encode/t/CJKT.t +++ b/ext/Encode/t/CJKT.t @@ -9,8 +9,8 @@ BEGIN { exit 0; } if (ord("A") == 193) { - print "1..0 # Skip: EBCDIC\n"; - exit 0; + print "1..0 # Skip: EBCDIC\n"; + exit 0; } # should work w/o PerlIO now! # unless (PerlIO::Layer->find('perlio')){ @@ -55,7 +55,7 @@ for my $charset (sort keys %Charset){ open $src, "<$src_enc" or die "$src_enc : $!"; if (PerlIO::Layer->find('perlio')){ - binmode($src, ":bytes"); # needed when :utf8 in default open layer + binmode($src, ":bytes"); # needed when :utf8 in default open layer } $txt = join('',<$src>); @@ -68,27 +68,27 @@ for my $charset (sort keys %Charset){ open $dst, ">$dst_utf" or die "$dst_utf : $!"; if (PerlIO::Layer->find('perlio')){ - binmode($dst, ":utf8"); - print $dst $uni; + binmode($dst, ":utf8"); + print $dst $uni; }else{ # ugh! - binmode($dst); - my $raw = $uni; Encode::_utf8_off($raw); - print $dst $raw; + binmode($dst); + my $raw = $uni; Encode::_utf8_off($raw); + print $dst $raw; } close($dst); is(compare_text($dst_utf, $src_utf), 0, "$dst_utf eq $src_utf") - or ($DEBUG and rename $dst_utf, "$dst_utf.$seq"); + or ($DEBUG and rename $dst_utf, "$dst_utf.$seq"); $seq++; open $src, "<$src_utf" or die "$src_utf : $!"; if (PerlIO::Layer->find('perlio')){ - binmode($src, ":utf8"); - $uni = join('', <$src>); + binmode($src, ":utf8"); + $uni = join('', <$src>); }else{ # ugh! - binmode($src); - $uni = join('', <$src>); - Encode::_utf8_on($uni); + binmode($src); + $uni = join('', <$src>); + Encode::_utf8_on($uni); } close $src; @@ -104,13 +104,13 @@ for my $charset (sort keys %Charset){ print $dst $txt; close($dst); is(compare_text($src_enc, $dst_enc), 0 => "$dst_enc eq $src_enc") - or ($DEBUG and rename $dst_enc, "$dst_enc.$seq"); + or ($DEBUG and rename $dst_enc, "$dst_enc.$seq"); $seq++; unlink($dst_utf, $dst_enc); for my $encoding (@{$Charset{$charset}}){ - my $rt = decode($encoding, encode($encoding, $uni)); - is ($rt, $uni, "RT $encoding"); + my $rt = decode($encoding, encode($encoding, $uni)); + is ($rt, $uni, "RT $encoding"); } } diff --git a/ext/Encode/t/Encode.t b/ext/Encode/t/Encode.t index 528f75f..369557e 100644 --- a/ext/Encode/t/Encode.t +++ b/ext/Encode/t/Encode.t @@ -16,8 +16,8 @@ BEGIN { use strict; use Test; use Encode qw(from_to encode decode - encode_utf8 decode_utf8 - find_encoding is_utf8); + encode_utf8 decode_utf8 + find_encoding is_utf8); use charnames qw(greek); my @encodings = grep(/iso-?8859/,Encode::encodings()); my $n = 2; diff --git a/ext/Encode/t/Unicode.t b/ext/Encode/t/Unicode.t index 928a1d6..aa80899 100644 --- a/ext/Encode/t/Unicode.t +++ b/ext/Encode/t/Unicode.t @@ -1,5 +1,5 @@ # -# $Id: Unicode.t,v 2.0 2004/05/16 20:55:17 dankogai Exp $ +# $Id: Unicode.t,v 2.1 2006/05/03 18:24:10 dankogai Exp $ # # This script is written entirely in ASCII, even though quoted literals # do include non-BMP unicode characters -- Are you happy, jhi? @@ -13,7 +13,7 @@ BEGIN { } if (ord("A") == 193) { print "1..0 # Skip: EBCDIC\n"; - exit 0; + exit 0; } $| = 1; } @@ -45,10 +45,10 @@ my $f_16le = pack("C*", map {hex($_)} qw<0f 5c fc 98 00 30 3e 5f fd ff>); my $n_32be = pack("C*", map {hex($_)} - qw<00 00 5c 0f 00 00 98 fc 00 00 30 00 00 00 5f 3e 00 01 ab cd>); + qw<00 00 5c 0f 00 00 98 fc 00 00 30 00 00 00 5f 3e 00 01 ab cd>); my $n_32le = pack("C*", map {hex($_)} - qw<0f 5c 00 00 fc 98 00 00 00 30 00 00 3e 5f 00 00 cd ab 01 00>); + qw<0f 5c 00 00 fc 98 00 00 00 30 00 00 3e 5f 00 00 cd ab 01 00>); my $n_16bb = pack('n', 0xFeFF) . $n_16be; my $n_16lb = pack('v', 0xFeFF) . $n_16le; @@ -91,16 +91,16 @@ is(index($@, 'UCS-2LE'), 0, "encode UCS-2LE: exception"); SKIP: { my $utf8 = ''; for my $j (0,0x10){ - for my $i (0..0xffff){ - $j == 0 and (0xD800 <= $i && $i <= 0xDFFF) and next; - $utf8 .= ord($j+$i); - } - for my $major ('UTF-16', 'UTF-32'){ - for my $minor ('BE', 'LE'){ - my $enc = $major.$minor; - is(decode($enc, encode($enc, $utf8)), $utf8, "$enc RT"); - } - } + for my $i (0..0xffff){ + $j == 0 and (0xD800 <= $i && $i <= 0xDFFF) and next; + $utf8 .= ord($j+$i); + } + for my $major ('UTF-16', 'UTF-32'){ + for my $minor ('BE', 'LE'){ + my $enc = $major.$minor; + is(decode($enc, encode($enc, $utf8)), $utf8, "$enc RT"); + } + } } }; @@ -120,12 +120,12 @@ for my $file (@file){ open my $fh, '<', $path or die "$path:$!"; my $content; if (PerlIO::Layer->find('perlio')){ - binmode $fh => ':utf8'; - $content = join('' => <$fh>); + binmode $fh => ':utf8'; + $content = join('' => <$fh>); }else{ # ugh! - binmode $fh; - $content = join('' => <$fh>); - Encode::_utf8_on($content) + binmode $fh; + $content = join('' => <$fh>); + Encode::_utf8_on($content) } close $fh; is(decode("UTF-7", encode("UTF-7", $content)), $content, diff --git a/ext/Encode/t/at-cn.t b/ext/Encode/t/at-cn.t index 6249fee..03ba109 100644 --- a/ext/Encode/t/at-cn.t +++ b/ext/Encode/t/at-cn.t @@ -1,7 +1,7 @@ BEGIN { if ($ENV{'PERL_CORE'}){ chdir 't'; - unshift @INC, '../lib'; + unshift @INC, '../lib'; } require Config; import Config; if ($Config{'extensions'} !~ /\bEncode\b/) { @@ -9,8 +9,8 @@ BEGIN { exit 0; } if (ord("A") == 193) { - print "1..0 # Skip: EBCDIC\n"; - exit 0; + print "1..0 # Skip: EBCDIC\n"; + exit 0; } $| = 1; } @@ -128,18 +128,18 @@ sub run_tests { # $enc = encoding, $str = content foreach my $enc (sort keys %{$tests}) { - my $str = $tests->{$enc}; + my $str = $tests->{$enc}; - is(Encode::decode($enc, $str), $utf, "[$enc] decode - $title"); - is(Encode::encode($enc, $utf), $str, "[$enc] encode - $title"); + is(Encode::decode($enc, $str), $utf, "[$enc] decode - $title"); + is(Encode::encode($enc, $utf), $str, "[$enc] encode - $title"); - my $str2 = $str; - my $utf8 = Encode::encode('utf-8', $utf); + my $str2 = $str; + my $utf8 = Encode::encode('utf-8', $utf); - Encode::from_to($str2, $enc, 'utf-8'); - is($str2, $utf8, "[$enc] from_to => utf8 - $title"); + Encode::from_to($str2, $enc, 'utf-8'); + is($str2, $utf8, "[$enc] from_to => utf8 - $title"); - Encode::from_to($utf8, 'utf-8', $enc); # convert $utf8 as $enc - is($utf8, $str, "[$enc] utf8 => from_to - $title"); + Encode::from_to($utf8, 'utf-8', $enc); # convert $utf8 as $enc + is($utf8, $str, "[$enc] utf8 => from_to - $title"); } } diff --git a/ext/Encode/t/at-tw.t b/ext/Encode/t/at-tw.t index 11abbf3..e6a559b 100644 --- a/ext/Encode/t/at-tw.t +++ b/ext/Encode/t/at-tw.t @@ -7,12 +7,12 @@ BEGIN { exit 0; } unless (find PerlIO::Layer 'perlio') { - print "1..0 # Skip: PerlIO was not built\n"; - exit 0; + print "1..0 # Skip: PerlIO was not built\n"; + exit 0; } if (ord("A") == 193) { - print "1..0 # Skip: EBCDIC\n"; - exit 0; + print "1..0 # Skip: EBCDIC\n"; + exit 0; } $| = 1; } @@ -79,18 +79,18 @@ sub run_tests { # $enc = encoding, $str = content foreach my $enc (sort keys %{$tests}) { - my $str = $tests->{$enc}; + my $str = $tests->{$enc}; - is(Encode::decode($enc, $str), $utf, "[$enc] decode - $title"); - is(Encode::encode($enc, $utf), $str, "[$enc] encode - $title"); + is(Encode::decode($enc, $str), $utf, "[$enc] decode - $title"); + is(Encode::encode($enc, $utf), $str, "[$enc] encode - $title"); - my $str2 = $str; - my $utf8 = Encode::encode('utf-8', $utf); + my $str2 = $str; + my $utf8 = Encode::encode('utf-8', $utf); - Encode::from_to($str2, $enc, 'utf-8'); - is($str2, $utf8, "[$enc] from_to => utf8 - $title"); + Encode::from_to($str2, $enc, 'utf-8'); + is($str2, $utf8, "[$enc] from_to => utf8 - $title"); - Encode::from_to($utf8, 'utf-8', $enc); # convert $utf8 as $enc - is($utf8, $str, "[$enc] utf8 => from_to - $title"); + Encode::from_to($utf8, 'utf-8', $enc); # convert $utf8 as $enc + is($utf8, $str, "[$enc] utf8 => from_to - $title"); } } diff --git a/ext/Encode/t/enc_data.t b/ext/Encode/t/enc_data.t index 4d9b544..52d7e11 100644 --- a/ext/Encode/t/enc_data.t +++ b/ext/Encode/t/enc_data.t @@ -1,4 +1,4 @@ -# $Id: enc_data.t,v 2.0 2004/05/16 20:55:18 dankogai Exp $ +# $Id: enc_data.t,v 2.1 2006/05/03 18:24:10 dankogai Exp $ BEGIN { require Config; import Config; @@ -7,16 +7,16 @@ BEGIN { exit 0; } unless (find PerlIO::Layer 'perlio') { - print "1..0 # Skip: PerlIO was not built\n"; - exit 0; + print "1..0 # Skip: PerlIO was not built\n"; + exit 0; } if (ord("A") == 193) { - print "1..0 # encoding pragma does not support EBCDIC platforms\n"; - exit(0); + print "1..0 # encoding pragma does not support EBCDIC platforms\n"; + exit(0); } if ($] <= 5.008 and !$Config{perl_patchlevel}){ - print "1..0 # Skip: Perl 5.8.1 or later required\n"; - exit 0; + print "1..0 # Skip: Perl 5.8.1 or later required\n"; + exit 0; } } diff --git a/ext/Encode/t/enc_eucjp.t b/ext/Encode/t/enc_eucjp.t index ab660af..2fdd811 100644 --- a/ext/Encode/t/enc_eucjp.t +++ b/ext/Encode/t/enc_eucjp.t @@ -1,4 +1,4 @@ -# $Id: enc_eucjp.t,v 2.0 2004/05/16 20:55:18 dankogai Exp $ +# $Id: enc_eucjp.t,v 2.1 2006/05/03 18:24:10 dankogai Exp $ # This is the twin of enc_utf8.t . BEGIN { @@ -8,16 +8,16 @@ BEGIN { exit 0; } unless (find PerlIO::Layer 'perlio') { - print "1..0 # Skip: PerlIO was not built\n"; - exit 0; + print "1..0 # Skip: PerlIO was not built\n"; + exit 0; } if (ord("A") == 193) { - print "1..0 # encoding pragma does not support EBCDIC platforms\n"; - exit(0); + print "1..0 # encoding pragma does not support EBCDIC platforms\n"; + exit(0); } if ($] <= 5.008 and !$Config{perl_patchlevel}){ - print "1..0 # Skip: Perl 5.8.1 or later required\n"; - exit 0; + print "1..0 # Skip: Perl 5.8.1 or later required\n"; + exit 0; } } @@ -62,9 +62,9 @@ close F; open(F, $f) or die "$0: failed to open '$f' for reading: $!"; binmode(F, ":encoding(utf-8)"); { - local $^W = 1; - local $SIG{__WARN__} = sub { $a = shift }; - eval { }; # This should get caught. + local $^W = 1; + local $SIG{__WARN__} = sub { $a = shift }; + eval { }; # This should get caught. } close F; print $a =~ qr{^utf8 "\\x80" does not map to Unicode} ? diff --git a/ext/Encode/t/enc_module.t b/ext/Encode/t/enc_module.t index 7cc150c..f187bd7 100644 --- a/ext/Encode/t/enc_module.t +++ b/ext/Encode/t/enc_module.t @@ -1,4 +1,4 @@ -# $Id: enc_module.t,v 2.0 2004/05/16 20:55:18 dankogai Exp $ +# $Id: enc_module.t,v 2.1 2006/05/03 18:24:10 dankogai Exp $ # This file is in euc-jp BEGIN { require Config; import Config; @@ -7,16 +7,16 @@ BEGIN { exit 0; } unless (find PerlIO::Layer 'perlio') { - print "1..0 # Skip: PerlIO was not built\n"; - exit 0; + print "1..0 # Skip: PerlIO was not built\n"; + exit 0; } if (defined ${^UNICODE} and ${^UNICODE} != 0){ - print "1..0 # Skip: \${^UNICODE} == ${^UNICODE}\n"; - exit 0; + print "1..0 # Skip: \${^UNICODE} == ${^UNICODE}\n"; + exit 0; } if (ord("A") == 193) { - print "1..0 # encoding pragma does not support EBCDIC platforms\n"; - exit(0); + print "1..0 # encoding pragma does not support EBCDIC platforms\n"; + exit(0); } } use lib qw(t ext/Encode/t ../ext/Encode/t); # latter 2 for perl core diff --git a/ext/Encode/t/enc_utf8.t b/ext/Encode/t/enc_utf8.t index 1c1a1c7..5a30196 100644 --- a/ext/Encode/t/enc_utf8.t +++ b/ext/Encode/t/enc_utf8.t @@ -1,4 +1,4 @@ -# $Id: enc_utf8.t,v 2.0 2004/05/16 20:55:18 dankogai Exp $ +# $Id: enc_utf8.t,v 2.1 2006/05/03 18:24:10 dankogai Exp $ # This is the twin of enc_eucjp.t . BEGIN { @@ -8,12 +8,12 @@ BEGIN { exit 0; } unless (find PerlIO::Layer 'perlio') { - print "1..0 # Skip: PerlIO was not built\n"; - exit 0; + print "1..0 # Skip: PerlIO was not built\n"; + exit 0; } if (ord("A") == 193) { - print "1..0 # encoding pragma does not support EBCDIC platforms\n"; - exit(0); + print "1..0 # encoding pragma does not support EBCDIC platforms\n"; + exit(0); } } @@ -56,9 +56,9 @@ close F; open(F, $f) or die "$0: failed to open '$f' for reading: $!"; binmode(F, ":encoding(utf-8)"); { - local $^W = 1; - local $SIG{__WARN__} = sub { $a = shift }; - eval { }; # This should get caught. + local $^W = 1; + local $SIG{__WARN__} = sub { $a = shift }; + eval { }; # This should get caught. } close F; print $a =~ qr{^utf8 "\\x80" does not map to Unicode} ? diff --git a/ext/Encode/t/encoding.t b/ext/Encode/t/encoding.t index 1e69ad9..67ea068 100644 --- a/ext/Encode/t/encoding.t +++ b/ext/Encode/t/encoding.t @@ -5,12 +5,12 @@ BEGIN { exit 0; } unless (find PerlIO::Layer 'perlio') { - print "1..0 # Skip: PerlIO was not built\n"; - exit 0; + print "1..0 # Skip: PerlIO was not built\n"; + exit 0; } if (ord("A") == 193) { - print "1..0 # encoding pragma does not support EBCDIC platforms\n"; - exit(0); + print "1..0 # encoding pragma does not support EBCDIC platforms\n"; + exit(0); } } @@ -183,7 +183,7 @@ print "ok 27\n"; print "not " unless ((pack("U*", 0x3B0) cmp $byte) == 1) && ((pack("U*", 0x3AE) cmp $byte) == -1) && ((pack("U*", 0x3AF, 0x20) cmp $byte) == 1) && - ((pack("U*", 0x3AF) cmp pack("C*",0xDF,0x20))==-1); + ((pack("U*", 0x3AF) cmp pack("C*",0xDF,0x20))==-1); print "ok 28\n"; @@ -194,10 +194,10 @@ print "ok 28\n"; } { - my %h1; - my %h2; - $h1{"\xdf"} = 41; - $h2{"\x{3af}"} = 42; - print $h1{"\x{3af}"} == 41 ? "ok 30\n" : "not ok 30\n"; - print $h2{"\xdf"} == 42 ? "ok 31\n" : "not ok 31\n"; + my %h1; + my %h2; + $h1{"\xdf"} = 41; + $h2{"\x{3af}"} = 42; + print $h1{"\x{3af}"} == 41 ? "ok 30\n" : "not ok 30\n"; + print $h2{"\xdf"} == 42 ? "ok 31\n" : "not ok 31\n"; } diff --git a/ext/Encode/t/fallback.t b/ext/Encode/t/fallback.t index 76e1e8c..4cbfe77 100644 --- a/ext/Encode/t/fallback.t +++ b/ext/Encode/t/fallback.t @@ -9,8 +9,8 @@ BEGIN { exit 0; } if (ord("A") == 193) { - print "1..0 # Skip: EBCDIC\n"; - exit 0; + print "1..0 # Skip: EBCDIC\n"; + exit 0; } $| = 1; } diff --git a/ext/Encode/t/guess.t b/ext/Encode/t/guess.t index 55a75b4..5bfbf4e 100644 --- a/ext/Encode/t/guess.t +++ b/ext/Encode/t/guess.t @@ -9,8 +9,8 @@ BEGIN { exit 0; } if (ord("A") == 193) { - print "1..0 # Skip: EBCDIC\n"; - exit 0; + print "1..0 # Skip: EBCDIC\n"; + exit 0; } $| = 1; } @@ -90,8 +90,8 @@ my $ambiguous = "\x{5c0f}\x{98fc}\x{5f3e}"; my $english = "The quick brown fox jumps over the black lazy dog."; for my $utf (qw/UTF-16 UTF-32/){ for my $bl (qw/BE LE/){ - my $test = encode("$utf$bl" => $english); - is(guess_encoding($test)->name, "$utf$bl", "$utf$bl"); + my $test = encode("$utf$bl" => $english); + is(guess_encoding($test)->name, "$utf$bl", "$utf$bl"); } } for my $bl (qw/BE LE/){ diff --git a/ext/Encode/t/jperl.t b/ext/Encode/t/jperl.t index 60b0317..da68468 100644 --- a/ext/Encode/t/jperl.t +++ b/ext/Encode/t/jperl.t @@ -1,5 +1,5 @@ # -# $Id: jperl.t,v 2.0 2004/05/16 20:55:18 dankogai Exp $ +# $Id: jperl.t,v 2.1 2006/05/03 18:24:10 dankogai Exp $ # # This script is written in euc-jp @@ -10,12 +10,12 @@ BEGIN { exit 0; } unless (find PerlIO::Layer 'perlio') { - print "1..0 # Skip: PerlIO was not built\n"; - exit 0; + print "1..0 # Skip: PerlIO was not built\n"; + exit 0; } if (ord("A") == 193) { - print "1..0 # Skip: EBCDIC\n"; - exit 0; + print "1..0 # Skip: EBCDIC\n"; + exit 0; } $| = 1; } diff --git a/ext/Encode/t/mime-header.t b/ext/Encode/t/mime-header.t index 4e3ac56..3c8a559 100644 --- a/ext/Encode/t/mime-header.t +++ b/ext/Encode/t/mime-header.t @@ -1,5 +1,5 @@ # -# $Id: mime-header.t,v 2.1 2006/01/15 15:06:36 dankogai Exp $ +# $Id: mime-header.t,v 2.2 2006/05/03 18:24:10 dankogai Exp $ # This script is written in utf8 # BEGIN { @@ -13,8 +13,8 @@ BEGIN { exit 0; } if (ord("A") == 193) { - print "1..0 # Skip: EBCDIC\n"; - exit 0; + print "1..0 # Skip: EBCDIC\n"; + exit 0; } $| = 1; } diff --git a/ext/Encode/t/mime_header_iso2022jp.t b/ext/Encode/t/mime_header_iso2022jp.t index ffc77e0..bc26b9e 100644 --- a/ext/Encode/t/mime_header_iso2022jp.t +++ b/ext/Encode/t/mime_header_iso2022jp.t @@ -5,7 +5,7 @@ use strict; use Encode; BEGIN{ - use_ok('Encode::MIME::Header::ISO_2022_JP'); + use_ok('Encode::MIME::Header::ISO_2022_JP'); } require_ok('Encode::MIME::Header::ISO_2022_JP'); @@ -13,12 +13,12 @@ require_ok('Encode::MIME::Header::ISO_2022_JP'); # below codes are from mime.t in Jcode my %mime = ( - "´Á»ú¡¢¥«¥¿¥«¥Ê¡¢¤Ò¤é¤¬¤Ê" - => "=?ISO-2022-JP?B?GyRCNEE7eiEiJSslPyUrJUohIiRSJGkkLCRKGyhC?=", - "foo bar" - => "foo bar", - "´Á»ú¡¢¥«¥¿¥«¥Ê¡¢¤Ò¤é¤¬¤Ê¤Îº®¤¸¤Ã¤¿Subject Header." - => "=?ISO-2022-JP?B?GyRCNEE7eiEiJSslPyUrJUohIiRSJGkkLCRKJE46LiQ4JEMkPxsoQlN1?=\n =?ISO-2022-JP?B?YmplY3Q=?= Header.", + "´Á»ú¡¢¥«¥¿¥«¥Ê¡¢¤Ò¤é¤¬¤Ê" + => "=?ISO-2022-JP?B?GyRCNEE7eiEiJSslPyUrJUohIiRSJGkkLCRKGyhC?=", + "foo bar" + => "foo bar", + "´Á»ú¡¢¥«¥¿¥«¥Ê¡¢¤Ò¤é¤¬¤Ê¤Îº®¤¸¤Ã¤¿Subject Header." + => "=?ISO-2022-JP?B?GyRCNEE7eiEiJSslPyUrJUohIiRSJGkkLCRKJE46LiQ4JEMkPxsoQlN1?=\n =?ISO-2022-JP?B?YmplY3Q=?= Header.", ); @@ -28,13 +28,13 @@ for my $k (keys %mime){ for my $decoded (sort keys %mime){ - my $encoded = $mime{$decoded}; + my $encoded = $mime{$decoded}; - my $header = Encode::encode('MIME-Header-ISO_2022_JP', decode('euc-jp', $decoded)); - my $utf8 = Encode::decode('MIME-Header', $header); + my $header = Encode::encode('MIME-Header-ISO_2022_JP', decode('euc-jp', $decoded)); + my $utf8 = Encode::decode('MIME-Header', $header); - is(encode('euc-jp', $utf8), $decoded); - is($header, $encoded); + is(encode('euc-jp', $utf8), $decoded); + is($header, $encoded); } __END__ diff --git a/ext/Encode/t/perlio.t b/ext/Encode/t/perlio.t index c3330ef..8138a89 100644 --- a/ext/Encode/t/perlio.t +++ b/ext/Encode/t/perlio.t @@ -9,8 +9,8 @@ BEGIN { exit 0; } if (ord("A") == 193) { - print "1..0 # Skip: EBCDIC\n"; - exit 0; + print "1..0 # Skip: EBCDIC\n"; + exit 0; } unless (PerlIO::Layer->find('perlio')){ print "1..0 # Skip: PerlIO required\n"; @@ -60,68 +60,68 @@ for my $src (sort keys %e) { close $fh; for my $e (@{$e{$src}}){ - my $sfile = File::Spec->catfile($dir,"$$.sio"); - my $pfile = File::Spec->catfile($dir,"$$.pio"); + my $sfile = File::Spec->catfile($dir,"$$.sio"); + my $pfile = File::Spec->catfile($dir,"$$.pio"); - # first create a file without perlio - dump2file($sfile, &encode($e, $utext, 0)); + # first create a file without perlio + dump2file($sfile, &encode($e, $utext, 0)); - # then create a file via perlio without autoflush + # then create a file via perlio without autoflush SKIP:{ - skip "$e: !perlio_ok", 4 unless (perlio_ok($e) or $DEBUG); - no warnings 'uninitialized'; - open $fh, ">:encoding($e)", $pfile or die "$sfile : $!"; - $fh->autoflush(0); - print $fh $utext; - close $fh; - $seq++; - is(compare_text($sfile, $pfile), 0 => ">:encoding($e)"); - if ($DEBUG){ - copy $sfile, "$sfile.$seq"; - copy $pfile, "$pfile.$seq"; - } - - # this time print line by line. - # works even for ISO-2022 but not ISO-2022-KR - open $fh, ">:encoding($e)", $pfile or die "$sfile : $!"; - $fh->autoflush(1); - for my $l (@uline) { - print $fh $l; - } - close $fh; - $seq++; - is(compare_text($sfile, $pfile), 0 => ">:encoding($e) by lines"); - if ($DEBUG){ - copy $sfile, "$sfile.$seq"; - copy $pfile, "$pfile.$seq"; - } - my $dtext; - open $fh, "<:encoding($e)", $pfile or die "$pfile : $!"; - $fh->autoflush(0); - $dtext = join('' => <$fh>); - close $fh; - $seq++; - ok($utext eq $dtext, "<:encoding($e)"); - if ($DEBUG){ - dump2file("$sfile.$seq", $utext); - dump2file("$pfile.$seq", $dtext); - } - if (perlio_ok($e) or $DEBUG){ - $dtext = ''; - open $fh, "<:encoding($e)", $pfile or die "$pfile : $!"; - while(defined(my $l = <$fh>)) { - $dtext .= $l; - } - close $fh; - } - $seq++; - ok($utext eq $dtext, "<:encoding($e) by lines"); - if ($DEBUG){ - dump2file("$sfile.$seq", $utext); - dump2file("$pfile.$seq", $dtext); - } - } + skip "$e: !perlio_ok", 4 unless (perlio_ok($e) or $DEBUG); + no warnings 'uninitialized'; + open $fh, ">:encoding($e)", $pfile or die "$sfile : $!"; + $fh->autoflush(0); + print $fh $utext; + close $fh; + $seq++; + is(compare_text($sfile, $pfile), 0 => ">:encoding($e)"); + if ($DEBUG){ + copy $sfile, "$sfile.$seq"; + copy $pfile, "$pfile.$seq"; + } + + # this time print line by line. + # works even for ISO-2022 but not ISO-2022-KR + open $fh, ">:encoding($e)", $pfile or die "$sfile : $!"; + $fh->autoflush(1); + for my $l (@uline) { + print $fh $l; + } + close $fh; + $seq++; + is(compare_text($sfile, $pfile), 0 => ">:encoding($e) by lines"); + if ($DEBUG){ + copy $sfile, "$sfile.$seq"; + copy $pfile, "$pfile.$seq"; + } + my $dtext; + open $fh, "<:encoding($e)", $pfile or die "$pfile : $!"; + $fh->autoflush(0); + $dtext = join('' => <$fh>); + close $fh; + $seq++; + ok($utext eq $dtext, "<:encoding($e)"); + if ($DEBUG){ + dump2file("$sfile.$seq", $utext); + dump2file("$pfile.$seq", $dtext); + } + if (perlio_ok($e) or $DEBUG){ + $dtext = ''; + open $fh, "<:encoding($e)", $pfile or die "$pfile : $!"; + while(defined(my $l = <$fh>)) { + $dtext .= $l; + } + close $fh; + } + $seq++; + ok($utext eq $dtext, "<:encoding($e) by lines"); + if ($DEBUG){ + dump2file("$sfile.$seq", $utext); + dump2file("$pfile.$seq", $dtext); + } + } if ( ! $DEBUG ) { 1 while unlink ($sfile); 1 while unlink ($pfile); @@ -134,45 +134,45 @@ for my $src (sort keys %e) { SKIP:{ my $pev = PerlIO::encoding->VERSION; skip "PerlIO::encoding->VERSION = $pev <= 0.07 ", 6 - unless ($pev >= 0.07 or $DEBUG); + unless ($pev >= 0.07 or $DEBUG); my $file = File::Spec->catfile($dir,"jisx0208.utf"); open my $fh, "<:utf8", $file or die "$file : $!"; my $str = join('' => <$fh>); close $fh; my %bom = ( - 'UTF-16BE' => pack('n', 0xFeFF), - 'UTF-16LE' => pack('v', 0xFeFF), - 'UTF-32BE' => pack('N', 0xFeFF), - 'UTF-32LE' => pack('V', 0xFeFF), - ); + 'UTF-16BE' => pack('n', 0xFeFF), + 'UTF-16LE' => pack('v', 0xFeFF), + 'UTF-32BE' => pack('N', 0xFeFF), + 'UTF-32LE' => pack('V', 0xFeFF), + ); # reading for my $utf (sort keys %bom){ - my $bomed = $bom{$utf} . encode($utf, $str); - my $sfile = File::Spec->catfile($dir,".${utf}_${seq}_$$"); - dump2file($sfile, $bomed); - my $utf_nobom = $utf; $utf_nobom =~ s/(LE|BE)$//o; - # reading - open $fh, "<:encoding($utf_nobom)", $sfile or die "$sfile : $!"; - my $cmp = join '' => <$fh>; - close $fh; - is($str, $cmp, "<:encoding($utf_nobom) eq $utf"); - unlink $sfile; $seq++; + my $bomed = $bom{$utf} . encode($utf, $str); + my $sfile = File::Spec->catfile($dir,".${utf}_${seq}_$$"); + dump2file($sfile, $bomed); + my $utf_nobom = $utf; $utf_nobom =~ s/(LE|BE)$//o; + # reading + open $fh, "<:encoding($utf_nobom)", $sfile or die "$sfile : $!"; + my $cmp = join '' => <$fh>; + close $fh; + is($str, $cmp, "<:encoding($utf_nobom) eq $utf"); + unlink $sfile; $seq++; } # writing for my $utf_nobom (qw/UTF-16 UTF-32/){ - my $utf = $utf_nobom . 'BE'; - my $sfile = File::Spec->catfile($dir,".${utf_nobom}_${seq}_$$"); - my $bomed = $bom{$utf} . encode($utf, $str); - open $fh, ">:encoding($utf_nobom)", $sfile or die "$sfile : $!"; - print $fh $str; - close $fh; - open my $fh, "<:bytes", $sfile or die "$sfile : $!"; - read $fh, my $cmp, -s $sfile; - close $fh; - use bytes (); - ok($bomed eq $cmp, ">:encoding($utf_nobom) eq $utf"); - unlink $sfile; $seq++; + my $utf = $utf_nobom . 'BE'; + my $sfile = File::Spec->catfile($dir,".${utf_nobom}_${seq}_$$"); + my $bomed = $bom{$utf} . encode($utf, $str); + open $fh, ">:encoding($utf_nobom)", $sfile or die "$sfile : $!"; + print $fh $str; + close $fh; + open my $fh, "<:bytes", $sfile or die "$sfile : $!"; + read $fh, my $cmp, -s $sfile; + close $fh; + use bytes (); + ok($bomed eq $cmp, ">:encoding($utf_nobom) eq $utf"); + unlink $sfile; $seq++; } } sub dump2file{ diff --git a/ext/Encode/t/rt.pl b/ext/Encode/t/rt.pl index 41db7b9..5959d56 100644 --- a/ext/Encode/t/rt.pl +++ b/ext/Encode/t/rt.pl @@ -1,6 +1,6 @@ #!/usr/local/bin/perl # -# $Id: rt.pl,v 2.0 2004/05/16 20:55:19 dankogai Exp $ +# $Id: rt.pl,v 2.1 2006/05/03 18:24:10 dankogai Exp $ # BEGIN { @@ -16,25 +16,25 @@ BEGIN { exit 0; } if (ord("A") == 193) { - print "1..0 # Skip: EBCDIC\n"; - exit 0; + print "1..0 # Skip: EBCDIC\n"; + exit 0; } use strict; require Test::More; our $DEBUG; our @ucm; unless(@ARGV){ - use File::Spec; - Test::More->import(tests => 103); - opendir my $dh, $ucmdir or die "$ucmdir:$!"; - @ucm = - map {File::Spec->catfile($ucmdir, $_) } - sort grep {/\.ucm$/o} readdir($dh); - closedir $dh; + use File::Spec; + Test::More->import(tests => 103); + opendir my $dh, $ucmdir or die "$ucmdir:$!"; + @ucm = + map {File::Spec->catfile($ucmdir, $_) } + sort grep {/\.ucm$/o} readdir($dh); + closedir $dh; }else{ - Test::More->import("no_plan"); - $DEBUG = 1; - @ucm = @ARGV; + Test::More->import("no_plan"); + $DEBUG = 1; + @ucm = @ARGV; } } @@ -55,20 +55,20 @@ sub rttest{ open my $rfh, "<$ucm" or die "$ucm:$!"; # \x00 |0 # while(<$rfh>){ - s/#.*//o; /^$/ and next; - unless ($name){ - /^\s+"([^\"]+)"/io or next; - $name = $1 and next; - }else{ - /^\s+(\S+)\s+\|(\d)/io or next; - $nchar++; - $3 == 0 or next; - $nrt++; - my $uni = chr(hex($1)); - my $enc = eval qq{ "$2" }; - decode($name, $enc) eq $uni or $nok++; - encode($name, $uni) eq $enc or $nok++; - } + s/#.*//o; /^$/ and next; + unless ($name){ + /^\s+"([^\"]+)"/io or next; + $name = $1 and next; + }else{ + /^\s+(\S+)\s+\|(\d)/io or next; + $nchar++; + $3 == 0 or next; + $nrt++; + my $uni = chr(hex($1)); + my $enc = eval qq{ "$2" }; + decode($name, $enc) eq $uni or $nok++; + encode($name, $uni) eq $enc or $nok++; + } } return($name, $nchar, $nrt, $nok); } diff --git a/ext/Encode/t/unibench.pl b/ext/Encode/t/unibench.pl index ed444cd..8461828 100644 --- a/ext/Encode/t/unibench.pl +++ b/ext/Encode/t/unibench.pl @@ -25,8 +25,8 @@ my %S; for my $i (@sizes){ my $sz = 256 * $i; for my $cp (qw(BMP HIGH)){ - $S{utf8}{$sz}{$cp} = $utf8_seed{$cp} x $i; - $S{utf16}{$sz}{$cp} = encode('UTF-16BE', $S{utf8}{$sz}{$cp}); + $S{utf8}{$sz}{$cp} = $utf8_seed{$cp} x $i; + $S{utf16}{$sz}{$cp} = encode('UTF-16BE', $S{utf8}{$sz}{$cp}); } } @@ -34,37 +34,37 @@ for my $i (@sizes){ my $sz = $i * 256; my $count = $Count * int(256/$i); for my $cp (qw(BMP HIGH)){ - for my $op (qw(encode decode)){ - my ($meth, $from, $to) = ($op eq 'encode') ? - (\&encode, 'utf8', 'utf16') : (\&decode, 'utf16', 'utf8'); - my $XS = sub { - Encode::Unicode::set_transcoder("xs"); - $meth->('UTF-16BE', $S{$from}{$sz}{$cp}) - eq $S{$to}{$sz}{$cp} - or die "$op,$from,$to,$sz,$cp"; - }; - my $modern = sub { - Encode::Unicode::set_transcoder("modern"); - $meth->('UTF-16BE', $S{$from}{$sz}{$cp}) - eq $S{$to}{$sz}{$cp} - or die "$op,$from,$to,$sz,$cp"; - }; - my $classic = sub { - Encode::Unicode::set_transcoder("classic"); - $meth->('UTF-16BE', $S{$from}{$sz}{$cp}) - eq $S{$to}{$sz}{$cp} or - die "$op,$from,$to,$sz,$cp"; - }; - print "---- $op length=$sz/range=$cp ----\n"; - my $r = timethese($count, - { - "XS" => $XS, - "Modern" => $modern, - "Classic" => $classic, - }, - 'none', - ); - cmpthese($r); - } + for my $op (qw(encode decode)){ + my ($meth, $from, $to) = ($op eq 'encode') ? + (\&encode, 'utf8', 'utf16') : (\&decode, 'utf16', 'utf8'); + my $XS = sub { + Encode::Unicode::set_transcoder("xs"); + $meth->('UTF-16BE', $S{$from}{$sz}{$cp}) + eq $S{$to}{$sz}{$cp} + or die "$op,$from,$to,$sz,$cp"; + }; + my $modern = sub { + Encode::Unicode::set_transcoder("modern"); + $meth->('UTF-16BE', $S{$from}{$sz}{$cp}) + eq $S{$to}{$sz}{$cp} + or die "$op,$from,$to,$sz,$cp"; + }; + my $classic = sub { + Encode::Unicode::set_transcoder("classic"); + $meth->('UTF-16BE', $S{$from}{$sz}{$cp}) + eq $S{$to}{$sz}{$cp} or + die "$op,$from,$to,$sz,$cp"; + }; + print "---- $op length=$sz/range=$cp ----\n"; + my $r = timethese($count, + { + "XS" => $XS, + "Modern" => $modern, + "Classic" => $classic, + }, + 'none', + ); + cmpthese($r); + } } } diff --git a/ext/Encode/t/utf8strict.t b/ext/Encode/t/utf8strict.t index dac5d6f..b2bf6b3 100644 --- a/ext/Encode/t/utf8strict.t +++ b/ext/Encode/t/utf8strict.t @@ -9,45 +9,45 @@ BEGIN { require Config; import Config; if ($Config{'extensions'} !~ /\bEncode\b/) { print "1..0 # Skip: Encode was not built\n"; - exit 0; + exit 0; } if ($] <= 5.008 and !$Config{perl_patchlevel}){ - print "1..0 # Skip: Perl 5.8.1 or later required\n"; - exit 0; + print "1..0 # Skip: Perl 5.8.1 or later required\n"; + exit 0; } # http://smontagu.damowmow.com/utf8test.html %ORD = ( - 0x00000080 => 0, # 2.1.2 - 0x00000800 => 0, # 2.1.3 - 0x00010000 => 0, # 2.1.4 - 0x00200000 => 1, # 2.1.5 - 0x00400000 => 1, # 2.1.6 - 0x0000007F => 0, # 2.2.1 -- unmapped okay - 0x000007FF => 0, # 2.2.2 - 0x0000FFFF => 1, # 2.2.3 - 0x001FFFFF => 1, # 2.2.4 - 0x03FFFFFF => 1, # 2.2.5 - 0x7FFFFFFF => 1, # 2.2.6 - 0x0000D800 => 1, # 5.1.1 - 0x0000DB7F => 1, # 5.1.2 - 0x0000D880 => 1, # 5.1.3 - 0x0000DBFF => 1, # 5.1.4 - 0x0000DC00 => 1, # 5.1.5 - 0x0000DF80 => 1, # 5.1.6 - 0x0000DFFF => 1, # 5.1.7 - # 5.2 "Paird UTF-16 surrogates skipped - # because utf-8-strict raises exception at the first one - 0x0000FFFF => 1, # 5.3.1 - ); + 0x00000080 => 0, # 2.1.2 + 0x00000800 => 0, # 2.1.3 + 0x00010000 => 0, # 2.1.4 + 0x00200000 => 1, # 2.1.5 + 0x00400000 => 1, # 2.1.6 + 0x0000007F => 0, # 2.2.1 -- unmapped okay + 0x000007FF => 0, # 2.2.2 + 0x0000FFFF => 1, # 2.2.3 + 0x001FFFFF => 1, # 2.2.4 + 0x03FFFFFF => 1, # 2.2.5 + 0x7FFFFFFF => 1, # 2.2.6 + 0x0000D800 => 1, # 5.1.1 + 0x0000DB7F => 1, # 5.1.2 + 0x0000D880 => 1, # 5.1.3 + 0x0000DBFF => 1, # 5.1.4 + 0x0000DC00 => 1, # 5.1.5 + 0x0000DF80 => 1, # 5.1.6 + 0x0000DFFF => 1, # 5.1.7 + # 5.2 "Paird UTF-16 surrogates skipped + # because utf-8-strict raises exception at the first one + 0x0000FFFF => 1, # 5.3.1 + ); $NTESTS += scalar keys %ORD; %SEQ = ( - qq/ed 9f bf/ => 0, # 2.3.1 - qq/ee 80 80/ => 0, # 2.3.2 - qq/f4 8f bf bf/ => 0, # 2.3.3 - qq/f4 90 80 80/ => 1, # 2.3.4 -- out of range so NG - # "3 Malformed sequences" are checked by perl. - # "4 Overlong sequences" are checked by perl. - ); + qq/ed 9f bf/ => 0, # 2.3.1 + qq/ee 80 80/ => 0, # 2.3.2 + qq/f4 8f bf bf/ => 0, # 2.3.3 + qq/f4 90 80 80/ => 1, # 2.3.4 -- out of range so NG + # "3 Malformed sequences" are checked by perl. + # "4 Overlong sequences" are checked by perl. + ); $NTESTS += scalar keys %SEQ; } use strict;