From: SADAHIRO Tomoyuki Date: Tue, 10 Jul 2001 00:29:55 +0000 (+0900) Subject: Encode::Tcl. add "HZ" encoding and bugfix X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=466d6cd3e7b7bbd27604d839f967b18f16a935e5;p=p5sagit%2Fp5-mst-13.2.git Encode::Tcl. add "HZ" encoding and bugfix Message-Id: <20010710002756.B497.BQW10602@nifty.com> p4raw-id: //depot/perl@11244 --- diff --git a/MANIFEST b/MANIFEST index 2c9da00..5b08012 100644 --- a/MANIFEST +++ b/MANIFEST @@ -227,6 +227,7 @@ ext/Encode/Encode/gb12345.enc Encoding tables ext/Encode/Encode/gb1988.enc Encoding tables ext/Encode/Encode/gb2312.enc Encoding tables ext/Encode/Encode/gsm0338.enc Encoding tables +ext/Encode/Encode/HZ.enc Encoding tables ext/Encode/Encode/iso2022-jp.enc Encoding tables ext/Encode/Encode/iso2022-kr.enc Encoding tables ext/Encode/Encode/iso2022.enc Encoding tables diff --git a/ext/Encode/Encode/7bit-kana.enc b/ext/Encode/Encode/7bit-kana.enc index 871dbf6..2ca1d5f 100644 --- a/ext/Encode/Encode/7bit-kana.enc +++ b/ext/Encode/Encode/7bit-kana.enc @@ -2,8 +2,8 @@ S 0025 0 1 00 -0000000100020003000400050006000700080009000A000B000C000D00000000 -0010001100120013001400150016001700180019001A0000001C001D001E001F +0000000000000000000000000000000000000000000000000000000000000000 +0000000000000000000000000000000000000000000000000000000000000000 0000FF61FF62FF63FF64FF65FF66FF67FF68FF69FF6AFF6BFF6CFF6DFF6EFF6F FF70FF71FF72FF73FF74FF75FF76FF77FF78FF79FF7AFF7BFF7CFF7DFF7EFF7F FF80FF81FF82FF83FF84FF85FF86FF87FF88FF89FF8AFF8BFF8CFF8DFF8EFF8F diff --git a/ext/Encode/Encode/HZ.enc b/ext/Encode/Encode/HZ.enc new file mode 100644 index 0000000..748ee0b --- /dev/null +++ b/ext/Encode/Encode/HZ.enc @@ -0,0 +1,7 @@ +# Encoding file: HZ, HanZi +H +name HZ +init {} +final {} +ascii \x7e\x7d +gb2312 \x7e\x7b diff --git a/ext/Encode/Encode/Tcl.pm b/ext/Encode/Encode/Tcl.pm index 84d107f..d0ff0c8 100644 --- a/ext/Encode/Encode/Tcl.pm +++ b/ext/Encode/Encode/Tcl.pm @@ -78,7 +78,7 @@ sub loadEncoding $type = substr($line,0,1); last unless $type eq '#'; } - my $class = ref($obj).('::'.(($type eq 'E') ? 'Escape' : 'Table')); + my $class = ref($obj).('::'.(($type eq 'H') ? 'HanZi' : ($type eq 'E') ? 'Escape' : 'Table')); # carp "Loading $file"; bless $obj,$class; return $obj if $obj->read($fh,$obj->name,$type); @@ -323,46 +323,152 @@ sub encode while (length($uni)){ my $ch = chr(ord(substr($uni,0,1,''))); - my $x = ref($tbl->{$pre}) eq 'Encode::XS' - ? $tbl->{$pre}->encode($ch,1) - : $tbl->{$pre}->{FmUni}->{$ch}; - - unless(defined $x){ - foreach my $e_seq (@$seq){ - $x = ref($tbl->{$e_seq}) eq 'Encode::XS' - ? $tbl->{$e_seq}->encode($ch,1) - : $tbl->{$e_seq}->{FmUni}->{$ch}; - $cur = $e_seq and last if defined $x; - } + my $x; + foreach my $e_seq ($std, $pre, @$seq){ + $x = ref($tbl->{$e_seq}) eq 'Encode::XS' + ? $tbl->{$e_seq}->encode($ch,1) + : $tbl->{$e_seq}->{FmUni}->{$ch}; + $cur = $e_seq and last if defined $x; } - if($x == 0x0d && !($ini eq '' && $fin eq '') && substr($uni,0,1) eq "\x0a") + if(ref($tbl->{$cur}) ne 'Encode::XS') { - $str .= $cur unless $cur eq $pre; - $str .= $fin."\x0d\x0a".$ini; - substr($uni,0,1,''); - $pre = $std; - next; + my $def = $tbl->{$cur}->{'Def'}; + my $rep = $tbl->{$cur}->{'Rep'}; + unless (defined $x){ + last if ($chk); + $x = $def; + } + $x = pack(&$rep($x),$x); + } + $str .= $cur eq $pre ? $x : ($pre = $cur).$x; + } + $str .= $std unless $cur eq $std; + $str .= $fin; + $_[1] = $uni if $chk; + return $str; +} + +package Encode::Tcl::HanZi; +use base 'Encode::Encoding'; + +use Carp; + +sub read +{ + my ($obj,$fh,$name) = @_; + my(%tbl, @seq, $enc); + while (<$fh>) + { + my ($key,$val) = /^(\S+)\s+(.*)$/; + $val =~ s/^\{(.*?)\}/$1/g; + $val =~ s/\\x([0-9a-f]{2})/chr(hex($1))/ge; + if($enc = Encode->getEncoding($key)){ + $tbl{$val} = ref($enc) eq 'Encode::Tcl' ? $enc->loadEncoding : $enc; + push @seq, $val; + }else{ + $obj->{$key} = $val; + } + } + $obj->{'Seq'} = \@seq; # escape sequences + $obj->{'Tbl'} = \%tbl; # encoding tables + return $obj; +} + +sub decode +{ + my ($obj,$str,$chk) = @_; + my $tbl = $obj->{'Tbl'}; + my $seq = $obj->{'Seq'}; + my $std = $seq->[0]; + my $cur = $std; + my $uni; + while (length($str)){ + my $uch = substr($str,0,1,''); + if($uch eq "~"){ + if($str =~ s/^\cJ//) + { + next; + } + elsif($str =~ s/^\~//) + { + 1; + } + elsif($str =~ s/^([{}])//) + { + $cur = "~$1"; + next; + } + else + { + $str =~ s/^([^~])//; + carp "unknown HanZi escape sequence: ~$1"; + next; + } } - if(ref($tbl->{$cur}) eq 'Encode::XS'){ - $str .= $cur unless $cur eq $pre; - $str .= $x; # "DEF" is lost - $pre = $cur; - next; + if(ref($tbl->{$cur}) eq 'Encode::XS'){ + $uni .= $tbl->{$cur}->decode($uch); + next; + } + my $ch = ord($uch); + my $rep = $tbl->{$cur}->{'Rep'}; + my $touni = $tbl->{$cur}->{'ToUni'}; + my $x; + if (&$rep($ch) eq 'C') + { + $x = $touni->[0][$ch]; + } + else + { + $x = $touni->[$ch][ord(substr($str,0,1,''))]; + } + unless (defined $x) + { + last if $chk; + # What do we do here ? + $x = ''; + } + $uni .= $x; } - my $def = $tbl->{$cur}->{'Def'}; - my $rep = $tbl->{$cur}->{'Rep'}; - unless (defined $x){ - last if ($chk); - $x = $def; + $_[1] = $str if $chk; + return $uni; +} + +sub encode +{ + my ($obj,$uni,$chk) = @_; + my $tbl = $obj->{'Tbl'}; + my $seq = $obj->{'Seq'}; + my $std = $seq->[0]; + my $str; + my $pre = $std; + my $cur = $pre; + + while (length($uni)){ + my $ch = chr(ord(substr($uni,0,1,''))); + my $x; + foreach my $e_seq (@$seq){ + $x = ref($tbl->{$e_seq}) eq 'Encode::XS' + ? $tbl->{$e_seq}->encode($ch,1) + : $tbl->{$e_seq}->{FmUni}->{$ch}; + $cur = $e_seq and last if defined $x; } - $str .= $cur unless $cur eq $pre; - $str .= pack(&$rep($x),$x); - $pre = $cur; + if(ref($tbl->{$cur}) ne 'Encode::XS') + { + my $def = $tbl->{$cur}->{'Def'}; + my $rep = $tbl->{$cur}->{'Rep'}; + unless (defined $x){ + last if ($chk); + $x = $def; + } + $x = pack(&$rep($x),$x); + } + $str .= $cur eq $pre ? $x : ($pre = $cur).$x; + $str .= '~' if $x eq '~'; # to '~~' } $str .= $std unless $cur eq $std; - $str .= $fin; $_[1] = $uni if $chk; return $str; } + 1; __END__ diff --git a/ext/Encode/Encode/Tcl.t b/ext/Encode/Encode/Tcl.t index 5e5d8f9..1d1692e 100644 --- a/ext/Encode/Encode/Tcl.t +++ b/ext/Encode/Encode/Tcl.t @@ -42,7 +42,7 @@ my %ideodigit = ( # cjk ideograph 'one' to 'ten' my @ideodigit = qw(one two three four five six seven eight nine ten); my $jis = '7bit-jis'; -my $kr = '7bit-kr'; +my $kr = 'iso2022-kr'; my %esc_str; $esc_str{$jis} = {qw( @@ -70,8 +70,19 @@ $esc_str{$kr} = {qw( my $num_esc = $n * keys(%esc_str); foreach (values %esc_str){ $num_esc += $n * keys %$_ } +my $hz = 'HZ'; # HanZi + +my @hz_txt = ( + "~~in GB.~{<:Ky2;S{#,NpJ)l6HK!#~}Bye.~~", + "~~in GB.~{<:Ky2;S{#,~}~\cJ~{NpJ)l6HK!#~}Bye.~~", + "~~in GB.~\cJ~{<:Ky2;S{#,NpJ)l6HK!#~}~\cJBye.~~", +); + +my $hz_exp = '007e0069006e002000470042002e5df162404e0d6b32' + . 'ff0c52ff65bd65bc4eba3002004200790065002e007e'; + plan test => $n*@encodings + $n*@encodings*@greek - + $n*@encodings*@ideodigit + $num_esc; + + $n*@encodings*@ideodigit + $num_esc + $n + @hz_txt; foreach my $enc (@encodings) { @@ -143,3 +154,31 @@ foreach my $enc (@encodings) "$enc mangled translating to Unicode"); } } + + +{ + my $hz_to_unicode = sub + { + return unpack('H*', pack 'n*', unpack 'U*', decode $hz, shift); + }; + + my $hz_from_unicode = sub + { + return encode($hz, pack 'U*', unpack 'n*', pack 'H*', shift); + }; + + foreach my $enc ($hz) + { + my $tab = Encode->getEncoding($enc); + ok(1,defined($tab),"Could not load $enc"); + + ok(&$hz_from_unicode($hz_exp), $hz_txt[0], + "$enc mangled translating from Unicode"); + + foreach my $str (@hz_txt) + { + ok(&$hz_to_unicode($str), $hz_exp, + "$enc mangled translating to Unicode"); + } + } +} diff --git a/ext/Encode/Encode/iso2022-jp.enc b/ext/Encode/Encode/iso2022-jp.enc index a4e455f..15d52d2 100644 --- a/ext/Encode/Encode/iso2022-jp.enc +++ b/ext/Encode/Encode/iso2022-jp.enc @@ -3,10 +3,7 @@ E name iso2022-jp init {} final {} -iso8859-1 \x1b(B -jis0201 \x1b(J -jis0208 \x1b$@ +ascii \x1b(B +ascii \x1b(J jis0208 \x1b$B -jis0212 \x1b$(D -gb2312 \x1b$A -ksc5601 \x1b$(C +jis0208 \x1b$@ diff --git a/ext/Encode/Encode/iso2022-kr.enc b/ext/Encode/Encode/iso2022-kr.enc index d20ce2b..b894d22 100644 --- a/ext/Encode/Encode/iso2022-kr.enc +++ b/ext/Encode/Encode/iso2022-kr.enc @@ -3,5 +3,5 @@ E name iso2022-kr init \x1b$)C final {} -iso8859-1 \x0f +ascii \x0f ksc5601 \x0e