From: Jarkko Hietaniemi Date: Tue, 5 Mar 2002 01:39:29 +0000 (+0000) Subject: "The last pieces of Chinese puzzle" from Autrijus. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=c0d88b767d1f5af863e544cf079429d1c44da957;p=p5sagit%2Fp5-mst-13.2.git "The last pieces of Chinese puzzle" from Autrijus. p4raw-id: //depot/perl@15029 --- diff --git a/MANIFEST b/MANIFEST index 8a6ee65..b6e5bd3 100644 --- a/MANIFEST +++ b/MANIFEST @@ -296,7 +296,6 @@ ext/Encode/Encode/gb12345.enc Encode table ext/Encode/Encode/gb1988.enc Encode table ext/Encode/Encode/gb2312.enc Encode table ext/Encode/Encode/gsm0338.enc Encode table -ext/Encode/Encode/HZ.enc Encode table ext/Encode/Encode/ir-197.enc Encode table ext/Encode/Encode/iso-ir-165.enc Encode table ext/Encode/Encode/jis0201.enc Encode table @@ -338,6 +337,7 @@ ext/Encode/JP/JP.pm Encode extension ext/Encode/JP/Makefile.PL Encode extension ext/Encode/KR/KR.pm Encode extension ext/Encode/KR/Makefile.PL Encode extension +ext/Encode/lib/Encode/CN/HZ.pm Encode table ext/Encode/lib/Encode/Encoding.pm Encode extension ext/Encode/lib/Encode/Internal.pm Encode extension ext/Encode/lib/Encode/iso10646_1.pm Encode extension @@ -1145,8 +1145,8 @@ lib/Math/BigInt/t/calling.t Test calling conventions lib/Math/BigInt/t/config.t Test Math::BigInt->config() lib/Math/BigInt/t/constant.t Test Math::BigInt/BigFloat under :constant lib/Math/BigInt/t/downgrade.t Test if use Math::BigInt(); under downgrade works -lib/Math/BigInt/t/isa.t Test for Math::BigInt inheritance lib/Math/BigInt/t/inf_nan.t Special tests for inf and NaN handling +lib/Math/BigInt/t/isa.t Test for Math::BigInt inheritance lib/Math/BigInt/t/mbimbf.inc Actual BigInt/BigFloat accuracy, precicion and fallback, round_mode tests lib/Math/BigInt/t/mbimbf.t BigInt/BigFloat accuracy, precicion and fallback, round_mode lib/Math/BigInt/t/require.t Test if require Math::BigInt works diff --git a/ext/Encode/CN/CN.pm b/ext/Encode/CN/CN.pm index 7f828d3..b2d1795 100644 --- a/ext/Encode/CN/CN.pm +++ b/ext/Encode/CN/CN.pm @@ -1,9 +1,14 @@ package Encode::CN; -use Encode; our $VERSION = '0.02'; + +use Encode; +use Encode::CN::HZ; use XSLoader; XSLoader::load('Encode::CN',$VERSION); +local $@; +eval "use Encode::HanExtra"; # load extra encodings if they exist + 1; __END__ =head1 NAME @@ -25,7 +30,8 @@ Encodings supported are as follows. gb2312 The raw (low-bit) GB2312 character map gb12345 Traditional chinese counterpart to GB2312 (raw) iso-ir-165 GB2312 + GB6345 + GB8565 + additions - cp936 Code Page 936, also known as GBK (Extended GuoBiao) + cp936 Code Page 936, also known as GBK (Extended GuoBiao) + hz 7-bit escaped GB2312 encoding To find how to use this module in detail, see L. @@ -35,9 +41,10 @@ Due to size concerns, C (an extension to C) is distributed separately on CPAN, under the name L. That module also contains extra Taiwan-based encodings. -=head1 BUGS +This module will automatically load L if you have it on +your machine. -The C (Hanzi) escaped encoding is not supported. +=head1 BUGS ASCII part (0x00-0x7f) is preserved for all encodings, even though it conflicts with mappings by the Unicode Consortium. See diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index d0bb788..445dd24 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -173,7 +173,6 @@ define_alias( qr/^gbk$/i => '"cp936"'); # TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8 # TODO: HP-UX '15' encodings japanese15 korean15 roi15 # TODO: Cyrillic encoding ISO-IR-111 (useful?) -# TODO: Chinese encodings HZ # TODO: Armenian encoding ARMSCII-8 # TODO: Hebrew encoding ISO-8859-8-1 # TODO: Thai encoding TCVN diff --git a/ext/Encode/Encode/HZ.enc b/ext/Encode/Encode/HZ.enc deleted file mode 100644 index 748ee0b..0000000 --- a/ext/Encode/Encode/HZ.enc +++ /dev/null @@ -1,7 +0,0 @@ -# Encoding file: HZ, HanZi -H -name HZ -init {} -final {} -ascii \x7e\x7d -gb2312 \x7e\x7b diff --git a/ext/Encode/KR/KR.pm b/ext/Encode/KR/KR.pm index aa24281..9936c5d 100644 --- a/ext/Encode/KR/KR.pm +++ b/ext/Encode/KR/KR.pm @@ -1,6 +1,7 @@ package Encode::KR; -use Encode; our $VERSION = '0.02'; + +use Encode; use XSLoader; XSLoader::load('Encode::KR',$VERSION); @@ -23,7 +24,7 @@ are as follows. euc-kr EUC (Extended Unix Character) ksc5601 Korean standard code set - cp949 Code Page 949 (EUC-KR + Unified Hangul Code) + cp949 Code Page 949 (EUC-KR + Unified Hangul Code) To find how to use this module in detail, see L. diff --git a/ext/Encode/MANIFEST b/ext/Encode/MANIFEST index bf34b59..6300a73 100644 --- a/ext/Encode/MANIFEST +++ b/ext/Encode/MANIFEST @@ -95,7 +95,6 @@ Encode/gb12345.enc Encode/gb1988.enc Encode/gb2312.enc Encode/gsm0338.enc -Encode/HZ.enc Encode/iso-ir-165.enc Encode/ir-197.enc Encode/jis0201.enc @@ -155,6 +154,7 @@ lib/Encode/ucs2_le.pm lib/Encode/Unicode.pm lib/Encode/utf8.pm lib/Encode/XS.pm +lib/Encode/CN/HZ.pm lib/Encode/Tcl/Escape.pm lib/Encode/Tcl/Extended.pm lib/Encode/Tcl/HanZi.pm diff --git a/ext/Encode/TW/TW.pm b/ext/Encode/TW/TW.pm index 90b0460..c3f64fd 100644 --- a/ext/Encode/TW/TW.pm +++ b/ext/Encode/TW/TW.pm @@ -1,9 +1,13 @@ package Encode::TW; -use Encode; our $VERSION = '0.02'; + +use Encode; use XSLoader; XSLoader::load('Encode::TW',$VERSION); +local $@; +eval "use Encode::HanExtra"; # load extra encodings if they exist + 1; __END__ =head1 NAME @@ -23,7 +27,7 @@ Encodings supported are as follows. big5 The original Big5 encoding big5-hkscs Big5 plus Cantonese characters in Hong Kong - cp950 Code Page 950 (Big5 + Microsoft vendor mappings) + cp950 Code Page 950 (Big5 + Microsoft vendor mappings) To find how to use this module in detail, see L. @@ -33,6 +37,9 @@ Due to size concerns, C (Extended Unix Character) and C (CMEX's Big5+) are distributed separately on CPAN, under the name L. That module also contains extra China-based encodings. +This module will automatically load L if you have it on +your machine. + =head1 BUGS The C encoding files are not complete (only the first two planes, diff --git a/ext/Encode/lib/Encode/CN/HZ.pm b/ext/Encode/lib/Encode/CN/HZ.pm new file mode 100644 index 0000000..a57ae8a --- /dev/null +++ b/ext/Encode/lib/Encode/CN/HZ.pm @@ -0,0 +1,50 @@ +package Encode::CN::HZ; + +use Encode::CN; +use Encode qw|encode decode|; +use base 'Encode::Encoding'; + +use strict; + +# HZ is but escaped GB, so we implement it with the +# GB2312(raw) encoding here. Cf. RFC 1842 & 1843. + +my $canon = 'hz'; +my $obj = bless {name => $canon}, __PACKAGE__; +$obj->Define($canon); + +sub decode +{ + my ($obj,$str,$chk) = @_; + my $gb = Encode::find_encoding('gb2312'); + + $str =~ s{~(?:(~)|\n|{([^~]*)~}|)} + {$1 ? '~' : defined $2 ? $gb->decode($2, $chk) : ''}eg; + + return $str; +} + +sub encode +{ + my ($obj,$str,$chk) = @_; + my $gb = Encode::find_encoding('gb2312'); + + $str =~ s/~/~~/g; + $str =~ s/((?: + \p{InCJKCompatibility}| + \p{InCJKCompatibilityForms}| + \p{InCJKCompatibilityIdeographs}| + \p{InCJKCompatibilityIdeographsSupplement}| + \p{InCJKRadicalsSupplement}| + \p{InCJKSymbolsAndPunctuation}| + \p{InCJKUnifiedIdeographsExtensionA}| + \p{InCJKUnifiedIdeographs}| + \p{InCJKUnifiedIdeographsExtensionB}| + \p{InEnclosedCJKLettersAndMonths} + )+)/'~{'.$gb->encode($1, $chk).'~}'/egx; + + return $str; +} + +1; +__END__