From: Jarkko Hietaniemi Date: Sun, 24 Mar 2002 15:53:59 +0000 (+0000) Subject: Upgrade to Encode 0.98, from Dan Kogai. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=a63c962f6b57d9d07801c81cc6e7f8a1b904a8c5;p=p5sagit%2Fp5-mst-13.2.git Upgrade to Encode 0.98, from Dan Kogai. p4raw-id: //depot/perl@15467 --- diff --git a/MANIFEST b/MANIFEST index 26bbcda..2b4c0bf 100644 --- a/MANIFEST +++ b/MANIFEST @@ -355,12 +355,14 @@ ext/Encode/KR/Makefile.PL Encode extension ext/Encode/lib/Encode/Alias.pm Encode extension ext/Encode/lib/Encode/CN/HZ.pm Encode extension ext/Encode/lib/Encode/Details.pod Encode extension +ext/Encode/lib/Encode/EncFormat.pod Encode extension ext/Encode/lib/Encode/Encoding.pm Encode extension ext/Encode/lib/Encode/Internal.pm Encode extension ext/Encode/lib/Encode/iso10646_1.pm Encode extension ext/Encode/lib/Encode/JP/Constants.pm Encode extension ext/Encode/lib/Encode/JP/H2Z.pm Encode extension ext/Encode/lib/Encode/JP/ISO_2022_JP.pm Encode extension +ext/Encode/lib/Encode/JP/ISO_2022_JP_1.pm Encode extension ext/Encode/lib/Encode/JP/JIS.pm Encode extension ext/Encode/lib/Encode/Supported.pod Encode extension ext/Encode/lib/Encode/Tcl.pm Encode extension @@ -370,7 +372,6 @@ ext/Encode/lib/Encode/ucs2_le.pm Encode extension ext/Encode/lib/Encode/Unicode.pm Encode extension ext/Encode/lib/Encode/utf8.pm Encode extension ext/Encode/lib/Encode/XS.pm Encode extension -ext/Encode/lib/EncodeFormat.pod Encode extension ext/Encode/Makefile.PL Encode extension makefile writer ext/Encode/MANIFEST Encode extension ext/Encode/README Encode extension diff --git a/ext/Encode/CN/CN.pm b/ext/Encode/CN/CN.pm index 51d90bb..0a468f9 100644 --- a/ext/Encode/CN/CN.pm +++ b/ext/Encode/CN/CN.pm @@ -4,7 +4,7 @@ BEGIN { die "Encode::CN not supported on EBCDIC\n"; } } -our $VERSION = do { my @r = (q$Revision: 0.97 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use Encode; use Encode::CN::HZ; @@ -58,7 +58,7 @@ also contains extra Taiwan-based encodings. ASCII part (0x00-0x7f) is preserved for all encodings, even though it conflicts with mappings by the Unicode Consortium. See -F +L to find why it is implemented that way. diff --git a/ext/Encode/Changes b/ext/Encode/Changes index a981280..0054d25 100644 --- a/ext/Encode/Changes +++ b/ext/Encode/Changes @@ -1,8 +1,62 @@ # Revision history for Perl extension Encode. # -# $Id: Changes,v 0.97 2002/03/23 20:24:42 dankogai Exp dankogai $ +# $Id: Changes,v 0.98 2002/03/24 15:43:37 dankogai Exp dankogai $ # +0.98 Mon Mar 25 2002 +! lib/Encode/Supported.pod + Further pod fixes ++ lib/Encode/JP/ISO_2022_JP_1.pm +! lib/Encode/JP/ISO_2022_JP.pm +! lib/Encode/JP/JIS.pm +! JP/JP.pm + Now Encode::JP is more strict on the difference between ISO-2022-JP + and ISO-2022-JP-1. See JP/JP.pm for details. I hope this move + makes Anton happier :) FYI the previous version implements + ISO-2022-JP as ISO-2022-JP-1 since it had X0212 support. +! lib/Encode/Supported.pod + Further pod fixes +! Encode.xs + Avoid core-dump in Encode with PERLIO=mmap by NI-S + Message-Id: <20020324104139.1326.7@bactrian.ni-s.u-net.com> +! CN/CN.pm +! JP/JP.pm +! KR/KR.pm +! TW/TW.pm +! lib/Encode/Suppoted.pod + pod fixes to replace F to L, + as suggested by Autrijius in: + Message-Id: <20020324083943.GA14901@not.autrijus.org> +! lib/Encode/Suppoted.pod + fixes and enhancements by Anton + Message-Id: <10632060120.20020324103753@motor.ru> +! lib/Encode/Alias.pm + > define_alias( qr/^GB[- ]?(\d+)$/i => '"gb$1"' ); + added. Suggested by Anton then deobfuscated by Autrijius + Message-Id: <20020324064455.GA3667@not.autrijus.org> +! compile + Further fix by Nicholas Clark + Message-Id: <20020323145840.GD304@Bagpuss.unfortu.net> +- lib/EncodeFormat.pod ++ lib/Encode/EncFormat.pod +! MANIFEST + File renamed as suggested by Autrijius +! Encode.pm +! lib/Encode/Details.pod +! lib/Encode/Supported.pod Sun Mar 24 13:29:35 2002 +! Encode.pm Sun Mar 24 13:43:47 2002 + pod fixes by Autrijius. + Message-Id: <20020324062804.GA3595@not.autrijus.org> + Message-Id: <20020324075627.GB11986@not.autrijus.org> +! t/Alias.t +! lib/Encode/Alias.pm +! Encode.pm + now more EBCDIC conscious; + %ExtModules on EBCDIC system excludes CJK so that you don't + have to worry about the matched alias resulting cloaking. + t/Alias.t also revised to reflect changes. Verified by jhi + Message-Id: <20020324022929.D22596@alpha.hut.fi> + 0.97 Sun Mar 24 2002 ! CN/CN.pm ! KR/KR.pm diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index 7886c63..39953d0 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -1,6 +1,6 @@ package Encode; use strict; -our $VERSION = do { my @r = (q$Revision: 0.97 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; our $DEBUG = 0; require DynaLoader; @@ -37,6 +37,7 @@ bootstrap Encode (); use Carp; +our $ON_EBCDIC = (ord("A") == 193); use Encode::Alias; # Make a %Encoding package variable to allow a certain amount of cheating @@ -51,27 +52,6 @@ our %ExtModule = 'posix-bc' => 'Encode/EBCDIC.pm', symbol => 'Encode/Symbol.pm', dingbats => 'Encode/Symbol.pm', - 'euc-cn' => 'Encode/CN.pm', - gb2312 => 'Encode/CN.pm', - gb12345 => 'Encode/CN.pm', - gbk => 'Encode/CN.pm', - cp936 => 'Encode/CN.pm', - 'iso-ir-165' => 'Encode/CN.pm', - 'euc-jp' => 'Encode/JP.pm', - 'iso-2022-jp' => 'Encode/JP.pm', - '7bit-jis' => 'Encode/JP.pm', - shiftjis => 'Encode/JP.pm', - macjapan => 'Encode/JP.pm', - cp932 => 'Encode/JP.pm', - 'euc-kr' => 'Encode/KR.pm', - ksc5601 => 'Encode/KR.pm', - cp949 => 'Encode/KR.pm', - big5 => 'Encode/TW.pm', - 'big5-hkscs' => 'Encode/TW.pm', - cp950 => 'Encode/TW.pm', - gb18030 => 'Encode/HanExtra.pm', - big5plus => 'Encode/HanExtra.pm', - 'euc-tw' => 'Encode/HanExtra.pm', ); for my $k (2..11,13..16){ @@ -82,6 +62,34 @@ for my $k (1250..1258){ $ExtModule{"cp$k"} = 'Encode/Byte.pm'; } +unless ($ON_EBCDIC) { # CJK added to autoload unless EBCDIC env +%ExtModule =( + %ExtModule, + 'euc-cn' => 'Encode/CN.pm', + gb2312 => 'Encode/CN.pm', + gb12345 => 'Encode/CN.pm', + gbk => 'Encode/CN.pm', + cp936 => 'Encode/CN.pm', + 'iso-ir-165' => 'Encode/CN.pm', + 'euc-jp' => 'Encode/JP.pm', + 'iso-2022-jp' => 'Encode/JP.pm', + 'iso-2022-jp-1' => 'Encode/JP.pm', + '7bit-jis' => 'Encode/JP.pm', + shiftjis => 'Encode/JP.pm', + macjapan => 'Encode/JP.pm', + cp932 => 'Encode/JP.pm', + 'euc-kr' => 'Encode/KR.pm', + ksc5601 => 'Encode/KR.pm', + cp949 => 'Encode/KR.pm', + big5 => 'Encode/TW.pm', + 'big5-hkscs' => 'Encode/TW.pm', + cp950 => 'Encode/TW.pm', + gb18030 => 'Encode/HanExtra.pm', + big5plus => 'Encode/HanExtra.pm', + 'euc-tw' => 'Encode/HanExtra.pm', + ); +} + for my $k (qw(centeuro croatian cyrillic dingbats greek iceland roman rumanian sami thai turkish ukraine)) @@ -234,7 +242,7 @@ The C module provides the interfaces between Perl's strings and the rest of the system. Perl strings are sequences of B. To find more about character encodings, please consult -L . This document focuses on programming references. +L. This document focuses on programming references. =head1 PERL ENCODING API @@ -242,9 +250,7 @@ L . This document focuses on programming references. =over 4 -=item * - - $bytes = encode(ENCODING, $string[, CHECK]) +=item $bytes = encode(ENCODING, $string[, CHECK]) Encodes string from Perl's internal form into I and returns a sequence of octets. For CHECK see L. @@ -254,9 +260,7 @@ to octets: $octets = encode("utf8", $unicode); -=item * - - $string = decode(ENCODING, $bytes[, CHECK]) +=item $string = decode(ENCODING, $bytes[, CHECK]) Decode sequence of octets assumed to be in I into Perl's internal form and returns the resulting string. For CHECK see @@ -266,9 +270,7 @@ For example to convert ISO-8859-1 data to UTF-8: $utf8 = decode("latin1", $latin1); -=item * - - from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK]) +=item from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK]) Convert B the data between two encodings. How did the data in $string originally get to be in FROM_ENCODING? Either using @@ -342,32 +344,28 @@ Hybrids of above. Multiple return values rather than in-place modifications. -Index into the string could be pos($str) allowing s/\G...//. +Index into the string could be C allowing C. =back =head2 UTF-8 / utf8 The Unicode consortium defines the UTF-8 standard as a way of encoding -the entire Unicode repertiore as sequences of octets. This encoding is -expected to become very widespread. Perl can use this form internaly +the entire Unicode repertoire as sequences of octets. This encoding is +expected to become very widespread. Perl can use this form internally to represent strings, so conversions to and from this form are particularly efficient (as octets in memory do not have to change, just the meta-data that tells Perl how to treat them). =over 4 -=item * - - $bytes = encode_utf8($string); +=item $bytes = encode_utf8($string); The characters that comprise string are encoded in Perl's superset of UTF-8 and the resulting octets returned as a sequence of bytes. All possible characters have a UTF-8 representation so this function cannot fail. -=item * - - $string = decode_utf8($bytes [,CHECK]); +=item $string = decode_utf8($bytes [, CHECK]); The sequence of octets represented by $bytes is decoded from UTF-8 into a sequence of logical characters. Not all sequences of octets @@ -391,16 +389,17 @@ Or you can give the name of specific module. @with_jp = Encode->encodings("Encode/JP.pm"); -Note in this case you have to say "Encode/JP.pm instead of Encode::JP. +Note in this case you have to say C<"Encode/JP.pm"> instead of +C<"Encode::JP">. -To find which encodings are suppoted by this package in details, +To find which encodings are supported by this package in details, see L. =head2 Defining Aliases use Encode; use Encode::Alias; - define_alias( newName => ENCODING); + define_alias(newName => ENCODING); Allows newName to be used as am alias for ENCODING. ENCODING may be either the name of an encoding or and encoding object (as above). @@ -410,7 +409,7 @@ See L on details. =head1 Defining Encodings use Encode qw(define_alias); - define_encoding( $object, 'canonicalName' [,alias...]); + define_encoding($object, 'canonicalName' [, alias...]); Causes I to be associated with I<$object>. The object should provide the interface described in L @@ -490,15 +489,13 @@ implementation. As such they are efficient, but may change. =over 4 -=item * is_utf8(STRING [, CHECK]) +=item is_utf8(STRING [, CHECK]) [INTERNAL] Test whether the UTF-8 flag is turned on in the STRING. If CHECK is true, also checks the data in STRING for being well-formed UTF-8. Returns true if successful, false otherwise. -=item * - - _utf8_on(STRING) +=item _utf8_on(STRING) [INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is B checked for being well-formed UTF-8. Do not use unless you @@ -506,9 +503,7 @@ B that the STRING is well-formed UTF-8. Returns the previous state of the UTF-8 flag (so please don't test the return value as I success or failure), or C if STRING is not a string. -=item * - - _utf8_off(STRING) +=item _utf8_off(STRING) [INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously. Returns the previous state of the UTF-8 flag (so please don't test the diff --git a/ext/Encode/JP/JP.pm b/ext/Encode/JP/JP.pm index cff0d98..c4cbac1 100644 --- a/ext/Encode/JP/JP.pm +++ b/ext/Encode/JP/JP.pm @@ -5,13 +5,14 @@ BEGIN { } } use Encode; -our $VERSION = do { my @r = (q$Revision: 0.96 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use XSLoader; XSLoader::load('Encode::JP',$VERSION); use Encode::JP::JIS; use Encode::JP::ISO_2022_JP; +use Encode::JP::ISO_2022_JP_1; 1; __END__ @@ -41,6 +42,9 @@ supported are as follows. iso-2022-jp ISO-2022-JP (7bit JIS with all Halfwidth Kana converted to Fullwidth) + iso-2022-jp-1 ISO-2022-JP-1 + (ISO-2022-JP with JIS X 0212-1990 + support. See below) macjapan Mac Japan (Shift JIS + Apple vendor mappings) cp932 Code Page 932 (Shift JIS + MS/IBM vendor mappings) -------------------------------------------------------------------- @@ -49,12 +53,34 @@ supported are as follows. To find how to use this module in detail, see L. +=head1 Note on ISO-2022-JP(-1)? + +ISO-2022-JP-1 (RFC2237) is a superset of ISO-2022-JP (RFC1468) which +adds support for JIS X 0212-1990. That means you can use the same +code to decode to utf8 but not vice versa. + + $utf8 = decode('iso-2022-jp-1', $stream); + $utf8 = decode('iso-2022-jp', $stream); + +Yields the same result but + + $with_0212 = encode('iso-2022-jp-1', $utf8); + +is now different from + + $without_0212 = encode('iso-2022-jp', $utf8 ); + +In the latter case, characters that map to 0212 are at first converted +to U+3013 (0xA2AE in EUC-JP; a white square also known as 'Tofu') then +fed to decoding engine. U+FFFD is not used to preserve text layout as +much as possible. + =head1 BUGS ASCII part (0x00-0x7f) is preserved for all encodings, even though it conflicts with mappings by the Unicode Consortium. See -F +L to find why it is implemented that way. diff --git a/ext/Encode/KR/KR.pm b/ext/Encode/KR/KR.pm index 7dcafd0..9e2e1d3 100644 --- a/ext/Encode/KR/KR.pm +++ b/ext/Encode/KR/KR.pm @@ -4,7 +4,7 @@ BEGIN { die "Encode::KR not supported on EBCDIC\n"; } } -our $VERSION = do { my @r = (q$Revision: 0.97 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use Encode; use XSLoader; @@ -46,7 +46,7 @@ The C (two-byte combination code) encoding is not supported. ASCII part (0x00-0x7f) is preserved for all encodings, even though it conflicts with mappings by the Unicode Consortium. See -F +L to find why it is implemented that way. diff --git a/ext/Encode/MANIFEST b/ext/Encode/MANIFEST index 24adaca..79ea273 100644 --- a/ext/Encode/MANIFEST +++ b/ext/Encode/MANIFEST @@ -169,6 +169,7 @@ lib/Encode/Internal.pm Encode extension lib/Encode/JP/Constants.pm Encode extension lib/Encode/JP/H2Z.pm Encode extension lib/Encode/JP/ISO_2022_JP.pm Encode extension +lib/Encode/JP/ISO_2022_JP_1.pm Encode extension lib/Encode/JP/JIS.pm Encode extension lib/Encode/Supported.pod Documents supported encodings lib/Encode/Tcl.pm Encode extension @@ -179,7 +180,7 @@ lib/Encode/XS.pm Encode extension lib/Encode/iso10646_1.pm Encode extension lib/Encode/ucs2_le.pm Encode extension lib/Encode/utf8.pm Encode extension -lib/EncodeFormat.pod Encode extension +lib/Encode/EncFormat.pod Encode extension t/Aliases.t Encode extension test t/CN.t Encode extension test t/Encode.t Encode extension test diff --git a/ext/Encode/TW/TW.pm b/ext/Encode/TW/TW.pm index b44c8d2..d1f85c5 100644 --- a/ext/Encode/TW/TW.pm +++ b/ext/Encode/TW/TW.pm @@ -4,7 +4,7 @@ BEGIN { die "Encode::TW not supported on EBCDIC\n"; } } -our $VERSION = do { my @r = (q$Revision: 0.97 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use Encode; use XSLoader; @@ -54,7 +54,7 @@ plane 1-7. ASCII part (0x00-0x7f) is preserved for all encodings, even though it conflicts with mappings by the Unicode Consortium. See -F +L to find why it is implemented that way. diff --git a/ext/Encode/lib/Encode/Alias.pm b/ext/Encode/lib/Encode/Alias.pm index 5f7d345..2a97261 100644 --- a/ext/Encode/lib/Encode/Alias.pm +++ b/ext/Encode/lib/Encode/Alias.pm @@ -1,7 +1,7 @@ package Encode::Alias; use strict; use Encode; -our $VERSION = do { my @r = (q$Revision: 0.96 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; our $DEBUG = 0; require Exporter; @@ -31,11 +31,13 @@ sub find_alias my $new; if (ref($alias) eq 'Regexp' && $_ =~ $alias) { + $DEBUG and warn "eval $val"; $new = eval $val; # $@ and warn "$val, $@"; } elsif (ref($alias) eq 'CODE') { + $DEBUG and warn "$alias", "->", "($val)"; $new = $alias->($val); } elsif (lc($_) eq lc($alias)) @@ -45,6 +47,7 @@ sub find_alias if (defined($new)) { next if $new eq $_; # avoid (direct) recursion on bugs + $DEBUG and warn "$alias, $new"; my $enc = (ref($new)) ? $new : Encode::find_encoding($new); if ($enc) { @@ -54,6 +57,15 @@ sub find_alias } } } + if ($DEBUG){ + my $name; + if (my $e = $Alias{$_}){ + $name = $e->name; + }else{ + $name = ""; + } + warn "find_alias($class, $_)->name = $name"; + } return $Alias{$_}; } @@ -69,15 +81,17 @@ sub define_alias for my $k (@a){ if (ref($alias) eq 'Regexp' && $k =~ $alias) { - $DEBUG and warn $k; + $DEBUG and warn "delete \$Alias\{$k\}"; delete $Alias{$k}; } elsif (ref($alias) eq 'CODE') { + $DEBUG and warn "delete \$Alias\{$k\}"; delete $Alias{$alias->($name)}; } } }else{ + $DEBUG and warn "delete \$Alias\{$alias\}"; delete $Alias{$alias}; } } @@ -154,29 +168,29 @@ sub init_aliases define_alias( qr/^macRomanian$/i => '"macRumanian"'); # Standardize on the dashed versions. - define_alias( qr/^utf8$/i => 'utf-8' ); + # define_alias( qr/^utf8$/i => 'utf-8' ); define_alias( qr/^koi8r$/i => 'koi8-r' ); define_alias( qr/^koi8u$/i => 'koi8-u' ); -# for Encode::CN - define_alias( qr/euc.*cn$/i => '"euc-cn"' ); - define_alias( qr/cn.*euc/i => '"euc-cn"' ); - -# for Encode::JP - define_alias( qr/euc.*jp$/i => '"euc-jp"' ); - define_alias( qr/jp.*euc/i => '"euc-jp"' ); - define_alias( qr/ujis$/i => '"euc-jp"' ); - define_alias( qr/shift.*jis$/i => '"shiftjis"' ); - define_alias( qr/sjis$/i => '"shiftjis"' ); - define_alias( qr/^jis$/i => '"7bit-jis"' ); - -# for Encode::KR - define_alias( qr/euc.*kr$/i => '"euc-kr"' ); - define_alias( qr/kr.*euc/i => '"euc-kr"' ); - -# for Encode::TW - define_alias( qr/big-?5$/i => '"big5"' ); - define_alias( qr/big5-hk(?:scs)?/i => '"big5-hkscs"' ); + unless ($Encode::ON_EBCDIC){ + # for Encode::CN + define_alias( qr/euc.*cn$/i => '"euc-cn"' ); + define_alias( qr/cn.*euc/i => '"euc-cn"' ); + define_alias( qr/^GB[- ]?(\d+)$/i => '"gb$1"' ); + # for Encode::JP + define_alias( qr/euc.*jp$/i => '"euc-jp"' ); + define_alias( qr/jp.*euc/i => '"euc-jp"' ); + define_alias( qr/ujis$/i => '"euc-jp"' ); + define_alias( qr/shift.*jis$/i => '"shiftjis"' ); + define_alias( qr/sjis$/i => '"shiftjis"' ); + define_alias( qr/^jis$/i => '"7bit-jis"' ); + # for Encode::KR + define_alias( qr/euc.*kr$/i => '"euc-kr"' ); + define_alias( qr/kr.*euc/i => '"euc-kr"' ); + # for Encode::TW + define_alias( qr/big-?5$/i => '"big5"' ); + define_alias( qr/big5-hk(?:scs)?/i => '"big5-hkscs"' ); + } # At last, Map white space and _ to '-' define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' ); diff --git a/ext/Encode/lib/Encode/Details.pod b/ext/Encode/lib/Encode/Details.pod index aa3a0af..6721484 100644 --- a/ext/Encode/lib/Encode/Details.pod +++ b/ext/Encode/lib/Encode/Details.pod @@ -1,11 +1,6 @@ - =head1 NAME -Encode - character encodings - -=head1 SYNOPSIS - - use Encode; +Encode::Details - implementation details of Encode.pm =head1 DESCRIPTION @@ -19,7 +14,7 @@ codepoint" for the character (the exceptions are those platforms where the legacy encoding is some variant of EBCDIC rather than a super-set of ASCII - see L). -Traditionaly computer data has been moved around in 8-bit chunks +Traditionally computer data has been moved around in 8-bit chunks often called "bytes". These chunks are also known as "octets" in networking standards. Perl is widely used to manipulate data of many types - not only strings of characters representing human or @@ -92,7 +87,7 @@ encodings for East Asian languages. Not really very "encoded" encodings. The Unicode code points are just represented as 4-octet integers. None the less because different architectures use different representations of integers -(so called "endian") there at least two disctinct encodings. +(so called "endian") there at least two distinct encodings. =item * Multi-byte encodings @@ -265,7 +260,7 @@ Microsft proprietary. UTF-16 KOI8-U ISO-2022-JP-2 -are IANA-registered preferred MIME names but probably shoule +are IANA-registered preferred MIME names but probably should be avoided as encoding for web pages due to lack of browser support. @@ -412,25 +407,21 @@ Index into the string could be pos($str) allowing s/\G...//. =head2 UTF-8 / utf8 The Unicode consortium defines the UTF-8 standard as a way of encoding -the entire Unicode repertiore as sequences of octets. This encoding is -expected to become very widespread. Perl can use this form internaly +the entire Unicode repertoire as sequences of octets. This encoding is +expected to become very widespread. Perl can use this form internally to represent strings, so conversions to and from this form are particularly efficient (as octets in memory do not have to change, just the meta-data that tells Perl how to treat them). =over 4 -=item * - - $bytes = encode_utf8($string); +=item $bytes = encode_utf8($string); The characters that comprise string are encoded in Perl's superset of UTF-8 and the resulting octets returned as a sequence of bytes. All possible characters have a UTF-8 representation so this function cannot fail. -=item * - - $string = decode_utf8($bytes [,CHECK]); +=item $string = decode_utf8($bytes [,CHECK]); The sequence of octets represented by $bytes is decoded from UTF-8 into a sequence of logical characters. Not all sequences of octets @@ -505,10 +496,10 @@ Currently I can be specified in the following ways: define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' ); In this case if I is not a reference it is C-ed to -allow C<$1> etc. to be subsituted. The example is one way to names as +allow C<$1> etc. to be substituted. The example is one way to names as used in X11 font names to alias the MIME names for the iso-8859-* family. Note the double quote inside the single quote. If you are -using regex here, y ou have to do so or it won't work in this case. +using regex here, you have to do so or it won't work in this case. =item As a code reference, e.g.: @@ -622,15 +613,13 @@ implementation. As such they are efficient, but may change. =over 4 -=item * is_utf8(STRING [, CHECK]) +=item is_utf8(STRING [, CHECK]) [INTERNAL] Test whether the UTF-8 flag is turned on in the STRING. If CHECK is true, also checks the data in STRING for being well-formed UTF-8. Returns true if successful, false otherwise. -=item * - - _utf8_on(STRING) +=item _utf8_on(STRING) [INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is B checked for being well-formed UTF-8. Do not use unless you @@ -638,9 +627,7 @@ B that the STRING is well-formed UTF-8. Returns the previous state of the UTF-8 flag (so please don't test the return value as I success or failure), or C if STRING is not a string. -=item * - - _utf8_off(STRING) +=item _utf8_off(STRING) [INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously. Returns the previous state of the UTF-8 flag (so please don't test the @@ -816,6 +803,4 @@ to be rationalized. L, L, L, L, L, L, the Perl Unicode Mailing List Eperl-unicode@perl.orgE - =cut - diff --git a/ext/Encode/lib/EncodeFormat.pod b/ext/Encode/lib/Encode/EncFormat.pod similarity index 98% rename from ext/Encode/lib/EncodeFormat.pod rename to ext/Encode/lib/Encode/EncFormat.pod index 3a1269d..abb8057 100644 --- a/ext/Encode/lib/EncodeFormat.pod +++ b/ext/Encode/lib/Encode/EncFormat.pod @@ -1,6 +1,6 @@ =head1 NAME -EncodeFormat - the format of encoding tables of the Encode extension +Encode::EncFormat - the format of encoding tables of the Encode/*.enc files =head1 DESCRIPTION diff --git a/ext/Encode/lib/Encode/JP/ISO_2022_JP.pm b/ext/Encode/lib/Encode/JP/ISO_2022_JP.pm index 388be5f..29df750 100644 --- a/ext/Encode/lib/Encode/JP/ISO_2022_JP.pm +++ b/ext/Encode/lib/Encode/JP/ISO_2022_JP.pm @@ -5,7 +5,7 @@ use Encode::JP::H2Z; use base 'Encode::Encoding'; use vars qw($VERSION); -$VERSION = do { my @r = (q$Revision: 0.94 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +$VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; my $canon = 'iso-2022-jp'; my $obj = bless {name => $canon}, __PACKAGE__; @@ -31,7 +31,7 @@ sub encode my ($obj,$str,$chk) = @_; my $euc = Encode::encode('euc-jp', $str, $chk); &Encode::JP::H2Z::h2z(\$euc); - return &Encode::JP::JIS::euc_jis(\$euc); + return &Encode::JP::JIS::euc_jis_nox0212(\$euc); } 1; diff --git a/ext/Encode/lib/Encode/JP/ISO_2022_JP_1.pm b/ext/Encode/lib/Encode/JP/ISO_2022_JP_1.pm new file mode 100644 index 0000000..9b1c319 --- /dev/null +++ b/ext/Encode/lib/Encode/JP/ISO_2022_JP_1.pm @@ -0,0 +1,38 @@ +package Encode::JP::ISO_2022_JP_1; +use Encode::JP; +use Encode::JP::JIS; +use Encode::JP::H2Z; +use base 'Encode::Encoding'; + +use vars qw($VERSION); +$VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; + +my $canon = 'iso-2022-jp-1'; +my $obj = bless {name => $canon}, __PACKAGE__; +$obj->Define($canon); + +sub name { return $_[0]->{name}; } + +# +# decode is identical to 7bit-jis +# + +sub decode +{ + my ($obj,$str,$chk) = @_; + return Encode::decode('7bit-jis', $str, $chk); +} + +# iso-2022-jp = 7bit-jis with all x201 (Hankaku) converted to +# x208 equivalent (Zenkaku) + +sub encode +{ + my ($obj,$str,$chk) = @_; + my $euc = Encode::encode('euc-jp', $str, $chk); + &Encode::JP::H2Z::h2z(\$euc); + return &Encode::JP::JIS::euc_jis(\$euc); +} + +1; +__END__ diff --git a/ext/Encode/lib/Encode/JP/JIS.pm b/ext/Encode/lib/Encode/JP/JIS.pm index 6e6dd0f..8687821 100644 --- a/ext/Encode/lib/Encode/JP/JIS.pm +++ b/ext/Encode/lib/Encode/JP/JIS.pm @@ -5,7 +5,7 @@ use base 'Encode::Encoding'; use strict; use vars qw($VERSION); -$VERSION = do { my @r = (q$Revision: 0.94 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +$VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; # Just for the time being, we implement jis-7bit # encoding via EUC @@ -77,5 +77,11 @@ sub euc_jis{ $$r_str; } +sub euc_jis_nox0212{ + my $r_str = shift; + $$r_str =~ s/$RE{EUC_0212}/$CHARCODE{UNDEF_EUC}/go; + euc_jis($r_str); +} + 1; __END__ diff --git a/ext/Encode/lib/Encode/Supported.pod b/ext/Encode/lib/Encode/Supported.pod index d48d14d..1a9f88e 100644 --- a/ext/Encode/lib/Encode/Supported.pod +++ b/ext/Encode/lib/Encode/Supported.pod @@ -24,7 +24,7 @@ once an operation is in progress. As of Perl 5.8.0, at least the following encodings are recognized. Note that unless otherwise specified, they are all case insensitive -(via alias) and all occurance of spaces are replaced with '-'. In +(via alias) and all occurrance of spaces are replaced with '-'. In other words, "ISO 8859 1" and "iso-8859-1" are identical. Encodings are categorized and implemented in several different modules @@ -51,12 +51,12 @@ extended ASCII. For most cases it uses \x80-\xff (upper half) to map non-ASCII characters. ----------------------- - iso-8859-1 latin + (iso-8859-1 is in built-in) iso-8859-2 latin2 iso-8859-3 latin3 iso-8859-4 latin4 - iso-8859-5 latin - iso-8859-6 latin + iso-8859-5 + iso-8859-6 iso-8859-7 iso-8859-8 iso-8859-9 latin5 @@ -102,8 +102,9 @@ non-ASCII characters. =head2 The CJK: Chinese, Japanese, Korean (Multibyte) Note Vietnamese is listed above. Also read "Encoding vs Charset" -below. Also note these are impelemented in distinct module by -languages, due the the size concerns. See these perldocs also. +below. Also note these are implemented in distinct module by +languages, due the the size concerns. Please also refer to their +respective document pages. =over 4 @@ -125,6 +126,7 @@ languages, due the the size concerns. See these perldocs also. cp932 euc-jp ujis iso-2022-jp + iso-2022-jp-1 macjapan shiftjis Shift_JIS, sjis ----------------------- @@ -172,7 +174,7 @@ See perlebcdic for details. posix-bc ----------------------- -=item Enocode::Symbols +=item Encode::Symbols For symbols and dingbats. @@ -193,70 +195,105 @@ Charset determines which characters to be included in a given text. Encoding actually maps charset(s) to stream of bits. -Note a given encoding contains multiple charsets. For instance, -euc-jp contains ASCII, JIS X 0201 (Hankaku Kana), JIS X 0208 (Zenkaku -Kana and Kanji) and JIS X 0212 (Extended Kanji) in a single encoding. +Note a given encoding may contain multiple charsets and complex CJK +encodings are usually implemented that way. + +For instance, euc-jp contains ASCII, JIS X 0201-1978 (Hankaku Kana), +JIS X 0208-1997 (ZenkakuKana and Kanji) and JIS X 0212-1990 (Extended +Kanji) in a single encoding. As the name suggests, the Encode module supports encodings, not individual charsets. -=head1 Encoding Classification (by Anton Tagunov) +=head1 Encoding Classification (by Anton Tagunov and Dan Kogai) + +This section tries to classify the supported encodings by their +applicability for information exchange over the Internet and to +choose the most suitable aliases to name them in the context of +such communication. + +Encoding names -Encodings + US-ASCII UTF-8 + ISO-8859-* KOI8-R + Shift_JIS EUC-JP ISO-2022-JP ISO-2022-JP-1 + EUC-KR + Big5 - US-ASCII UTF-8 KOI8-R ISO-8859-* - ISO-2022-CN ISO-2022-JP Big5 - EUC-CN EUC-JP EUC-KR +are L-registered as +preferred MIME names and may probably be used over the Internet. -are -registered as -preferred MIME names and may probably be used over the Internet. So is +C is no longer Microsft proprietary since it has been +officialized by JIS X 0208-1997. It is probably the most wide +spread encoding for Japanese on the Internet. - Shift_JIS + EUC-CN -but despite its wide spread it bears the label of being -Microsft proprietary -- was. Now Shift JIS is official as of -JIS X 0208-1997. +has not been registered with IANA (as of march 2002) but +seems to be supported by major web browsers. (IANA has registered +this encoding as C, but C currently has a different +meaning to the C module. It will probably become alias to +C in the future; until then it is safer to avoid using +C as encoding name within Perl). - UTF-16 KOI8-U + UTF-16 + KOI8-U (http://www.faqs.org/rfcs/rfc2319.html) -are IANA-registered preferred MIME names but probably -shoule be avoided as encoding for web pages due to lack of -browser support. +are IANA-registered (C even as a preferred MIME name) +but probably should be avoided as encoding for web pages due to +lack of browser support. - ISO-2022 (http://www.ecma.ch/ecma1/STAND/ECMA-035.HTM) - ISO-2022-JP-1 (http://www.faqs.org/rfcs/rfc2237.html) ISO-IR-165 (http://www.faqs.org/rfcs/rfc1345.html) GBK VISCII - GB 12345 (only plains 1 and 2 available) - GB 18030 - CNS 11643 + GB 12345 + GB 18030 (*) (see links bellow) + EUC-TW (*) are totally valid encodings but not registered at IANA. +The names under which they are listed here are probably the +most widely-known names for these encodings and are recommended +names. + + +=for comment this used to be listed as supported but - BIG5PLUS - EUC-JP-0212 (Encode::lib::Encode::Tcl::Extended) +do not work @15457 when it's clear they will be uncommented +or deleted - Anton +ISO-2022 (http://www.ecma.ch/ecma1/STAND/ECMA-035.HTM) +CNS 11643 (only plains 1 and 2 available) -are a bit proprietary + BIG5PLUS (*) + +is a bit proprietary name. C<(*)>-marked encodings belong to +C available from CPAN. You may probably get some info on CJK encodings at brief description for most of the mentioned CJK encodings - -F +L several years old, but still useful - -F +L and some in-depth reading for the heroes :-) -F (eq ISO-2022) +L (eq C) + +gives brief info on C, C and mostly on C +F + +The nature of information in this section is most fragile and +error-prone; I is the most popular adverb :) +Please feel free to send your comments, disagreements and +additions to L<...>. (Note however, +that the mission of this document is to cover the +C-supported encodings only. =head1 See Also L, L, -L, L, L, L +L, L, L, L, L, L =cut diff --git a/ext/Encode/t/Aliases.t b/ext/Encode/t/Aliases.t index 3640f4b..8fe298b 100644 --- a/ext/Encode/t/Aliases.t +++ b/ext/Encode/t/Aliases.t @@ -3,66 +3,72 @@ use strict; use Encode; use Encode::Alias; +my %a2c; +my $ON_EBCDIC; BEGIN { - if (ord("A") == 193) { - print "1..0 # Skip: EBCDIC\n"; - exit 0; + $ON_EBCDIC = ord("A") == 193; + @ARGV and $ON_EBCDIC = $ARGV[0] eq 'EBCDIC'; + $Encode::ON_EBCDIC = $ON_EBCDIC; + + %a2c = ( + 'ascii' => 'US-ascii', + 'cyrillic' => 'iso-8859-5', + 'arabic' => 'iso-8859-6', + 'greek' => 'iso-8859-7', + 'hebrew' => 'iso-8859-8', + 'thai' => 'iso-8859-11', + 'tis620' => 'iso-8859-11', + 'WinLatin1' => 'cp1252', + 'WinLatin2' => 'cp1250', + 'WinCyrillic' => 'cp1251', + 'WinGreek' => 'cp1253', + 'WinTurkish' => 'cp1254', + 'WinHebrew' => 'cp1255', + 'WinArabic' => 'cp1256', + 'WinBaltic' => 'cp1257', + 'WinVietnamese' => 'cp1258', + 'ja_JP.euc' => $ON_EBCDIC ? '' : 'euc-jp', + 'x-euc-jp' => $ON_EBCDIC ? '' : 'euc-jp', + 'zh_CN.euc' => $ON_EBCDIC ? '' : 'euc-cn', + 'x-euc-cn' => $ON_EBCDIC ? '' : 'euc-cn', + 'ko_KR.euc' => $ON_EBCDIC ? '' : 'euc-kr', + 'x-euc-kr' => $ON_EBCDIC ? '' : 'euc-kr', + 'ujis' => $ON_EBCDIC ? '' : 'euc-jp', + 'Shift_JIS' => $ON_EBCDIC ? '' : 'shiftjis', + 'x-sjis' => $ON_EBCDIC ? '' : 'shiftjis', + 'jis' => $ON_EBCDIC ? '' : '7bit-jis', + 'big-5' => $ON_EBCDIC ? '' : 'big5', + 'zh_TW.Big5' => $ON_EBCDIC ? '' : 'big5', + 'big5-hk' => $ON_EBCDIC ? '' : 'big5-hkscs', + ); + + for my $i (1..11,13..16){ + $a2c{"ISO 8859 $i"} = "iso-8859-$i"; + } + for my $i (1..10){ + $a2c{"ISO Latin $i"} = "iso-8859-$Encode::Alias::Latin2iso[$i]"; + } + for my $k (keys %Encode::Alias::Winlatin2cp){ + my $v = $Encode::Alias::Winlatin2cp{$k}; + $a2c{"Win" . ucfirst($k)} = "cp" . $v; + $a2c{"IBM-$v"} = $a2c{"MS-$v"} = "cp" . $v; } } -my %a2c; - -BEGIN { - %a2c = ( - 'ascii' => 'US-ascii', - 'cyrillic' => 'iso-8859-5', - 'arabic' => 'iso-8859-6', - 'greek' => 'iso-8859-7', - 'hebrew' => 'iso-8859-8', - 'thai' => 'iso-8859-11', - 'tis620' => 'iso-8859-11', - 'ja_JP.euc' => 'euc-jp', - 'x-euc-jp' => 'euc-jp', - 'zh_CN.euc' => 'euc-cn', - 'x-euc-cn' => 'euc-cn', - 'ko_KR.euc' => 'euc-kr', - 'x-euc-kr' => 'euc-kr', - 'ujis' => 'euc-jp', - 'Shift_JIS' => 'shiftjis', - 'x-sjis' => 'shiftjis', - 'jis' => '7bit-jis', - 'big-5' => 'big5', - 'zh_TW.Big5' => 'big5', - 'big5-hk' => 'big5-hkscs', - 'WinLatin1' => 'cp1252', - 'WinLatin2' => 'cp1250', - 'WinCyrillic' => 'cp1251', - 'WinGreek' => 'cp1253', - 'WinTurkish' => 'cp1254', - 'WinHebrew' => 'cp1255', - 'WinArabic' => 'cp1256', - 'WinBaltic' => 'cp1257', - 'WinVietnamese' => 'cp1258', - ); - - for my $i (1..11,13..16){ - $a2c{"ISO 8859 $i"} = "iso-8859-$i"; - } - for my $i (1..10){ - $a2c{"ISO Latin $i"} = "iso-8859-$Encode::Alias::Latin2iso[$i]"; - } - for my $k (keys %Encode::Alias::Winlatin2cp){ - my $v = $Encode::Alias::Winlatin2cp{$k}; - $a2c{"Win" . ucfirst($k)} = "cp" . $v; - $a2c{"IBM-$v"} = "cp" . $v; - $a2c{"MS-$v"} = "cp" . $v; - } +if ($ON_EBCDIC){ + delete @Encode::ExtModule{ + qw(euc-cn gb2312 gb12345 gbk cp936 iso-ir-165 + euc-jp iso-2022-jp 7bit-jis shiftjis macjapan cp932 + euc-kr ksc5601 cp949 + big5 big5-hkscs cp950 + gb18030 big5plus euc-tw) + }; } use Test::More tests => (scalar keys %a2c) * 3; -print "# alias test\n"; +print "# alias test; \$ON_EBCDIC == $ON_EBCDIC\n"; foreach my $a (keys %a2c){ my $e = Encode::find_encoding($a); @@ -71,10 +77,20 @@ foreach my $a (keys %a2c){ # now we override some of the aliases and see if it works fine -define_alias( qr/shift.*jis$/i => '"macjapan"' ); -define_alias( qr/sjis$/i => '"cp932"' ); +define_alias(ascii => 'WinLatin1', + cyrillic => 'WinCyrillic', + arabic => 'WinArabic', + greek => 'WinGreek', + hebrew => 'WinHebrew'); -@a2c{qw(Shift_JIS x-sjis)} = qw(macjapan cp932); +@a2c{qw(ascii cyrillic arabic greek hebrew)} = + qw(cp1252 cp1251 cp1256 cp1253 cp1255); + +unless ($ON_EBCDIC){ + define_alias( qr/shift.*jis$/i => '"macjapan"', + qr/sjis$/i => '"cp932"' ); + @a2c{qw(Shift_JIS x-sjis)} = qw(macjapan cp932); +} print "# alias test with alias overrides\n";