From: Jarkko Hietaniemi Date: Tue, 9 Apr 2002 20:05:59 +0000 (+0000) Subject: Upgrade to Encode 1.32, from Dan Kogai. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=fcb875d4741190d167d6a2773683d7421a2a5279;hp=939256b9e8d4f2f1dde5c4e1158e5152ff270bb2;p=p5sagit%2Fp5-mst-13.2.git Upgrade to Encode 1.32, from Dan Kogai. p4raw-id: //depot/perl@15834 --- diff --git a/MANIFEST b/MANIFEST index 548b4f2..2a742f9 100644 --- a/MANIFEST +++ b/MANIFEST @@ -226,13 +226,14 @@ ext/Encode/TW/TW.pm Encode extension ext/Encode/bin/enc2xs Encode module generator ext/Encode/bin/piconv iconv by perl ext/Encode/bin/ucm2table Table Generator for testing +ext/Encode/bin/ucmlint A UCM Lint utility ext/Encode/bin/unidump Unicode Dump like hexdump(1) ext/Encode/encengine.c Encode extension ext/Encode/encoding.pm Perl Pragmactic Module ext/Encode/lib/Encode/Alias.pm Encode extension ext/Encode/lib/Encode/CJKConstants.pm Encode extension ext/Encode/lib/Encode/CN/HZ.pm Encode extension -ext/Encode/lib/Encode/Config.pm Encode configuration module +ext/Encode/lib/Encode/Config.pm Encode configuration module ext/Encode/lib/Encode/Encoder.pm OO Encoder ext/Encode/lib/Encode/Encoding.pm Encode extension ext/Encode/lib/Encode/JP/2022_JP.pm Encode extension @@ -246,11 +247,12 @@ ext/Encode/lib/Encode/XS.pm Encode extension ext/Encode/t/Aliases.t Encode extension test ext/Encode/t/CN.t Encode extension test ext/Encode/t/Encode.t Encode extension test -ext/Encode/t/Encoder.t Encode::Encoder test +ext/Encode/t/Encoder.t Encode::Encoder test ext/Encode/t/JP.t Encode extension test ext/Encode/t/KR.t Encode extension test ext/Encode/t/TW.t Encode extension test ext/Encode/t/Unicode.t Encode extension test +ext/Encode/t/bogus.ucm Sample data for ucmlint ext/Encode/t/encoding.t encoding extension test ext/Encode/t/gb2312.euc test data ext/Encode/t/gb2312.ref test data @@ -344,12 +346,9 @@ ext/Encode/ucm/macChinsimp.ucm Unicode Character Map ext/Encode/ucm/macChintrad.ucm Unicode Character Map ext/Encode/ucm/macCroatian.ucm Unicode Character Map ext/Encode/ucm/macCyrillic.ucm Unicode Character Map -ext/Encode/ucm/macDevanaga.ucm Unicode Character Map ext/Encode/ucm/macDingbats.ucm Unicode Character Map ext/Encode/ucm/macFarsi.ucm Unicode Character Map ext/Encode/ucm/macGreek.ucm Unicode Character Map -ext/Encode/ucm/macGujarati.ucm Unicode Character Map -ext/Encode/ucm/macGurmukhi.ucm Unicode Character Map ext/Encode/ucm/macHebrew.ucm Unicode Character Map ext/Encode/ucm/macIceland.ucm Unicode Character Map ext/Encode/ucm/macJapanese.ucm Unicode Character Map diff --git a/ext/Encode/AUTHORS b/ext/Encode/AUTHORS index 4fde1a4..fce9f0d 100644 --- a/ext/Encode/AUTHORS +++ b/ext/Encode/AUTHORS @@ -23,5 +23,6 @@ Michael G Schwern Nicholas Clark Nick Ing-Simmons Paul Marquess +Philip Newton SADAHIRO Tomoyuki Spider Boardman diff --git a/ext/Encode/Changes b/ext/Encode/Changes index aaacc8b..9e84446 100644 --- a/ext/Encode/Changes +++ b/ext/Encode/Changes @@ -1,9 +1,53 @@ # Revision history for Perl extension Encode. # -# $Id: Changes,v 1.31 2002/04/08 18:08:07 dankogai Exp dankogai $ +# $Id: Changes,v 1.32 2002/04/09 20:06:15 dankogai Exp dankogai $ # -1.31 $Date: 2002/04/08 18:08:07 $ +1.32 $Date: 2002/04/09 20:06:15 $ ++ bin/ucmlint ++ t/bogus.ucm +- ucm/macDevanaga.ucm Unicode Character Map +- ucm/macGujarati.ucm Unicode Character Map +- ucm/macGurmukhi.ucm Unicode Character Map + A utility to check integrity of .ucm files. t/bogus.ucm is a + ucm that is deliberately bogus. unused Indic mappings are removed + for the time being. +! Encode.pm + resolve_alias() added as suggested by jhi. Same as + find_encoding("alias")->name. For convenience. This one is + defined in Encode.pm instead of Alias.pm. + Message-Id: <20020409215846.H17022@alpha.hut.fi> +! Encode.xs + Memory Allocate but detected during the devel of ucmlint -- fixed. + Message-Id: +! lib/Encode/Unicode.pm + valid_ucs2(0) is false but must be true. + 3 patches from NI-S as follows. This also has fixed the incident + Andy has reported. +! lib/Encode/Alias.pm + find_alias() recursion prevention +! t/Aliases.t + Checks for the patch above +! t/Encode/Unicode.pm + An extra "F" that causes valid_ucs2() return a bogus value fixed + Message-Id: <20020409133927.17803.1@bactrian.elixent.com> + Message-Id: + 2 Small Patches from jhi as follows: +! Encode.pm + Encode->encodings() lists in case-insensitve order (as it was) +! bin/piconv + -l option prints avaiable encodings to STDOUT instead of STDERR +! lib/Encode/Aliases.pm + s/defintion/definition/ + Message-Id: <200204082306.CAA21033@alpha.hut.fi> +! AUTHORS +! lib/Encode/Supported.pod +! lib/Encode/Unicode.pm + POD revise by Philip Newton. This adds Philip to AUTHORS list. + Thank you for the exact quote of Douglas Adams :) + Message-Id: <22s3bu4gpvhhsses64nj3afuu0lo927rv3@4ax.com> + +1.31 2002/04/08 18:08:07 ! lib/Encode/Encoder.pm + t/Encoder.t Encode::Encoder, once just a placeholder of an idea, is now much more @@ -189,7 +233,7 @@ Typo fixes and improvements by jhi Message-Id: <200204010201.FAA03564@alpha.hut.fi>, et al. -1.11 $Date: 2002/04/08 18:08:07 $ +1.11 $Date: 2002/04/09 20:06:15 $ + t/encoding.t + t/jperl.t ! MANIFEST diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index 0e89009..efc77d8 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -1,6 +1,6 @@ package Encode; use strict; -our $VERSION = do { my @r = (q$Revision: 1.31 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.32 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; our $DEBUG = 0; require DynaLoader; @@ -10,25 +10,26 @@ our @ISA = qw(Exporter DynaLoader); # Public, encouraged API is exported by default our @EXPORT = qw ( - encode decode - encode_utf8 decode_utf8 - find_encoding + encode + encode_utf8 encodings + find_encoding ); our @EXPORT_OK = qw( + _utf8_off + _utf8_on define_encoding from_to - is_utf8 - is_8bit is_16bit - utf8_upgrade + is_8bit + is_utf8 + resolve_alias utf8_downgrade - _utf8_on - _utf8_off + utf8_upgrade ); bootstrap Encode (); @@ -117,6 +118,12 @@ sub find_encoding return __PACKAGE__->getEncoding($name,$skip_external); } +sub resolve_alias { + my $obj = find_encoding(shift); + defined $obj and return $obj->name; + return; +} + sub encode { my ($name,$string,$check) = @_; @@ -444,6 +451,17 @@ After that, newName can be used as an alias for ENCODING. ENCODING may be either the name of an encoding or an I +But before you do so, make sure the alias is nonexistent with +C, which returns the canonical name thereof. +i.e. + + Encode::resolve_alias("latin1") eq "iso-8859-1" # true + Encode::resolve_alias("iso-8859-12") # false; nonexistent + Encode::resolve_alias($name) eq $name # true if $name is canonical + +This resolve_alias() does not need C and is +exported via C. + See L on details. =head1 Encoding and IO diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs index 741b679..a911866 100644 --- a/ext/Encode/Encode.xs +++ b/ext/Encode/Encode.xs @@ -6,7 +6,7 @@ #include "encode.h" #include "def_t.h" -#define ENCODE_XS_PROFILE 0 /* set 1 to profile. +#define ENCODE_XS_PROFILE 0 /* set 1 or more to profile. t/encoding.t dumps core because of Perl_warner and PerlIO don't work well */ @@ -520,11 +520,17 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, switch (code) { case ENCODE_NOSPACE: { - STRLEN more, sleft; + STRLEN more = 0; /* make sure you initialize! */ + STRLEN sleft; sdone += slen; ddone += dlen; sleft = tlen - sdone; - if (sdone) { /* has src ever been processed ? */ +#if ENCODE_XS_PROFILE >= 2 + Perl_warn(aTHX_ + "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n", + more, sdone, sleft, SvLEN(dst)); +#endif + if (sdone != 0) { /* has src ever been processed ? */ #if ENCODE_XS_USEFP == 2 more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone - SvLEN(dst); diff --git a/ext/Encode/MANIFEST b/ext/Encode/MANIFEST index c3deba8..3c4a187 100644 --- a/ext/Encode/MANIFEST +++ b/ext/Encode/MANIFEST @@ -28,6 +28,7 @@ TW/TW.pm Encode extension bin/enc2xs Encode module generator bin/piconv iconv by perl bin/ucm2table Table Generator for testing +bin/ucmlint A UCM Lint utility bin/unidump Unicode Dump like hexdump(1) encengine.c Encode extension encoding.pm Perl Pragmactic Module @@ -53,6 +54,7 @@ t/JP.t Encode extension test t/KR.t Encode extension test t/TW.t Encode extension test t/Unicode.t Encode extension test +t/bogus.ucm Sample data for ucmlint t/encoding.t encoding extension test t/gb2312.euc test data t/gb2312.ref test data @@ -146,12 +148,9 @@ ucm/macChinsimp.ucm Unicode Character Map ucm/macChintrad.ucm Unicode Character Map ucm/macCroatian.ucm Unicode Character Map ucm/macCyrillic.ucm Unicode Character Map -ucm/macDevanaga.ucm Unicode Character Map ucm/macDingbats.ucm Unicode Character Map ucm/macFarsi.ucm Unicode Character Map ucm/macGreek.ucm Unicode Character Map -ucm/macGujarati.ucm Unicode Character Map -ucm/macGurmukhi.ucm Unicode Character Map ucm/macHebrew.ucm Unicode Character Map ucm/macIceland.ucm Unicode Character Map ucm/macJapanese.ucm Unicode Character Map diff --git a/ext/Encode/bin/piconv b/ext/Encode/bin/piconv index c375e19..f96512d 100644 --- a/ext/Encode/bin/piconv +++ b/ext/Encode/bin/piconv @@ -1,5 +1,5 @@ #!./perl -# $Id: piconv,v 1.20 2002/04/04 19:50:52 dankogai Exp $ +# $Id: piconv,v 1.21 2002/04/09 20:06:15 dankogai Exp dankogai $ # use 5.7.3; use strict; diff --git a/ext/Encode/bin/ucmlint b/ext/Encode/bin/ucmlint new file mode 100644 index 0000000..a3fe6c8 --- /dev/null +++ b/ext/Encode/bin/ucmlint @@ -0,0 +1,201 @@ +#!/usr/local/bin/perl +# +# $Id: ucmlint,v 0.1 2002/04/09 20:04:30 dankogai Exp $ +# + +use strict; +our $VERSION = do { my @r = (q$Revision: 0.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; + +use Getopt::Std; +our %Opt; +getopts("Dehfv", \%Opt); + +if ($Opt{e}){ + eval{ require Encode; }; + $@ and die "can't load Encode : $@"; +} + +$Opt{h} and help(); +@ARGV or help(); + +sub help{ + print <<""; +$0 -[Dehfv] [ucm files ...] + -D debug mode on + -e test with Encode module also (requires perl 5.7.3 or higher) + -h shows this message + -f forces roundtrip check even for |[123] + -v verbose mode + +} + +$| = 1; +my (%Hdr, %U2E, %E2U); +my $in_charmap = 0; +my $nerror = 0; +my $nwarning = 0; + +sub nit($;$){ + my ($msg, $level) = @_; + my $lstr; + if ($level == 2){ + $lstr = 'notice'; + }elsif ($level == 1){ + $lstr = 'warning'; $nwarning++; + }else{ + $lstr = 'error'; $nerror++; + } + print "$ARGV:$lstr in line $.: $msg\n"; +} + +for $ARGV (@ARGV){ + open UCM, $ARGV or die "$ARGV:$!"; + %Hdr = %U2E = %E2U = (); + $in_charmap = $nerror = $nwarning = 0; + $. = 0; + while(){ + chomp; + s/\s*#.*$//o; /^$/ and next; + if ($_ eq "CHARMAP"){ + $in_charmap = 1; + for my $must (qw/code_set_name mb_cur_min mb_cur_max/){ + exists $Hdr{$must} or nit "<$must> nonexistent"; + } + $Hdr{mb_cur_min} > $Hdr{mb_cur_max} + and nit sprintf("mb_cur_min(%d) > mb_cur_max(%d)", + $Hdr{mb_cur_min},$Hdr{mb_cur_max}); + $in_charmap = 1; + next; + } + unless ($in_charmap){ + my($hkey, $hvalue) = /^<(\S+)>\s+[\"\']?([^\"\']+)/o or next; + $Opt{D} and warn "$hkey => $hvalue"; + if ($hkey eq "code_set_name"){ # name check + exists $Hdr{code_set_name} + and nit "Duplicate : $hkey"; + } + if ($hkey eq "code_set_alias"){ # alias check + $hvalue eq $Hdr{code_set_name} + and nit qq(alias "$hvalue" is already in ); + } + $Hdr{$hkey} = $hvalue; + }else{ + my $name = $Hdr{code_set_name}; + my($unistr, $encstr, $fb) = /^(\S+)\s+(\S+)\s(\S+)/o or next; + $Opt{v} and nit $_, 2; + my $uni = uniparse($unistr); + my $enc = encparse($encstr); + $fb =~ /^\|([0123])$/ or nit "malformed fallback: $fb"; + $fb = $1; + $Opt{f} and $fb = 0; + unless ($fb == 1){ # check uni -> enc + if (exists $U2E{$uni}){ + nit "dupe encode map: U$uni => $U2E{$uni} and $enc", 1; + }else{ + $U2E{$uni} = $enc; + if ($Opt{e} and $fb != 3) { + my $e = hex2enc($enc); + my $u = hex2uni($uni); + my $eu = Encode::encode($name, $u); + $e eq $eu + or nit qq(encode('$name', $uni) != $enc); + } + } + } + unless ($fb == 3){ # check enc -> uni + if (exists $E2U{$enc}){ + nit "dupe decode map: $enc => U$E2U{$enc} and U$uni", 1; + }else{ + $E2U{$enc} = $uni; + if ($Opt{e} and $fb != 1) { + my $e = hex2enc($enc); + my $u = hex2uni($uni); + $Opt{D} and warn "$uni, $enc"; + my $de = Encode::decode($name, $e); + $de eq $u + or nit qq(decode('$name', $enc) != $uni); + } + } + } + # warn "$uni, $enc, $fb"; + } + } + $in_charmap or nit "Where is CHARMAP?"; + checkRT(); + printf ("$ARGV: %s error%s found\n", + ($nerror == 0 ? 'no' : $nerror), + ($nerror > 1 ? 's' : '')); +} + +exit; + +sub hex2enc{ + pack("C*", map {hex($_)} split(",", shift)); +} +sub hex2uni{ + join("", map { chr(hex($_)) } split(",", shift)); +} + +sub checkRT{ + for my $uni (keys %E2U){ + my $enc = $U2E{$uni} or next; # okay + $E2U{$U2E{$uni}} eq $uni or + nit "RT failure: U$uni => $enc =>U$E2U{$U2E{$uni}}"; + } + for my $enc (keys %E2U){ + my $uni = $E2U{$enc} or next; # okay + $U2E{$E2U{$enc}} eq $enc or + nit "RT failure: $enc => U$uni => $U2E{$E2U{$enc}}"; + } +} + + +sub uniparse{ + my $str = shift; + my @u; + push @u, $1 while($str =~ /\G/ig); + for my $u (@u){ + $u =~ /^([0-9A-Za-z]+)$/o + or nit "malformed Unicode character: $u"; + } + return join(',', @u); +} + +sub encparse{ + my $str = shift; + my @e; + for my $e (split /\\x/io, $str){ + $e or next; # first \x + $e =~ /^([0-9A-Za-z]{1,2})$/io + or nit "Hex $e in $str is bogus"; + push @e, $1; + } + return join(',', @e); +} + + + +__END__ + +UCM file looks like this. + + # + # Comments + # + "US-ascii" # Required + "ascii" # Optional + 1 # Required; usually 1 + 1 # Max. # of bytes/char + \x3F # Substitution char + # + CHARMAP + \x00 |0 # + \x01 |0 # + \x02 |0 # + .... + \x7C |0 # VERTICAL LINE + \x7D |0 # RIGHT CURLY BRACKET + \x7E |0 # TILDE + \x7F |0 # + END CHARMAP + diff --git a/ext/Encode/lib/Encode/Alias.pm b/ext/Encode/lib/Encode/Alias.pm index 56e75ea..e60c889 100644 --- a/ext/Encode/lib/Encode/Alias.pm +++ b/ext/Encode/lib/Encode/Alias.pm @@ -1,7 +1,7 @@ package Encode::Alias; use strict; use Encode; -our $VERSION = do { my @r = (q$Revision: 1.26 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.27 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; our $DEBUG = 0; require Exporter; @@ -9,7 +9,7 @@ our @ISA = qw(Exporter); # Public, encouraged API is exported by default -our @EXPORT = +our @EXPORT = qw ( define_alias find_alias @@ -156,11 +156,11 @@ sub init_aliases # has been redefined as the euro symbol.) define_alias( qr/^(.+)\@euro$/i => '"$1"' ); - define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i + define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i => '"iso-8859-$Encode::Alias::Latin2iso[$1]"' ); define_alias( qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish| - hebrew|arabic|baltic|vietnamese)$/ix => + hebrew|arabic|baltic|vietnamese)$/ix => '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' ); # Common names for non-latin prefered MIME names @@ -175,7 +175,7 @@ sub init_aliases # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN. # And Microsoft has their own naming (again, surprisingly). - # And windows-* is registered in IANA! + # And windows-* is registered in IANA! define_alias( qr/\b(?:ibm|ms|windows)[-_]?(\d\d\d\d?)$/i => '"cp$1"'); # Sometimes seen with a leading zero. @@ -187,7 +187,7 @@ sub init_aliases define_alias( qr/^mac_(.*)$/i => '"mac$1"'); # Ououououou. gone. They are differente! # define_alias( qr/\bmacRomanian$/i => '"macRumanian"'); - + # Standardize on the dashed versions. # define_alias( qr/\butf8$/i => 'utf-8' ); define_alias( qr/\bkoi8r$/i => 'koi8-r' ); @@ -255,7 +255,7 @@ Encode::Alias - alias definitions to encodings =head1 DESCRIPTION Allows newName to be used as an alias for ENCODING. ENCODING may be -either the name of an encoding or an encoding object (as described +either the name of an encoding or an encoding object (as described in L). Currently I can be specified in the following ways: diff --git a/ext/Encode/lib/Encode/Encoder.pm b/ext/Encode/lib/Encode/Encoder.pm index 68cc65c..33a3bd5 100644 --- a/ext/Encode/lib/Encode/Encoder.pm +++ b/ext/Encode/lib/Encode/Encoder.pm @@ -1,5 +1,5 @@ # -# $Id: Encoder.pm,v 0.2 2002/04/08 18:08:07 dankogai Exp dankogai $ +# $Id: Encoder.pm,v 0.2 2002/04/08 18:08:07 dankogai Exp $ # package Encode::Encoder; use strict; diff --git a/ext/Encode/lib/Encode/Supported.pod b/ext/Encode/lib/Encode/Supported.pod index 0517f1a..132e5a9 100644 --- a/ext/Encode/lib/Encode/Supported.pod +++ b/ext/Encode/lib/Encode/Supported.pod @@ -592,7 +592,7 @@ Microsoft's understanding of C. JIS has not endorsed the full Microsoft standard however. The official C includes only JIS X 0201 and JIS X 0208 subsets, while Microsoft has always been meaning C to -encode a wider character repertoire, see C registration for +encode a wider character repertoire. See C registration for C. As a historical predecessor Microsoft's variant @@ -600,7 +600,7 @@ probably has more rights for the name, albeit it may be objected that Microsoft shouldn't have used JIS as part of the name in the first place. -Unabiguous name: C. C name (not used?): C. +Unambiguous name: C. C name (not used?): C. Encode separately supports C and C. diff --git a/ext/Encode/lib/Encode/Unicode.pm b/ext/Encode/lib/Encode/Unicode.pm index 7f1ad53..2686df7 100644 --- a/ext/Encode/lib/Encode/Unicode.pm +++ b/ext/Encode/lib/Encode/Unicode.pm @@ -3,7 +3,7 @@ package Encode::Unicode; use strict; use warnings; -our $VERSION = do { my @r = (q$Revision: 1.28 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.29 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; # # Aux. subs & constants @@ -15,11 +15,9 @@ sub BOM16LE(){ 0xFFFe } sub BOM32LE(){ 0xFFFe0000 } sub valid_ucs2($){ - if ($_[0] < 0xD800){ - return $_[0] > 0; - }else{ - return ($_[0] > 0xDFFF && $_[0] <= 0xFFFF); - } + return + (0 <= $_[0] && $_[0] < 0xD800) + || ( 0xDFFF < $_[0] && $_[0] <= 0xFFFF); } sub issurrogate($){ 0xD800 <= $_[0] && $_[0] <= 0xDFFF } @@ -74,9 +72,9 @@ sub name { shift->{'Name'} } sub new_sequence { $_[0] }; # -# the two implementation of (en|de)code exist. *_modern use -# array and *_classic stick with substr. *_classic is much -# slower but more memory conservative. *_moder is default. +# two implementation of (en|de)code exist. *_modern use +# an array and *_classic stick with substr. *_classic is much +# slower but more memory conservative. *_modern is default. sub set_transcoder{ no warnings qw(redefine); @@ -88,7 +86,7 @@ sub set_transcoder{ *decode = \&decode_classic; *encode = \&encode_classic; }else{ - require Carp; + require Carp; Carp::croak __PACKAGE__, "::set_transcoder(modern|classic)"; } } @@ -115,7 +113,7 @@ sub decode_modern my $ord = shift @ord; unless ($size == 4 or valid_ucs2($ord &= $mask)){ if ($ucs2){ - $chk and + $chk and poisoned2death($obj, "no surrogates allowed", $ord); shift @ord; # skip the next one as well $ord = FBCHAR; @@ -151,12 +149,12 @@ sub encode_modern unless ($size == 4 or valid_ucs2($ord)) { unless(issurrogate($ord)){ if ($ucs2){ - $chk and + $chk and poisoned2death($obj, "code point too high", $ord); push @str, FBCHAR; }else{ - + push @str, ensurrogate($ord); } }else{ # not supposed to happen @@ -188,7 +186,7 @@ sub decode_classic my $ord = unpack($endian, substr($str, 0, $size, '')); unless ($size == 4 or valid_ucs2($ord &= $mask)){ if ($ucs2){ - $chk and + $chk and poisoned2death($obj, "no surrogates allowed", $ord); substr($str,0,$size,''); # skip the next one as well $ord = FBCHAR; @@ -224,7 +222,7 @@ sub encode_classic unless ($size == 4 or valid_ucs2($ord)) { unless(issurrogate($ord)){ if ($ucs2){ - $chk and + $chk and poisoned2death($obj, "code point too high", $ord); $str .= pack($endian, FBCHAR); }else{ @@ -244,7 +242,7 @@ sub BOMB { my ($size, $bom) = @_; my $N = $size == 2 ? 'n' : 'N'; my $ord = unpack($N, $bom); - return ($ord eq BOM_BE) ? $N : + return ($ord eq BOM_BE) ? $N : ($ord eq BOM16LE) ? 'v' : ($ord eq BOM32LE) ? 'V' : undef; } @@ -267,7 +265,7 @@ Encode::Unicode -- Various Unicode Transform Format =head1 SYNOPSIS - use Encode qw/encode decode/; + use Encode qw/encode decode/; $ucs2 = encode("UCS-2BE", $utf8); $utf8 = decode("UCS-2BE", $ucs2); @@ -311,26 +309,26 @@ Endianness, and Byte Order Mark. =head2 by Size UCS-2 is a fixed-length encoding with each character taking 16 bits. -It B support I. When surrogate pair is -encountered during decode(), it fills its place with \xFFFD without -I or croaks if I. When a character which ord value is -larger than 0xFFFF, it uses 0xFFFD without I or croaks if -. +It B support I. When a surrogate pair is +encountered during decode(), its place is filled with \xFFFD without +I or croaks if I. When a character whose ord value is +larger than 0xFFFF is encountered, it uses 0xFFFD without I or +croaks if . -UTF-16 is almost the same as UCS-2 but it supports I. +UTF-16 is almost the same as UCS-2 but it supports I. When it encounters a high surrogate (0xD800-0xDBFF), it fetches the -following low surrogate (0xDC00-0xDFFF), C them to form a +following low surrogate (0xDC00-0xDFFF), Cs them to form a character. Bogus surrogates result in death. When \x{10000} or above -is encountered during encode(), it Cs them and push the +is encountered during encode(), it Cs them and pushes the surrogate pair to the output stream. UTF-32 is a fixed-length encoding with each character taking 32 bits. -Since it is 32-bit there is no need for I. +Since it is 32-bit there is no need for I. =head2 by Endianness First (and now failed) goal of Unicode was to map all character -repartories into a fixed-length integer so programmers are happy. +repertories into a fixed-length integer so programmers are happy. Since each character is either I or I in C, you have to put endianness of each platform when you pass data to one another. @@ -340,16 +338,16 @@ called Byte Order Mark (BOM) is prepended to the head of string. =over 4 -=item BOM as integer +=item BOM as integer when fetched in network byte order - 16 32 bits/char -------------------------- -BE 0xFeFF 0x0000FeFF -LE 0xFFeF 0xFFFe0000 -------------------------- + 16 32 bits/char + ------------------------- + BE 0xFeFF 0x0000FeFF + LE 0xFFeF 0xFFFe0000 + ------------------------- =back - + This modules handles BOM as follows. =over 4 @@ -363,7 +361,7 @@ simply treated as one of characters (ZERO WIDTH NO-BREAK SPACE). When BE or LE is omitted during decode(), it checks if BOM is in the beginning of the string and if found endianness is set to what BOM -says. if not found, dies. +says. If not found, dies. =item * @@ -379,21 +377,22 @@ UCS-2 is already registered by IANA and others that way. =back -=head1 The Surrogate Pair +=head1 Surrogate Pairs -To say the least, surrogate pair was the biggest mistake by Unicode -Consortium. I don't give a darn if they admit it or not. But -according to late Douglas Adams in I Triology, C. Their mistake was not this magnitude so let's forgive them. +To say the least, surrogate pairs were the biggest mistake of the +Unicode Consortium. But according to the late Douglas Adams in I Trilogy, C. Their mistake was not of this +magnitude so let's forgive them. (I don't dare make any comparison with Unicode Consortium and the Vogons here ;) Or, comparing Encode to Babel Fish is completely appropriate -- if you can only stick this into your ear :) -A surrogate pair was born when Unicode Consortium had finally -admitted that 16 bit was not big enough to hold all the world's -character repartorie. But they have already made UCS-2 16-bit. What +Surrogate pairs were born when Unicode Consortium finally +admitted that 16 bits were not big enough to hold all the world's +character repertoire. But they have already made UCS-2 16-bit. What do we do? Back then 0xD800-0xDFFF was not allocated. Let's split them half and @@ -402,7 +401,7 @@ latter C. That way you can represent 1024 * 1024 = 1048576 more characters. Now we can store character ranges up to \x{10ffff} even with 16-bit encodings. This pair of half-character is now called a I and UTF-16 is the -name of encoding that embraces them. +name of the encoding that embraces them. Here is a fomula to ensurrogate a Unicode character \x{10000} and above; @@ -414,8 +413,14 @@ And to desurrogate; $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00); -Note this move has made \x{D800}-\x{DFFF} forbidden zone but perl -does not prohibit them for uses. +Note this move has made \x{D800}-\x{DFFF} into a forbidden zone but +perl does not prohibit the use of characters within this range. To perl, +every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I. + + (*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit + integer support! (**) + + (**) Is anything beyond \x{11_0000} still Unicode :? =head1 SEE ALSO @@ -425,4 +430,8 @@ RFC 2781 L, L +Ch. 15, pp. 403 of C +by Larry Wall, Tom Christiansen, Jon Orwant; +O'Reilly & Associates; ISBN 0-596-00027-8 + =cut diff --git a/ext/Encode/t/bogus.ucm b/ext/Encode/t/bogus.ucm new file mode 100644 index 0000000..6eb2f72 --- /dev/null +++ b/ext/Encode/t/bogus.ucm @@ -0,0 +1,384 @@ +# +# $Id: bogus.ucm,v 1.1 2002/04/09 20:06:15 dankogai Exp dankogai $ +# +# based upon euc-jp +# + "euc-bogus" + "euc-bogus" # error 1 + 3 # error 2 + 1 + \x3F +# +CHARMAP +# +# ASCII +# + \x00 |0 # + \x01 |0 # + \x02 |0 # + \x03 |0 # + \x04 |0 # + \x05 |0 # + \x06 |0 # + \x07 |0 # + \x08 |0 # + \x09 |0 # + \x0A |0 # + \x0B |0 # + \x0C |0 # + \x0D |0 # + \x0E |0 # + \x0F |0 # + \x10 |0 # + \x11 |0 # + \x12 |0 # + \x13 |0 # + \x14 |0 # + \x15 |0 # + \x16 |0 # + \x17 |0 # + \x18 |0 # + \x19 |0 # + \x1A |0 # + \x1B |0 # + \x1C |0 # + \x1D |0 # + \x1E |0 # + \x1F |0 # + \x20 |0 # SPACE + \x21 |0 # EXCLAMATION MARK + \x22 |0 # QUOTATION MARK + \x23 |0 # NUMBER SIGN + \x24 |0 # DOLLAR SIGN + \x25 |0 # PERCENT SIGN + \x26 |0 # AMPERSAND + \x27 |0 # APOSTROPHE + \x28 |0 # LEFT PARENTHESIS + \x29 |0 # RIGHT PARENTHESIS + \x2A |0 # ASTERISK + \x2B |0 # PLUS SIGN + \x2C |0 # COMMA + \x2D |0 # HYPHEN-MINUS + \x2E |0 # FULL STOP + \x2F |0 # SOLIDUS + \x30 |0 # DIGIT ZERO + \x31 |0 # DIGIT ONE + \x32 |0 # DIGIT TWO + \x33 |0 # DIGIT THREE + \x34 |0 # DIGIT FOUR + \x35 |0 # DIGIT FIVE + \x36 |0 # DIGIT SIX + \x37 |0 # DIGIT SEVEN + \x38 |0 # DIGIT EIGHT + \x39 |0 # DIGIT NINE + \x3A |0 # COLON + \x3B |0 # SEMICOLON + \x3C |0 # LESS-THAN SIGN + \x3D |0 # EQUALS SIGN + \x3E |0 # GREATER-THAN SIGN + \x3F |0 # QUESTION MARK + \x40 |0 # COMMERCIAL AT + \x41 |0 # LATIN CAPITAL LETTER A + \x42 |0 # LATIN CAPITAL LETTER B + \x43 |0 # LATIN CAPITAL LETTER C + \x44 |0 # LATIN CAPITAL LETTER D + \x45 |0 # LATIN CAPITAL LETTER E + \x46 |0 # LATIN CAPITAL LETTER F + \x47 |0 # LATIN CAPITAL LETTER G + \x48 |0 # LATIN CAPITAL LETTER H + \x49 |0 # LATIN CAPITAL LETTER I + \x4A |0 # LATIN CAPITAL LETTER J + \x4B |0 # LATIN CAPITAL LETTER K + \x4C |0 # LATIN CAPITAL LETTER L + \x4D |0 # LATIN CAPITAL LETTER M + \x4E |0 # LATIN CAPITAL LETTER N + \x4F |0 # LATIN CAPITAL LETTER O + \x50 |0 # LATIN CAPITAL LETTER P + \x51 |0 # LATIN CAPITAL LETTER Q + \x52 |0 # LATIN CAPITAL LETTER R + \x53 |0 # LATIN CAPITAL LETTER S + \x54 |0 # LATIN CAPITAL LETTER T + \x55 |0 # LATIN CAPITAL LETTER U + \x56 |0 # LATIN CAPITAL LETTER V + \x57 |0 # LATIN CAPITAL LETTER W + \x58 |0 # LATIN CAPITAL LETTER X + \x59 |0 # LATIN CAPITAL LETTER Y + \x5A |0 # LATIN CAPITAL LETTER Z + \x5B |0 # LEFT SQUARE BRACKET + \x5C |0 # REVERSE SOLIDUS + \x5D |0 # RIGHT SQUARE BRACKET + \x5E |0 # CIRCUMFLEX ACCENT + \x5F |0 # LOW LINE + \x60 |0 # GRAVE ACCENT + \x61 |0 # LATIN SMALL LETTER A + \x62 |0 # LATIN SMALL LETTER B + \x63 |0 # LATIN SMALL LETTER C + \x64 |0 # LATIN SMALL LETTER D + \x65 |0 # LATIN SMALL LETTER E + \x66 |0 # LATIN SMALL LETTER F + \x67 |0 # LATIN SMALL LETTER G + \x68 |0 # LATIN SMALL LETTER H + \x69 |0 # LATIN SMALL LETTER I + \x6A |0 # LATIN SMALL LETTER J + \x6B |0 # LATIN SMALL LETTER K + \x6C |0 # LATIN SMALL LETTER L + \x6D |0 # LATIN SMALL LETTER M + \x6E |0 # LATIN SMALL LETTER N + \x6F |0 # LATIN SMALL LETTER O + \x70 |0 # LATIN SMALL LETTER P + \x71 |0 # LATIN SMALL LETTER Q + \x72 |0 # LATIN SMALL LETTER R + \x73 |0 # LATIN SMALL LETTER S + \x74 |0 # LATIN SMALL LETTER T + \x75 |0 # LATIN SMALL LETTER U + \x76 |0 # LATIN SMALL LETTER V + \x77 |0 # LATIN SMALL LETTER W + \x78 |0 # LATIN SMALL LETTER X + \x79 |0 # LATIN SMALL LETTER Y + \x7A |0 # LATIN SMALL LETTER Z + \x7B |0 # LEFT CURLY BRACKET + \x7C |0 # VERTICAL LINE + \x7D |0 # RIGHT CURLY BRACKET + \x7E |0 # TILDE + \x7F |0 # +# +# jisx0201-1978 +# + \x8E\xA1 |0 # HALFWIDTH IDEOGRAPHIC FULL STOP + \x8E\xA2 |0 # HALFWIDTH LEFT CORNER BRACKET + \x8E\xA3 |0 # HALFWIDTH RIGHT CORNER BRACKET + \x8E\xA4 |0 # HALFWIDTH IDEOGRAPHIC COMMA + \x8E\xA5 |0 # HALFWIDTH KATAKANA MIDDLE DOT + \x8E\xA6 |0 # HALFWIDTH KATAKANA LETTER WO + \x8E\xA7 |0 # HALFWIDTH KATAKANA LETTER SMALL A + \x8E\xA8 |0 # HALFWIDTH KATAKANA LETTER SMALL I + \x8E\xA9 |0 # HALFWIDTH KATAKANA LETTER SMALL U + \x8E\xAA |0 # HALFWIDTH KATAKANA LETTER SMALL E + \x8E\xAB |0 # HALFWIDTH KATAKANA LETTER SMALL O + \x8E\xAC |0 # HALFWIDTH KATAKANA LETTER SMALL YA + \x8E\xAD |0 # HALFWIDTH KATAKANA LETTER SMALL YU + \x8E\xAE |0 # HALFWIDTH KATAKANA LETTER SMALL YO + \x8E\xAF |0 # HALFWIDTH KATAKANA LETTER SMALL TU + \x8E\xB0 |0 # HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK + \x8E\xB1 |0 # HALFWIDTH KATAKANA LETTER A + \x8E\xB2 |0 # HALFWIDTH KATAKANA LETTER I + \x8E\xB3 |0 # HALFWIDTH KATAKANA LETTER U + \x8E\xB4 |0 # HALFWIDTH KATAKANA LETTER E + \x8E\xB5 |0 # HALFWIDTH KATAKANA LETTER O + \x8E\xB6 |0 # HALFWIDTH KATAKANA LETTER KA + \x8E\xB7 |0 # HALFWIDTH KATAKANA LETTER KI + \x8E\xB8 |0 # HALFWIDTH KATAKANA LETTER KU + \x8E\xB9 |0 # HALFWIDTH KATAKANA LETTER KE + \x8E\xBA |0 # HALFWIDTH KATAKANA LETTER KO + \x8E\xBB |0 # HALFWIDTH KATAKANA LETTER SA + \x8E\xBC |0 # HALFWIDTH KATAKANA LETTER SI + \x8E\xBD |0 # HALFWIDTH KATAKANA LETTER SU + \x8E\xBE |0 # HALFWIDTH KATAKANA LETTER SE + \x8E\xBF |0 # HALFWIDTH KATAKANA LETTER SO + \x8E\xC0 |0 # HALFWIDTH KATAKANA LETTER TA + \x8E\xC1 |0 # HALFWIDTH KATAKANA LETTER TI + \x8E\xC2 |0 # HALFWIDTH KATAKANA LETTER TU + \x8E\xC3 |0 # HALFWIDTH KATAKANA LETTER TE + \x8E\xC4 |0 # HALFWIDTH KATAKANA LETTER TO + \x8E\xC5 |0 # HALFWIDTH KATAKANA LETTER NA + \x8E\xC6 |0 # HALFWIDTH KATAKANA LETTER NI + \x8E\xC7 |0 # HALFWIDTH KATAKANA LETTER NU + \x8E\xC8 |0 # HALFWIDTH KATAKANA LETTER NE + \x8E\xC9 |0 # HALFWIDTH KATAKANA LETTER NO + \x8E\xCA |0 # HALFWIDTH KATAKANA LETTER HA + \x8E\xCB |0 # HALFWIDTH KATAKANA LETTER HI + \x8E\xCC |0 # HALFWIDTH KATAKANA LETTER HU + \x8E\xCD |0 # HALFWIDTH KATAKANA LETTER HE + \x8E\xCE |0 # HALFWIDTH KATAKANA LETTER HO + \x8E\xCF |0 # HALFWIDTH KATAKANA LETTER MA + \x8E\xD0 |0 # HALFWIDTH KATAKANA LETTER MI + \x8E\xD1 |0 # HALFWIDTH KATAKANA LETTER MU + \x8E\xD2 |0 # HALFWIDTH KATAKANA LETTER ME + \x8E\xD3 |0 # HALFWIDTH KATAKANA LETTER MO + \x8E\xD4 |0 # HALFWIDTH KATAKANA LETTER YA + \x8E\xD5 |0 # HALFWIDTH KATAKANA LETTER YU + \x8E\xD6 |0 # HALFWIDTH KATAKANA LETTER YO + \x8E\xD7 |0 # HALFWIDTH KATAKANA LETTER RA + \x8E\xD8 |0 # HALFWIDTH KATAKANA LETTER RI + \x8E\xD9 |0 # HALFWIDTH KATAKANA LETTER RU + \x8E\xDA |0 # HALFWIDTH KATAKANA LETTER RE + \x8E\xDB |0 # HALFWIDTH KATAKANA LETTER RO + \x8E\xDC |0 # HALFWIDTH KATAKANA LETTER WA + \x8E\xDD |0 # HALFWIDTH KATAKANA LETTER N + \x8E\xDE |0 # HALFWIDTH KATAKANA VOICED SOUND MARK + \x8E\xDF |0 # HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK +# +# jisx0208-1990, just a part of it +# + \xA1\xA1 |0 # IDEOGRAPHIC SPACE + \xA1\xA2 |0 # IDEOGRAPHIC COMMA + \xA1\xA3 |0 # IDEOGRAPHIC FULL STOP + \xA1\xA4 |0 # FULLWIDTH COMMA + \xA1\xA5 |0 # FULLWIDTH FULL STOP + \xA1\xA6 |0 # KATAKANA MIDDLE DOT + \xA1\xA7 |0 # FULLWIDTH COLON + \xA1\xA8 |0 # FULLWIDTH SEMICOLON + \xA1\xA9 |0 # FULLWIDTH QUESTION MARK + \xA1\xAA |0 # FULLWIDTH EXCLAMATION MARK + \xA1\xAB |0 # KATAKANA-HIRAGANA VOICED SOUND MARK + \xA1\xAC |0 # KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK + \xA1\xAD |0 # ACUTE ACCENT + \xA1\xAE |0 # FULLWIDTH GRAVE ACCENT + \xA1\xAF |0 # DIAERESIS + \xA1\xB0 |0 # FULLWIDTH CIRCUMFLEX ACCENT + \xA1\xB1 |0 # FULLWIDTH MACRON + \xA1\xB2 |0 # FULLWIDTH LOW LINE + \xA1\xB3 |0 # KATAKANA ITERATION MARK + \xA1\xB4 |0 # KATAKANA VOICED ITERATION MARK + \xA1\xB5 |0 # HIRAGANA ITERATION MARK + \xA1\xB6 |0 # HIRAGANA VOICED ITERATION MARK + \xA1\xB7 |0 # DITTO MARK + \xA1\xB8 |0 # CJK Ideograph + \xA1\xB9 |0 # IDEOGRAPHIC ITERATION MARK + \xA1\xBA |0 # IDEOGRAPHIC CLOSING MARK + \xA1\xBB |0 # IDEOGRAPHIC NUMBER ZERO + \xA1\xBC |0 # KATAKANA-HIRAGANA PROLONGED SOUND MARK + \xA1\xBD |0 # HORIZONTAL BAR + \xA1\xBE |0 # HYPHEN + \xA1\xBF |0 # FULLWIDTH SOLIDUS + \xA1\xC0 |0 # FULLWIDTH REVERSE SOLIDUS + \xA1\xC1 |0 # WAVE DASH + \xA1\xC2 |0 # DOUBLE VERTICAL LINE + \xA1\xC3 |0 # FULLWIDTH VERTICAL LINE + \xA1\xC4 |0 # HORIZONTAL ELLIPSIS + \xA1\xC5 |0 # TWO DOT LEADER + \xA1\xC6 |0 # LEFT SINGLE QUOTATION MARK + \xA1\xC7 |0 # RIGHT SINGLE QUOTATION MARK + \xA1\xC8 |0 # LEFT DOUBLE QUOTATION MARK + \xA1\xC9 |0 # RIGHT DOUBLE QUOTATION MARK + \xA1\xCA |0 # FULLWIDTH LEFT PARENTHESIS + \xA1\xCB |0 # FULLWIDTH RIGHT PARENTHESIS + \xA1\xCC |0 # LEFT TORTOISE SHELL BRACKET + \xA1\xCD |0 # RIGHT TORTOISE SHELL BRACKET + \xA1\xCE |0 # FULLWIDTH LEFT SQUARE BRACKET + \xA1\xCF |0 # FULLWIDTH RIGHT SQUARE BRACKET + \xA1\xD0 |0 # FULLWIDTH LEFT CURLY BRACKET + \xA1\xD1 |0 # FULLWIDTH RIGHT CURLY BRACKET + \xA1\xD2 |0 # LEFT ANGLE BRACKET + \xA1\xD3 |0 # RIGHT ANGLE BRACKET + \xA1\xD4 |0 # LEFT DOUBLE ANGLE BRACKET + \xA1\xD5 |0 # RIGHT DOUBLE ANGLE BRACKET + \xA1\xD6 |0 # LEFT CORNER BRACKET + \xA1\xD7 |0 # RIGHT CORNER BRACKET + \xA1\xD8 |0 # LEFT WHITE CORNER BRACKET + \xA1\xD9 |0 # RIGHT WHITE CORNER BRACKET + \xA1\xDA |0 # LEFT BLACK LENTICULAR BRACKET + \xA1\xDB |0 # RIGHT BLACK LENTICULAR BRACKET + \xA1\xDC |0 # FULLWIDTH PLUS SIGN + \xA1\xDD |0 # MINUS SIGN + \xA1\xDE |0 # PLUS-MINUS SIGN + \xA1\xDF |0 # MULTIPLICATION SIGN + \xA1\xE0 |0 # DIVISION SIGN + \xA1\xE1 |0 # FULLWIDTH EQUALS SIGN + \xA1\xE2 |0 # NOT EQUAL TO + \xA1\xE3 |0 # FULLWIDTH LESS-THAN SIGN + \xA1\xE4 |0 # FULLWIDTH GREATER-THAN SIGN + \xA1\xE5 |0 # LESS-THAN OVER EQUAL TO + \xA1\xE6 |0 # GREATER-THAN OVER EQUAL TO + \xA1\xE7 |0 # INFINITY + \xA1\xE8 |0 # THEREFORE + \xA1\xE9 |0 # MALE SIGN + \xA1\xEA |0 # FEMALE SIGN + \xA1\xEB |0 # DEGREE SIGN + \xA1\xEC |0 # PRIME + \xA1\xED |0 # DOUBLE PRIME + \xA1\xEE |0 # DEGREE CELSIUS + \xA1\xEF |0 # FULLWIDTH YEN SIGN + \xA1\xF0 |0 # FULLWIDTH DOLLAR SIGN + \xA1\xF1 |0 # CENT SIGN + \xA1\xF2 |0 # POUND SIGN + \xA1\xF3 |0 # FULLWIDTH PERCENT SIGN + \xA1\xF4 |0 # FULLWIDTH NUMBER SIGN + \xA1\xF5 |0 # FULLWIDTH AMPERSAND + \xA1\xF6 |0 # FULLWIDTH ASTERISK + \xA1\xF7 |0 # FULLWIDTH COMMERCIAL AT + \xA1\xF8 |0 # SECTION SIGN + \xA1\xF9 |0 # WHITE STAR + \xA1\xFA |0 # BLACK STAR + \xA1\xFB |0 # WHITE CIRCLE + \xA1\xFC |0 # BLACK CIRCLE + \xA1\xFD |0 # BULLSEYE + \xA1\xFE |0 # WHITE DIAMOND + \xA2\xA1 |0 # BLACK DIAMOND + \xA2\xA2 |0 # WHITE SQUARE + \xA2\xA3 |0 # BLACK SQUARE + \xA2\xA4 |0 # WHITE UP-POINTING TRIANGLE + \xA2\xA5 |0 # BLACK UP-POINTING TRIANGLE + \xA2\xA6 |0 # WHITE DOWN-POINTING TRIANGLE + \xA2\xA7 |0 # BLACK DOWN-POINTING TRIANGLE + \xA2\xA8 |0 # REFERENCE MARK + \xA2\xA9 |0 # POSTAL MARK + \xA2\xAA |0 # RIGHTWARDS ARROW + \xA2\xAB |0 # LEFTWARDS ARROW + \xA2\xAC |0 # UPWARDS ARROW + \xA2\xAD |0 # DOWNWARDS ARROW + \xA2\xAE |0 # GETA MARK + \xA2\xBA |0 # ELEMENT OF + \xA2\xBB |0 # CONTAINS AS MEMBER + \xA2\xBC |0 # SUBSET OF OR EQUAL TO + \xA2\xBD |0 # SUPERSET OF OR EQUAL TO + \xA2\xBE |0 # SUBSET OF + \xA2\xBF |0 # SUPERSET OF + \xA2\xC0 |0 # UNION + \xA2\xC1 |0 # INTERSECTION + \xA2\xCA |0 # LOGICAL AND + \xA2\xCB |0 # LOGICAL OR + \xA2\xCC |0 # NOT SIGN + \xA2\xCD |0 # RIGHTWARDS DOUBLE ARROW + \xA2\xCE |0 # LEFT RIGHT DOUBLE ARROW + \xA2\xCF |0 # FOR ALL + \xA2\xD0 |0 # THERE EXISTS + \xA2\xDC |0 # ANGLE + \xA2\xDD |0 # UP TACK + \xA2\xDE |0 # ARC + \xA2\xDF |0 # PARTIAL DIFFERENTIAL + \xA2\xE0 |0 # NABLA + \xA2\xE1 |0 # IDENTICAL TO + \xA2\xE2 |0 # APPROXIMATELY EQUAL TO OR THE IMAGE OF + \xA2\xE3 |0 # MUCH LESS-THAN + \xA2\xE4 |0 # MUCH GREATER-THAN + \xA2\xE5 |0 # SQUARE ROOT + \xA2\xE6 |0 # REVERSED TILDE + \xA2\xE7 |0 # PROPORTIONAL TO + \xA2\xE8 |0 # BECAUSE + \xA2\xE9 |0 # INTEGRAL + \xA2\xEA |0 # DOUBLE INTEGRAL + \xA2\xF2 |0 # ANGSTROM SIGN + \xA2\xF3 |0 # PER MILLE SIGN + \xA2\xF4 |0 # MUSIC SHARP SIGN + \xA2\xF5 |0 # MUSIC FLAT SIGN + \xA2\xF6 |0 # EIGHTH NOTE + \xA2\xF7 |0 # DAGGER + \xA2\xF8 |0 # DOUBLE DAGGER + \xA2\xF9 |0 # PILCROW SIGN + \xA2\xFE |0 # LARGE CIRCLE +# +# jisx0212-1990, just part of it +# + \x8F\xA2\xAF |0 # BREVE + \x8F\xA2\xB0 |0 # CARON (Mandarin Chinese third tone) + \x8F\xA2\xB1 |0 # CEDILLA + \x8F\xA2\xB2 |0 # DOT ABOVE (Mandarin Chinese light tone) + \x8F\xA2\xB3 |0 # DOUBLE ACUTE ACCENT + \x8F\xA2\xB4 |0 # MACRON + \x8F\xA2\xB5 |0 # OGONEK + \x8F\xA2\xB6 |0 # RING ABOVE + \x8F\xA2\xB7 |0 # TILDE -- deliberately cause error + \x8F\xA2\xB8 |0 # GREEK TONOS + \x8F\xA2\xB9 |0 # GREEK DIALYTIKA TONOS + \x8F\xA2\xC2 |0 # INVERTED EXCLAMATION MARK + \x8F\xA2\xC3 |0 # BROKEN BAR + \x8F\xA2\xC4 |0 # INVERTED QUESTION MARK + \x8F\xA2\xEB |0 # MASCULINE ORDINAL INDICATOR + \x8F\xA2\xEC |0 # FEMININE ORDINAL INDICATOR + \x8F\xA2\xED |0 # COPYRIGHT SIGN + \x8F\xA2\xEE |0 # REGISTERED SIGN + \x8F\xA2\xEF |0 # TRADE MARK SIGN + \x8F\xA2\xF0 |0 # CURRENCY SIGN + \x8F\xA2\xF1 |0 # NUMERO SIGN +END CHARMAP diff --git a/ext/Encode/ucm/adobeStdenc.ucm b/ext/Encode/ucm/adobeStdenc.ucm index 1326369..52a757f 100644 --- a/ext/Encode/ucm/adobeStdenc.ucm +++ b/ext/Encode/ucm/adobeStdenc.ucm @@ -1,5 +1,5 @@ ## -# $Id: adobeStdenc.ucm,v 1.20 2002/04/04 19:50:53 dankogai Exp $ +# $Id: adobeStdenc.ucm,v 1.21 2002/04/09 20:06:15 dankogai Exp dankogai $ # # Original table can be obtained at # http://www.unicode.org/Public/MAPPINGS/VENDORS/ADOBE/stdenc.txt @@ -137,7 +137,7 @@ CHARMAP \x7C |0 # VERTICAL LINE # bar \x7D |0 # RIGHT CURLY BRACKET # braceright \x7E |0 # TILDE # asciitilde - \x20 |0 # NO-BREAK SPACE # space + \x20 |1 # NO-BREAK SPACE # space \xA1 |0 # INVERTED EXCLAMATION MARK # exclamdown \xA2 |0 # CENT SIGN # cent \xA3 |0 # POUND SIGN # sterling @@ -147,7 +147,7 @@ CHARMAP \xC8 |0 # DIAERESIS # dieresis \xE3 |0 # FEMININE ORDINAL INDICATOR # ordfeminine \xAB |0 # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK # guillemotleft - \x2D |0 # SOFT HYPHEN # hyphen + \x2D |1 # SOFT HYPHEN # hyphen \xC5 |0 # MACRON # macron \xC2 |0 # ACUTE ACCENT # acute \xB6 |0 # PILCROW SIGN # paragraph @@ -169,7 +169,7 @@ CHARMAP \xA6 |0 # LATIN SMALL LETTER F WITH HOOK # florin \xC3 |0 # MODIFIER LETTER CIRCUMFLEX ACCENT # circumflex \xCF |0 # CARON # caron - \xC5 |0 # MODIFIER LETTER MACRON # macron + \xC5 |1 # MODIFIER LETTER MACRON # macron \xC6 |0 # BREVE # breve \xC7 |0 # DOT ABOVE # dotaccent \xCA |0 # RING ABOVE # ring @@ -192,8 +192,8 @@ CHARMAP \xAC |0 # SINGLE LEFT-POINTING ANGLE QUOTATION MARK # guilsinglleft \xAD |0 # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK # guilsinglright \xA4 |0 # FRACTION SLASH # fraction - \xA4 |0 # DIVISION SLASH # fraction - \xB4 |0 # BULLET OPERATOR # periodcentered + \xA4 |1 # DIVISION SLASH # fraction + \xB4 |1 # BULLET OPERATOR # periodcentered \xAE |0 # LATIN SMALL LIGATURE FI # fi \xAF |0 # LATIN SMALL LIGATURE FL # fl END CHARMAP diff --git a/ext/Encode/ucm/adobeSymbol.ucm b/ext/Encode/ucm/adobeSymbol.ucm index 2d32f5d..e7b59c2 100644 --- a/ext/Encode/ucm/adobeSymbol.ucm +++ b/ext/Encode/ucm/adobeSymbol.ucm @@ -1,5 +1,5 @@ # -# $Id: adobeSymbol.ucm,v 1.20 2002/04/04 19:50:53 dankogai Exp $ +# $Id: adobeSymbol.ucm,v 1.21 2002/04/09 20:06:15 dankogai Exp dankogai $ # # Original table can be obtained at # http://www.unicode.org/Public/MAPPINGS/VENDORS/ADOBE/symbol.txt @@ -74,7 +74,7 @@ CHARMAP \x7B |0 # LEFT CURLY BRACKET # braceleft \x7C |0 # VERTICAL LINE # bar \x7D |0 # RIGHT CURLY BRACKET # braceright - \x20 |0 # NO-BREAK SPACE # space + \x20 |1 # NO-BREAK SPACE # space \xD8 |0 # NOT SIGN # logicalnot \xB0 |0 # DEGREE SIGN # degree \xB1 |0 # PLUS-MINUS SIGN # plusminus @@ -117,7 +117,7 @@ CHARMAP \x69 |0 # GREEK SMALL LETTER IOTA # iota \x6B |0 # GREEK SMALL LETTER KAPPA # kappa \x6C |0 # GREEK SMALL LETTER LAMDA # lambda - \x6D |0 # GREEK SMALL LETTER MU # mu + \x6D |1 # GREEK SMALL LETTER MU # mu \x6E |0 # GREEK SMALL LETTER NU # nu \x78 |0 # GREEK SMALL LETTER XI # xi \x6F |0 # GREEK SMALL LETTER OMICRON # omicron @@ -144,7 +144,7 @@ CHARMAP \xC1 |0 # BLACK-LETTER CAPITAL I # Ifraktur \xC3 |0 # SCRIPT CAPITAL P # weierstrass \xC2 |0 # BLACK-LETTER CAPITAL R # Rfraktur - \x57 |0 # OHM SIGN # Omega + \x57 |1 # OHM SIGN # Omega \xC0 |0 # ALEF SYMBOL # aleph \xAC |0 # LEFTWARDS ARROW # arrowleft \xAD |0 # UPWARDS ARROW # arrowup @@ -161,7 +161,7 @@ CHARMAP \xB6 |0 # PARTIAL DIFFERENTIAL # partialdiff \x24 |0 # THERE EXISTS # existential \xC6 |0 # EMPTY SET # emptyset - \x44 |0 # INCREMENT # Delta + \x44 |1 # INCREMENT # Delta \xD1 |0 # NABLA # gradient \xCE |0 # ELEMENT OF # element \xCF |0 # NOT AN ELEMENT OF # notelement @@ -169,7 +169,7 @@ CHARMAP \xD5 |0 # N-ARY PRODUCT # product \xE5 |0 # N-ARY SUMMATION # summation \x2D |0 # MINUS SIGN # minus - \xA4 |0 # DIVISION SLASH # fraction + \xA4 |1 # DIVISION SLASH # fraction \x2A |0 # ASTERISK OPERATOR # asteriskmath \xD6 |0 # SQUARE ROOT # radical \xB5 |0 # PROPORTIONAL TO # proportional diff --git a/ext/Encode/ucm/adobeZdingbat.ucm b/ext/Encode/ucm/adobeZdingbat.ucm index c233cc1..18e68e7 100644 --- a/ext/Encode/ucm/adobeZdingbat.ucm +++ b/ext/Encode/ucm/adobeZdingbat.ucm @@ -1,5 +1,5 @@ # -# $Id: adobeZdingbat.ucm,v 1.20 2002/04/04 19:50:53 dankogai Exp $ +# $Id: adobeZdingbat.ucm,v 1.21 2002/04/09 20:06:15 dankogai Exp dankogai $ # # Original table can be obtained at # http://www.unicode.org/Public/MAPPINGS/VENDORS/ADOBE/zdingbat.txt @@ -42,7 +42,7 @@ CHARMAP \x1E |0 # \x1F |0 # \x20 |0 # SPACE # space - \x20 |0 # NO-BREAK SPACE # space + \x20 |1 # NO-BREAK SPACE # space \xD5 |0 # RIGHTWARDS ARROW # a161 \xD6 |0 # LEFT RIGHT ARROW # a163 \xD7 |0 # UP DOWN ARROW # a164 diff --git a/ext/Encode/ucm/macDevanaga.ucm b/ext/Encode/ucm/macDevanaga.ucm deleted file mode 100644 index eb95722..0000000 --- a/ext/Encode/ucm/macDevanaga.ucm +++ /dev/null @@ -1,247 +0,0 @@ -# -# $Id: macDevanaga.ucm,v 1.20 2002/04/04 19:50:54 dankogai Exp $ -# -# Original table can be obtained at -# http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/DEVANAGA.TXT -# - "MacDevanagari" - "MacDevanaga" - 1 - 2 - \x3F -CHARMAP - \x00 |0 # - \x01 |0 # - \x02 |0 # - \x03 |0 # - \x04 |0 # - \x05 |0 # - \x06 |0 # - \x07 |0 # - \x08 |0 # - \x09 |0 # - \x0A |0 # - \x0B |0 # - \x0C |0 # - \x0D |0 # - \x0E |0 # - \x0F |0 # - \x10 |0 # - \x11 |0 # - \x12 |0 # - \x13 |0 # - \x14 |0 # - \x15 |0 # - \x16 |0 # - \x17 |0 # - \x18 |0 # - \x19 |0 # - \x1A |0 # - \x1B |0 # - \x1C |0 # - \x1D |0 # - \x1E |0 # - \x1F |0 # - \x20 |0 # SPACE - \x21 |0 # EXCLAMATION MARK - \x22 |0 # QUOTATION MARK - \x23 |0 # NUMBER SIGN - \x24 |0 # DOLLAR SIGN - \x25 |0 # PERCENT SIGN - \x26 |0 # AMPERSAND - \x27 |0 # APOSTROPHE - \x28 |0 # LEFT PARENTHESIS - \x29 |0 # RIGHT PARENTHESIS - \x2A |0 # ASTERISK - \x2B |0 # PLUS SIGN - \x2C |0 # COMMA - \x2D |0 # HYPHEN-MINUS - \x2E |0 # FULL STOP - \x2F |0 # SOLIDUS - \x30 |0 # DIGIT ZERO - \x31 |0 # DIGIT ONE - \x32 |0 # DIGIT TWO - \x33 |0 # DIGIT THREE - \x34 |0 # DIGIT FOUR - \x35 |0 # DIGIT FIVE - \x36 |0 # DIGIT SIX - \x37 |0 # DIGIT SEVEN - \x38 |0 # DIGIT EIGHT - \x39 |0 # DIGIT NINE - \x3A |0 # COLON - \x3B |0 # SEMICOLON - \x3C |0 # LESS-THAN SIGN - \x3D |0 # EQUALS SIGN - \x3E |0 # GREATER-THAN SIGN - \x3F |0 # QUESTION MARK - \x40 |0 # COMMERCIAL AT - \x41 |0 # LATIN CAPITAL LETTER A - \x42 |0 # LATIN CAPITAL LETTER B - \x43 |0 # LATIN CAPITAL LETTER C - \x44 |0 # LATIN CAPITAL LETTER D - \x45 |0 # LATIN CAPITAL LETTER E - \x46 |0 # LATIN CAPITAL LETTER F - \x47 |0 # LATIN CAPITAL LETTER G - \x48 |0 # LATIN CAPITAL LETTER H - \x49 |0 # LATIN CAPITAL LETTER I - \x4A |0 # LATIN CAPITAL LETTER J - \x4B |0 # LATIN CAPITAL LETTER K - \x4C |0 # LATIN CAPITAL LETTER L - \x4D |0 # LATIN CAPITAL LETTER M - \x4E |0 # LATIN CAPITAL LETTER N - \x4F |0 # LATIN CAPITAL LETTER O - \x50 |0 # LATIN CAPITAL LETTER P - \x51 |0 # LATIN CAPITAL LETTER Q - \x52 |0 # LATIN CAPITAL LETTER R - \x53 |0 # LATIN CAPITAL LETTER S - \x54 |0 # LATIN CAPITAL LETTER T - \x55 |0 # LATIN CAPITAL LETTER U - \x56 |0 # LATIN CAPITAL LETTER V - \x57 |0 # LATIN CAPITAL LETTER W - \x58 |0 # LATIN CAPITAL LETTER X - \x59 |0 # LATIN CAPITAL LETTER Y - \x5A |0 # LATIN CAPITAL LETTER Z - \x5B |0 # LEFT SQUARE BRACKET - \x5C |0 # REVERSE SOLIDUS - \x5D |0 # RIGHT SQUARE BRACKET - \x5E |0 # CIRCUMFLEX ACCENT - \x5F |0 # LOW LINE - \x60 |0 # GRAVE ACCENT - \x61 |0 # LATIN SMALL LETTER A - \x62 |0 # LATIN SMALL LETTER B - \x63 |0 # LATIN SMALL LETTER C - \x64 |0 # LATIN SMALL LETTER D - \x65 |0 # LATIN SMALL LETTER E - \x66 |0 # LATIN SMALL LETTER F - \x67 |0 # LATIN SMALL LETTER G - \x68 |0 # LATIN SMALL LETTER H - \x69 |0 # LATIN SMALL LETTER I - \x6A |0 # LATIN SMALL LETTER J - \x6B |0 # LATIN SMALL LETTER K - \x6C |0 # LATIN SMALL LETTER L - \x6D |0 # LATIN SMALL LETTER M - \x6E |0 # LATIN SMALL LETTER N - \x6F |0 # LATIN SMALL LETTER O - \x70 |0 # LATIN SMALL LETTER P - \x71 |0 # LATIN SMALL LETTER Q - \x72 |0 # LATIN SMALL LETTER R - \x73 |0 # LATIN SMALL LETTER S - \x74 |0 # LATIN SMALL LETTER T - \x75 |0 # LATIN SMALL LETTER U - \x76 |0 # LATIN SMALL LETTER V - \x77 |0 # LATIN SMALL LETTER W - \x78 |0 # LATIN SMALL LETTER X - \x79 |0 # LATIN SMALL LETTER Y - \x7A |0 # LATIN SMALL LETTER Z - \x7B |0 # LEFT CURLY BRACKET - \x7C |0 # VERTICAL LINE - \x7D |0 # RIGHT CURLY BRACKET - \x7E |0 # TILDE - \x88 |0 # COPYRIGHT SIGN - \x89 |0 # REGISTERED SIGN - \x80 |0 # MULTIPLICATION SIGN - \xA1 |0 # DEVANAGARI SIGN CANDRABINDU - \xA2 |0 # DEVANAGARI SIGN ANUSVARA - \xA3 |0 # DEVANAGARI SIGN VISARGA - \xA4 |0 # DEVANAGARI LETTER A - \xA5 |0 # DEVANAGARI LETTER AA - \xA6 |0 # DEVANAGARI LETTER I - \xA7 |0 # DEVANAGARI LETTER II - \xA8 |0 # DEVANAGARI LETTER U - \xA9 |0 # DEVANAGARI LETTER UU - \xAA |0 # DEVANAGARI LETTER VOCALIC R - \xA6\xE9 |1 # DEVANAGARI LETTER VOCALIC L - \xAE |0 # DEVANAGARI LETTER CANDRA E - \xAB |0 # DEVANAGARI LETTER SHORT E - \xAC |0 # DEVANAGARI LETTER E - \xAD |0 # DEVANAGARI LETTER AI - \xB2 |0 # DEVANAGARI LETTER CANDRA O - \xAF |0 # DEVANAGARI LETTER SHORT O - \xB0 |0 # DEVANAGARI LETTER O - \xB1 |0 # DEVANAGARI LETTER AU - \xB3 |0 # DEVANAGARI LETTER KA - \xB4 |0 # DEVANAGARI LETTER KHA - \xB5 |0 # DEVANAGARI LETTER GA - \xB6 |0 # DEVANAGARI LETTER GHA - \xB7 |0 # DEVANAGARI LETTER NGA - \xB8 |0 # DEVANAGARI LETTER CA - \xB9 |0 # DEVANAGARI LETTER CHA - \xBA |0 # DEVANAGARI LETTER JA - \xBB |0 # DEVANAGARI LETTER JHA - \xBC |0 # DEVANAGARI LETTER NYA - \xBD |0 # DEVANAGARI LETTER TTA - \xBE |0 # DEVANAGARI LETTER TTHA - \xBF |0 # DEVANAGARI LETTER DDA - \xC0 |0 # DEVANAGARI LETTER DDHA - \xC1 |0 # DEVANAGARI LETTER NNA - \xC2 |0 # DEVANAGARI LETTER TA - \xC3 |0 # DEVANAGARI LETTER THA - \xC4 |0 # DEVANAGARI LETTER DA - \xC5 |0 # DEVANAGARI LETTER DHA - \xC6 |0 # DEVANAGARI LETTER NA - \xC7 |0 # DEVANAGARI LETTER NNNA - \xC8 |0 # DEVANAGARI LETTER PA - \xC9 |0 # DEVANAGARI LETTER PHA - \xCA |0 # DEVANAGARI LETTER BA - \xCB |0 # DEVANAGARI LETTER BHA - \xCC |0 # DEVANAGARI LETTER MA - \xCD |0 # DEVANAGARI LETTER YA - \xCF |0 # DEVANAGARI LETTER RA - \xD0 |0 # DEVANAGARI LETTER RRA - \xD1 |0 # DEVANAGARI LETTER LA - \xD2 |0 # DEVANAGARI LETTER LLA - \xD3 |0 # DEVANAGARI LETTER LLLA - \xD4 |0 # DEVANAGARI LETTER VA - \xD5 |0 # DEVANAGARI LETTER SHA - \xD6 |0 # DEVANAGARI LETTER SSA - \xD7 |0 # DEVANAGARI LETTER SA - \xD8 |0 # DEVANAGARI LETTER HA - \xE9 |0 # DEVANAGARI SIGN NUKTA - \xEA\xE9 |1 # DEVANAGARI SIGN AVAGRAHA - \xDA |0 # DEVANAGARI VOWEL SIGN AA - \xDB |0 # DEVANAGARI VOWEL SIGN I - \xDC |0 # DEVANAGARI VOWEL SIGN II - \xDD |0 # DEVANAGARI VOWEL SIGN U - \xDE |0 # DEVANAGARI VOWEL SIGN UU - \xDF |0 # DEVANAGARI VOWEL SIGN VOCALIC R - \xDF\xE9 |1 # DEVANAGARI VOWEL SIGN VOCALIC RR - \xE3 |0 # DEVANAGARI VOWEL SIGN CANDRA E - \xE0 |0 # DEVANAGARI VOWEL SIGN SHORT E - \xE1 |0 # DEVANAGARI VOWEL SIGN E - \xE2 |0 # DEVANAGARI VOWEL SIGN AI - \xE7 |0 # DEVANAGARI VOWEL SIGN CANDRA O - \xE4 |0 # DEVANAGARI VOWEL SIGN SHORT O - \xE5 |0 # DEVANAGARI VOWEL SIGN O - \xE6 |0 # DEVANAGARI VOWEL SIGN AU - \xE8 |0 # DEVANAGARI SIGN VIRAMA # halant - \xE8\xE8 |1 # DEVANAGARI SIGN VIRAMA + ZWNJ # explicit halant - \xE8\xE9 |1 # DEVANAGARI SIGN VIRAMA + ZWJ # soft halant - \xA1\xE9 |1 # DEVANAGARI OM - \xCE |0 # DEVANAGARI LETTER YYA - \xAA\xE9 |1 # DEVANAGARI LETTER VOCALIC RR - \xA7\xE9 |1 # DEVANAGARI LETTER VOCALIC LL - \xDB\xE9 |1 # DEVANAGARI VOWEL SIGN VOCALIC L - \xDC\xE9 |1 # DEVANAGARI VOWEL SIGN VOCALIC LL - \xEA |0 # DEVANAGARI DANDA - \x90 |0 # DEVANAGARI DOUBLE DANDA - \xF1 |0 # DEVANAGARI DIGIT ZERO - \xF2 |0 # DEVANAGARI DIGIT ONE - \xF3 |0 # DEVANAGARI DIGIT TWO - \xF4 |0 # DEVANAGARI DIGIT THREE - \xF5 |0 # DEVANAGARI DIGIT FOUR - \xF6 |0 # DEVANAGARI DIGIT FIVE - \xF7 |0 # DEVANAGARI DIGIT SIX - \xF8 |0 # DEVANAGARI DIGIT SEVEN - \xF9 |0 # DEVANAGARI DIGIT EIGHT - \xFA |0 # DEVANAGARI DIGIT NINE - \x91 |0 # DEVANAGARI ABBREVIATION SIGN - \xD9 |0 # LEFT-TO-RIGHT MARK # invisible consonant - \x82 |0 # EN DASH - \x83 |0 # EM DASH - \x84 |0 # LEFT SINGLE QUOTATION MARK - \x85 |0 # RIGHT SINGLE QUOTATION MARK - \x87 |0 # BULLET - \x86 |0 # HORIZONTAL ELLIPSIS - \x8A |0 # TRADE MARK SIGN - \x81 |0 # MINUS SIGN -END CHARMAP diff --git a/ext/Encode/ucm/macGujarati.ucm b/ext/Encode/ucm/macGujarati.ucm deleted file mode 100644 index b1cd081..0000000 --- a/ext/Encode/ucm/macGujarati.ucm +++ /dev/null @@ -1,232 +0,0 @@ -# -# $Id: macGujarati.ucm,v 1.20 2002/04/04 19:50:54 dankogai Exp $ -# -# Original table can be obtained at -# http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/GUJARATI.TXT -# - "MacGujarati" - 1 - 2 - \x3F -CHARMAP - \x00 |0 # - \x01 |0 # - \x02 |0 # - \x03 |0 # - \x04 |0 # - \x05 |0 # - \x06 |0 # - \x07 |0 # - \x08 |0 # - \x09 |0 # - \x0A |0 # - \x0B |0 # - \x0C |0 # - \x0D |0 # - \x0E |0 # - \x0F |0 # - \x10 |0 # - \x11 |0 # - \x12 |0 # - \x13 |0 # - \x14 |0 # - \x15 |0 # - \x16 |0 # - \x17 |0 # - \x18 |0 # - \x19 |0 # - \x1A |0 # - \x1B |0 # - \x1C |0 # - \x1D |0 # - \x1E |0 # - \x1F |0 # - \x20 |0 # SPACE - \x21 |0 # EXCLAMATION MARK - \x22 |0 # QUOTATION MARK - \x23 |0 # NUMBER SIGN - \x24 |0 # DOLLAR SIGN - \x25 |0 # PERCENT SIGN - \x26 |0 # AMPERSAND - \x27 |0 # APOSTROPHE - \x28 |0 # LEFT PARENTHESIS - \x29 |0 # RIGHT PARENTHESIS - \x2A |0 # ASTERISK - \x2B |0 # PLUS SIGN - \x2C |0 # COMMA - \x2D |0 # HYPHEN-MINUS - \x2E |0 # FULL STOP - \x2F |0 # SOLIDUS - \x30 |0 # DIGIT ZERO - \x31 |0 # DIGIT ONE - \x32 |0 # DIGIT TWO - \x33 |0 # DIGIT THREE - \x34 |0 # DIGIT FOUR - \x35 |0 # DIGIT FIVE - \x36 |0 # DIGIT SIX - \x37 |0 # DIGIT SEVEN - \x38 |0 # DIGIT EIGHT - \x39 |0 # DIGIT NINE - \x3A |0 # COLON - \x3B |0 # SEMICOLON - \x3C |0 # LESS-THAN SIGN - \x3D |0 # EQUALS SIGN - \x3E |0 # GREATER-THAN SIGN - \x3F |0 # QUESTION MARK - \x40 |0 # COMMERCIAL AT - \x41 |0 # LATIN CAPITAL LETTER A - \x42 |0 # LATIN CAPITAL LETTER B - \x43 |0 # LATIN CAPITAL LETTER C - \x44 |0 # LATIN CAPITAL LETTER D - \x45 |0 # LATIN CAPITAL LETTER E - \x46 |0 # LATIN CAPITAL LETTER F - \x47 |0 # LATIN CAPITAL LETTER G - \x48 |0 # LATIN CAPITAL LETTER H - \x49 |0 # LATIN CAPITAL LETTER I - \x4A |0 # LATIN CAPITAL LETTER J - \x4B |0 # LATIN CAPITAL LETTER K - \x4C |0 # LATIN CAPITAL LETTER L - \x4D |0 # LATIN CAPITAL LETTER M - \x4E |0 # LATIN CAPITAL LETTER N - \x4F |0 # LATIN CAPITAL LETTER O - \x50 |0 # LATIN CAPITAL LETTER P - \x51 |0 # LATIN CAPITAL LETTER Q - \x52 |0 # LATIN CAPITAL LETTER R - \x53 |0 # LATIN CAPITAL LETTER S - \x54 |0 # LATIN CAPITAL LETTER T - \x55 |0 # LATIN CAPITAL LETTER U - \x56 |0 # LATIN CAPITAL LETTER V - \x57 |0 # LATIN CAPITAL LETTER W - \x58 |0 # LATIN CAPITAL LETTER X - \x59 |0 # LATIN CAPITAL LETTER Y - \x5A |0 # LATIN CAPITAL LETTER Z - \x5B |0 # LEFT SQUARE BRACKET - \x5C |0 # REVERSE SOLIDUS - \x5D |0 # RIGHT SQUARE BRACKET - \x5E |0 # CIRCUMFLEX ACCENT - \x5F |0 # LOW LINE - \x60 |0 # GRAVE ACCENT - \x61 |0 # LATIN SMALL LETTER A - \x62 |0 # LATIN SMALL LETTER B - \x63 |0 # LATIN SMALL LETTER C - \x64 |0 # LATIN SMALL LETTER D - \x65 |0 # LATIN SMALL LETTER E - \x66 |0 # LATIN SMALL LETTER F - \x67 |0 # LATIN SMALL LETTER G - \x68 |0 # LATIN SMALL LETTER H - \x69 |0 # LATIN SMALL LETTER I - \x6A |0 # LATIN SMALL LETTER J - \x6B |0 # LATIN SMALL LETTER K - \x6C |0 # LATIN SMALL LETTER L - \x6D |0 # LATIN SMALL LETTER M - \x6E |0 # LATIN SMALL LETTER N - \x6F |0 # LATIN SMALL LETTER O - \x70 |0 # LATIN SMALL LETTER P - \x71 |0 # LATIN SMALL LETTER Q - \x72 |0 # LATIN SMALL LETTER R - \x73 |0 # LATIN SMALL LETTER S - \x74 |0 # LATIN SMALL LETTER T - \x75 |0 # LATIN SMALL LETTER U - \x76 |0 # LATIN SMALL LETTER V - \x77 |0 # LATIN SMALL LETTER W - \x78 |0 # LATIN SMALL LETTER X - \x79 |0 # LATIN SMALL LETTER Y - \x7A |0 # LATIN SMALL LETTER Z - \x7B |0 # LEFT CURLY BRACKET - \x7C |0 # VERTICAL LINE - \x7D |0 # RIGHT CURLY BRACKET - \x7E |0 # TILDE - \x88 |0 # COPYRIGHT SIGN - \x89 |0 # REGISTERED SIGN - \x80 |0 # MULTIPLICATION SIGN - \xEA |0 # DEVANAGARI DANDA - \x90 |0 # DEVANAGARI DOUBLE DANDA - \xA1 |0 # GUJARATI SIGN CANDRABINDU - \xA2 |0 # GUJARATI SIGN ANUSVARA - \xA3 |0 # GUJARATI SIGN VISARGA - \xA4 |0 # GUJARATI LETTER A - \xA5 |0 # GUJARATI LETTER AA - \xA6 |0 # GUJARATI LETTER I - \xA7 |0 # GUJARATI LETTER II - \xA8 |0 # GUJARATI LETTER U - \xA9 |0 # GUJARATI LETTER UU - \xAA |0 # GUJARATI LETTER VOCALIC R - \xAE |0 # GUJARATI VOWEL CANDRA E - \xAC |0 # GUJARATI LETTER E - \xAD |0 # GUJARATI LETTER AI - \xB2 |0 # GUJARATI VOWEL CANDRA O - \xB0 |0 # GUJARATI LETTER O - \xB1 |0 # GUJARATI LETTER AU - \xB3 |0 # GUJARATI LETTER KA - \xB4 |0 # GUJARATI LETTER KHA - \xB5 |0 # GUJARATI LETTER GA - \xB6 |0 # GUJARATI LETTER GHA - \xB7 |0 # GUJARATI LETTER NGA - \xB8 |0 # GUJARATI LETTER CA - \xB9 |0 # GUJARATI LETTER CHA - \xBA |0 # GUJARATI LETTER JA - \xBB |0 # GUJARATI LETTER JHA - \xBC |0 # GUJARATI LETTER NYA - \xBD |0 # GUJARATI LETTER TTA - \xBE |0 # GUJARATI LETTER TTHA - \xBF |0 # GUJARATI LETTER DDA - \xC0 |0 # GUJARATI LETTER DDHA - \xC1 |0 # GUJARATI LETTER NNA - \xC2 |0 # GUJARATI LETTER TA - \xC3 |0 # GUJARATI LETTER THA - \xC4 |0 # GUJARATI LETTER DA - \xC5 |0 # GUJARATI LETTER DHA - \xC6 |0 # GUJARATI LETTER NA - \xC8 |0 # GUJARATI LETTER PA - \xC9 |0 # GUJARATI LETTER PHA - \xCA |0 # GUJARATI LETTER BA - \xCB |0 # GUJARATI LETTER BHA - \xCC |0 # GUJARATI LETTER MA - \xCD |0 # GUJARATI LETTER YA - \xCF |0 # GUJARATI LETTER RA - \xD1 |0 # GUJARATI LETTER LA - \xD2 |0 # GUJARATI LETTER LLA - \xD4 |0 # GUJARATI LETTER VA - \xD5 |0 # GUJARATI LETTER SHA - \xD6 |0 # GUJARATI LETTER SSA - \xD7 |0 # GUJARATI LETTER SA - \xD8 |0 # GUJARATI LETTER HA - \xE9 |0 # GUJARATI SIGN NUKTA - \xDA |0 # GUJARATI VOWEL SIGN AA - \xDB |0 # GUJARATI VOWEL SIGN I - \xDC |0 # GUJARATI VOWEL SIGN II - \xDD |0 # GUJARATI VOWEL SIGN U - \xDE |0 # GUJARATI VOWEL SIGN UU - \xDF |0 # GUJARATI VOWEL SIGN VOCALIC R - \xDF\xE9 |1 # GUJARATI VOWEL SIGN VOCALIC RR - \xE3 |0 # GUJARATI VOWEL SIGN CANDRA E - \xE1 |0 # GUJARATI VOWEL SIGN E - \xE2 |0 # GUJARATI VOWEL SIGN AI - \xE7 |0 # GUJARATI VOWEL SIGN CANDRA O - \xE5 |0 # GUJARATI VOWEL SIGN O - \xE6 |0 # GUJARATI VOWEL SIGN AU - \xE8 |0 # GUJARATI SIGN VIRAMA # halant - \xE8\xE8 |1 # GUJARATI SIGN VIRAMA + ZWNJ # explicit halant - \xE8\xE9 |1 # GUJARATI SIGN VIRAMA + ZWJ # soft halant - \xA1\xE9 |1 # GUJARATI OM - \xAA\xE9 |1 # GUJARATI LETTER VOCALIC RR - \xF1 |0 # GUJARATI DIGIT ZERO - \xF2 |0 # GUJARATI DIGIT ONE - \xF3 |0 # GUJARATI DIGIT TWO - \xF4 |0 # GUJARATI DIGIT THREE - \xF5 |0 # GUJARATI DIGIT FOUR - \xF6 |0 # GUJARATI DIGIT FIVE - \xF7 |0 # GUJARATI DIGIT SIX - \xF8 |0 # GUJARATI DIGIT SEVEN - \xF9 |0 # GUJARATI DIGIT EIGHT - \xFA |0 # GUJARATI DIGIT NINE - \xD9 |0 # LEFT-TO-RIGHT MARK # invisible consonant - \x82 |0 # EN DASH - \x83 |0 # EM DASH - \x84 |0 # LEFT SINGLE QUOTATION MARK - \x85 |0 # RIGHT SINGLE QUOTATION MARK - \x87 |0 # BULLET - \x86 |0 # HORIZONTAL ELLIPSIS - \x8A |0 # TRADE MARK SIGN - \x81 |0 # MINUS SIGN -END CHARMAP diff --git a/ext/Encode/ucm/macGurmukhi.ucm b/ext/Encode/ucm/macGurmukhi.ucm deleted file mode 100644 index 1190fc0..0000000 --- a/ext/Encode/ucm/macGurmukhi.ucm +++ /dev/null @@ -1,223 +0,0 @@ -# -# $Id: macGurmukhi.ucm,v 1.20 2002/04/04 19:50:54 dankogai Exp $ -# -# Original table can be obtained at -# http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/GURMUKHI.TXT -# - "MacGurmukhi" - 1 - 2 - \x3F -CHARMAP - \x00 |0 # - \x01 |0 # - \x02 |0 # - \x03 |0 # - \x04 |0 # - \x05 |0 # - \x06 |0 # - \x07 |0 # - \x08 |0 # - \x09 |0 # - \x0A |0 # - \x0B |0 # - \x0C |0 # - \x0D |0 # - \x0E |0 # - \x0F |0 # - \x10 |0 # - \x11 |0 # - \x12 |0 # - \x13 |0 # - \x14 |0 # - \x15 |0 # - \x16 |0 # - \x17 |0 # - \x18 |0 # - \x19 |0 # - \x1A |0 # - \x1B |0 # - \x1C |0 # - \x1D |0 # - \x1E |0 # - \x1F |0 # - \x20 |0 # SPACE - \x21 |0 # EXCLAMATION MARK - \x22 |0 # QUOTATION MARK - \x23 |0 # NUMBER SIGN - \x24 |0 # DOLLAR SIGN - \x25 |0 # PERCENT SIGN - \x26 |0 # AMPERSAND - \x27 |0 # APOSTROPHE - \x28 |0 # LEFT PARENTHESIS - \x29 |0 # RIGHT PARENTHESIS - \x2A |0 # ASTERISK - \x2B |0 # PLUS SIGN - \x2C |0 # COMMA - \x2D |0 # HYPHEN-MINUS - \x2E |0 # FULL STOP - \x2F |0 # SOLIDUS - \x30 |0 # DIGIT ZERO - \x31 |0 # DIGIT ONE - \x32 |0 # DIGIT TWO - \x33 |0 # DIGIT THREE - \x34 |0 # DIGIT FOUR - \x35 |0 # DIGIT FIVE - \x36 |0 # DIGIT SIX - \x37 |0 # DIGIT SEVEN - \x38 |0 # DIGIT EIGHT - \x39 |0 # DIGIT NINE - \x3A |0 # COLON - \x3B |0 # SEMICOLON - \x3C |0 # LESS-THAN SIGN - \x3D |0 # EQUALS SIGN - \x3E |0 # GREATER-THAN SIGN - \x3F |0 # QUESTION MARK - \x40 |0 # COMMERCIAL AT - \x41 |0 # LATIN CAPITAL LETTER A - \x42 |0 # LATIN CAPITAL LETTER B - \x43 |0 # LATIN CAPITAL LETTER C - \x44 |0 # LATIN CAPITAL LETTER D - \x45 |0 # LATIN CAPITAL LETTER E - \x46 |0 # LATIN CAPITAL LETTER F - \x47 |0 # LATIN CAPITAL LETTER G - \x48 |0 # LATIN CAPITAL LETTER H - \x49 |0 # LATIN CAPITAL LETTER I - \x4A |0 # LATIN CAPITAL LETTER J - \x4B |0 # LATIN CAPITAL LETTER K - \x4C |0 # LATIN CAPITAL LETTER L - \x4D |0 # LATIN CAPITAL LETTER M - \x4E |0 # LATIN CAPITAL LETTER N - \x4F |0 # LATIN CAPITAL LETTER O - \x50 |0 # LATIN CAPITAL LETTER P - \x51 |0 # LATIN CAPITAL LETTER Q - \x52 |0 # LATIN CAPITAL LETTER R - \x53 |0 # LATIN CAPITAL LETTER S - \x54 |0 # LATIN CAPITAL LETTER T - \x55 |0 # LATIN CAPITAL LETTER U - \x56 |0 # LATIN CAPITAL LETTER V - \x57 |0 # LATIN CAPITAL LETTER W - \x58 |0 # LATIN CAPITAL LETTER X - \x59 |0 # LATIN CAPITAL LETTER Y - \x5A |0 # LATIN CAPITAL LETTER Z - \x5B |0 # LEFT SQUARE BRACKET - \x5C |0 # REVERSE SOLIDUS - \x5D |0 # RIGHT SQUARE BRACKET - \x5E |0 # CIRCUMFLEX ACCENT - \x5F |0 # LOW LINE - \x60 |0 # GRAVE ACCENT - \x61 |0 # LATIN SMALL LETTER A - \x62 |0 # LATIN SMALL LETTER B - \x63 |0 # LATIN SMALL LETTER C - \x64 |0 # LATIN SMALL LETTER D - \x65 |0 # LATIN SMALL LETTER E - \x66 |0 # LATIN SMALL LETTER F - \x67 |0 # LATIN SMALL LETTER G - \x68 |0 # LATIN SMALL LETTER H - \x69 |0 # LATIN SMALL LETTER I - \x6A |0 # LATIN SMALL LETTER J - \x6B |0 # LATIN SMALL LETTER K - \x6C |0 # LATIN SMALL LETTER L - \x6D |0 # LATIN SMALL LETTER M - \x6E |0 # LATIN SMALL LETTER N - \x6F |0 # LATIN SMALL LETTER O - \x70 |0 # LATIN SMALL LETTER P - \x71 |0 # LATIN SMALL LETTER Q - \x72 |0 # LATIN SMALL LETTER R - \x73 |0 # LATIN SMALL LETTER S - \x74 |0 # LATIN SMALL LETTER T - \x75 |0 # LATIN SMALL LETTER U - \x76 |0 # LATIN SMALL LETTER V - \x77 |0 # LATIN SMALL LETTER W - \x78 |0 # LATIN SMALL LETTER X - \x79 |0 # LATIN SMALL LETTER Y - \x7A |0 # LATIN SMALL LETTER Z - \x7B |0 # LEFT CURLY BRACKET - \x7C |0 # VERTICAL LINE - \x7D |0 # RIGHT CURLY BRACKET - \x7E |0 # TILDE - \x88 |0 # COPYRIGHT SIGN - \x89 |0 # REGISTERED SIGN - \x80 |0 # MULTIPLICATION SIGN - \xEA |0 # DEVANAGARI DANDA - \xA2 |0 # GURMUKHI SIGN BINDI - \xA4 |0 # GURMUKHI LETTER A - \xA5 |0 # GURMUKHI LETTER AA - \xA6 |0 # GURMUKHI LETTER I - \xA7 |0 # GURMUKHI LETTER II - \xA8 |0 # GURMUKHI LETTER U - \xA9 |0 # GURMUKHI LETTER UU - \xAC |0 # GURMUKHI LETTER EE - \xAD |0 # GURMUKHI LETTER AI - \xB0 |0 # GURMUKHI LETTER OO - \xB1 |0 # GURMUKHI LETTER AU - \xB3 |0 # GURMUKHI LETTER KA - \xB4 |0 # GURMUKHI LETTER KHA - \xB5 |0 # GURMUKHI LETTER GA - \xB6 |0 # GURMUKHI LETTER GHA - \xB7 |0 # GURMUKHI LETTER NGA - \xB8 |0 # GURMUKHI LETTER CA - \xB9 |0 # GURMUKHI LETTER CHA - \xBA |0 # GURMUKHI LETTER JA - \xBB |0 # GURMUKHI LETTER JHA - \xBC |0 # GURMUKHI LETTER NYA - \xBD |0 # GURMUKHI LETTER TTA - \xBE |0 # GURMUKHI LETTER TTHA - \xBF |0 # GURMUKHI LETTER DDA - \xC0 |0 # GURMUKHI LETTER DDHA - \xC1 |0 # GURMUKHI LETTER NNA - \xC2 |0 # GURMUKHI LETTER TA - \xC3 |0 # GURMUKHI LETTER THA - \xC4 |0 # GURMUKHI LETTER DA - \xC5 |0 # GURMUKHI LETTER DHA - \xC6 |0 # GURMUKHI LETTER NA - \xC8 |0 # GURMUKHI LETTER PA - \xC9 |0 # GURMUKHI LETTER PHA - \xCA |0 # GURMUKHI LETTER BA - \xCB |0 # GURMUKHI LETTER BHA - \xCC |0 # GURMUKHI LETTER MA - \xCD |0 # GURMUKHI LETTER YA - \xCF |0 # GURMUKHI LETTER RA - \xD1 |0 # GURMUKHI LETTER LA - \xD4 |0 # GURMUKHI LETTER VA - \xD5 |0 # GURMUKHI LETTER SHA - \xD7 |0 # GURMUKHI LETTER SA - \xD8 |0 # GURMUKHI LETTER HA - \xE9 |0 # GURMUKHI SIGN NUKTA - \xDA |0 # GURMUKHI VOWEL SIGN AA - \xDB |0 # GURMUKHI VOWEL SIGN I - \xDC |0 # GURMUKHI VOWEL SIGN II - \xDD |0 # GURMUKHI VOWEL SIGN U - \xDE |0 # GURMUKHI VOWEL SIGN UU - \xE1 |0 # GURMUKHI VOWEL SIGN EE - \xE2 |0 # GURMUKHI VOWEL SIGN AI - \xE5 |0 # GURMUKHI VOWEL SIGN OO - \xE6 |0 # GURMUKHI VOWEL SIGN AU - \xE8 |0 # GURMUKHI SIGN VIRAMA # halant - \xE8\xE8 |1 # GURMUKHI SIGN VIRAMA + ZWNJ # explicit halant - \xE8\xE9 |1 # GURMUKHI SIGN VIRAMA + ZWJ # soft halant - \xF1 |0 # GURMUKHI DIGIT ZERO - \xF2 |0 # GURMUKHI DIGIT ONE - \xF3 |0 # GURMUKHI DIGIT TWO - \xF4 |0 # GURMUKHI DIGIT THREE - \xF5 |0 # GURMUKHI DIGIT FOUR - \xF6 |0 # GURMUKHI DIGIT FIVE - \xF7 |0 # GURMUKHI DIGIT SIX - \xF8 |0 # GURMUKHI DIGIT SEVEN - \xF9 |0 # GURMUKHI DIGIT EIGHT - \xFA |0 # GURMUKHI DIGIT NINE - \x90 |0 # GURMUKHI ADDAK - \x93 |0 # GURMUKHI IRI - \x92 |0 # GURMUKHI URA - \x94 |0 # GURMUKHI EK ONKAR - \xD9 |0 # LEFT-TO-RIGHT MARK # invisible consonant - \x82 |0 # EN DASH - \x83 |0 # EM DASH - \x84 |0 # LEFT SINGLE QUOTATION MARK - \x85 |0 # RIGHT SINGLE QUOTATION MARK - \x87 |0 # BULLET - \x86 |0 # HORIZONTAL ELLIPSIS - \x8A |0 # TRADE MARK SIGN - \x81 |0 # MINUS SIGN - \x91 |3 # GURMUKHI LETTER RRA, alternate -END CHARMAP diff --git a/ext/Encode/ucm/macROMnn.ucm b/ext/Encode/ucm/macROMnn.ucm index 242dd43..90f7a6b 100644 --- a/ext/Encode/ucm/macROMnn.ucm +++ b/ext/Encode/ucm/macROMnn.ucm @@ -1,5 +1,5 @@ # -# $Id: macROMnn.ucm,v 1.20 2002/04/04 19:50:55 dankogai Exp $ +# $Id: macROMnn.ucm,v 1.21 2002/04/09 20:06:15 dankogai Exp dankogai $ # # Original table can be obtained at # http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ROMANIAN.TXT @@ -93,9 +93,9 @@ CHARMAP \x50 |0 # LATIN CAPITAL LETTER P \x51 |0 # LATIN CAPITAL LETTER Q \x52 |0 # LATIN CAPITAL LETTER R - \xAF |3 # LATIN CAPITAL LETTER S + COMBINING COMMA BELOW + \xAF |3 # LATIN CAPITAL LETTER S + COMBINING COMMA BELOW \x53 |0 # LATIN CAPITAL LETTER S - \xDE |3 # LATIN CAPITAL LETTER T + COMBINING COMMA BELOW + \xDE |3 # LATIN CAPITAL LETTER T + COMBINING COMMA BELOW \x54 |0 # LATIN CAPITAL LETTER T \x55 |0 # LATIN CAPITAL LETTER U \x56 |0 # LATIN CAPITAL LETTER V @@ -127,9 +127,9 @@ CHARMAP \x70 |0 # LATIN SMALL LETTER P \x71 |0 # LATIN SMALL LETTER Q \x72 |0 # LATIN SMALL LETTER R - \xBF |3 # LATIN SMALL LETTER S + COMBINING COMMA BELOW + \xBF |3 # LATIN SMALL LETTER S + COMBINING COMMA BELOW \x73 |0 # LATIN SMALL LETTER S - \xDF |3 # LATIN SMALL LETTER T + COMBINING COMMA BELOW + \xDF |3 # LATIN SMALL LETTER T + COMBINING COMMA BELOW \x74 |0 # LATIN SMALL LETTER T \x75 |0 # LATIN SMALL LETTER U \x76 |0 # LATIN SMALL LETTER V