From: Jarkko Hietaniemi Date: Mon, 3 Mar 2003 06:45:59 +0000 (+0000) Subject: Upgrade to Unicode::Normalize 0.20. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=6c941e0cc3f0f3bcc43af75928ecabd63f9b41f6;p=p5sagit%2Fp5-mst-13.2.git Upgrade to Unicode::Normalize 0.20. p4raw-id: //depot/perl@18817 --- diff --git a/ext/Unicode/Normalize/Changes b/ext/Unicode/Normalize/Changes index b4c4d97..30f5c4a 100644 --- a/ext/Unicode/Normalize/Changes +++ b/ext/Unicode/Normalize/Changes @@ -1,5 +1,12 @@ Revision history for Perl extension Unicode::Normalize. +0.20 Sun Mar 02 13:29:25 2003 + - decompose Hangul syllables in a decomposition mapping. + +0.18 ... unreleased + - synchronization with bleadperl. + - Change 16262: by me + 0.17 Sun Apr 28 23:13:32 2002 - now normalize('NFC',$1) should work. - Some croak()'s are added in mkheader. diff --git a/ext/Unicode/Normalize/Normalize.pm b/ext/Unicode/Normalize/Normalize.pm index 33aeb6e..e0232d3 100644 --- a/ext/Unicode/Normalize/Normalize.pm +++ b/ext/Unicode/Normalize/Normalize.pm @@ -11,7 +11,7 @@ use strict; use warnings; use Carp; -our $VERSION = '0.17'; +our $VERSION = '0.20'; our $PACKAGE = __PACKAGE__; require Exporter; @@ -173,7 +173,7 @@ you can get its NFC/NFKC string, saying =head2 Quick Check -(see Annex 8, UAX #15; F) +(see Annex 8, UAX #15, and F) The following functions check whether the string is in that normalization form. @@ -275,7 +275,7 @@ is a composition exclusion. Returns a boolean whether the character of the specified codepoint is a singleton. -=item C<$is_non_startar_decomposition = isNonStDecomp($codepoint)> +=item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)> Returns a boolean whether the canonical decomposition of the character of the specified codepoint @@ -302,10 +302,10 @@ SADAHIRO Tomoyuki, ESADAHIRO@cpan.orgE http://homepage1.nifty.com/nomenclator/perl/ - Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved. + Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved. - This program is free software; you can redistribute it and/or - modify it under the same terms as Perl itself. + This module is free software; you can redistribute it + and/or modify it under the same terms as Perl itself. =head1 SEE ALSO diff --git a/ext/Unicode/Normalize/Normalize.xs b/ext/Unicode/Normalize/Normalize.xs index 93cb471..a75f74d 100644 --- a/ext/Unicode/Normalize/Normalize.xs +++ b/ext/Unicode/Normalize/Normalize.xs @@ -13,12 +13,12 @@ /* Perl 5.6.1 ? */ #ifndef uvuni_to_utf8 #define uvuni_to_utf8 uv_to_utf8 -#endif /* uvuni_to_utf8 */ +#endif /* uvuni_to_utf8 */ /* Perl 5.6.1 ? */ #ifndef utf8n_to_uvuni #define utf8n_to_uvuni utf8_to_uv -#endif /* utf8n_to_uvuni */ +#endif /* utf8n_to_uvuni */ /* At present, char > 0x10ffff are unaffected without complaint, right? */ #define VALID_UTF_MAX (0x10ffff) @@ -58,13 +58,15 @@ typedef struct { STRLEN pos; /* position */ } UNF_cc; -int compare_cc(const void *a, const void *b) +int compare_cc (const void *a, const void *b) { int ret_cc; - ret_cc = (*(UNF_cc*)a).cc - (*(UNF_cc*)b).cc; + ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc; if (ret_cc) return ret_cc; - return (*(UNF_cc*)a).pos - (*(UNF_cc*)b).pos; + + return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos ) + - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos ); } U8* dec_canonical (UV uv) @@ -460,13 +462,10 @@ checkNFC(arg) isMAYBE = TRUE; else if (ix) { char *canon, *compat; - /* - * NFKC_NO when having compatibility mapping; - * i.e. dec_compat(uv) defined & different with dec_canonical(uv). - */ + /* NFKC_NO when having compatibility mapping. */ canon = (char *) dec_canonical(uv); compat = (char *) dec_compat(uv); - if (compat && (!canon || strNE(canon, compat))) + if (compat && !(canon && strEQ(canon, compat))) XSRETURN_NO; } /* end of get NFC/NFKC property */ diff --git a/ext/Unicode/Normalize/README b/ext/Unicode/Normalize/README index c664b6a..f1b1754 100644 --- a/ext/Unicode/Normalize/README +++ b/ext/Unicode/Normalize/README @@ -1,4 +1,4 @@ -Unicode/Normalize version 0.17 +Unicode/Normalize version 0.20 =================================== Unicode::Normalize - Unicode Normalization Forms @@ -12,10 +12,8 @@ SYNOPSIS $NFKD_string = NFKD($string); # Normalization Form KD $NFKC_string = NFKC($string); # Normalization Form KC - or - use Unicode::Normalize 'normalize'; $NFD_string = normalize('D', $string); # Normalization Form D @@ -70,13 +68,13 @@ CAVEAT (1) In the perl-current, unicore/CompExcl.txt is renamed unicore/CompositionExclusions.txt. -(2) When these unicore/*.* files are updated; +(2) After these unicore/*.* files are updated. - in the case of an XS version: + In the case of an XS version: You must rebuild the module, as the data will be compiled on building. - in the case of a NoXS version: + In the case of a NoXS version: Rebuilding is not necessary, as the data will be read on requirement. @@ -88,7 +86,7 @@ COPYRIGHT AND LICENCE http://homepage1.nifty.com/nomenclator/perl/ - Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved. + Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved. - This program is free software; you can redistribute it and/or - modify it under the same terms as Perl itself. + This module is free software; you can redistribute it + and/or modify it under the same terms as Perl itself. diff --git a/ext/Unicode/Normalize/mkheader b/ext/Unicode/Normalize/mkheader index 8dc47a3..6cac390 100644 --- a/ext/Unicode/Normalize/mkheader +++ b/ext/Unicode/Normalize/mkheader @@ -13,6 +13,9 @@ use 5.006; use strict; use warnings; use Carp; +use File::Spec; + +our $IsEBCDIC = ord("A") != 0x41; our $PACKAGE = 'Unicode::Normalize, mkheader'; @@ -25,8 +28,9 @@ our $Decomp = do "unicore/Decomposition.pl" || croak "$PACKAGE: Decomposition.pl not found"; our %Combin; # $codepoint => $number : combination class -our %Canon; # $codepoint => $hexstring : canonical decomp. -our %Compat; # $codepoint => $hexstring : compat. decomp. +our %Canon; # $codepoint => \@codepoints : canonical decomp. +our %Compat; # $codepoint => \@codepoints : compat. decomp. +# after _U_stringify(), ($codepoint => $hexstring) for %Canon and %Compat our %Exclus; # $codepoint => 1 : composition exclusions our %Single; # $codepoint => 1 : singletons our %NonStD; # $codepoint => 1 : non-starter decompositions @@ -38,10 +42,39 @@ our %CompList; # $listname,$2nd => $codepoint : composite our $prefix = "UNF_"; our $structname = "${prefix}complist"; +########## definition of Hangul constants ########## +use constant SBase => 0xAC00; +use constant SFinal => 0xD7A3; # SBase -1 + SCount +use constant SCount => 11172; # LCount * NCount +use constant NCount => 588; # VCount * TCount +use constant LBase => 0x1100; +use constant LFinal => 0x1112; +use constant LCount => 19; +use constant VBase => 0x1161; +use constant VFinal => 0x1175; +use constant VCount => 21; +use constant TBase => 0x11A7; +use constant TFinal => 0x11C2; +use constant TCount => 28; + +sub decomposeHangul { + my $SIndex = $_[0] - SBase; + my $LIndex = int( $SIndex / NCount); + my $VIndex = int(($SIndex % NCount) / TCount); + my $TIndex = $SIndex % TCount; + my @ret = ( + LBase + $LIndex, + VBase + $VIndex, + $TIndex ? (TBase + $TIndex) : (), + ); + wantarray ? @ret : pack('U*', @ret); + # any element in @ret greater than 0xFF, so no need of u2n conversion. +} + +########## getting full decomposion ########## { my($f, $fh); foreach my $d (@INC) { - use File::Spec; $f = File::Spec->catfile($d, "unicore", "CompositionExclusions.txt"); last if open($fh, $f); $f = File::Spec->catfile($d, "unicode", "CompExcl.txt"); @@ -136,14 +169,20 @@ foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) { sub getCanonList { my @src = @_; - my @dec = map $Canon{$_} ? @{ $Canon{$_} } : $_, @src; + my @dec = map { + (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_) + : $Canon{$_} ? @{ $Canon{$_} } : $_ + } @src; return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec); # condition @src == @dec is not ok. } sub getCompatList { my @src = @_; - my @dec = map $Compat{$_} ? @{ $Compat{$_} } : $_, @src; + my @dec = map { + (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_) + : $Compat{$_} ? @{ $Compat{$_} } : $_ + } @src; return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec); # condition @src == @dec is not ok. } @@ -160,10 +199,11 @@ foreach my $key (keys %Compat) { sub _U_stringify { sprintf '"%s"', join '', - map sprintf("\\x%02x", $_), unpack 'C*', pack 'U*', @_; + map sprintf("\\x%02x", $_), unpack 'C*', + $IsEBCDIC + ? pack('U*', map utf8::unicode_to_native($_), @_) + : pack('U*', @_); } -# Do we need say -# instead of for EBCDIC? foreach my $hash (\%Canon, \%Compat) { foreach my $key (keys %$hash) { @@ -171,7 +211,7 @@ foreach my $hash (\%Canon, \%Compat) { } } -#################################### +########## writing header files ########## my @boolfunc = ( { diff --git a/ext/Unicode/Normalize/t/func.t b/ext/Unicode/Normalize/t/func.t index f45e111..d540d99 100644 --- a/ext/Unicode/Normalize/t/func.t +++ b/ext/Unicode/Normalize/t/func.t @@ -1,5 +1,3 @@ -# Before `make install' is performed this script should be runnable with -# `make test'. After `make install' it should work as `perl test.pl' BEGIN { if (ord("A") == 193) { @@ -8,6 +6,13 @@ BEGIN { } } +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = qw(../lib); + } +} + ######################### use Test; @@ -17,6 +22,20 @@ BEGIN { plan tests => 13 }; use Unicode::Normalize qw(:all); ok(1); # If we made it this far, we're ok. +our $IsEBCDIC = ord("A") != 0x41; + +sub _pack_U { + return $IsEBCDIC + ? pack('U*', map utf8::unicode_to_native($_), @_) + : pack('U*', @_); +} + +sub _unpack_U { + return $IsEBCDIC + ? map(utf8::native_to_unicode($_), unpack 'U*', shift) + : unpack('U*', shift); +} + ######################### print getCombinClass( 0) == 0 @@ -27,32 +46,32 @@ print getCombinClass( 0) == 0 print ! defined getCanon( 0) && ! defined getCanon(41) - && getCanon(0x00C0) eq pack('U*', 0x0041, 0x0300) - && getCanon(0x00EF) eq pack('U*', 0x0069, 0x0308) - && getCanon(0x304C) eq pack('U*', 0x304B, 0x3099) - && getCanon(0x1EA4) eq pack('U*', 0x0041, 0x0302, 0x0301) + && getCanon(0x00C0) eq _pack_U(0x0041, 0x0300) + && getCanon(0x00EF) eq _pack_U(0x0069, 0x0308) + && getCanon(0x304C) eq _pack_U(0x304B, 0x3099) + && getCanon(0x1EA4) eq _pack_U(0x0041, 0x0302, 0x0301) && getCanon(0x1F82) eq "\x{03B1}\x{0313}\x{0300}\x{0345}" - && getCanon(0x1FAF) eq pack('U*', 0x03A9, 0x0314, 0x0342, 0x0345) - && getCanon(0xAC00) eq pack('U*', 0x1100, 0x1161) - && getCanon(0xAE00) eq pack('U*', 0x1100, 0x1173, 0x11AF) + && getCanon(0x1FAF) eq _pack_U(0x03A9, 0x0314, 0x0342, 0x0345) + && getCanon(0xAC00) eq _pack_U(0x1100, 0x1161) + && getCanon(0xAE00) eq _pack_U(0x1100, 0x1173, 0x11AF) && ! defined getCanon(0x212C) && ! defined getCanon(0x3243) - && getCanon(0xFA2D) eq pack('U*', 0x9DB4) + && getCanon(0xFA2D) eq _pack_U(0x9DB4) ? "ok" : "not ok", " 3\n"; print ! defined getCompat( 0) && ! defined getCompat(41) - && getCompat(0x00C0) eq pack('U*', 0x0041, 0x0300) - && getCompat(0x00EF) eq pack('U*', 0x0069, 0x0308) - && getCompat(0x304C) eq pack('U*', 0x304B, 0x3099) - && getCompat(0x1EA4) eq pack('U*', 0x0041, 0x0302, 0x0301) - && getCompat(0x1F82) eq pack('U*', 0x03B1, 0x0313, 0x0300, 0x0345) - && getCompat(0x1FAF) eq pack('U*', 0x03A9, 0x0314, 0x0342, 0x0345) - && getCompat(0x212C) eq pack('U*', 0x0042) - && getCompat(0x3243) eq pack('U*', 0x0028, 0x81F3, 0x0029) - && getCompat(0xAC00) eq pack('U*', 0x1100, 0x1161) - && getCompat(0xAE00) eq pack('U*', 0x1100, 0x1173, 0x11AF) - && getCompat(0xFA2D) eq pack('U*', 0x9DB4) + && getCompat(0x00C0) eq _pack_U(0x0041, 0x0300) + && getCompat(0x00EF) eq _pack_U(0x0069, 0x0308) + && getCompat(0x304C) eq _pack_U(0x304B, 0x3099) + && getCompat(0x1EA4) eq _pack_U(0x0041, 0x0302, 0x0301) + && getCompat(0x1F82) eq _pack_U(0x03B1, 0x0313, 0x0300, 0x0345) + && getCompat(0x1FAF) eq _pack_U(0x03A9, 0x0314, 0x0342, 0x0345) + && getCompat(0x212C) eq _pack_U(0x0042) + && getCompat(0x3243) eq _pack_U(0x0028, 0x81F3, 0x0029) + && getCompat(0xAC00) eq _pack_U(0x1100, 0x1161) + && getCompat(0xAE00) eq _pack_U(0x1100, 0x1173, 0x11AF) + && getCompat(0xFA2D) eq _pack_U(0x9DB4) ? "ok" : "not ok", " 4\n"; print ! defined getComposite( 0, 0) @@ -85,11 +104,11 @@ print ! isSingleton( 0) ? "ok" : "not ok", " 7\n"; print reorder("") eq "" - && reorder(pack("U*", 0x0041, 0x0300, 0x0315, 0x0313, 0x031b, 0x0061)) - eq pack("U*", 0x0041, 0x031b, 0x0300, 0x0313, 0x0315, 0x0061) - && reorder(pack("U*", 0x00C1, 0x0300, 0x0315, 0x0313, 0x031b, + && reorder(_pack_U(0x0041, 0x0300, 0x0315, 0x0313, 0x031b, 0x0061)) + eq _pack_U(0x0041, 0x031b, 0x0300, 0x0313, 0x0315, 0x0061) + && reorder(_pack_U(0x00C1, 0x0300, 0x0315, 0x0313, 0x031b, 0x0061, 0x309A, 0x3099)) - eq pack("U*", 0x00C1, 0x031b, 0x0300, 0x0313, 0x0315, + eq _pack_U(0x00C1, 0x031b, 0x0300, 0x0313, 0x0315, 0x0061, 0x309A, 0x3099) ? "ok" : "not ok", " 8\n"; @@ -115,15 +134,15 @@ print answer(checkNFD("")) eq "YES" ? "ok" : "not ok", " 9\n"; print 1 - && answer(checkNFD(NFD(pack('U*', 0xC1, 0x1100, 0x1173, 0x11AF)))) eq "YES" - && answer(checkNFD(pack('U*', 0x20, 0xC1, 0x1100, 0x1173, 0x11AF))) eq "NO" - && answer(checkNFC(pack('U*', 0x20, 0xC1, 0x1173, 0x11AF))) eq "MAYBE" - && answer(checkNFC(pack('U*', 0x20, 0xC1, 0xAE00, 0x1100))) eq "YES" - && answer(checkNFC(pack('U*', 0x20, 0xC1, 0xAE00, 0x1100, 0x300))) eq "MAYBE" - && answer(checkNFC(pack('U*', 0x20, 0xC1, 0xFF71, 0x2025))) eq "YES" - && answer(check("NFC", pack('U*', 0x20, 0xC1, 0x212B, 0x300))) eq "NO" - && answer(checkNFKD(pack('U*', 0x20, 0xC1, 0xFF71, 0x2025))) eq "NO" - && answer(checkNFKC(pack('U*', 0x20, 0xC1, 0xAE00, 0x2025))) eq "NO" + && answer(checkNFD(NFD(_pack_U(0xC1, 0x1100, 0x1173, 0x11AF)))) eq "YES" + && answer(checkNFD(_pack_U(0x20, 0xC1, 0x1100, 0x1173, 0x11AF))) eq "NO" + && answer(checkNFC(_pack_U(0x20, 0xC1, 0x1173, 0x11AF))) eq "MAYBE" + && answer(checkNFC(_pack_U(0x20, 0xC1, 0xAE00, 0x1100))) eq "YES" + && answer(checkNFC(_pack_U(0x20, 0xC1, 0xAE00, 0x1100, 0x300))) eq "MAYBE" + && answer(checkNFC(_pack_U(0x20, 0xC1, 0xFF71, 0x2025))) eq "YES" + && answer(check("NFC", _pack_U(0x20, 0xC1, 0x212B, 0x300))) eq "NO" + && answer(checkNFKD(_pack_U(0x20, 0xC1, 0xFF71, 0x2025))) eq "NO" + && answer(checkNFKC(_pack_U(0x20, 0xC1, 0xAE00, 0x2025))) eq "NO" ? "ok" : "not ok", " 10\n"; "012ABC" =~ /(\d+)(\w+)/; diff --git a/ext/Unicode/Normalize/t/norm.t b/ext/Unicode/Normalize/t/norm.t index b32bf03..77ca218 100644 --- a/ext/Unicode/Normalize/t/norm.t +++ b/ext/Unicode/Normalize/t/norm.t @@ -1,5 +1,3 @@ -# Before `make install' is performed this script should be runnable with -# `make test'. After `make install' it should work as `perl test.pl' BEGIN { if (ord("A") == 193) { @@ -8,6 +6,13 @@ BEGIN { } } +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = qw(../lib); + } +} + ######################### use Test; @@ -17,6 +22,20 @@ BEGIN { plan tests => 18 }; use Unicode::Normalize qw(normalize); ok(1); # If we made it this far, we're ok. +our $IsEBCDIC = ord("A") != 0x41; + +sub _pack_U { + return $IsEBCDIC + ? pack('U*', map utf8::unicode_to_native($_), @_) + : pack('U*', @_); +} + +sub _unpack_U { + return $IsEBCDIC + ? map(utf8::native_to_unicode($_), unpack 'U*', shift) + : unpack('U*', shift); +} + ######################### ok(normalize('C', ""), ""); @@ -24,11 +43,11 @@ ok(normalize('D', ""), ""); sub hexNFC { join " ", map sprintf("%04X", $_), - unpack 'U*', normalize 'C', pack 'U*', map hex(), split ' ', shift; + _unpack_U normalize 'C', _pack_U map hex, split ' ', shift; } sub hexNFD { join " ", map sprintf("%04X", $_), - unpack 'U*', normalize 'NFD', pack 'U*', map hex(), split ' ', shift; + _unpack_U normalize 'D', _pack_U map hex, split ' ', shift; } ok(hexNFC("0061 0315 0300 05AE 05C4 0062"), "00E0 05AE 05C4 0315 0062"); diff --git a/ext/Unicode/Normalize/t/test.t b/ext/Unicode/Normalize/t/test.t index fe42aae..db1a536 100644 --- a/ext/Unicode/Normalize/t/test.t +++ b/ext/Unicode/Normalize/t/test.t @@ -1,5 +1,3 @@ -# Before `make install' is performed this script should be runnable with -# `make test'. After `make install' it should work as `perl test.pl' BEGIN { if (ord("A") == 193) { @@ -8,6 +6,13 @@ BEGIN { } } +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = qw(../lib); + } +} + ######################### use Test; @@ -17,6 +22,20 @@ BEGIN { plan tests => 20 }; use Unicode::Normalize; ok(1); # If we made it this far, we're ok. +our $IsEBCDIC = ord("A") != 0x41; + +sub _pack_U { + return $IsEBCDIC + ? pack('U*', map utf8::unicode_to_native($_), @_) + : pack('U*', @_); +} + +sub _unpack_U { + return $IsEBCDIC + ? map(utf8::native_to_unicode($_), unpack 'U*', shift) + : unpack('U*', shift); +} + ######################### ok(NFC(""), ""); @@ -24,11 +43,11 @@ ok(NFD(""), ""); sub hexNFC { join " ", map sprintf("%04X", $_), - unpack 'U*', NFC pack 'U*', map hex(), split ' ', shift; + _unpack_U NFC _pack_U map hex, split ' ', shift; } sub hexNFD { join " ", map sprintf("%04X", $_), - unpack 'U*', NFD pack 'U*', map hex(), split ' ', shift; + _unpack_U NFD _pack_U map hex, split ' ', shift; } ok(hexNFC("0061 0315 0300 05AE 05C4 0062"), "00E0 05AE 05C4 0315 0062"); @@ -49,10 +68,11 @@ ok(hexNFC("0000 0041 0000 0000"), "0000 0041 0000 0000"); ok(hexNFD("0000 0041 0000 0000"), "0000 0041 0000 0000"); # should be unary. -my $str11 = pack('U*', 0x41, 0x0302, 0x0301, 0x62); -my $str12 = pack('U*', 0x1EA4, 0x62); +my $str11 = _pack_U(0x41, 0x0302, 0x0301, 0x62); +my $str12 = _pack_U(0x1EA4, 0x62); ok(NFC $str11 eq $str12); -my $str21 = pack('U*', 0xE0, 0xAC00); -my $str22 = pack('U*', 0x61, 0x0300, 0x1100, 0x1161); +my $str21 = _pack_U(0xE0, 0xAC00); +my $str22 = _pack_U(0x61, 0x0300, 0x1100, 0x1161); ok(NFD $str21 eq $str22); +