From: Jarkko Hietaniemi Date: Mon, 29 Apr 2002 01:11:10 +0000 (+0000) Subject: Upgrade to Unicode::Normalize 0.76, X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=f027f50205c813d86c890c5f2f60eb2d68bf2fad;p=p5sagit%2Fp5-mst-13.2.git Upgrade to Unicode::Normalize 0.76, from SADAHIRO Tomoyuki. p4raw-id: //depot/perl@16260 --- diff --git a/ext/Unicode/Normalize/Changes b/ext/Unicode/Normalize/Changes index 37f7dee..b4c4d97 100644 --- a/ext/Unicode/Normalize/Changes +++ b/ext/Unicode/Normalize/Changes @@ -1,5 +1,12 @@ Revision history for Perl extension Unicode::Normalize. +0.17 Sun Apr 28 23:13:32 2002 + - now normalize('NFC',$1) should work. + - Some croak()'s are added in mkheader. + - synchronization with bleadperl. + - Change 15596: by me + - Change 16136: by pudge + 0.16 Thu Mar 21 13:36:14 2002 - synchronization with bleadperl. - Change 15318: by jhi diff --git a/ext/Unicode/Normalize/Normalize.pm b/ext/Unicode/Normalize/Normalize.pm index 8a31614..33aeb6e 100644 --- a/ext/Unicode/Normalize/Normalize.pm +++ b/ext/Unicode/Normalize/Normalize.pm @@ -11,7 +11,7 @@ use strict; use warnings; use Carp; -our $VERSION = '0.16'; +our $VERSION = '0.17'; our $PACKAGE = __PACKAGE__; require Exporter; @@ -45,24 +45,26 @@ sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) } sub normalize($$) { my $form = shift; + my $str = shift; $form =~ s/^NF//; return - $form eq 'D' ? NFD ($_[0]) : - $form eq 'C' ? NFC ($_[0]) : - $form eq 'KD' ? NFKD($_[0]) : - $form eq 'KC' ? NFKC($_[0]) : + $form eq 'D' ? NFD ($str) : + $form eq 'C' ? NFC ($str) : + $form eq 'KD' ? NFKD($str) : + $form eq 'KC' ? NFKC($str) : croak $PACKAGE."::normalize: invalid form name: $form"; } sub check($$) { my $form = shift; + my $str = shift; $form =~ s/^NF//; return - $form eq 'D' ? checkNFD ($_[0]) : - $form eq 'C' ? checkNFC ($_[0]) : - $form eq 'KD' ? checkNFKD($_[0]) : - $form eq 'KC' ? checkNFKC($_[0]) : + $form eq 'D' ? checkNFD ($str) : + $form eq 'C' ? checkNFC ($str) : + $form eq 'KD' ? checkNFKD($str) : + $form eq 'KC' ? checkNFKC($str) : croak $PACKAGE."::check: invalid form name: $form"; } @@ -71,7 +73,7 @@ __END__ =head1 NAME -Unicode::Normalize - normalized forms of Unicode text +Unicode::Normalize - Unicode Normalization Forms =head1 SYNOPSIS @@ -185,23 +187,23 @@ The result returned will be: =item C<$result = checkNFD($string)> -returns YES (1) or NO (empty string). +returns C (C<1>) or C (C). =item C<$result = checkNFC($string)> -returns YES (1), NO (empty string), or MAYBE (undef). +returns C (C<1>), C (C), or C (C). =item C<$result = checkNFKD($string)> -returns YES (1) or NO (empty string). +returns C (C<1>) or C (C). =item C<$result = checkNFKC($string)> -returns YES (1), NO (empty string), or MAYBE (undef). +returns C (C<1>), C (C), or C (C). =item C<$result = check($form_name, $string)> -returns YES (1), NO (empty string), or MAYBE (undef). +returns C (C<1>), C (C), or C (C). C<$form_name> is alike to that for C. @@ -218,7 +220,7 @@ For example, C has the MAYBE_NFC/MAYBE_NFKC property. Both C and C will return C. -Though, C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC +C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC (its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">), while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC. @@ -241,7 +243,7 @@ If the character of the specified codepoint is canonically decomposable (including Hangul Syllables), returns the B string canonically equivalent to it. -If it is not decomposable, returns undef. +If it is not decomposable, returns C. =item C<$compatibility_decomposed = getCompat($codepoint)> @@ -249,7 +251,7 @@ If the character of the specified codepoint is compatibility decomposable (including Hangul Syllables), returns the B string compatibility equivalent to it. -If it is not decomposable, returns undef. +If it is not decomposable, returns C. =item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)> @@ -257,7 +259,7 @@ If two characters here and next (as codepoints) are composable (including Hangul Jamo/Syllables and Composition Exclusions), returns the codepoint of the composite. -If they are not composable, returns undef. +If they are not composable, returns C. =item C<$combining_class = getCombinClass($codepoint)> diff --git a/ext/Unicode/Normalize/README b/ext/Unicode/Normalize/README index d24f0c7..c664b6a 100644 --- a/ext/Unicode/Normalize/README +++ b/ext/Unicode/Normalize/README @@ -1,7 +1,7 @@ -Unicode/Normalize version 0.16 +Unicode/Normalize version 0.17 =================================== -Unicode::Normalize - normalized forms of Unicode text +Unicode::Normalize - Unicode Normalization Forms SYNOPSIS @@ -67,8 +67,8 @@ unicore/CompositionExclusions.txt (or unicode/CompExcl.txt) CAVEAT -(1) In bleadperl, unicore/CompExcl.txt is renamed - unicore/CompositionExclusions.txt. +(1) In the perl-current, unicore/CompExcl.txt + is renamed unicore/CompositionExclusions.txt. (2) When these unicore/*.* files are updated; diff --git a/ext/Unicode/Normalize/mkheader b/ext/Unicode/Normalize/mkheader index 67a132b..339f866 100644 --- a/ext/Unicode/Normalize/mkheader +++ b/ext/Unicode/Normalize/mkheader @@ -1,11 +1,14 @@ #!perl # # This script generates "unfcan.h", "unfcpt.h", "unfcmb.h", -# "unfcmp.h", and "unfexc.h" +# "unfcmp.h", and "unfexc.h" # from CombiningClass.pl, Decomposition.pl, CompExcl.txt # in lib/unicore or unicode directory # for Unicode::Normalize.xs. (cf. Makefile.PL) # +# Usage: in command line +# or in perl +# use 5.006; use strict; use warnings; @@ -24,7 +27,7 @@ our $Decomp = do "unicore/Decomposition.pl" our %Combin; # $codepoint => $number : combination class our %Canon; # $codepoint => $hexstring : canonical decomp. our %Compat; # $codepoint => $hexstring : compat. decomp. -our %Compos; # $string => $codepoint : composite +our %Compos; # $1st,$2nd => $codepoint : composite our %Exclus; # $codepoint => 1 : composition exclusions our %Single; # $codepoint => 1 : singletons our %NonStD; # $codepoint => 1 : non-starter decompositions @@ -62,7 +65,6 @@ while ($Decomp =~ /(.+)/g) { my @tab = split /\t/, $1; my $compat = $tab[2] =~ s/<[^>]+>//; my $dec = [ _getHexArray($tab[2]) ]; # decomposition - my $com = pack('U*', @$dec); # composable sequence my $ini = hex($tab[0]); # initial decomposable character if ($tab[1] eq '') { $Compat{ $ini } = $dec; @@ -70,14 +72,16 @@ while ($Decomp =~ /(.+)/g) { if (! $compat) { $Canon{ $ini } = $dec; - if (@$dec > 1) { + if (@$dec == 2) { if ($Combin{ $dec->[0] }) { $NonStD{ $ini } = 1; } else { - $Compos{ $com } = $ini; + $Compos{ $dec->[0] }{ $dec->[1] } = $ini; } - } else { + } elsif (@$dec == 1) { $Single{ $ini } = 1; + } else { + croak("Weird Canonical Decomposition of U+$tab[0]"); } } } else { @@ -86,14 +90,16 @@ while ($Decomp =~ /(.+)/g) { if (! $compat) { $Canon{ $u } = $dec; - if (@$dec > 1) { + if (@$dec == 2) { if ($Combin{ $dec->[0] }) { $NonStD{ $u } = 1; } else { - $Compos{ $com } = $u; + $Compos{ $dec->[0] }{ $dec->[1] } = $u; } - } else { + } elsif (@$dec == 1) { $Single{ $u } = 1; + } else { + croak("Weird Canonical Decomposition of U+$tab[0]"); } } } @@ -130,6 +136,8 @@ sub _U_stringify { sprintf '"%s"', join '', map sprintf("\\x%02x", $_), unpack 'C*', pack 'U*', @_; } +# Do we need say +# instead of for EBCDIC? foreach my $hash (\%Canon, \%Compat) { foreach my $key (keys %$hash) { @@ -142,18 +150,20 @@ my $structname = "${prefix}complist"; our (%Comp1st, %Comp2nd, %CompList); -foreach (sort keys %Compos) { - my @a = unpack('U*', $_); - my $val = $Compos{$_}; - my $name = sprintf "${structname}_%06x", $a[0]; - $Comp1st{$a[0]} = $name; - $Comp2nd{$a[1]} = 1 if ! $Exclus{$Compos{$_}} && ! $Combin{$a[0]}; - $CompList{$name}{$a[1]} = $val; +foreach my $c1 (keys %Compos) { + my $name = sprintf "${structname}_%06x", $c1; + $Comp1st{$c1} = $name; + + foreach my $c2 (keys %{ $Compos{$c1} }) { + my $composite = $Compos{$c1}{$c2}; + $Comp2nd{$c2} = 1 if ! $Exclus{$composite} && ! $Combin{$c1}; + $CompList{$name}{$c2} = $composite; + } } # modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo -foreach (0x1161..0x1175, 0x11A8..0x11C2) { - $Comp2nd{$_} = 1; +foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) { + $Comp2nd{$j} = 1; } my $compinit = @@ -285,6 +295,8 @@ EOF print $init if defined $init; foreach my $uv (keys %$hash) { + croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv) + unless $uv <= 0x10FFFF; my @c = unpack 'CCCC', pack 'N', $uv; $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv}; } diff --git a/ext/Unicode/Normalize/t/func.t b/ext/Unicode/Normalize/t/func.t index 86320cc..f45e111 100644 --- a/ext/Unicode/Normalize/t/func.t +++ b/ext/Unicode/Normalize/t/func.t @@ -3,8 +3,8 @@ BEGIN { if (ord("A") == 193) { - print "1..0 # Unicode::Normalize not ported to EBCDIC\n"; - exit 0; + print "1..0 # Unicode::Normalize not ported to EBCDIC\n"; + exit 0; } } @@ -13,7 +13,7 @@ BEGIN { use Test; use strict; use warnings; -BEGIN { plan tests => 10 }; +BEGIN { plan tests => 13 }; use Unicode::Normalize qw(:all); ok(1); # If we made it this far, we're ok. @@ -22,7 +22,7 @@ ok(1); # If we made it this far, we're ok. print getCombinClass( 0) == 0 && getCombinClass( 768) == 230 && getCombinClass(1809) == 36 -# && getCombinClass(119143) == 1 # U+1D167, a Unicode 3.1 character + && ($] < 5.007003 || getCombinClass(0x1D167) == 1) # Unicode 3.1 ? "ok" : "not ok", " 2\n"; print ! defined getCanon( 0) @@ -125,3 +125,15 @@ print 1 && answer(checkNFKD(pack('U*', 0x20, 0xC1, 0xFF71, 0x2025))) eq "NO" && answer(checkNFKC(pack('U*', 0x20, 0xC1, 0xAE00, 0x2025))) eq "NO" ? "ok" : "not ok", " 10\n"; + +"012ABC" =~ /(\d+)(\w+)/; +print "012" eq NFC $1 && "ABC" eq NFC $2 + ? "ok" : "not ok", " 11\n"; + +print "012" eq normalize('C', $1) && "ABC" eq normalize('C', $2) + ? "ok" : "not ok", " 12\n"; + +print "012" eq normalize('NFC', $1) && "ABC" eq normalize('NFC', $2) + ? "ok" : "not ok", " 13\n"; + # s/^NF// in normalize() must not prevent using $1, $&, etc. +