From: Rafael Garcia-Suarez Date: Tue, 5 Apr 2005 15:23:47 +0000 (+0000) Subject: Upgrade to Unicode::Normalize 0.32 X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=628bbff0c15c13c4704c1f63e1b8dac4c47eb639;p=p5sagit%2Fp5-mst-13.2.git Upgrade to Unicode::Normalize 0.32 p4raw-id: //depot/perl@24166 --- diff --git a/ext/Unicode/Normalize/Changes b/ext/Unicode/Normalize/Changes index bb1b693..9c0271b 100644 --- a/ext/Unicode/Normalize/Changes +++ b/ext/Unicode/Normalize/Changes @@ -1,8 +1,18 @@ Revision history for Perl extension Unicode::Normalize. +0.32 Tue Apr 5 22:47:09 2005 + - Some literal and grammatical errors in POD are fixed. + +0.31 Tue Apr 5 21:43:20 2005 + - CAVEATS in POD is added. + - Some test cases from Unicode Public Review Issue #29 + (Normalization Issue) are added to norm.t and test.t. + - do 'mkheader' returns true so that Makefile.PL will catch error. + - META.yml is added. + 0.30 Sun May 2 14:35:00 2004 - - XSUB: (perl 5.8.1 or later) improved utf8 upgrade of non-POK (private POK) - values like tied scalars, overloaded objects, etc. + - XSUB: (perl 5.8.1 or later) improved utf8 upgrade of non-POK + (private POK) values like tied scalars, overloaded objects, etc. 0.28 Sat Nov 22 23:46:24 2003 - XSUB: even if string contains a malformed, "short" Unicode character, diff --git a/ext/Unicode/Normalize/Makefile.PL b/ext/Unicode/Normalize/Makefile.PL index 2f37b62..ae920dc 100644 --- a/ext/Unicode/Normalize/Makefile.PL +++ b/ext/Unicode/Normalize/Makefile.PL @@ -1,3 +1,4 @@ +require 5.006001; use ExtUtils::MakeMaker; my $clean = {}; @@ -5,13 +6,25 @@ my $clean = {}; if (-f "Normalize.xs") { print STDERR "Making header files for XS...\n"; - do "mkheader"; + do 'mkheader' or die $@ || "mkheader: $!"; + $clean = { FILES => 'unfcan.h unfcmb.h unfcmp.h unfcpt.h unfexc.h' }; } WriteMakefile( - 'INSTALLDIRS' => $] >= 5.007 ? 'perl' : 'site', + 'INSTALLDIRS' => $] >= 5.007002 ? 'perl' : 'site', 'NAME' => 'Unicode::Normalize', 'VERSION_FROM' => 'Normalize.pm', # finds $VERSION 'clean' => $clean, + 'PREREQ_PM' => { + Carp => 0, + constant => 0, + DynaLoader => 0, + Exporter => 0, + File::Copy => 0, + File::Spec => 0, + strict => 0, + Test => 0, + warnings => 0, + }, ); diff --git a/ext/Unicode/Normalize/Normalize.pm b/ext/Unicode/Normalize/Normalize.pm index 09ef371..8f5f4cc 100644 --- a/ext/Unicode/Normalize/Normalize.pm +++ b/ext/Unicode/Normalize/Normalize.pm @@ -13,7 +13,7 @@ use Carp; no warnings 'utf8'; -our $VERSION = '0.30'; +our $VERSION = '0.32'; our $PACKAGE = __PACKAGE__; require Exporter; @@ -81,7 +81,7 @@ sub normalize($$) { my $form = shift; my $str = shift; - return exists $formNorm{$form} + return exists $formNorm{$form} ? $formNorm{$form}->($str) : croak $PACKAGE."::normalize: invalid form name: $form"; } @@ -103,7 +103,7 @@ sub check($$) { my $form = shift; my $str = shift; - return exists $formCheck{$form} + return exists $formCheck{$form} ? $formCheck{$form}->($str) : croak $PACKAGE."::check: invalid form name: $form"; } @@ -145,9 +145,9 @@ C<$string> is used as a string under character semantics C<$codepoint> should be an unsigned integer representing a Unicode code point. -Note: Between XS edition and pure Perl edition, -interpretation of C<$codepoint> as a decimal number has incompatibility. -XS converts C<$codepoint> to an unsigned integer, but pure Perl does not. +Note: Between XSUB and pure Perl, there is an incompatibility +about the interpretation of C<$codepoint> as a decimal number. +XSUB converts C<$codepoint> to an unsigned integer, but pure Perl does not. Do not use a floating point nor a negative sign in C<$codepoint>. =head2 Normalization Forms @@ -259,36 +259,48 @@ The result returned will be: =item C<$result = checkNFD($string)> -returns C (C<1>) or C (C). +returns true (C<1>) if C; false (C) if C. =item C<$result = checkNFC($string)> -returns C (C<1>), C (C), or C (C). +returns true (C<1>) if C; false (C) if C; +C if C. =item C<$result = checkNFKD($string)> -returns C (C<1>) or C (C). +returns true (C<1>) if C; false (C) if C. =item C<$result = checkNFKC($string)> -returns C (C<1>), C (C), or C (C). +returns true (C<1>) if C; false (C) if C; +C if C. =item C<$result = checkFCD($string)> -returns C (C<1>) or C (C). +returns true (C<1>) if C; false (C) if C. =item C<$result = checkFCC($string)> -returns C (C<1>), C (C), or C (C). +returns true (C<1>) if C; false (C) if C; +C if C. If a string is not in FCD, it must not be in FCC. So C should return C. =item C<$result = check($form_name, $string)> -returns C (C<1>), C (C), or C (C). +returns true (C<1>) if C; false (C) if C; +C if C. -C<$form_name> is alike to that for C. +As C<$form_name>, one of the following names must be given. + + 'C' or 'NFC' for Normalization Form C (UAX #15) + 'D' or 'NFD' for Normalization Form D (UAX #15) + 'KC' or 'NFKC' for Normalization Form KC (UAX #15) + 'KD' or 'NFKD' for Normalization Form KD (UAX #15) + + 'FCD' for "Fast C or D" Form (UTN #5) + 'FCC' for "Fast C Contiguous" (UTN #5) =back @@ -308,12 +320,19 @@ C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC (its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">), while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC. -If you want to check exactly, compare the string with its NFC/NFKC/FCC; -i.e., +If you want to check exactly, compare the string with its NFC/NFKC/FCC. + + if ($string eq NFC($string)) { + # $string is exactly normalized in NFC; + } else { + # $string is not normalized in NFC; + } - $string eq NFC($string) # thorough than checkNFC($string) - $string eq NFKC($string) # thorough than checkNFKC($string) - $string eq FCC($string) # thorough than checkFCC($string) + if ($string eq NFKC($string)) { + # $string is exactly normalized in NFKC; + } else { + # $string is not normalized in NFKC; + } =head2 Character Data @@ -376,22 +395,60 @@ Composition Exclusions and Non-Starter Decompositions). =back -=head2 EXPORT +=head1 EXPORT C, C, C, C: by default. C and other some functions: on request. +=head1 CAVEATS + +=over 4 + +=item Perl's version vs. Unicode version + +Since this module refers to perl core's Unicode database in the directory +F (or formerly F), the Unicode version of +normalization implemented by this module depends on your perl's version. + + perl's version implemented Unicode version + 5.6.1 3.0.1 + 5.7.2 3.1.0 + 5.7.3 3.1.1 (same normalized form as that of 3.1.0) + 5.8.0 3.2.0 + 5.8.1-5.8.3 4.0.0 + 5.8.4-5.8.6 (latest) 4.0.1 (same normalized form as that of 4.0.0) + +=item Correction of decomposition mapping + +In older Unicode versions, a small number of characters (all of which are +CJK compatibility ideographs as far as they have been found) may have +an erroneous decomposition mapping (see F). +Anyhow, this module will neither refer to F +nor provide any specific version of normalization. Therefore this module +running on an older perl with an older Unicode database may use +the erroneous decomposition mapping blindly conforming to the Unicode database. + +=item Revised definition of canonical composition + +In Unicode 4.1.0, the definition D2 of canonical composition (which +affects NFC and NFKC) has been changed (see Public Review Issue #29 +and recent UAX #15). This module has used the newer definition +since the version 0.07 (Oct 31, 2001). +This module does not support normalization according to the older +definition, even if the Unicode version implemented by perl is +lower than 4.1.0. + +=back + =head1 AUTHOR SADAHIRO Tomoyuki - http://homepage1.nifty.com/nomenclator/perl/ - - Copyright(C) 2001-2004, SADAHIRO Tomoyuki. Japan. All rights reserved. +Copyright(C) 2001-2005, SADAHIRO Tomoyuki. Japan. All rights reserved. - This module is free software; you can redistribute it - and/or modify it under the same terms as Perl itself. +This module is free software; you can redistribute it +and/or modify it under the same terms as Perl itself. =head1 SEE ALSO @@ -405,6 +462,14 @@ Unicode Normalization Forms - UAX #15 Derived Normalization Properties +=item http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt + +Normalization Corrections + +=item http://www.unicode.org/review/pr-29.html + +Public Review Issue #29: Normalization Issue + =item http://www.unicode.org/notes/tn5/ Canonical Equivalence in Applications - UTN #5 @@ -412,4 +477,3 @@ Canonical Equivalence in Applications - UTN #5 =back =cut - diff --git a/ext/Unicode/Normalize/Normalize.xs b/ext/Unicode/Normalize/Normalize.xs index 13544c9..7398ce0 100644 --- a/ext/Unicode/Normalize/Normalize.xs +++ b/ext/Unicode/Normalize/Normalize.xs @@ -23,7 +23,7 @@ /* UTF8_ALLOW_BOM is used before Perl 5.8.0 */ #ifdef UTF8_ALLOW_BOM #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FFFF) -#else +#else #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF) #endif diff --git a/ext/Unicode/Normalize/mkheader b/ext/Unicode/Normalize/mkheader index 4da4d07..ff30759 100644 --- a/ext/Unicode/Normalize/mkheader +++ b/ext/Unicode/Normalize/mkheader @@ -1,13 +1,22 @@ #!perl # -# This script generates "unfcan.h", "unfcpt.h", "unfcmb.h", -# "unfcmp.h", and "unfexc.h" -# from CombiningClass.pl, Decomposition.pl, CompositionExclusions.txt -# in lib/unicore or unicode directory -# for Unicode::Normalize.xs. (cf. Makefile.PL) +# This auxiliary script makes five header files +# used for building XSUB of Unicode::Normalize. # -# Usage: in command line -# or in perl +# Usage: +# in perl, or in command line +# +# Input files: +# unicore/CombiningClass.pl (or unicode/CombiningClass.pl) +# unicore/Decomposition.pl (or unicode/Decomposition.pl) +# unicore/CompositionExclusions.txt (or unicode/CompExcl.txt) +# +# Output files: +# unfcan.h +# unfcpt.h +# unfcmb.h +# unfcmp.h +# unfexc.h # use 5.006; use strict; @@ -197,7 +206,7 @@ foreach my $key (keys %Canon) { } # exhaustive decomposition -foreach my $key (keys %Compat) { +foreach my $key (keys %Compat) { $Compat{$key} = [ getCompatList($key) ]; } @@ -387,4 +396,5 @@ EOF close FH; } +1; __END__ diff --git a/ext/Unicode/Normalize/t/form.t b/ext/Unicode/Normalize/t/form.t index 4e9b885..27cd177 100644 --- a/ext/Unicode/Normalize/t/form.t +++ b/ext/Unicode/Normalize/t/form.t @@ -9,8 +9,8 @@ BEGIN { BEGIN { if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); } } diff --git a/ext/Unicode/Normalize/t/norm.t b/ext/Unicode/Normalize/t/norm.t index 90c037a..a939907 100644 --- a/ext/Unicode/Normalize/t/norm.t +++ b/ext/Unicode/Normalize/t/norm.t @@ -19,7 +19,7 @@ BEGIN { use Test; use strict; use warnings; -BEGIN { plan tests => 18 }; +BEGIN { plan tests => 29 }; use Unicode::Normalize qw(normalize); ok(1); # If we made it this far, we're ok. @@ -57,3 +57,19 @@ ok(hexNFD("0061 05AE 05C4 0300 0315 0062"), "0061 05AE 05C4 0300 0315 0062"); ok(hexNFC("0000 0041 0000 0000"), "0000 0041 0000 0000"); ok(hexNFD("0000 0041 0000 0000"), "0000 0041 0000 0000"); +ok(hexNFC("AC00 11A7"), "AC00 11A7"); +ok(hexNFC("AC00 11A8"), "AC01"); +ok(hexNFC("AC00 11A9"), "AC02"); +ok(hexNFC("AC00 11C2"), "AC1B"); +ok(hexNFC("AC00 11C3"), "AC00 11C3"); + +# Test Cases from Public Review Issue #29: Normalization Issue +# cf. http://www.unicode.org/review/pr-29.html +ok(hexNFC("0B47 0300 0B3E"), "0B47 0300 0B3E"); +ok(hexNFC("1100 0300 1161"), "1100 0300 1161"); + +ok(hexNFC("0B47 0B3E 0300"), "0B4B 0300"); +ok(hexNFC("1100 1161 0300"), "AC00 0300"); + +ok(hexNFC("0B47 0300 0B3E 0327"), "0B47 0300 0B3E 0327"); +ok(hexNFC("1100 0300 1161 0327"), "1100 0300 1161 0327"); diff --git a/ext/Unicode/Normalize/t/test.t b/ext/Unicode/Normalize/t/test.t index c5ebf3a..8e3369f 100644 --- a/ext/Unicode/Normalize/t/test.t +++ b/ext/Unicode/Normalize/t/test.t @@ -19,7 +19,7 @@ BEGIN { use Test; use strict; use warnings; -BEGIN { plan tests => 20 }; +BEGIN { plan tests => 31 }; use Unicode::Normalize; ok(1); # If we made it this far, we're ok. @@ -57,11 +57,29 @@ ok(hexNFD("0061 05AE 05C4 0300 0315 0062"), "0061 05AE 05C4 0300 0315 0062"); ok(hexNFC("0000 0041 0000 0000"), "0000 0041 0000 0000"); ok(hexNFD("0000 0041 0000 0000"), "0000 0041 0000 0000"); -# should be unary. +ok(hexNFC("AC00 11A7"), "AC00 11A7"); +ok(hexNFC("AC00 11A8"), "AC01"); +ok(hexNFC("AC00 11A9"), "AC02"); +ok(hexNFC("AC00 11C2"), "AC1B"); +ok(hexNFC("AC00 11C3"), "AC00 11C3"); + +# Test Cases from Public Review Issue #29: Normalization Issue +# cf. http://www.unicode.org/review/pr-29.html +ok(hexNFC("0B47 0300 0B3E"), "0B47 0300 0B3E"); +ok(hexNFC("1100 0300 1161"), "1100 0300 1161"); + +ok(hexNFC("0B47 0B3E 0300"), "0B4B 0300"); +ok(hexNFC("1100 1161 0300"), "AC00 0300"); + +ok(hexNFC("0B47 0300 0B3E 0327"), "0B47 0300 0B3E 0327"); +ok(hexNFC("1100 0300 1161 0327"), "1100 0300 1161 0327"); + +# NFC() should be unary. my $str11 = _pack_U(0x41, 0x0302, 0x0301, 0x62); my $str12 = _pack_U(0x1EA4, 0x62); ok(NFC $str11 eq $str12); +# NFD() should be unary. my $str21 = _pack_U(0xE0, 0xAC00); my $str22 = _pack_U(0x61, 0x0300, 0x1100, 0x1161); ok(NFD $str21 eq $str22);