From: Steve Peters Date: Tue, 13 Jun 2006 01:00:02 +0000 (+0000) Subject: Upgrade to Unicode-Normalize-1.00 X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=fe067ad959549a513d3f99948bd05deb85d6e222;p=p5sagit%2Fp5-mst-13.2.git Upgrade to Unicode-Normalize-1.00 p4raw-id: //depot/perl@28389 --- diff --git a/ext/Unicode/Normalize/Changes b/ext/Unicode/Normalize/Changes index 9c0271b..8d05a3d 100644 --- a/ext/Unicode/Normalize/Changes +++ b/ext/Unicode/Normalize/Changes @@ -1,5 +1,16 @@ Revision history for Perl extension Unicode::Normalize. +1.00 Thu May 25 20:35:06 2006 + - Pure Perl: compose($not_canonically_reordered) works like that in XSUB, + where an intervening character with higher combining class blocks + the composition. (This change doesn't affect any normalization forms.) + - XSUB: NFD(), NFC(), NFKD(), NFC(), and FCC() are now in XSUB, then + internal subroutine calls are avoided. + - The functions isComp_Ex(), isNFD_NO(), isNFC_NO(), isNFC_MAYBE(), + isNFKD_NO(), isNFKC_NO(), and isNFKC_MAYBE() are documented. + - Tests are more amplified and documentations are more clarified. + - Makefile.PL: Change 26295 is incorporated. + 0.32 Tue Apr 5 22:47:09 2005 - Some literal and grammatical errors in POD are fixed. @@ -62,13 +73,13 @@ Revision history for Perl extension Unicode::Normalize. 0.18 ... unreleased - synchronization with bleadperl. - - Change 16262: by me + - Change 16262: by sadahiro 0.17 Sun Apr 28 23:13:32 2002 - now normalize('NFC',$1) should work. - Some croak()'s are added in mkheader. - synchronization with bleadperl. - - Change 15596: by me + - Change 15596: by sadahiro - Change 16136: by pudge 0.16 Thu Mar 21 13:36:14 2002 @@ -85,7 +96,7 @@ Revision history for Perl extension Unicode::Normalize. - synchronization with bleadperl. - Change 14128: by Arthur - Change 14129: by jhi - - Change 14156: + - Change 14156: by sadahiro - Change 14199: by Nikola Knezevic - Change 14308: by Benjamin Goldberg - Change 14370: by jhi diff --git a/ext/Unicode/Normalize/Normalize.pm b/ext/Unicode/Normalize/Normalize.pm index 8f5f4cc..16d7664 100644 --- a/ext/Unicode/Normalize/Normalize.pm +++ b/ext/Unicode/Normalize/Normalize.pm @@ -13,7 +13,7 @@ use Carp; no warnings 'utf8'; -our $VERSION = '0.32'; +our $VERSION = '1.00'; our $PACKAGE = __PACKAGE__; require Exporter; @@ -43,12 +43,16 @@ bootstrap Unicode::Normalize $VERSION; ###### +## +## utilites for tests +## + sub pack_U { return pack('U*', @_); } sub unpack_U { - return unpack('U*', pack('U*').shift); + return unpack('U*', shift(@_).pack('U*')); } @@ -56,18 +60,10 @@ sub unpack_U { ## normalization forms ## -use constant COMPAT => 1; - -sub NFD ($) { reorder(decompose($_[0])) } -sub NFKD ($) { reorder(decompose($_[0], COMPAT)) } -sub NFC ($) { compose(reorder(decompose($_[0]))) } -sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) } - sub FCD ($) { my $str = shift; return checkFCD($str) ? $str : NFD($str); } -sub FCC ($) { composeContiguous(reorder(decompose($_[0]))) } our %formNorm = ( NFC => \&NFC, C => \&NFC, @@ -81,9 +77,10 @@ sub normalize($$) { my $form = shift; my $str = shift; - return exists $formNorm{$form} - ? $formNorm{$form}->($str) - : croak $PACKAGE."::normalize: invalid form name: $form"; + if (exists $formNorm{$form}) { + return $formNorm{$form}->($str); + } + croak($PACKAGE."::normalize: invalid form name: $form"); } @@ -103,9 +100,10 @@ sub check($$) { my $form = shift; my $str = shift; - return exists $formCheck{$form} - ? $formCheck{$form}->($str) - : croak $PACKAGE."::check: invalid form name: $form"; + if (exists $formCheck{$form}) { + return $formCheck{$form}->($str); + } + croak($PACKAGE."::check: invalid form name: $form"); } 1; @@ -139,16 +137,14 @@ Unicode::Normalize - Unicode Normalization Forms Parameters: -C<$string> is used as a string under character semantics -(see F). +C<$string> is used as a string under character semantics (see F). -C<$codepoint> should be an unsigned integer -representing a Unicode code point. +C<$code_point> should be an unsigned integer representing a Unicode code point. Note: Between XSUB and pure Perl, there is an incompatibility -about the interpretation of C<$codepoint> as a decimal number. -XSUB converts C<$codepoint> to an unsigned integer, but pure Perl does not. -Do not use a floating point nor a negative sign in C<$codepoint>. +about the interpretation of C<$code_point> as a decimal number. +XSUB converts C<$code_point> to an unsigned integer, but pure Perl does not. +Do not use a floating point nor a negative sign in C<$code_point>. =head2 Normalization Forms @@ -156,38 +152,40 @@ Do not use a floating point nor a negative sign in C<$codepoint>. =item C<$NFD_string = NFD($string)> -returns the Normalization Form D (formed by canonical decomposition). +It returns the Normalization Form D (formed by canonical decomposition). =item C<$NFC_string = NFC($string)> -returns the Normalization Form C (formed by canonical decomposition +It returns the Normalization Form C (formed by canonical decomposition followed by canonical composition). =item C<$NFKD_string = NFKD($string)> -returns the Normalization Form KD (formed by compatibility decomposition). +It returns the Normalization Form KD (formed by compatibility decomposition). =item C<$NFKC_string = NFKC($string)> -returns the Normalization Form KC (formed by compatibility decomposition +It returns the Normalization Form KC (formed by compatibility decomposition followed by B composition). =item C<$FCD_string = FCD($string)> If the given string is in FCD ("Fast C or D" form; cf. UTN #5), -returns it without modification; otherwise returns an FCD string. +it returns the string without modification; otherwise it returns an FCD string. Note: FCD is not always unique, then plural forms may be equivalent each other. C will return one of these equivalent forms. =item C<$FCC_string = FCC($string)> -returns the FCC form ("Fast C Contiguous"; cf. UTN #5). +It returns the FCC form ("Fast C Contiguous"; cf. UTN #5). Note: FCC is unique, as well as four normalization forms (NF*). =item C<$normalized_string = normalize($form_name, $string)> +It returns the normalization form of C<$form_name>. + As C<$form_name>, one of the following names must be given. 'C' or 'NFC' for Normalization Form C (UAX #15) @@ -204,39 +202,39 @@ As C<$form_name>, one of the following names must be given. =over 4 -=item C<$decomposed_string = decompose($string)> +=item C<$decomposed_string = decompose($string [, $useCompatMapping])> -=item C<$decomposed_string = decompose($string, $useCompatMapping)> +It returns the concatenation of the decomposition of each character +in the string. -Decomposes the specified string and returns the result. +If the second parameter (a boolean) is omitted or false, +the decomposition is canonical decomposition; +if the second parameter (a boolean) is true, +the decomposition is compatibility decomposition. -If the second parameter (a boolean) is omitted or false, decomposes it -using the Canonical Decomposition Mapping. -If true, decomposes it using the Compatibility Decomposition Mapping. - -The string returned is not always in NFD/NFKD. -Reordering may be required. +The string returned is not always in NFD/NFKD. Reordering may be required. $NFD_string = reorder(decompose($string)); # eq. to NFD() $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD() -=item C<$reordered_string = reorder($string)> +=item C<$reordered_string = reorder($string)> -Reorders the combining characters and the like in the canonical ordering -and returns the result. +It returns the result of reordering the combining characters +according to Canonical Ordering Behavior. -E.g., when you have a list of NFD/NFKD strings, -you can get the concatenated NFD/NFKD string from them, saying +For example, when you have a list of NFD/NFKD strings, +you can get the concatenated NFD/NFKD string from them, by saying $concat_NFD = reorder(join '', @NFD_strings); $concat_NFKD = reorder(join '', @NFKD_strings); -=item C<$composed_string = compose($string)> +=item C<$composed_string = compose($string)> -Returns the string where composable pairs are composed. +It returns the result of canonical composition +without applying any decomposition. -E.g., when you have a NFD/NFKD string, -you can get its NFC/NFKC string, saying +For example, when you have a NFD/NFKD string, +you can get its NFC/NFKC string, by saying $NFC_string = compose($NFD_string); $NFKC_string = compose($NFKD_string); @@ -249,7 +247,7 @@ you can get its NFC/NFKC string, saying The following functions check whether the string is in that normalization form. -The result returned will be: +The result returned will be one of the following: YES The string is in that normalization form. NO The string is not in that normalization form. @@ -259,37 +257,37 @@ The result returned will be: =item C<$result = checkNFD($string)> -returns true (C<1>) if C; false (C) if C. +It returns true (C<1>) if C; false (C) if C. =item C<$result = checkNFC($string)> -returns true (C<1>) if C; false (C) if C; +It returns true (C<1>) if C; false (C) if C; C if C. =item C<$result = checkNFKD($string)> -returns true (C<1>) if C; false (C) if C. +It returns true (C<1>) if C; false (C) if C. =item C<$result = checkNFKC($string)> -returns true (C<1>) if C; false (C) if C; +It returns true (C<1>) if C; false (C) if C; C if C. =item C<$result = checkFCD($string)> -returns true (C<1>) if C; false (C) if C. +It returns true (C<1>) if C; false (C) if C. =item C<$result = checkFCC($string)> -returns true (C<1>) if C; false (C) if C; +It returns true (C<1>) if C; false (C) if C; C if C. -If a string is not in FCD, it must not be in FCC. +Note: If a string is not in FCD, it must not be in FCC. So C should return C. =item C<$result = check($form_name, $string)> -returns true (C<1>) if C; false (C) if C; +It returns true (C<1>) if C; false (C) if C; C if C. As C<$form_name>, one of the following names must be given. @@ -342,56 +340,92 @@ call them yourself. =over 4 -=item C<$canonical_decomposed = getCanon($codepoint)> +=item C<$canonical_decomposition = getCanon($code_point)> -If the character of the specified codepoint is canonically -decomposable (including Hangul Syllables), -returns the B string canonically equivalent to it. +If the character is canonically decomposable (including Hangul Syllables), +it returns the (full) canonical decomposition as a string. +Otherwise it returns C. -If it is not decomposable, returns C. +B According to the Unicode standard, the canonical decomposition +of the character that is not canonically decomposable is same as +the character itself. -=item C<$compatibility_decomposed = getCompat($codepoint)> +=item C<$compatibility_decomposition = getCompat($code_point)> -If the character of the specified codepoint is compatibility -decomposable (including Hangul Syllables), -returns the B string compatibility equivalent to it. +If the character is compatibility decomposable (including Hangul Syllables), +it returns the (full) compatibility decomposition as a string. +Otherwise it returns C. -If it is not decomposable, returns C. +B According to the Unicode standard, the compatibility decomposition +of the character that is not compatibility decomposable is same as +the character itself. -=item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)> +=item C<$code_point_composite = getComposite($code_point_here, $code_point_next)> -If two characters here and next (as codepoints) are composable +If two characters here and next (as code points) are composable (including Hangul Jamo/Syllables and Composition Exclusions), -returns the codepoint of the composite. +it returns the code point of the composite. + +If they are not composable, it returns C. -If they are not composable, returns C. +=item C<$combining_class = getCombinClass($code_point)> -=item C<$combining_class = getCombinClass($codepoint)> +It returns the combining class (as an integer) of the character. -Returns the combining class of the character as an integer. +=item C<$may_be_composed_with_prev_char = isComp2nd($code_point)> -=item C<$is_exclusion = isExclusion($codepoint)> +It returns a boolean whether the character of the specified codepoint +may be composed with the previous one in a certain composition +(including Hangul Compositions, but excluding +Composition Exclusions and Non-Starter Decompositions). -Returns a boolean whether the character of the specified codepoint -is a composition exclusion. +=item C<$is_exclusion = isExclusion($code_point)> -=item C<$is_singleton = isSingleton($codepoint)> +It returns a boolean whether the code point is a composition exclusion. -Returns a boolean whether the character of the specified codepoint is -a singleton. +=item C<$is_singleton = isSingleton($code_point)> -=item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)> +It returns a boolean whether the code point is a singleton -Returns a boolean whether the canonical decomposition -of the character of the specified codepoint -is a Non-Starter Decomposition. +=item C<$is_non_starter_decomposition = isNonStDecomp($code_point)> -=item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)> +It returns a boolean whether the code point has Non-Starter Decomposition. -Returns a boolean whether the character of the specified codepoint -may be composed with the previous one in a certain composition -(including Hangul Compositions, but excluding -Composition Exclusions and Non-Starter Decompositions). +=item C<$is_Full_Composition_Exclusion = isComp_Ex($code_point)> + +It returns a boolean of the derived property Comp_Ex +(Full_Composition_Exclusion). This property is generated from +Composition Exclusions + Singletons + Non-Starter Decompositions. + +=item C<$NFD_is_NO = isNFD_NO($code_point)> + +It returns a boolean of the derived property NFD_NO +(NFD_Quick_Check=No). + +=item C<$NFC_is_NO = isNFC_NO($code_point)> + +It returns a boolean of the derived property NFC_NO +(NFC_Quick_Check=No). + +=item C<$NFC_is_MAYBE = isNFC_MAYBE($code_point)> + +It returns a boolean of the derived property NFC_MAYBE +(NFC_Quick_Check=Maybe). + +=item C<$NFKD_is_NO = isNFKD_NO($code_point)> + +It returns a boolean of the derived property NFKD_NO +(NFKD_Quick_Check=No). + +=item C<$NFKC_is_NO = isNFKC_NO($code_point)> + +It returns a boolean of the derived property NFKC_NO +(NFKC_Quick_Check=No). + +=item C<$NFKC_is_MAYBE = isNFKC_MAYBE($code_point)> + +It returns a boolean of the derived property NFKC_MAYBE +(NFKC_Quick_Check=Maybe). =back @@ -411,13 +445,14 @@ Since this module refers to perl core's Unicode database in the directory F (or formerly F), the Unicode version of normalization implemented by this module depends on your perl's version. - perl's version implemented Unicode version - 5.6.1 3.0.1 - 5.7.2 3.1.0 - 5.7.3 3.1.1 (same normalized form as that of 3.1.0) - 5.8.0 3.2.0 - 5.8.1-5.8.3 4.0.0 - 5.8.4-5.8.6 (latest) 4.0.1 (same normalized form as that of 4.0.0) + perl's version implemented Unicode version + 5.6.1 3.0.1 + 5.7.2 3.1.0 + 5.7.3 3.1.1 (normalization is same as 3.1.0) + 5.8.0 3.2.0 + 5.8.1-5.8.3 4.0.0 + 5.8.4-5.8.6 4.0.1 (normalization is same as 4.0.0) + 5.8.7-5.8.8 4.1.0 =item Correction of decomposition mapping @@ -445,7 +480,7 @@ lower than 4.1.0. SADAHIRO Tomoyuki -Copyright(C) 2001-2005, SADAHIRO Tomoyuki. Japan. All rights reserved. +Copyright(C) 2001-2006, SADAHIRO Tomoyuki. Japan. All rights reserved. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. @@ -458,6 +493,10 @@ and/or modify it under the same terms as Perl itself. Unicode Normalization Forms - UAX #15 +=item http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt + +Composition Exclusion Table + =item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt Derived Normalization Properties diff --git a/ext/Unicode/Normalize/Normalize.xs b/ext/Unicode/Normalize/Normalize.xs index 7398ce0..e48ead9 100644 --- a/ext/Unicode/Normalize/Normalize.xs +++ b/ext/Unicode/Normalize/Normalize.xs @@ -28,16 +28,29 @@ #endif /* if utf8n_to_uvuni() sets retlen to 0 (?) */ -#define ErrRetlenIsZero "panic (Unicode::Normalize): zero-length character" +#define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character" /* utf8_hop() hops back before start. Maybe broken UTF-8 */ #define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start" +/* It should never happen as there is no instance in UTF-8 and UTF-EBCDIC; + according to Versioning and Stability in UAX#15, no new composition + should come in future. */ +#define ErrLongerThanSrc "panic (Unicode::Normalize %s): longer than source" + +/* uvuni_to_utf8 wants UTF8_MAXBYTES free bytes available */ +#define ErrTargetNotEnough "panic (Unicode::Normalize %s): target not enough" + /* At present, char > 0x10ffff are unaffected without complaint, right? */ #define VALID_UTF_MAX (0x10ffff) #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv)) -/* HANGUL_H */ +/* size of array for combining characters */ +/* enough as an initial value? */ +#define CC_SEQ_SIZE (10) +#define CC_SEQ_STEP (5) + +/* HANGUL begin */ #define Hangul_SBase 0xAC00 #define Hangul_SFinal 0xD7A3 #define Hangul_SCount 11172 @@ -62,7 +75,7 @@ #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal)) #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal)) #define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal)) -/* HANGUL_H */ +/* HANGUL end */ /* this is used for canonical ordering of combining characters (c.c.). */ typedef struct { @@ -71,7 +84,7 @@ typedef struct { STRLEN pos; /* position */ } UNF_cc; -static int compare_cc (const void *a, const void *b) +static int compare_cc(const void *a, const void *b) { int ret_cc; ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc; @@ -82,7 +95,7 @@ static int compare_cc (const void *a, const void *b) - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos ); } -static U8* dec_canonical (UV uv) +static U8* dec_canonical(UV uv) { U8 ***plane, **row; if (OVER_UTF_MAX(uv)) @@ -94,7 +107,7 @@ static U8* dec_canonical (UV uv) return row ? row[uv & 0xff] : NULL; } -static U8* dec_compat (UV uv) +static U8* dec_compat(UV uv) { U8 ***plane, **row; if (OVER_UTF_MAX(uv)) @@ -106,21 +119,22 @@ static U8* dec_compat (UV uv) return row ? row[uv & 0xff] : NULL; } -static UV composite_uv (UV uv, UV uv2) +static UV composite_uv(UV uv, UV uv2) { UNF_complist ***plane, **row, *cell, *i; - if (! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2)) + if (!uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2)) return 0; if (Hangul_IsL(uv) && Hangul_IsV(uv2)) { - uv -= Hangul_LBase; /* lindex */ - uv2 -= Hangul_VBase; /* vindex */ - return(Hangul_SBase + (uv * Hangul_VCount + uv2) * Hangul_TCount); + UV lindex = uv - Hangul_LBase; + UV vindex = uv2 - Hangul_VBase; + return(Hangul_SBase + (lindex * Hangul_VCount + vindex) * + Hangul_TCount); } if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) { - uv2 -= Hangul_TBase; /* tindex */ - return(uv + uv2); + UV tindex = uv2 - Hangul_TBase; + return(uv + tindex); } plane = UNF_compos[uv >> 16]; if (! plane) @@ -138,7 +152,7 @@ static UV composite_uv (UV uv, UV uv2) return 0; } -static U8 getCombinClass (UV uv) +static U8 getCombinClass(UV uv) { U8 **plane, *row; if (OVER_UTF_MAX(uv)) @@ -150,36 +164,21 @@ static U8 getCombinClass (UV uv) return row ? row[uv & 0xff] : 0; } -static void sv_cat_decompHangul (SV* sv, UV uv) +static U8* pv_cat_decompHangul(U8* d, UV uv) { - UV sindex, lindex, vindex, tindex; - U8 *t, tmp[3 * UTF8_MAXLEN + 1]; + UV sindex = uv - Hangul_SBase; + UV lindex = sindex / Hangul_NCount; + UV vindex = (sindex % Hangul_NCount) / Hangul_TCount; + UV tindex = sindex % Hangul_TCount; if (! Hangul_IsS(uv)) - return; + return d; - sindex = uv - Hangul_SBase; - lindex = sindex / Hangul_NCount; - vindex = (sindex % Hangul_NCount) / Hangul_TCount; - tindex = sindex % Hangul_TCount; - - t = tmp; - t = uvuni_to_utf8(t, (lindex + Hangul_LBase)); - t = uvuni_to_utf8(t, (vindex + Hangul_VBase)); + d = uvuni_to_utf8(d, (lindex + Hangul_LBase)); + d = uvuni_to_utf8(d, (vindex + Hangul_VBase)); if (tindex) - t = uvuni_to_utf8(t, (tindex + Hangul_TBase)); - *t = '\0'; - sv_catpvn(sv, (char *)tmp, t - tmp); -} - -static void sv_cat_uvuni (SV* sv, UV uv) -{ - U8 *t, tmp[UTF8_MAXLEN + 1]; - - t = tmp; - t = uvuni_to_utf8(t, uv); - *t = '\0'; - sv_catpvn(sv, (char *)tmp, t - tmp); + d = uvuni_to_utf8(d, (tindex + Hangul_TBase)); + return d; } static char * sv_2pvunicode(SV *sv, STRLEN *lp) @@ -194,140 +193,305 @@ static char * sv_2pvunicode(SV *sv, STRLEN *lp) sv_utf8_upgrade(tmpsv); s = (char*)SvPV(tmpsv,len); } - *lp = len; + if (lp) + *lp = len; return s; } -MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize - -SV* -decompose(src, compat = &PL_sv_no) - SV * src - SV * compat - PROTOTYPE: $;$ - PREINIT: - SV *dst; - STRLEN srclen, retlen; - U8 *s, *e, *p, *r; - UV uv; - bool iscompat; - CODE: - iscompat = SvTRUE(compat); - s = (U8*)sv_2pvunicode(src,&srclen); - e = s + srclen; - - dst = newSV(1); - (void)SvPOK_only(dst); - SvUTF8_on(dst); - - for (p = s; p < e; p += retlen) { - uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); +static +U8* pv_utf8_decompose(U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat) +{ + U8* p = s; + U8* e = s + slen; + U8* dstart = *dp; + U8* d = dstart; + + while (p < e) { + STRLEN retlen; + UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); if (!retlen) - croak(ErrRetlenIsZero); + croak(ErrRetlenIsZero, "decompose"); + p += retlen; + + if (Hangul_IsS(uv)) { + STRLEN cur = d - dstart; - if (Hangul_IsS(uv)) - sv_cat_decompHangul(dst, uv); + if (dlen < cur + UTF8_MAXLEN * 3) { + dlen += UTF8_MAXLEN * 3; + Renew(dstart, dlen+1, U8); + d = dstart + cur; + } + d = pv_cat_decompHangul(d, uv); + } else { - r = iscompat ? dec_compat(uv) : dec_canonical(uv); - if (r) - sv_catpv(dst, (char *)r); - else - sv_cat_uvuni(dst, uv); + U8* r = iscompat ? dec_compat(uv) : dec_canonical(uv); + + if (r) { + STRLEN len = (STRLEN)strlen((char *)r); + STRLEN cur = d - dstart; + if (dlen < cur + len) { + dlen += len; + Renew(dstart, dlen+1, U8); + d = dstart + cur; + } + while (len--) + *d++ = *r++; + } + else { + STRLEN cur = d - dstart; + + if (dlen < cur + UTF8_MAXLEN) { + dlen += UTF8_MAXLEN; + Renew(dstart, dlen+1, U8); + d = dstart + cur; + } + d = uvuni_to_utf8(d, uv); + } } } - RETVAL = dst; - OUTPUT: - RETVAL + *dp = dstart; + return d; +} +static +U8* pv_utf8_reorder(U8* s, STRLEN slen, U8* d, STRLEN dlen) +{ + U8* p = s; + U8* e = s + slen; + U8* dend = d + dlen; + + UNF_cc seq_ary[CC_SEQ_SIZE]; + UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */ + UNF_cc* seq_ext = NULL; /* extend if need */ + STRLEN seq_max = CC_SEQ_SIZE; + STRLEN cc_pos = 0; + + if (dlen < slen || dlen < slen + UTF8_MAXLEN) + croak(ErrTargetNotEnough, "reorder"); + dend -= UTF8_MAXLEN; /* safety */ + + while (p < e) { + U8 curCC; + STRLEN retlen; + UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); + if (!retlen) + croak(ErrRetlenIsZero, "reorder"); + p += retlen; + curCC = getCombinClass(uv); -SV* -reorder(src) - SV * src - PROTOTYPE: $ - PREINIT: - SV *dst; - STRLEN srclen, dstlen, retlen, stk_cc_max; - U8 *s, *e, *p, *d, curCC; - UV uv, uvlast; - UNF_cc * stk_cc; - STRLEN i, cc_pos; - bool valid_uvlast; - CODE: - s = (U8*)sv_2pvunicode(src,&srclen); - e = s + srclen; + if (curCC != 0) { + if (seq_max < cc_pos + 1) { /* extend if need */ + seq_max = cc_pos + CC_SEQ_STEP; /* new size */ + if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */ + STRLEN i; + New(0, seq_ext, seq_max, UNF_cc); + for (i = 0; i < cc_pos; i++) + seq_ext[i] = seq_ary[i]; + } + else { + Renew(seq_ext, seq_max, UNF_cc); + } + seq_ptr = seq_ext; /* till now use seq_ext */ + } - dstlen = srclen + 1; - dst = newSV(dstlen); - (void)SvPOK_only(dst); - SvUTF8_on(dst); - d = (U8*)SvPVX(dst); + seq_ptr[cc_pos].cc = curCC; + seq_ptr[cc_pos].uv = uv; + seq_ptr[cc_pos].pos = cc_pos; + ++cc_pos; - stk_cc_max = 10; /* enough as an initial value? */ - New(0, stk_cc, stk_cc_max, UNF_cc); + if (p < e) + continue; + } - for (p = s; p < e;) { - uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); - if (!retlen) - croak(ErrRetlenIsZero); - p += retlen; + if (cc_pos) { + STRLEN i; + + if (cc_pos > 1) /* reordered if there are two c.c.'s */ + qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc); + + for (i = 0; i < cc_pos; i++) { + d = uvuni_to_utf8(d, seq_ptr[i].uv); + if (dend < d) /* real end is dend + UTF8_MAXLEN */ + croak(ErrLongerThanSrc, "reorder"); + } + cc_pos = 0; + } - curCC = getCombinClass(uv); if (curCC == 0) { d = uvuni_to_utf8(d, uv); - continue; + if (dend < d) /* real end is dend + UTF8_MAXLEN */ + croak(ErrLongerThanSrc, "reorder"); } + } + if (seq_ext) + Safefree(seq_ext); + return d; +} - cc_pos = 0; - stk_cc[cc_pos].cc = curCC; - stk_cc[cc_pos].uv = uv; - stk_cc[cc_pos].pos = cc_pos; +static +U8* pv_utf8_compose(U8* s, STRLEN slen, U8* d, STRLEN dlen, bool iscontig) +{ + U8* p = s; + U8* e = s + slen; + U8* dend = d + dlen; + + UV uvS; /* code point of the starter */ + bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */ + U8 preCC = 0; + + UV seq_ary[CC_SEQ_SIZE]; + UV* seq_ptr = seq_ary; /* use array at the beginning */ + UV* seq_ext = NULL; /* extend if need */ + STRLEN seq_max = CC_SEQ_SIZE; + STRLEN cc_pos = 0; + + if (dlen < slen || dlen < slen + UTF8_MAXLEN) + croak(ErrTargetNotEnough, "compose"); + dend -= UTF8_MAXLEN; /* safety */ + + while (p < e) { + U8 curCC; + STRLEN retlen; + UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); + if (!retlen) + croak(ErrRetlenIsZero, "compose"); + p += retlen; - valid_uvlast = FALSE; - while (p < e) { - uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); - if (!retlen) - croak(ErrRetlenIsZero); - p += retlen; + curCC = getCombinClass(uv); - curCC = getCombinClass(uv); + if (!valid_uvS) { if (curCC == 0) { - uvlast = uv; - valid_uvlast = TRUE; - break; + uvS = uv; /* the first Starter is found */ + valid_uvS = TRUE; + if (p < e) + continue; } + else { + d = uvuni_to_utf8(d, uv); + if (dend < d) /* real end is dend + UTF8_MAXLEN */ + croak(ErrLongerThanSrc, "compose"); + continue; + } + } + else { + bool composed; + + /* blocked */ + if (iscontig && cc_pos || /* discontiguous combination */ + curCC != 0 && preCC == curCC || /* blocked by same CC */ + preCC > curCC) /* blocked by higher CC: revised D2 */ + composed = FALSE; + + /* not blocked: + iscontig && cc_pos == 0 -- contiguous combination + curCC == 0 && preCC == 0 -- starter + starter + curCC != 0 && preCC < curCC -- lower CC */ + else { + /* try composition */ + UV uvComp = composite_uv(uvS, uv); + + if (uvComp && !isExclusion(uvComp)) { + uvS = uvComp; + composed = TRUE; - cc_pos++; - if (stk_cc_max <= cc_pos) { /* extend if need */ - stk_cc_max = cc_pos + 1; - Renew(stk_cc, stk_cc_max, UNF_cc); + /* preCC should not be changed to curCC */ + /* e.g. 1E14 = 0045 0304 0300 where CC(0304) == CC(0300) */ + if (p < e) + continue; + } + else + composed = FALSE; + } + + if (!composed) { + preCC = curCC; + if (curCC != 0 || !(p < e)) { + if (seq_max < cc_pos + 1) { /* extend if need */ + seq_max = cc_pos + CC_SEQ_STEP; /* new size */ + if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */ + New(0, seq_ext, seq_max, UV); + Copy(seq_ary, seq_ext, cc_pos, UV); + } + else { + Renew(seq_ext, seq_max, UV); + } + seq_ptr = seq_ext; /* till now use seq_ext */ + } + seq_ptr[cc_pos] = uv; + ++cc_pos; + } + if (curCC != 0 && p < e) + continue; } - stk_cc[cc_pos].cc = curCC; - stk_cc[cc_pos].uv = uv; - stk_cc[cc_pos].pos = cc_pos; } - /* reordered if there are two c.c.'s */ + d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */ + if (dend < d) /* real end is dend + UTF8_MAXLEN */ + croak(ErrLongerThanSrc, "compose"); + if (cc_pos) { - qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc); - } + STRLEN i; - for (i = 0; i <= cc_pos; i++) { - d = uvuni_to_utf8(d, stk_cc[i].uv); - } - if (valid_uvlast) - { - d = uvuni_to_utf8(d, uvlast); + for (i = 0; i < cc_pos; i++) { + d = uvuni_to_utf8(d, seq_ptr[i]); + if (dend < d) /* real end is dend + UTF8_MAXLEN */ + croak(ErrLongerThanSrc, "compose"); + } + cc_pos = 0; } + + uvS = uv; } - *d = '\0'; - SvCUR_set(dst, d - (U8*)SvPVX(dst)); - Safefree(stk_cc); + if (seq_ext) + Safefree(seq_ext); + return d; +} + +MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize + +SV* +decompose(src, compat = &PL_sv_no) + SV * src + SV * compat + PROTOTYPE: $;$ + PREINIT: + SV* dst; + U8 *s, *d, *dend; + STRLEN slen, dlen; + CODE: + s = (U8*)sv_2pvunicode(src,&slen); + dst = newSVpvn("", 0); + dlen = slen; + New(0, d, dlen+1, U8); + dend = pv_utf8_decompose(s, slen, &d, dlen, SvTRUE(compat)); + sv_setpvn(dst, d, dend - d); + SvUTF8_on(dst); + Safefree(d); RETVAL = dst; OUTPUT: RETVAL - +SV* +reorder(src) + SV * src + PROTOTYPE: $ + PREINIT: + SV* dst; + U8 *s, *d, *dend; + STRLEN slen, dlen; + CODE: + s = (U8*)sv_2pvunicode(src,&slen); + dst = newSVpvn("", 0); + dlen = slen + UTF8_MAXLEN; + d = (U8*)SvGROW(dst,dlen+1); + SvUTF8_on(dst); + dend = pv_utf8_reorder(s, slen, d, dlen); + *dend = '\0'; + SvCUR_set(dst, dend - d); + RETVAL = dst; + OUTPUT: + RETVAL SV* compose(src) @@ -336,96 +500,99 @@ compose(src) ALIAS: composeContiguous = 1 PREINIT: - SV *dst, *tmp; - U8 *s, *p, *e, *d, *t, *tmp_start, curCC, preCC; - UV uv, uvS, uvComp; - STRLEN srclen, dstlen, tmplen, retlen; - bool beginning = TRUE; + SV* dst; + U8 *s, *d, *dend; + STRLEN slen, dlen; CODE: - s = (U8*)sv_2pvunicode(src,&srclen); - e = s + srclen; - - dstlen = srclen + 1; - dst = newSV(dstlen); - (void)SvPOK_only(dst); + s = (U8*)sv_2pvunicode(src,&slen); + dst = newSVpvn("", 0); + dlen = slen + UTF8_MAXLEN; + d = (U8*)SvGROW(dst,dlen+1); SvUTF8_on(dst); - d = (U8*)SvPVX(dst); - - /* for uncomposed combining char */ - tmp = sv_2mortal(newSV(dstlen)); - (void)SvPOK_only(tmp); - SvUTF8_on(tmp); - - for (p = s; p < e;) { - if (beginning) { - uvS = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); - if (!retlen) - croak(ErrRetlenIsZero); - p += retlen; - - if (getCombinClass(uvS)) { /* no Starter found yet */ - d = uvuni_to_utf8(d, uvS); - continue; - } - beginning = FALSE; - } + dend = pv_utf8_compose(s, slen, d, dlen, (bool)ix); + *dend = '\0'; + SvCUR_set(dst, dend - d); + RETVAL = dst; + OUTPUT: + RETVAL - /* Starter */ - t = tmp_start = (U8*)SvPVX(tmp); - preCC = 0; +SV* +NFD(src) + SV * src + PROTOTYPE: $ + ALIAS: + NFKD = 1 + PREINIT: + SV *dst; + U8 *s, *t, *tend, *d, *dend; + STRLEN slen, tlen, dlen; + CODE: + /* decompose */ + s = (U8*)sv_2pvunicode(src,&slen); + tlen = slen; + New(0, t, tlen+1, U8); + tend = pv_utf8_decompose(s, slen, &t, tlen, (bool)ix); + *tend = '\0'; + tlen = tend - t; /* no longer know real tlen */ + + /* reorder */ + dst = newSVpvn("", 0); + dlen = tlen + UTF8_MAXLEN; + d = (U8*)SvGROW(dst,dlen+1); + SvUTF8_on(dst); + dend = pv_utf8_reorder(t, tlen, d, dlen); + *dend = '\0'; + SvCUR_set(dst, dend - d); - /* to the next Starter */ - while (p < e) { - uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); - if (!retlen) - croak(ErrRetlenIsZero); - p += retlen; + /* return */ + Safefree(t); + RETVAL = dst; + OUTPUT: + RETVAL - curCC = getCombinClass(uv); +SV* +NFC(src) + SV * src + PROTOTYPE: $ + ALIAS: + NFKC = 1 + FCC = 2 + PREINIT: + SV *dst; + U8 *s, *t, *tend, *u, *uend, *d, *dend; + STRLEN slen, tlen, ulen, dlen; + CODE: + /* decompose */ + s = (U8*)sv_2pvunicode(src,&slen); + tlen = slen; + New(0, t, tlen+1, U8); + tend = pv_utf8_decompose(s, slen, &t, tlen, (bool)(ix==1)); + *tend = '\0'; + tlen = tend - t; /* no longer know real tlen */ + + /* reorder */ + ulen = tlen + UTF8_MAXLEN; + New(0, u, ulen+1, U8); + uend = pv_utf8_reorder(t, tlen, u, ulen); + *uend = '\0'; + ulen = uend - u; + + /* compose */ + dst = newSVpvn("", 0); + dlen = ulen + UTF8_MAXLEN; + d = (U8*)SvGROW(dst,dlen+1); + SvUTF8_on(dst); + dend = pv_utf8_compose(u, ulen, d, dlen, (bool)(ix==2)); + *dend = '\0'; + SvCUR_set(dst, dend - d); - if (preCC && preCC == curCC) { - preCC = curCC; - t = uvuni_to_utf8(t, uv); - } else { - uvComp = composite_uv(uvS, uv); - - if (uvComp && ! isExclusion(uvComp) && - (ix ? (t == tmp_start) : (preCC <= curCC))) { - STRLEN leftcur, rightcur, dstcur; - leftcur = UNISKIP(uvComp); - rightcur = UNISKIP(uvS) + UNISKIP(uv); - - if (leftcur > rightcur) { - dstcur = d - (U8*)SvPVX(dst); - dstlen += leftcur - rightcur; - d = (U8*)SvGROW(dst,dstlen) + dstcur; - } - /* preCC not changed to curCC */ - uvS = uvComp; - } else if (! curCC && p < e) { /* blocked */ - break; - } else { - preCC = curCC; - t = uvuni_to_utf8(t, uv); - } - } - } - d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */ - tmplen = t - tmp_start; - if (tmplen) { /* uncomposed combining char */ - t = (U8*)SvPVX(tmp); - while (tmplen--) - *d++ = *t++; - } - uvS = uv; - } /* for */ - *d = '\0'; - SvCUR_set(dst, d - (U8*)SvPVX(dst)); + /* return */ + Safefree(t); + Safefree(u); RETVAL = dst; OUTPUT: RETVAL - void checkNFD(src) SV * src @@ -435,16 +602,15 @@ checkNFD(src) PREINIT: STRLEN srclen, retlen; U8 *s, *e, *p, curCC, preCC; - UV uv; CODE: s = (U8*)sv_2pvunicode(src,&srclen); e = s + srclen; preCC = 0; for (p = s; p < e; p += retlen) { - uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); + UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); if (!retlen) - croak(ErrRetlenIsZero); + croak(ErrRetlenIsZero, "checkNFD or -NFKD"); curCC = getCombinClass(uv); if (preCC > curCC && curCC != 0) /* canonical ordering violated */ @@ -466,7 +632,6 @@ checkNFC(src) PREINIT: STRLEN srclen, retlen; U8 *s, *e, *p, curCC, preCC; - UV uv; bool isMAYBE; CODE: s = (U8*)sv_2pvunicode(src,&srclen); @@ -475,12 +640,11 @@ checkNFC(src) preCC = 0; isMAYBE = FALSE; for (p = s; p < e; p += retlen) { - uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); + UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); if (!retlen) - croak(ErrRetlenIsZero); + croak(ErrRetlenIsZero, "checkNFC or -NFKC"); curCC = getCombinClass(uv); - if (preCC > curCC && curCC != 0) /* canonical ordering violated */ XSRETURN_NO; @@ -516,27 +680,29 @@ checkFCD(src) ALIAS: checkFCC = 1 PREINIT: - STRLEN srclen, retlen, canlen, canret; + STRLEN srclen, retlen; U8 *s, *e, *p, curCC, preCC; - UV uv, uvLead, uvTrail; - U8 *sCan, *pCan, *eCan; bool isMAYBE; CODE: s = (U8*)sv_2pvunicode(src,&srclen); e = s + srclen; - preCC = 0; isMAYBE = FALSE; for (p = s; p < e; p += retlen) { - uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); + U8 *sCan; + UV uvLead; + STRLEN canlen, canret; + UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); if (!retlen) - croak(ErrRetlenIsZero); + croak(ErrRetlenIsZero, "checkFCD or -FCC"); sCan = (U8*) dec_canonical(uv); if (sCan) { canlen = (STRLEN)strlen((char *) sCan); uvLead = utf8n_to_uvuni(sCan, canlen, &canret, AllowAnyUTF); + if (!canret) + croak(ErrRetlenIsZero, "checkFCD or -FCC"); } else { uvLead = uv; @@ -555,11 +721,14 @@ checkFCD(src) } if (sCan) { - eCan = sCan + canlen; - pCan = utf8_hop(eCan, -1); + UV uvTrail; + U8* eCan = sCan + canlen; + U8* pCan = utf8_hop(eCan, -1); if (pCan < sCan) croak(ErrHopBeforeStart); uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, AllowAnyUTF); + if (!canret) + croak(ErrRetlenIsZero, "checkFCD or -FCC"); preCC = getCombinClass(uvTrail); } else { @@ -662,17 +831,14 @@ getCanon(uv) PROTOTYPE: $ ALIAS: getCompat = 1 - PREINIT: - U8 * rstr; CODE: if (Hangul_IsS(uv)) { - SV * dst; - dst = newSV(1); - (void)SvPOK_only(dst); - sv_cat_decompHangul(dst, uv); - RETVAL = dst; + U8 tmp[3 * UTF8_MAXLEN + 1]; + U8 *t = tmp; + U8 *e = pv_cat_decompHangul(t, uv); + RETVAL = newSVpvn((char *)t, e - t); } else { - rstr = ix ? dec_compat(uv) : dec_canonical(uv); + U8* rstr = ix ? dec_compat(uv) : dec_canonical(uv); if (!rstr) XSRETURN_UNDEF; RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr)); @@ -687,18 +853,18 @@ splitOnLastStarter(src) SV * src PREINIT: SV *svp; - STRLEN srclen, retlen; + STRLEN srclen; U8 *s, *e, *p; - UV uv; PPCODE: s = (U8*)sv_2pvunicode(src,&srclen); e = s + srclen; - - for (p = e; s < p; ) { + p = e; + while (s < p) { + UV uv; p = utf8_hop(p, -1); if (p < s) croak(ErrHopBeforeStart); - uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); + uv = utf8n_to_uvuni(p, e - p, NULL, AllowAnyUTF); if (getCombinClass(uv) == 0) /* Last Starter found */ break; } diff --git a/ext/Unicode/Normalize/README b/ext/Unicode/Normalize/README index 34e24e3..e70d7ea 100644 --- a/ext/Unicode/Normalize/README +++ b/ext/Unicode/Normalize/README @@ -1,4 +1,4 @@ -Unicode/Normalize version 0.28 +Unicode/Normalize version 1.00 =================================== Unicode::Normalize - Unicode Normalization Forms @@ -90,7 +90,7 @@ COPYRIGHT AND LICENCE http://homepage1.nifty.com/nomenclator/perl/ - Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved. + Copyright(C) 2001-2006, SADAHIRO Tomoyuki. Japan. All rights reserved. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. diff --git a/ext/Unicode/Normalize/mkheader b/ext/Unicode/Normalize/mkheader index ff30759..165c1c4 100644 --- a/ext/Unicode/Normalize/mkheader +++ b/ext/Unicode/Normalize/mkheader @@ -363,7 +363,7 @@ EOF next if ! $val{ $p }; for (my $r = 0; $r < 256; $r++) { next if ! $val{ $p }{ $r }; - printf "$type ${head}_%02x_%02x [256] = {\n", $p, $r; + printf "static $type ${head}_%02x_%02x [256] = {\n", $p, $r; for (my $c = 0; $c < 256; $c++) { print "\t", defined $val{$p}{$r}{$c} ? "($type)".$val{$p}{$r}{$c} @@ -376,7 +376,7 @@ EOF } foreach my $p (sort { $a <=> $b } keys %val) { next if ! $val{ $p }; - printf "$type* ${head}_%02x [256] = {\n", $p; + printf "static $type* ${head}_%02x [256] = {\n", $p; for (my $r = 0; $r < 256; $r++) { print $val{ $p }{ $r } ? sprintf("${head}_%02x_%02x", $p, $r) @@ -386,7 +386,7 @@ EOF } print "};\n\n"; } - print "$type** $head [] = {\n"; + print "static $type** $head [] = {\n"; for (my $p = 0; $p <= 0x10; $p++) { print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL"; print ',' if $p != 0x10; diff --git a/ext/Unicode/Normalize/t/fcdc.t b/ext/Unicode/Normalize/t/fcdc.t index ea10a64..5fc78a5 100644 --- a/ext/Unicode/Normalize/t/fcdc.t +++ b/ext/Unicode/Normalize/t/fcdc.t @@ -19,53 +19,93 @@ BEGIN { use Test; use strict; use warnings; -BEGIN { plan tests => 35 }; +BEGIN { plan tests => 68 }; use Unicode::Normalize qw(:all); ok(1); # If we made it this far, we're ok. -sub _pack_U { Unicode::Normalize::pack_U(@_) } -sub _unpack_U { Unicode::Normalize::unpack_U(@_) } +sub _pack_U { Unicode::Normalize::pack_U(@_) } +sub hexU { _pack_U map hex, split ' ', shift } sub answer { defined $_[0] ? $_[0] ? "YES" : "NO" : "MAYBE" } ######################### +ok(FCD(''), ""); +ok(FCC(''), ""); +ok(FCD('A'), "A"); +ok(FCC('A'), "A"); + +ok(normalize('FCD', ""), ""); +ok(normalize('FCC', ""), ""); +ok(normalize('FCC', "A"), "A"); +ok(normalize('FCD', "A"), "A"); + +# if checkFCD is YES, the return value from FCD should be same as the original +ok(FCD(hexU("00C5")), hexU("00C5")); # A with ring above +ok(FCD(hexU("0041 030A")), hexU("0041 030A")); # A+ring +ok(FCD(hexU("0041 0327 030A")), hexU("0041 0327 030A")); # A+cedilla+ring +ok(FCD(hexU("AC01 1100 1161")), hexU("AC01 1100 1161")); # hangul +ok(FCD(hexU("212B F900")), hexU("212B F900")); # compat + +ok(normalize('FCD', hexU("00C5")), hexU("00C5")); +ok(normalize('FCD', hexU("0041 030A")), hexU("0041 030A")); +ok(normalize('FCD', hexU("0041 0327 030A")), hexU("0041 0327 030A")); +ok(normalize('FCD', hexU("AC01 1100 1161")), hexU("AC01 1100 1161")); +ok(normalize('FCD', hexU("212B F900")), hexU("212B F900")); + +# if checkFCD is MAYBE or NO, FCD returns NFD (this behavior isn't documented) +ok(FCD(hexU("00C5 0327")), hexU("0041 0327 030A")); +ok(FCD(hexU("0041 030A 0327")), hexU("0041 0327 030A")); +ok(FCD(hexU("00C5 0327")), NFD(hexU("00C5 0327"))); +ok(FCD(hexU("0041 030A 0327")), NFD(hexU("0041 030A 0327"))); + +ok(normalize('FCD', hexU("00C5 0327")), hexU("0041 0327 030A")); +ok(normalize('FCD', hexU("0041 030A 0327")), hexU("0041 0327 030A")); +ok(normalize('FCD', hexU("00C5 0327")), NFD(hexU("00C5 0327"))); +ok(normalize('FCD', hexU("0041 030A 0327")), NFD(hexU("0041 030A 0327"))); + ok(answer(checkFCD('')), 'YES'); ok(answer(checkFCD('A')), 'YES'); ok(answer(checkFCD("\x{030A}")), 'YES'); # 030A;COMBINING RING ABOVE -ok(answer(checkFCD("\x{0327}")), 'YES'); # 0327;COMBINING CEDILLA +ok(answer(checkFCD("\x{0327}")), 'YES'); # 0327;COMBINING CEDILLA ok(answer(checkFCD(_pack_U(0x00C5))), 'YES'); # A with ring above -ok(answer(checkFCD(_pack_U(0x41, 0x30A))), 'YES'); # A+ring -ok(answer(checkFCD(_pack_U(0x41, 0x327, 0x30A))), 'YES'); # A+cedilla+ring -ok(answer(checkFCD(_pack_U(0x41, 0x30A, 0x327))), 'NO'); # A+ring+cedilla -ok(answer(checkFCD(_pack_U(0xC5, 0x0327))), 'NO'); # A-ring+cedilla -ok(answer(checkNFC(_pack_U(0xC5, 0x0327))), 'MAYBE'); # NFC: A-ring+cedilla -ok(answer(check("FCD", _pack_U(0xC5, 0x0327))), 'NO'); -ok(answer(check("NFC", _pack_U(0xC5, 0x0327))), 'MAYBE'); +ok(answer(checkFCD(hexU("0041 030A"))), 'YES'); # A+ring +ok(answer(checkFCD(hexU("0041 0327 030A"))), 'YES'); # A+cedilla+ring +ok(answer(checkFCD(hexU("0041 030A 0327"))), 'NO'); # A+ring+cedilla +ok(answer(checkFCD(hexU("00C5 0327"))), 'NO'); # A-ring+cedilla +ok(answer(checkNFC(hexU("00C5 0327"))), 'MAYBE'); # NFC: A-ring+cedilla +ok(answer(check("FCD", hexU("00C5 0327"))), 'NO'); +ok(answer(check("NFC", hexU("00C5 0327"))), 'MAYBE'); ok(answer(checkFCD("\x{AC01}\x{1100}\x{1161}")), 'YES'); # hangul ok(answer(checkFCD("\x{212B}\x{F900}")), 'YES'); # compat -ok(FCD(''), ""); -ok(FCC(''), ""); - -ok(FCD('A'), "A"); -ok(FCC('A'), "A"); +ok(answer(checkFCD(hexU("1EA7 05AE 0315 0062"))), "NO"); +ok(answer(checkFCC(hexU("1EA7 05AE 0315 0062"))), "NO"); +ok(answer(check('FCD', hexU("1EA7 05AE 0315 0062"))), "NO"); +ok(answer(check('FCC', hexU("1EA7 05AE 0315 0062"))), "NO"); -ok(answer(checkFCD(_pack_U(0x1EA7, 0x05AE, 0x0315, 0x0062))), "NO"); -ok(answer(checkFCC(_pack_U(0x1EA7, 0x05AE, 0x0315, 0x0062))), "NO"); - -ok(FCC(_pack_U(0xC5, 0x327)), _pack_U(0x41, 0x327, 0x30A)); -ok(FCC(_pack_U(0x45, 0x304, 0x300)), _pack_U(0x1E14)); +ok(FCC(hexU("00C5 0327")), hexU("0041 0327 030A")); +ok(FCC(hexU("0045 0304 0300")), "\x{1E14}"); ok(FCC("\x{1100}\x{1161}\x{1100}\x{1173}\x{11AF}"), "\x{AC00}\x{AE00}"); +ok(normalize('FCC', hexU("00C5 0327")), hexU("0041 0327 030A")); +ok(normalize('FCC', hexU("0045 0304 0300")), "\x{1E14}"); +ok(normalize('FCC', hexU("1100 1161 1100 1173 11AF")), "\x{AC00}\x{AE00}"); + +ok(FCC("\x{0B47}\x{0300}\x{0B3E}"), "\x{0B47}\x{0300}\x{0B3E}"); +ok(FCC("\x{1100}\x{0300}\x{1161}"), "\x{1100}\x{0300}\x{1161}"); +ok(FCC("\x{0B47}\x{0B3E}\x{0300}"), "\x{0B4B}\x{0300}"); +ok(FCC("\x{1100}\x{1161}\x{0300}"), "\x{AC00}\x{0300}"); +ok(FCC("\x{0B47}\x{300}\x{0B3E}\x{327}"), "\x{0B47}\x{300}\x{0B3E}\x{327}"); +ok(FCC("\x{1100}\x{300}\x{1161}\x{327}"), "\x{1100}\x{300}\x{1161}\x{327}"); ok(answer(checkFCC('')), 'YES'); ok(answer(checkFCC('A')), 'YES'); ok(answer(checkFCC("\x{030A}")), 'MAYBE'); # 030A;COMBINING RING ABOVE ok(answer(checkFCC("\x{0327}")), 'MAYBE'); # 0327;COMBINING CEDILLA -ok(answer(checkFCC(_pack_U(0x00C5))), 'YES'); # A with ring above -ok(answer(checkFCC(_pack_U(0x41, 0x30A))), 'MAYBE'); # A+ring -ok(answer(checkFCC(_pack_U(0x41, 0x327, 0x30A))), 'MAYBE'); # A+cedilla+ring -ok(answer(checkFCC(_pack_U(0x41, 0x30A, 0x327))), 'NO'); # A+ring+cedilla -ok(answer(checkFCC(_pack_U(0xC5, 0x0327))), 'NO'); # A-ring+cedilla +ok(answer(checkFCC(hexU("00C5"))), 'YES'); # A with ring above +ok(answer(checkFCC(hexU("0041 030A"))), 'MAYBE'); # A+ring +ok(answer(checkFCC(hexU("0041 0327 030A"))), 'MAYBE'); # A+cedilla+ring +ok(answer(checkFCC(hexU("0041 030A 0327"))), 'NO'); # A+ring+cedilla +ok(answer(checkFCC(hexU("00C5 0327"))), 'NO'); # A-ring+cedilla ok(answer(checkFCC("\x{AC01}\x{1100}\x{1161}")), 'MAYBE'); # hangul ok(answer(checkFCC("\x{212B}\x{F900}")), 'NO'); # compat diff --git a/ext/Unicode/Normalize/t/func.t b/ext/Unicode/Normalize/t/func.t index 6dbf41b..81421ce 100644 --- a/ext/Unicode/Normalize/t/func.t +++ b/ext/Unicode/Normalize/t/func.t @@ -19,130 +19,295 @@ BEGIN { use Test; use strict; use warnings; -BEGIN { plan tests => 13 }; +BEGIN { plan tests => 202 }; use Unicode::Normalize qw(:all); ok(1); # If we made it this far, we're ok. -sub _pack_U { Unicode::Normalize::pack_U(@_) } -sub _unpack_U { Unicode::Normalize::unpack_U(@_) } +sub _pack_U { Unicode::Normalize::pack_U(@_) } +sub hexU { _pack_U map hex, split ' ', shift } ######################### -print getCombinClass( 0) == 0 - && getCombinClass( 768) == 230 - && getCombinClass(1809) == 36 - && ($] < 5.007003 || getCombinClass(0x1D167) == 1) # Unicode 3.1 - ? "ok" : "not ok", " 2\n"; - -print ! defined getCanon( 0) - && ! defined getCanon(41) - && getCanon(0x00C0) eq _pack_U(0x0041, 0x0300) - && getCanon(0x00EF) eq _pack_U(0x0069, 0x0308) - && getCanon(0x304C) eq _pack_U(0x304B, 0x3099) - && getCanon(0x1EA4) eq _pack_U(0x0041, 0x0302, 0x0301) - && getCanon(0x1F82) eq _pack_U(0x03B1, 0x0313, 0x0300, 0x0345) - && getCanon(0x1FAF) eq _pack_U(0x03A9, 0x0314, 0x0342, 0x0345) - && getCanon(0xAC00) eq _pack_U(0x1100, 0x1161) - && getCanon(0xAE00) eq _pack_U(0x1100, 0x1173, 0x11AF) - && ! defined getCanon(0x212C) - && ! defined getCanon(0x3243) - && getCanon(0xFA2D) eq _pack_U(0x9DB4) - ? "ok" : "not ok", " 3\n"; - -print ! defined getCompat( 0) - && ! defined getCompat(41) - && getCompat(0x00C0) eq _pack_U(0x0041, 0x0300) - && getCompat(0x00EF) eq _pack_U(0x0069, 0x0308) - && getCompat(0x304C) eq _pack_U(0x304B, 0x3099) - && getCompat(0x1EA4) eq _pack_U(0x0041, 0x0302, 0x0301) - && getCompat(0x1F82) eq _pack_U(0x03B1, 0x0313, 0x0300, 0x0345) - && getCompat(0x1FAF) eq _pack_U(0x03A9, 0x0314, 0x0342, 0x0345) - && getCompat(0x212C) eq _pack_U(0x0042) - && getCompat(0x3243) eq _pack_U(0x0028, 0x81F3, 0x0029) - && getCompat(0xAC00) eq _pack_U(0x1100, 0x1161) - && getCompat(0xAE00) eq _pack_U(0x1100, 0x1173, 0x11AF) - && getCompat(0xFA2D) eq _pack_U(0x9DB4) - ? "ok" : "not ok", " 4\n"; - -print ! defined getComposite( 0, 0) - && ! defined getComposite( 0, 41) - && ! defined getComposite(41, 0) - && ! defined getComposite(41, 41) - && ! defined getComposite(12, 0x0300) - && ! defined getComposite(0x0055, 0xFF00) - && 0x00C0 == getComposite(0x0041, 0x0300) - && 0x00D9 == getComposite(0x0055, 0x0300) - && 0x1E14 == getComposite(0x0112, 0x0300) - && 0xAC00 == getComposite(0x1100, 0x1161) - && 0xADF8 == getComposite(0x1100, 0x1173) - && ! defined getComposite(0x1100, 0x11AF) - && ! defined getComposite(0x1173, 0x11AF) - && ! defined getComposite(0xAC00, 0x11A7) - && 0xAC01 == getComposite(0xAC00, 0x11A8) - && 0xAE00 == getComposite(0xADF8, 0x11AF) - ? "ok" : "not ok", " 5\n"; - -print ! isExclusion( 0) - && ! isExclusion(41) - && isExclusion(2392) # DEVANAGARI LETTER QA - && isExclusion(3907) # TIBETAN LETTER GHA - && isExclusion(64334) # HEBREW LETTER PE WITH RAFE - ? "ok" : "not ok", " 6\n"; - -print ! isSingleton( 0) - && isSingleton(0x212B) # ANGSTROM SIGN - ? "ok" : "not ok", " 7\n"; - -print reorder("") eq "" - && reorder(_pack_U(0x0041, 0x0300, 0x0315, 0x0313, 0x031b, 0x0061)) - eq _pack_U(0x0041, 0x031b, 0x0300, 0x0313, 0x0315, 0x0061) - && reorder(_pack_U(0x00C1, 0x0300, 0x0315, 0x0313, 0x031b, - 0x0061, 0x309A, 0x3099)) - eq _pack_U(0x00C1, 0x031b, 0x0300, 0x0313, 0x0315, - 0x0061, 0x309A, 0x3099) - ? "ok" : "not ok", " 8\n"; +ok(getCombinClass( 0), 0); +ok(getCombinClass( 41), 0); +ok(getCombinClass( 65), 0); +ok(getCombinClass( 768), 230); +ok(getCombinClass(1809), 36); + +ok(getCanon( 0), undef); +ok(getCanon(0x29), undef); +ok(getCanon(0x41), undef); +ok(getCanon(0x00C0), _pack_U(0x0041, 0x0300)); +ok(getCanon(0x00EF), _pack_U(0x0069, 0x0308)); +ok(getCanon(0x304C), _pack_U(0x304B, 0x3099)); +ok(getCanon(0x1EA4), _pack_U(0x0041, 0x0302, 0x0301)); +ok(getCanon(0x1F82), _pack_U(0x03B1, 0x0313, 0x0300, 0x0345)); +ok(getCanon(0x1FAF), _pack_U(0x03A9, 0x0314, 0x0342, 0x0345)); +ok(getCanon(0xAC00), _pack_U(0x1100, 0x1161)); +ok(getCanon(0xAE00), _pack_U(0x1100, 0x1173, 0x11AF)); +ok(getCanon(0x212C), undef); +ok(getCanon(0x3243), undef); +ok(getCanon(0xFA2D), _pack_U(0x9DB4)); + +ok(getCompat( 0), undef); +ok(getCompat(0x29), undef); +ok(getCompat(0x41), undef); +ok(getCompat(0x00C0), _pack_U(0x0041, 0x0300)); +ok(getCompat(0x00EF), _pack_U(0x0069, 0x0308)); +ok(getCompat(0x304C), _pack_U(0x304B, 0x3099)); +ok(getCompat(0x1EA4), _pack_U(0x0041, 0x0302, 0x0301)); +ok(getCompat(0x1F82), _pack_U(0x03B1, 0x0313, 0x0300, 0x0345)); +ok(getCompat(0x1FAF), _pack_U(0x03A9, 0x0314, 0x0342, 0x0345)); +ok(getCompat(0x212C), _pack_U(0x0042)); +ok(getCompat(0x3243), _pack_U(0x0028, 0x81F3, 0x0029)); +ok(getCompat(0xAC00), _pack_U(0x1100, 0x1161)); +ok(getCompat(0xAE00), _pack_U(0x1100, 0x1173, 0x11AF)); +ok(getCompat(0xFA2D), _pack_U(0x9DB4)); + +ok(getComposite( 0, 0), undef); +ok(getComposite( 0, 0x29), undef); +ok(getComposite(0x29, 0), undef); +ok(getComposite(0x29, 0x29), undef); +ok(getComposite( 0, 0x41), undef); +ok(getComposite(0x41, 0), undef); +ok(getComposite(0x41, 0x41), undef); +ok(getComposite(12, 0x0300), undef); +ok(getComposite(0x0055, 0xFF00), undef); +ok(getComposite(0x0041, 0x0300), 0x00C0); +ok(getComposite(0x0055, 0x0300), 0x00D9); +ok(getComposite(0x0112, 0x0300), 0x1E14); +ok(getComposite(0x1100, 0x1161), 0xAC00); +ok(getComposite(0x1100, 0x1173), 0xADF8); +ok(getComposite(0x1100, 0x11AF), undef); +ok(getComposite(0x1173, 0x11AF), undef); +ok(getComposite(0xAC00, 0x11A7), undef); +ok(getComposite(0xAC00, 0x11A8), 0xAC01); +ok(getComposite(0xADF8, 0x11AF), 0xAE00); + +sub uprops { + my $uv = shift; + my $r = ""; + $r .= isExclusion($uv) ? 'X' : 'x'; + $r .= isSingleton($uv) ? 'S' : 's'; + $r .= isNonStDecomp($uv) ? 'N' : 'n'; # Non-Starter Decomposition + $r .= isComp_Ex($uv) ? 'F' : 'f'; # Full exclusion (X + S + N) + $r .= isComp2nd($uv) ? 'B' : 'b'; # B = M = Y + $r .= isNFD_NO($uv) ? 'D' : 'd'; + $r .= isNFC_MAYBE($uv) ? 'M' : 'm'; # Maybe + $r .= isNFC_NO($uv) ? 'C' : 'c'; + $r .= isNFKD_NO($uv) ? 'K' : 'k'; + $r .= isNFKC_MAYBE($uv) ? 'Y' : 'y'; # maYbe + $r .= isNFKC_NO($uv) ? 'G' : 'g'; + return $r; +} + +ok(uprops(0x0000), 'xsnfbdmckyg'); +ok(uprops(0x0029), 'xsnfbdmckyg'); +ok(uprops(0x0041), 'xsnfbdmckyg'); +ok(uprops(0x00A0), 'xsnfbdmcKyG'); # NO-BREAK SPACE +ok(uprops(0x00C0), 'xsnfbDmcKyg'); # LATIN CAPITAL LETTER A WITH GRAVE +ok(uprops(0x0300), 'xsnfBdMckYg'); # COMBINING GRAVE ACCENT +ok(uprops(0x0344), 'xsNFbDmCKyG'); # COMBINING GREEK DIALYTIKA TONOS +ok(uprops(0x0387), 'xSnFbDmCKyG'); # GREEK ANO TELEIA +ok(uprops(0x0958), 'XsnFbDmCKyG'); # DEVANAGARI LETTER QA +ok(uprops(0x0F43), 'XsnFbDmCKyG'); # TIBETAN LETTER GHA +ok(uprops(0x1100), 'xsnfbdmckyg'); # HANGUL CHOSEONG KIYEOK +ok(uprops(0x1161), 'xsnfBdMckYg'); # HANGUL JUNGSEONG A +ok(uprops(0x11AF), 'xsnfBdMckYg'); # HANGUL JONGSEONG RIEU +ok(uprops(0x212B), 'xSnFbDmCKyG'); # ANGSTROM SIGN +ok(uprops(0xAC00), 'xsnfbDmcKyg'); # HANGUL SYLLABLE GA +ok(uprops(0xF900), 'xSnFbDmCKyG'); # CJK COMPATIBILITY IDEOGRAPH-F900 +ok(uprops(0xFB4E), 'XsnFbDmCKyG'); # HEBREW LETTER PE WITH RAFE +ok(uprops(0xFF71), 'xsnfbdmcKyG'); # HALFWIDTH KATAKANA LETTER A + +ok(decompose(""), ""); +ok(decompose("A"), "A"); +ok(decompose("", 1), ""); +ok(decompose("A", 1), "A"); + +ok(decompose(hexU("1E14 AC01")), hexU("0045 0304 0300 1100 1161 11A8")); +ok(decompose(hexU("AC00 AE00")), hexU("1100 1161 1100 1173 11AF")); +ok(decompose(hexU("304C FF76")), hexU("304B 3099 FF76")); + +ok(decompose(hexU("1E14 AC01"), 1), hexU("0045 0304 0300 1100 1161 11A8")); +ok(decompose(hexU("AC00 AE00"), 1), hexU("1100 1161 1100 1173 11AF")); +ok(decompose(hexU("304C FF76"), 1), hexU("304B 3099 30AB")); + +# don't modify the source +my $sDec = "\x{FA19}"; +ok(decompose($sDec), "\x{795E}"); +ok($sDec, "\x{FA19}"); + +ok(reorder(""), ""); +ok(reorder("A"), "A"); +ok(reorder(hexU("0041 0300 0315 0313 031b 0061")), + hexU("0041 031b 0300 0313 0315 0061")); +ok(reorder(hexU("00C1 0300 0315 0313 031b 0061 309A 3099")), + hexU("00C1 031b 0300 0313 0315 0061 309A 3099")); + +# don't modify the source +my $sReord = "\x{3000}\x{300}\x{31b}"; +ok(reorder($sReord), "\x{3000}\x{31b}\x{300}"); +ok($sReord, "\x{3000}\x{300}\x{31b}"); + +ok(compose(""), ""); +ok(compose("A"), "A"); +ok(compose(hexU("0061 0300")), hexU("00E0")); +ok(compose(hexU("0061 0300 031B")), hexU("00E0 031B")); +ok(compose(hexU("0061 0300 0315")), hexU("00E0 0315")); +ok(compose(hexU("0061 0300 0313")), hexU("00E0 0313")); +ok(compose(hexU("0061 031B 0300")), hexU("00E0 031B")); +ok(compose(hexU("0061 0315 0300")), hexU("0061 0315 0300")); +ok(compose(hexU("0061 0313 0300")), hexU("0061 0313 0300")); + +# don't modify the source +my $sCom = "\x{304B}\x{3099}"; +ok(compose($sCom), "\x{304C}"); +ok($sCom, "\x{304B}\x{3099}"); + +ok(composeContiguous(""), ""); +ok(composeContiguous("A"), "A"); +ok(composeContiguous(hexU("0061 0300")), hexU("00E0")); +ok(composeContiguous(hexU("0061 0300 031B")), hexU("00E0 031B")); +ok(composeContiguous(hexU("0061 0300 0315")), hexU("00E0 0315")); +ok(composeContiguous(hexU("0061 0300 0313")), hexU("00E0 0313")); +ok(composeContiguous(hexU("0061 031B 0300")), hexU("0061 031B 0300")); +ok(composeContiguous(hexU("0061 0315 0300")), hexU("0061 0315 0300")); +ok(composeContiguous(hexU("0061 0313 0300")), hexU("0061 0313 0300")); + +# don't modify the source +my $sCtg = "\x{30DB}\x{309A}"; +ok(composeContiguous($sCtg), "\x{30DD}"); +ok($sCtg, "\x{30DB}\x{309A}"); sub answer { defined $_[0] ? $_[0] ? "YES" : "NO" : "MAYBE" } -print answer(checkNFD("")) eq "YES" - && answer(checkNFC("")) eq "YES" - && answer(checkNFKD("")) eq "YES" - && answer(checkNFKC("")) eq "YES" - && answer(check("NFD", "")) eq "YES" - && answer(check("NFC", "")) eq "YES" - && answer(check("NFKD","")) eq "YES" - && answer(check("NFKC","")) eq "YES" +ok(answer(checkNFD("")), "YES"); +ok(answer(checkNFC("")), "YES"); +ok(answer(checkNFKD("")), "YES"); +ok(answer(checkNFKC("")), "YES"); +ok(answer(check("NFD", "")), "YES"); +ok(answer(check("NFC", "")), "YES"); +ok(answer(check("NFKD","")), "YES"); +ok(answer(check("NFKC","")), "YES"); + # U+0000 to U+007F are prenormalized in all the normalization forms. - && answer(checkNFD("AZaz\t12!#`")) eq "YES" - && answer(checkNFC("AZaz\t12!#`")) eq "YES" - && answer(checkNFKD("AZaz\t12!#`")) eq "YES" - && answer(checkNFKC("AZaz\t12!#`")) eq "YES" - && answer(check("D", "AZaz\t12!#`")) eq "YES" - && answer(check("C", "AZaz\t12!#`")) eq "YES" - && answer(check("KD","AZaz\t12!#`")) eq "YES" - && answer(check("KC","AZaz\t12!#`")) eq "YES" - ? "ok" : "not ok", " 9\n"; - -print 1 - && answer(checkNFD(NFD(_pack_U(0xC1, 0x1100, 0x1173, 0x11AF)))) eq "YES" - && answer(checkNFD(_pack_U(0x20, 0xC1, 0x1100, 0x1173, 0x11AF))) eq "NO" - && answer(checkNFC(_pack_U(0x20, 0xC1, 0x1173, 0x11AF))) eq "MAYBE" - && answer(checkNFC(_pack_U(0x20, 0xC1, 0xAE00, 0x1100))) eq "YES" - && answer(checkNFC(_pack_U(0x20, 0xC1, 0xAE00, 0x1100, 0x300))) eq "MAYBE" - && answer(checkNFC(_pack_U(0x20, 0xC1, 0xFF71, 0x2025))) eq "YES" - && answer(check("NFC", _pack_U(0x20, 0xC1, 0x212B, 0x300))) eq "NO" - && answer(checkNFKD(_pack_U(0x20, 0xC1, 0xFF71, 0x2025))) eq "NO" - && answer(checkNFKC(_pack_U(0x20, 0xC1, 0xAE00, 0x2025))) eq "NO" - ? "ok" : "not ok", " 10\n"; +ok(answer(checkNFD("AZaz\t12!#`")), "YES"); +ok(answer(checkNFC("AZaz\t12!#`")), "YES"); +ok(answer(checkNFKD("AZaz\t12!#`")), "YES"); +ok(answer(checkNFKC("AZaz\t12!#`")), "YES"); +ok(answer(check("D", "AZaz\t12!#`")), "YES"); +ok(answer(check("C", "AZaz\t12!#`")), "YES"); +ok(answer(check("KD","AZaz\t12!#`")), "YES"); +ok(answer(check("KC","AZaz\t12!#`")), "YES"); + +ok(answer(checkNFD(NFD(_pack_U(0xC1, 0x1100, 0x1173, 0x11AF)))), "YES"); +ok(answer(checkNFD(_pack_U(0x20, 0xC1, 0x1100, 0x1173, 0x11AF))), "NO"); +ok(answer(checkNFC(_pack_U(0x20, 0xC1, 0x1173, 0x11AF))), "MAYBE"); +ok(answer(checkNFC(_pack_U(0x20, 0xC1, 0xAE00, 0x1100))), "YES"); +ok(answer(checkNFC(_pack_U(0x20, 0xC1, 0xAE00, 0x1100, 0x300))), "MAYBE"); +ok(answer(checkNFC(_pack_U(0x20, 0xC1, 0xFF71, 0x2025))), "YES"); +ok(answer(check("NFC", _pack_U(0x20, 0xC1, 0x212B, 0x300))), "NO"); +ok(answer(checkNFKD(_pack_U(0x20, 0xC1, 0xFF71, 0x2025))), "NO"); +ok(answer(checkNFKC(_pack_U(0x20, 0xC1, 0xAE00, 0x2025))), "NO"); "012ABC" =~ /(\d+)(\w+)/; -print "012" eq NFC $1 && "ABC" eq NFC $2 - ? "ok" : "not ok", " 11\n"; +ok("012" eq NFC $1 && "ABC" eq NFC $2); -print "012" eq normalize('C', $1) && "ABC" eq normalize('C', $2) - ? "ok" : "not ok", " 12\n"; +ok(normalize('C', $1), "012"); +ok(normalize('C', $2), "ABC"); -print "012" eq normalize('NFC', $1) && "ABC" eq normalize('NFC', $2) - ? "ok" : "not ok", " 13\n"; +ok(normalize('NFC', $1), "012"); +ok(normalize('NFC', $2), "ABC"); # s/^NF// in normalize() must not prevent using $1, $&, etc. +# a string with initial zero should be treated like a number + +# LATIN CAPITAL LETTER A WITH GRAVE +ok(getCombinClass("0192"), 0); +ok(getCanon ("0192"), _pack_U(0x41, 0x300)); +ok(getCompat("0192"), _pack_U(0x41, 0x300)); +ok(getComposite("065", "0768"), 192); +ok(isNFD_NO ("0192")); +ok(isNFKD_NO("0192")); + +# DEVANAGARI LETTER QA +ok(isExclusion("02392")); +ok(isComp_Ex ("02392")); +ok(isNFC_NO ("02392")); +ok(isNFKC_NO ("02392")); +ok(isNFD_NO ("02392")); +ok(isNFKD_NO ("02392")); + +# ANGSTROM SIGN +ok(isSingleton("08491")); +ok(isComp_Ex ("08491")); +ok(isNFC_NO ("08491")); +ok(isNFKC_NO ("08491")); +ok(isNFD_NO ("08491")); +ok(isNFKD_NO ("08491")); + +# COMBINING GREEK DIALYTIKA TONOS +ok(isNonStDecomp("0836")); +ok(isComp_Ex ("0836")); +ok(isNFC_NO ("0836")); +ok(isNFKC_NO ("0836")); +ok(isNFD_NO ("0836")); +ok(isNFKD_NO ("0836")); + +# COMBINING GRAVE ACCENT +ok(getCombinClass("0768"), 230); +ok(isComp2nd ("0768")); +ok(isNFC_MAYBE ("0768")); +ok(isNFKC_MAYBE("0768")); + +# HANGUL SYLLABLE GA +ok(getCombinClass("044032"), 0); +ok(getCanon("044032"), _pack_U(0x1100, 0x1161)); +ok(getCompat("044032"), _pack_U(0x1100, 0x1161)); +ok(getComposite("04352", "04449"), 0xAC00); + +# string with 22 combining characters: (0x300..0x315) +my $str_cc22 = _pack_U(0x3041, 0x300..0x315, 0x3042); +ok(decompose($str_cc22), $str_cc22); +ok(reorder($str_cc22), $str_cc22); +ok(compose($str_cc22), $str_cc22); +ok(composeContiguous($str_cc22), $str_cc22); +ok(NFD($str_cc22), $str_cc22); +ok(NFC($str_cc22), $str_cc22); +ok(NFKD($str_cc22), $str_cc22); +ok(NFKC($str_cc22), $str_cc22); +ok(FCD($str_cc22), $str_cc22); +ok(FCC($str_cc22), $str_cc22); + +# string with 40 combining characters of the same class: (0x300..0x313)x2 +my $str_cc40 = _pack_U(0x3041, 0x300..0x313, 0x300..0x313, 0x3042); +ok(decompose($str_cc40), $str_cc40); +ok(reorder($str_cc40), $str_cc40); +ok(compose($str_cc40), $str_cc40); +ok(composeContiguous($str_cc40), $str_cc40); +ok(NFD($str_cc40), $str_cc40); +ok(NFC($str_cc40), $str_cc40); +ok(NFKD($str_cc40), $str_cc40); +ok(NFKC($str_cc40), $str_cc40); +ok(FCD($str_cc40), $str_cc40); +ok(FCC($str_cc40), $str_cc40); + +my $precomp = hexU("304C 304E 3050 3052 3054"); +my $combseq = hexU("304B 3099 304D 3099 304F 3099 3051 3099 3053 3099"); +ok(decompose($precomp x 5), $combseq x 5); +ok(decompose($precomp x 10), $combseq x 10); +ok(decompose($precomp x 20), $combseq x 20); + +my $hangsyl = hexU("AC00 B098 B2E4 B77C B9C8"); +my $jamoseq = hexU("1100 1161 1102 1161 1103 1161 1105 1161 1106 1161"); +ok(decompose($hangsyl x 5), $jamoseq x 5); +ok(decompose($hangsyl x 10), $jamoseq x 10); +ok(decompose($hangsyl x 20), $jamoseq x 20); + +my $notcomp = hexU("304B 304D 304F 3051 3053"); +ok(decompose($precomp . $notcomp), $combseq . $notcomp); +ok(decompose($precomp . $notcomp x 5), $combseq . $notcomp x 5); +ok(decompose($precomp . $notcomp x10), $combseq . $notcomp x10); + + diff --git a/ext/Unicode/Normalize/t/norm.t b/ext/Unicode/Normalize/t/norm.t index a939907..5d93747 100644 --- a/ext/Unicode/Normalize/t/norm.t +++ b/ext/Unicode/Normalize/t/norm.t @@ -19,7 +19,7 @@ BEGIN { use Test; use strict; use warnings; -BEGIN { plan tests => 29 }; +BEGIN { plan tests => 64 }; use Unicode::Normalize qw(normalize); ok(1); # If we made it this far, we're ok. @@ -28,8 +28,42 @@ sub _unpack_U { Unicode::Normalize::unpack_U(@_) } ######################### -ok(normalize('C', ""), ""); ok(normalize('D', ""), ""); +ok(normalize('C', ""), ""); +ok(normalize('KD',""), ""); +ok(normalize('KC',""), ""); + +ok(normalize('D', "A"), "A"); +ok(normalize('C', "A"), "A"); +ok(normalize('KD',"A"), "A"); +ok(normalize('KC',"A"), "A"); + +ok(normalize('NFD', ""), ""); +ok(normalize('NFC', ""), ""); +ok(normalize('NFKD',""), ""); +ok(normalize('NFKC',""), ""); + +ok(normalize('NFD', "A"), "A"); +ok(normalize('NFC', "A"), "A"); +ok(normalize('NFKD',"A"), "A"); +ok(normalize('NFKC',"A"), "A"); + +# don't modify the source +my $sNFD = "\x{FA19}"; +ok(normalize('NFD', $sNFD), "\x{795E}"); +ok($sNFD, "\x{FA19}"); + +my $sNFC = "\x{FA1B}"; +ok(normalize('NFC', $sNFC), "\x{798F}"); +ok($sNFC, "\x{FA1B}"); + +my $sNFKD = "\x{FA1E}"; +ok(normalize('NFKD', $sNFKD), "\x{7FBD}"); +ok($sNFKD, "\x{FA1E}"); + +my $sNFKC = "\x{FA26}"; +ok(normalize('NFKC', $sNFKC), "\x{90FD}"); +ok($sNFKC, "\x{FA26}"); sub hexNFC { join " ", map sprintf("%04X", $_), @@ -40,6 +74,9 @@ sub hexNFD { _unpack_U normalize 'D', _pack_U map hex, split ' ', shift; } +ok(hexNFD("1E14 AC01"), "0045 0304 0300 1100 1161 11A8"); +ok(hexNFD("AC00 AE00"), "1100 1161 1100 1173 11AF"); + ok(hexNFC("0061 0315 0300 05AE 05C4 0062"), "00E0 05AE 05C4 0315 0062"); ok(hexNFC("00E0 05AE 05C4 0315 0062"), "00E0 05AE 05C4 0315 0062"); ok(hexNFC("0061 05AE 0300 05C4 0315 0062"), "00E0 05AE 05C4 0315 0062"); @@ -73,3 +110,16 @@ ok(hexNFC("1100 1161 0300"), "AC00 0300"); ok(hexNFC("0B47 0300 0B3E 0327"), "0B47 0300 0B3E 0327"); ok(hexNFC("1100 0300 1161 0327"), "1100 0300 1161 0327"); + +ok(hexNFC("0300 0041"), "0300 0041"); +ok(hexNFC("0300 0301 0041"), "0300 0301 0041"); +ok(hexNFC("0301 0300 0041"), "0301 0300 0041"); +ok(hexNFC("0000 0300 0000 0301"), "0000 0300 0000 0301"); +ok(hexNFC("0000 0301 0000 0300"), "0000 0301 0000 0300"); + +ok(hexNFC("0327 0061 0300"), "0327 00E0"); +ok(hexNFC("0301 0061 0300"), "0301 00E0"); +ok(hexNFC("0315 0061 0300"), "0315 00E0"); +ok(hexNFC("0000 0327 0061 0300"), "0000 0327 00E0"); +ok(hexNFC("0000 0301 0061 0300"), "0000 0301 00E0"); +ok(hexNFC("0000 0315 0061 0300"), "0000 0315 00E0"); diff --git a/ext/Unicode/Normalize/t/null.t b/ext/Unicode/Normalize/t/null.t index ae75752..6067da4 100644 --- a/ext/Unicode/Normalize/t/null.t +++ b/ext/Unicode/Normalize/t/null.t @@ -20,7 +20,7 @@ use strict; use warnings; use Unicode::Normalize qw(:all); -print "1..8\n"; +print "1..24\n"; print "ok 1\n"; @@ -47,3 +47,51 @@ print /c$/ ? "ok" : "not ok", " 7\n"; $_ = NFKC('abc'); print /c$/ ? "ok" : "not ok", " 8\n"; +$_ = FCC('abc'); +print /c$/ ? "ok" : "not ok", " 9\n"; + +$_ = decompose("\x{304C}abc"); +print /c$/ ? "ok" : "not ok", " 10\n"; + +$_ = decompose("\x{304B}\x{3099}abc"); +print /c$/ ? "ok" : "not ok", " 11\n"; + +$_ = reorder("\x{304C}abc"); +print /c$/ ? "ok" : "not ok", " 12\n"; + +$_ = reorder("\x{304B}\x{3099}abc"); +print /c$/ ? "ok" : "not ok", " 13\n"; + +$_ = compose("\x{304C}abc"); +print /c$/ ? "ok" : "not ok", " 14\n"; + +$_ = compose("\x{304B}\x{3099}abc"); +print /c$/ ? "ok" : "not ok", " 15\n"; + +$_ = NFD("\x{304C}abc"); +print /c$/ ? "ok" : "not ok", " 16\n"; + +$_ = NFC("\x{304C}abc"); +print /c$/ ? "ok" : "not ok", " 17\n"; + +$_ = NFKD("\x{304C}abc"); +print /c$/ ? "ok" : "not ok", " 18\n"; + +$_ = NFKC("\x{304C}abc"); +print /c$/ ? "ok" : "not ok", " 19\n"; + +$_ = FCC("\x{304C}abc"); +print /c$/ ? "ok" : "not ok", " 20\n"; + +$_ = getCanon(0x100); +print s/.$// ? "ok" : "not ok", " 21\n"; + +$_ = getCompat(0x100); +print s/.$// ? "ok" : "not ok", " 22\n"; + +$_ = getCanon(0xAC00); +print s/.$// ? "ok" : "not ok", " 23\n"; + +$_ = getCompat(0xAC00); +print s/.$// ? "ok" : "not ok", " 24\n"; + diff --git a/ext/Unicode/Normalize/t/test.t b/ext/Unicode/Normalize/t/test.t index 8e3369f..e07f6f0 100644 --- a/ext/Unicode/Normalize/t/test.t +++ b/ext/Unicode/Normalize/t/test.t @@ -19,7 +19,7 @@ BEGIN { use Test; use strict; use warnings; -BEGIN { plan tests => 31 }; +BEGIN { plan tests => 58 }; use Unicode::Normalize; ok(1); # If we made it this far, we're ok. @@ -28,8 +28,34 @@ sub _unpack_U { Unicode::Normalize::unpack_U(@_) } ######################### -ok(NFC(""), ""); ok(NFD(""), ""); +ok(NFC(""), ""); +ok(NFKD(""), ""); +ok(NFKC(""), ""); + +ok(NFD("A"), "A"); +ok(NFC("A"), "A"); +ok(NFKD("A"), "A"); +ok(NFKC("A"), "A"); + +# don't modify the source +# don't modify the source +my $sNFD = "\x{FA19}"; +ok(NFD($sNFD), "\x{795E}"); +ok($sNFD, "\x{FA19}"); + +my $sNFC = "\x{FA1B}"; +ok(NFC($sNFC), "\x{798F}"); +ok($sNFC, "\x{FA1B}"); + +my $sNFKD = "\x{FA1E}"; +ok(NFKD($sNFKD), "\x{7FBD}"); +ok($sNFKD, "\x{FA1E}"); + +my $sNFKC = "\x{FA26}"; +ok(NFKC($sNFKC), "\x{90FD}"); +ok($sNFKC, "\x{FA26}"); + sub hexNFC { join " ", map sprintf("%04X", $_), @@ -40,6 +66,9 @@ sub hexNFD { _unpack_U NFD _pack_U map hex, split ' ', shift; } +ok(hexNFD("1E14 AC01"), "0045 0304 0300 1100 1161 11A8"); +ok(hexNFD("AC00 AE00"), "1100 1161 1100 1173 11AF"); + ok(hexNFC("0061 0315 0300 05AE 05C4 0062"), "00E0 05AE 05C4 0315 0062"); ok(hexNFC("00E0 05AE 05C4 0315 0062"), "00E0 05AE 05C4 0315 0062"); ok(hexNFC("0061 05AE 0300 05C4 0315 0062"), "00E0 05AE 05C4 0315 0062"); @@ -67,13 +96,24 @@ ok(hexNFC("AC00 11C3"), "AC00 11C3"); # cf. http://www.unicode.org/review/pr-29.html ok(hexNFC("0B47 0300 0B3E"), "0B47 0300 0B3E"); ok(hexNFC("1100 0300 1161"), "1100 0300 1161"); - ok(hexNFC("0B47 0B3E 0300"), "0B4B 0300"); ok(hexNFC("1100 1161 0300"), "AC00 0300"); - ok(hexNFC("0B47 0300 0B3E 0327"), "0B47 0300 0B3E 0327"); ok(hexNFC("1100 0300 1161 0327"), "1100 0300 1161 0327"); +ok(hexNFC("0300 0041"), "0300 0041"); +ok(hexNFC("0300 0301 0041"), "0300 0301 0041"); +ok(hexNFC("0301 0300 0041"), "0301 0300 0041"); +ok(hexNFC("0000 0300 0000 0301"), "0000 0300 0000 0301"); +ok(hexNFC("0000 0301 0000 0300"), "0000 0301 0000 0300"); + +ok(hexNFC("0327 0061 0300"), "0327 00E0"); +ok(hexNFC("0301 0061 0300"), "0301 00E0"); +ok(hexNFC("0315 0061 0300"), "0315 00E0"); +ok(hexNFC("0000 0327 0061 0300"), "0000 0327 00E0"); +ok(hexNFC("0000 0301 0061 0300"), "0000 0301 00E0"); +ok(hexNFC("0000 0315 0061 0300"), "0000 0315 00E0"); + # NFC() should be unary. my $str11 = _pack_U(0x41, 0x0302, 0x0301, 0x62); my $str12 = _pack_U(0x1EA4, 0x62);