Revision history for Perl extension Unicode::Normalize.
+0.15 Tue Mar 19 22:04:07 2002
+ - Quick check is implemented.
+ - decompose(), reorder(), and compose() are documented.
+ - The Non-XS version is also independent of Lingua::KO::Hangul::Util.
+
0.14 Sat Feb 02 20:40:14 2002
- synchronization with bleadperl.
- Change 14128: by Arthur
BEGIN {
if (ord("A") == 193) {
- die "Unicode::Normalize not ported to EBCDIC\n";
+ die "Unicode::Normalize not ported to EBCDIC\n";
}
}
use warnings;
use Carp;
-our $VERSION = '0.14';
+our $VERSION = '0.15';
our $PACKAGE = __PACKAGE__;
require Exporter;
our @EXPORT = qw( NFC NFD NFKC NFKD );
our @EXPORT_OK = qw(
normalize decompose reorder compose
- getCanon getCompat getComposite getCombinClass isExclusion
+ checkNFD checkNFKD checkNFC checkNFKC check
+ getCanon getCompat getComposite getCombinClass
+ isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
+ isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
+);
+our %EXPORT_TAGS = (
+ all => [ @EXPORT, @EXPORT_OK ],
+ normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
+ check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
);
-our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] );
bootstrap Unicode::Normalize $VERSION;
sub NFD ($) { reorder(decompose($_[0])) }
sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
-
sub NFC ($) { compose(reorder(decompose($_[0]))) }
sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
croak $PACKAGE."::normalize: invalid form name: $form";
}
+sub check($$)
+{
+ my $form = shift;
+ $form =~ s/^NF//;
+ return
+ $form eq 'D' ? checkNFD ($_[0]) :
+ $form eq 'C' ? checkNFC ($_[0]) :
+ $form eq 'KD' ? checkNFKD($_[0]) :
+ $form eq 'KC' ? checkNFKC($_[0]) :
+ croak $PACKAGE."::check: invalid form name: $form";
+}
+
1;
__END__
use Unicode::Normalize;
- $string_NFD = NFD($raw_string); # Normalization Form D
- $string_NFC = NFC($raw_string); # Normalization Form C
- $string_NFKD = NFKD($raw_string); # Normalization Form KD
- $string_NFKC = NFKC($raw_string); # Normalization Form KC
+ $NFD_string = NFD($string); # Normalization Form D
+ $NFC_string = NFC($string); # Normalization Form C
+ $NFKD_string = NFKD($string); # Normalization Form KD
+ $NFKC_string = NFKC($string); # Normalization Form KC
or
use Unicode::Normalize 'normalize';
- $string_NFD = normalize('D', $raw_string); # Normalization Form D
- $string_NFC = normalize('C', $raw_string); # Normalization Form C
- $string_NFKD = normalize('KD', $raw_string); # Normalization Form KD
- $string_NFKC = normalize('KC', $raw_string); # Normalization Form KC
+ $NFD_string = normalize('D', $string); # Normalization Form D
+ $NFC_string = normalize('C', $string); # Normalization Form C
+ $NFKD_string = normalize('KD', $string); # Normalization Form KD
+ $NFKC_string = normalize('KC', $string); # Normalization Form KC
=head1 DESCRIPTION
=over 4
-=item C<$string_NFD = NFD($raw_string)>
+=item C<$NFD_string = NFD($string)>
returns the Normalization Form D (formed by canonical decomposition).
-
-=item C<$string_NFC = NFC($raw_string)>
+=item C<$NFC_string = NFC($string)>
returns the Normalization Form C (formed by canonical decomposition
followed by canonical composition).
-=item C<$string_NFKD = NFKD($raw_string)>
+=item C<$NFKD_string = NFKD($string)>
returns the Normalization Form KD (formed by compatibility decomposition).
-=item C<$string_NFKC = NFKC($raw_string)>
+=item C<$NFKC_string = NFKC($string)>
returns the Normalization Form KC (formed by compatibility decomposition
followed by B<canonical> composition).
-=item C<$normalized_string = normalize($form_name, $raw_string)>
+=item C<$normalized_string = normalize($form_name, $string)>
As C<$form_name>, one of the following names must be given.
=back
+=head2 Decomposition and Composition
+
+=over 4
+
+=item C<$decomposed_string = decompose($string)>
+
+=item C<$decomposed_string = decompose($string, $useCompatMapping)>
+
+Decompose the specified string and returns the result.
+
+If the second parameter (a boolean) is omitted or false, decomposes it
+using the Canonical Decomposition Mapping.
+If true, decomposes it using the Compatibility Decomposition Mapping.
+
+The string returned is not always in NFD/NFKD.
+Reordering may be required.
+
+ $NFD_string = reorder(decompose($string)); # eq. to NFD()
+ $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
+
+=item C<$reordered_string = reorder($string)>
+
+Reorder the combining characters and the like in the canonical ordering
+and returns the result.
+
+E.g., when you have a list of NFD/NFKD strings,
+you can get the concatenated NFD/NFKD string from them, saying
+
+ $concat_NFD = reorder(join '', @NFD_strings);
+ $concat_NFKD = reorder(join '', @NFKD_strings);
+
+=item C<$composed_string = compose($string)>
+
+Returns the string where composable pairs are composed.
+
+E.g., when you have a NFD/NFKD string,
+you can get its NFC/NFKC string, saying
+
+ $NFC_string = compose($NFD_string);
+ $NFKC_string = compose($NFKD_string);
+
+=back
+
+=head2 Quick Check
+
+(see Annex 8, UAX #15; F<DerivedNormalizationProperties.txt>)
+
+The following functions check whether the string is in that normalization form.
+
+The result returned will be:
+
+ YES The string is in that normalization form.
+ NO The string is not in that normalization form.
+ MAYBE Dubious. Maybe yes, maybe no.
+
+=over 4
+
+=item C<$result = checkNFD($string)>
+
+returns YES (1) or NO (empty string).
+
+=item C<$result = checkNFC($string)>
+
+returns YES (1), NO (empty string), or MAYBE (undef).
+
+=item C<$result = checkNFKD($string)>
+
+returns YES (1) or NO (empty string).
+
+=item C<$result = checkNFKC($string)>
+
+returns YES (1), NO (empty string), or MAYBE (undef).
+
+=item C<$result = check($form_name, $string)>
+
+returns YES (1), NO (empty string), or MAYBE (undef).
+
+C<$form_name> is alike to that for C<normalize()>.
+
+=back
+
+B<Note>
+
+In the cases of NFD and NFKD, the answer must be either C<YES> or C<NO>.
+The answer C<MAYBE> may be returned in the cases of NFC and NFKC.
+
+A MAYBE-NFC/NFKC string should contain at least
+one combining character or the like.
+For example, C<COMBINING ACUTE ACCENT> has
+the MAYBE_NFC/MAYBE_NFKC property.
+Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
+and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
+Though, C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
+(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
+while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
+
+If you want to check exactly, compare the string with its NFC/NFKC; i.e.,
+
+ $string eq NFC($string) # more thorough than checkNFC($string)
+ $string eq NFKC($string) # more thorough than checkNFKC($string)
+
=head2 Character Data
These functions are interface of character data used internally.
=item C<$canonical_decomposed = getCanon($codepoint)>
+If the character of the specified codepoint is canonically
+decomposable (including Hangul Syllables),
+returns the B<completely decomposed> string canonically equivalent to it.
+
+If it is not decomposable, returns undef.
+
=item C<$compatibility_decomposed = getCompat($codepoint)>
-If the character of the specified codepoint is canonically or
-compatibility decomposable (including Hangul Syllables),
-returns the B<completely decomposed> string equivalent to it.
+If the character of the specified codepoint is compatibility
+decomposable (including Hangul Syllables),
+returns the B<completely decomposed> string compatibility equivalent to it.
If it is not decomposable, returns undef.
-=item C<$uv_composite = getComposite($uv_here, $uv_next)>
+=item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>
If two characters here and next (as codepoints) are composable
-(including Hangul Jamo/Syllables and Exclusions),
+(including Hangul Jamo/Syllables and Composition Exclusions),
returns the codepoint of the composite.
If they are not composable, returns undef.
=item C<$combining_class = getCombinClass($codepoint)>
-Returns the combining class as integer of the character.
+Returns the combining class of the character as an integer.
=item C<$is_exclusion = isExclusion($codepoint)>
+Returns a boolean whether the character of the specified codepoint
+is a composition exclusion.
+
+=item C<$is_singleton = isSingleton($codepoint)>
+
Returns a boolean whether the character of the specified codepoint is
-a composition exclusion.
+a singleton.
+
+=item C<$is_non_startar_decomposition = isNonStDecomp($codepoint)>
+
+Returns a boolean whether the canonical decomposition
+of the character of the specified codepoint
+is a Non-Starter Decomposition.
+
+=item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>
+
+Returns a boolean whether the character of the specified codepoint
+may be composed with the previous one in a certain composition
+(including Hangul Compositions, but excluding
+Composition Exclusions and Non-Starter Decompositions).
=back
C<normalize> and other some functions: on request.
-=head2 TODO
-
-Unicode::Normalize has not been ported to EBCDIC. The code mostly
-would work just fine but a decision needs to be made: how the module
-should work in EBCDIC? Should the low 256 characters be understood as
-Unicode or as EBCDIC code points? Should one be chosen or should
-there be a way to do either? Or should such translation be left
-outside the module for the user to do, for example by using
-Encode::from_to()?
-
=head1 AUTHOR
SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
Unicode Normalization Forms - UAX #15
+=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProperties.txt
+
+Derived Normalization Properties
+
=back
=cut
{
int ret_cc;
ret_cc = (*(UNF_cc*)a).cc - (*(UNF_cc*)b).cc;
- if(ret_cc) return ret_cc;
+ if (ret_cc)
+ return ret_cc;
return (*(UNF_cc*)a).pos - (*(UNF_cc*)b).pos;
}
U8* dec_canonical (UV uv)
{
U8 ***plane, **row;
- if(OVER_UTF_MAX(uv)) return NULL;
+ if (OVER_UTF_MAX(uv))
+ return NULL;
plane = (U8***)UNF_canon[uv >> 16];
- if(! plane) return NULL;
+ if (! plane)
+ return NULL;
row = plane[(uv >> 8) & 0xff];
return row ? row[uv & 0xff] : NULL;
}
U8* dec_compat (UV uv)
{
U8 ***plane, **row;
- if(OVER_UTF_MAX(uv)) return NULL;
+ if (OVER_UTF_MAX(uv))
+ return NULL;
plane = (U8***)UNF_compat[uv >> 16];
- if(! plane) return NULL;
+ if (! plane)
+ return NULL;
row = plane[(uv >> 8) & 0xff];
return row ? row[uv & 0xff] : NULL;
}
{
UNF_complist ***plane, **row, *cell, *i;
- if(! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2)) return 0;
+ if (! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
+ return 0;
- if(Hangul_IsL(uv) && Hangul_IsV(uv2)) {
+ if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {
uv -= Hangul_LBase; /* lindex */
uv2 -= Hangul_VBase; /* vindex */
return(Hangul_SBase + (uv * Hangul_VCount + uv2) * Hangul_TCount);
}
- if(Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
+ if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
uv2 -= Hangul_TBase; /* tindex */
return(uv + uv2);
}
plane = UNF_compos[uv >> 16];
- if(! plane) return 0;
+ if (! plane)
+ return 0;
row = plane[(uv >> 8) & 0xff];
- if(! row) return 0;
+ if (! row)
+ return 0;
cell = row[uv & 0xff];
- if(! cell) return 0;
- for(i = cell; i->nextchar; i++) {
- if(uv2 == i->nextchar) return i->composite;
+ if (! cell)
+ return 0;
+ for (i = cell; i->nextchar; i++) {
+ if (uv2 == i->nextchar)
+ return i->composite;
}
return 0;
}
U8 getCombinClass (UV uv)
{
U8 **plane, *row;
- if(OVER_UTF_MAX(uv)) return 0;
+ if (OVER_UTF_MAX(uv))
+ return 0;
plane = (U8**)UNF_combin[uv >> 16];
- if(! plane) return 0;
+ if (! plane)
+ return 0;
row = plane[(uv >> 8) & 0xff];
return row ? row[uv & 0xff] : 0;
}
UV sindex, lindex, vindex, tindex;
U8 *t, tmp[3 * UTF8_MAXLEN + 1];
- if(! Hangul_IsS(uv)) return;
+ if (! Hangul_IsS(uv))
+ return;
sindex = uv - Hangul_SBase;
lindex = sindex / Hangul_NCount;
t = tmp;
t = uvuni_to_utf8(t, (lindex + Hangul_LBase));
t = uvuni_to_utf8(t, (vindex + Hangul_VBase));
- if (tindex) t = uvuni_to_utf8(t, (tindex + Hangul_TBase));
+ if (tindex)
+ t = uvuni_to_utf8(t, (tindex + Hangul_TBase));
*t = '\0';
sv_catpvn(sv, (char *)tmp, strlen((char *)tmp));
}
U8 *s, *e, *p, *r;
bool iscompat;
CODE:
- if(SvUTF8(arg)) {
+ if (SvUTF8(arg)) {
src = arg;
} else {
src = sv_mortalcopy(arg);
s = (U8*)SvPV(src,srclen);
e = s + srclen;
- for(p = s; p < e;){
+ for (p = s; p < e;) {
uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
p += retlen;
- if(Hangul_IsS(uv)) sv_cat_decompHangul(dst, uv);
+ if (Hangul_IsS(uv))
+ sv_cat_decompHangul(dst, uv);
else {
r = iscompat ? dec_compat(uv) : dec_canonical(uv);
- if(r) sv_catpv(dst, (char *)r);
- else sv_catpvn(dst, (char *)p - retlen, retlen);
+ if (r)
+ sv_catpv(dst, (char *)r);
+ else
+ sv_catpvn(dst, (char *)p - retlen, retlen);
}
}
RETVAL = dst;
SV * arg
PROTOTYPE: $
PREINIT:
- SV *src;
- STRLEN srclen, retlen, stk_cc_max;
- U8 *s, *e, *p, curCC;
+ SV *src, *dst;
+ STRLEN srclen, dstlen, retlen, stk_cc_max;
+ U8 *s, *e, *p, *d, curCC;
UV uv;
UNF_cc * stk_cc;
CODE:
- src = newSVsv(arg);
- if(! SvUTF8(arg)) sv_utf8_upgrade(src);
+ if (SvUTF8(arg)) {
+ src = arg;
+ } else {
+ src = sv_mortalcopy(arg);
+ sv_utf8_upgrade(src);
+ }
+
+ s = (U8*)SvPV(src, srclen);
+
+ dstlen = srclen + 1;
+ dst = newSV(dstlen);
+ sv_setpvn(dst,s,srclen);
+ SvUTF8_on(dst);
stk_cc_max = 10; /* enough as an initial value? */
New(0, stk_cc, stk_cc_max, UNF_cc);
- s = (U8*)SvPV(src,srclen);
- e = s + srclen;
+ d = (U8*)SvPV(dst,dstlen);
+ e = d + dstlen;
- for(p = s; p < e;){
+ for (p = d; p < e;) {
U8 *cc_in;
STRLEN cc_len, cc_iter, cc_pos;
curCC = getCombinClass(uv);
p += retlen;
- if(! (curCC && p < e)) continue; else cc_in = p - retlen;
+ if (! (curCC && p < e))
+ continue;
+ else
+ cc_in = p - retlen;
cc_pos = 0;
stk_cc[cc_pos].cc = curCC;
stk_cc[cc_pos].uv = uv;
stk_cc[cc_pos].pos = cc_pos;
- while(p < e) {
+ while (p < e) {
uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
curCC = getCombinClass(uv);
- if(!curCC) break;
+ if (!curCC)
+ break;
p += retlen;
cc_pos++;
- if(stk_cc_max <= cc_pos) { /* extend if need */
+ if (stk_cc_max <= cc_pos) { /* extend if need */
stk_cc_max = cc_pos + 1;
Renew(stk_cc, stk_cc_max, UNF_cc);
}
}
/* only one c.c. in cc_len from cc_in, no need of reordering */
- if(!cc_pos) continue;
+ if (!cc_pos)
+ continue;
qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc);
cc_len = p - cc_in;
p = cc_in;
- for(cc_iter = 0; cc_iter <= cc_pos; cc_iter++) {
+ for (cc_iter = 0; cc_iter <= cc_pos; cc_iter++) {
p = uvuni_to_utf8(p, stk_cc[cc_iter].uv);
}
}
Safefree(stk_cc);
- RETVAL = src;
+ RETVAL = dst;
OUTPUT:
RETVAL
STRLEN srclen, dstlen, tmplen, retlen;
bool beginning = TRUE;
CODE:
- if(SvUTF8(arg)) {
+ if (SvUTF8(arg)) {
src = arg;
} else {
src = sv_mortalcopy(arg);
(void)SvPOK_only(tmp);
SvUTF8_on(tmp);
- for(p = s; p < e;){
- if(beginning) {
+ for (p = s; p < e;) {
+ if (beginning) {
uvS = utf8n_to_uvuni(p, e - p, &retlen, 0);
p += retlen;
- if (getCombinClass(uvS)){ /* no Starter found yet */
+ if (getCombinClass(uvS)) { /* no Starter found yet */
d = uvuni_to_utf8(d, uvS);
continue;
}
preCC = 0;
/* to the next Starter */
- while(p < e) {
+ while (p < e) {
uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
p += retlen;
curCC = getCombinClass(uv);
- if(preCC && preCC == curCC) {
+ if (preCC && preCC == curCC) {
preCC = curCC;
t = uvuni_to_utf8(t, uv);
} else {
uvComp = composite_uv(uvS, uv);
- /* S + C + S => S-S + C would be also blocked. */
- if( uvComp && ! isExclusion(uvComp) && preCC <= curCC)
- {
+ if (uvComp && ! isExclusion(uvComp) && preCC <= curCC) {
STRLEN leftcur, rightcur, dstcur;
leftcur = UNISKIP(uvComp);
rightcur = UNISKIP(uvS) + UNISKIP(uv);
dstlen += leftcur - rightcur;
d = (U8*)SvGROW(dst,dstlen) + dstcur;
}
-
/* preCC not changed to curCC */
uvS = uvComp;
} else if (! curCC && p < e) { /* blocked */
tmplen = t - tmp_start;
if (tmplen) { /* uncomposed combining char */
t = (U8*)SvPVX(tmp);
- while(tmplen--) *d++ = *t++;
+ while (tmplen--)
+ *d++ = *t++;
}
uvS = uv;
} /* for */
+void
+checkNFD(arg)
+ SV * arg
+ PROTOTYPE: $
+ ALIAS:
+ checkNFKD = 1
+ PREINIT:
+ UV uv;
+ SV *src;
+ STRLEN srclen, retlen;
+ U8 *s, *e, *p, curCC, preCC;
+ PPCODE:
+ if (SvUTF8(arg)) {
+ src = arg;
+ } else {
+ src = sv_mortalcopy(arg);
+ sv_utf8_upgrade(src);
+ }
+
+ s = (U8*)SvPV(src,srclen);
+ e = s + srclen;
+
+ preCC = 0;
+ for (p = s; p < e; p += retlen) {
+ uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
+ curCC = getCombinClass(uv);
+ if (preCC > curCC && curCC != 0) /* canonical ordering violated */
+ XSRETURN_NO;
+ if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
+ XSRETURN_NO;
+ preCC = curCC;
+ }
+ XSRETURN_YES;
+
+
+
+void
+checkNFC(arg)
+ SV * arg
+ PROTOTYPE: $
+ ALIAS:
+ checkNFKC = 1
+ PREINIT:
+ UV uv;
+ SV *src;
+ STRLEN srclen, retlen;
+ U8 *s, *e, *p, curCC, preCC;
+ bool isMAYBE;
+ PPCODE:
+ if (SvUTF8(arg)) {
+ src = arg;
+ } else {
+ src = sv_mortalcopy(arg);
+ sv_utf8_upgrade(src);
+ }
+
+ s = (U8*)SvPV(src,srclen);
+ e = s + srclen;
+
+ preCC = 0;
+ isMAYBE = FALSE;
+ for (p = s; p < e; p += retlen) {
+ uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
+ curCC = getCombinClass(uv);
+
+ if (preCC > curCC && curCC != 0) /* canonical ordering violated */
+ XSRETURN_NO;
+
+ /* get NFC/NFKC property */
+ if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */
+ ; /* YES */
+ else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
+ XSRETURN_NO;
+ else if (isComp2nd(uv))
+ isMAYBE = TRUE;
+ else if (ix) {
+ char *canon, *compat;
+ /*
+ * NFKC_NO when having compatibility mapping;
+ * i.e. dec_compat(uv) defined & different with dec_canonical(uv).
+ */
+ canon = (char *) dec_canonical(uv);
+ compat = (char *) dec_compat(uv);
+ if (compat && (!canon || strNE(canon, compat)))
+ XSRETURN_NO;
+ } /* end of get NFC/NFKC property */
+
+ preCC = curCC;
+ }
+ if (isMAYBE)
+ XSRETURN_UNDEF;
+ else
+ XSRETURN_YES;
+
+
+
U8
getCombinClass(uv)
UV uv
+ PROTOTYPE: $
bool
isExclusion(uv)
UV uv
+ PROTOTYPE: $
+
+bool
+isSingleton(uv)
+ UV uv
+ PROTOTYPE: $
+
+bool
+isNonStDecomp(uv)
+ UV uv
+ PROTOTYPE: $
+
+bool
+isComp2nd(uv)
+ UV uv
+ PROTOTYPE: $
+ ALIAS:
+ isNFC_MAYBE = 1
+ isNFKC_MAYBE = 2
+
+
+
+void
+isNFD_NO(uv)
+ UV uv
+ PROTOTYPE: $
+ ALIAS:
+ isNFKD_NO = 1
+ PPCODE:
+ if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
+ XSRETURN_YES; /* NFD_NO or NFKD_NO */
+ else
+ XSRETURN_NO;
+
+
+
+void
+isComp_Ex(uv)
+ UV uv
+ PROTOTYPE: $
+ ALIAS:
+ isNFC_NO = 0
+ isNFKC_NO = 1
+ PPCODE:
+ if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
+ XSRETURN_YES; /* NFC_NO or NFKC_NO */
+ else if (ix) {
+ char *canon, *compat;
+ canon = (char *) dec_canonical(uv);
+ compat = (char *) dec_compat(uv);
+ if (compat && (!canon || strNE(canon, compat)))
+ XSRETURN_YES; /* NFC_NO or NFKC_NO */
+ else
+ XSRETURN_NO;
+ }
+ else
+ XSRETURN_NO;
+
+
SV*
getComposite(uv, uv2)
OUTPUT:
RETVAL
+
+
SV*
getCanon(uv)
UV uv
PREINIT:
U8 * rstr;
CODE:
- if(Hangul_IsS(uv)) {
+ if (Hangul_IsS(uv)) {
SV * dst;
dst = newSV(1);
(void)SvPOK_only(dst);
RETVAL = dst;
} else {
rstr = ix ? dec_compat(uv) : dec_canonical(uv);
- if(!rstr) XSRETURN_UNDEF;
+ if (!rstr)
+ XSRETURN_UNDEF;
RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
}
SvUTF8_on(RETVAL);
-Unicode/Normalize version 0.14
+Unicode/Normalize version 0.15
===================================
Unicode::Normalize - normalized forms of Unicode text
use Unicode::Normalize;
- $string_NFD = NFD($string); # Normalization Form D
- $string_NFC = NFC($string); # Normalization Form C
- $string_NFKD = NFKD($string); # Normalization Form KD
- $string_NFKC = NFKC($string); # Normalization Form KC
+ $NFD_string = NFD($string); # Normalization Form D
+ $NFC_string = NFC($string); # Normalization Form C
+ $NFKD_string = NFKD($string); # Normalization Form KD
+ $NFKC_string = NFKC($string); # Normalization Form KC
+
+
+ or
- or
use Unicode::Normalize 'normalize';
- $string_NFD = normalize('D', $string); # Normalization Form D
- $string_NFC = normalize('C', $string); # Normalization Form C
- $string_NFKD = normalize('KD', $string); # Normalization Form KD
- $string_NFKC = normalize('KC', $string); # Normalization Form KC
+ $NFD_string = normalize('D', $string); # Normalization Form D
+ $NFC_string = normalize('C', $string); # Normalization Form C
+ $NFKD_string = normalize('KD', $string); # Normalization Form KD
+ $NFKC_string = normalize('KC', $string); # Normalization Form KC
+
INSTALLATION
-Perl 5.006 or later
+Perl 5.6 or later
To install this module type the following:
File::Copy
File::Spec
-unicore/CombiningClass.pl
- (or unicode/CombiningClass.pl)
-unicore/Decomposition.pl
- (or unicode/Decomposition.pl)
-unicore/CompositionExclusions.txt
- (or unicode/CompExcl.txt)
+unicore/CombiningClass.pl (or unicode/CombiningClass.pl)
+unicore/Decomposition.pl (or unicode/Decomposition.pl)
+unicore/CompositionExclusions.txt (or unicode/CompExcl.txt)
-# CAVEAT:
-# In bleadperl, unicore/CompExcl.txt is renamed
-# unicore/CompositionExclusions.txt.
+CAVEAT
-And for the NoXS version, in addition to the above,
+(1) In bleadperl, unicore/CompExcl.txt is renamed
+ unicore/CompositionExclusions.txt.
- Lingua::KO::Hangul::Util 0.06
+(2) When these unicore/*.* files are updated;
-is required.
+ in the case of an XS version:
+ You must rebuild the module,
+ as the data will be compiled on building.
+
+ in the case of a NoXS version:
+ Rebuilding is not necessary,
+ as the data will be read on requirement.
COPYRIGHT AND LICENCE
-SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
+ SADAHIRO Tomoyuki
+
+ SADAHIRO@cpan.org
http://homepage1.nifty.com/nomenclator/perl/
This program is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
-
our $PACKAGE = 'Unicode::Normalize, mkheader';
our $Combin = do "unicore/CombiningClass.pl"
- || do "unicode/CombiningClass.pl"
- || croak "$PACKAGE: CombiningClass.pl not found";
+ || do "unicode/CombiningClass.pl"
+ || croak "$PACKAGE: CombiningClass.pl not found";
our $Decomp = do "unicore/Decomposition.pl"
- || do "unicode/Decomposition.pl"
- || croak "$PACKAGE: Decomposition.pl not found";
+ || do "unicode/Decomposition.pl"
+ || croak "$PACKAGE: Decomposition.pl not found";
our %Combin; # $codepoint => $number : combination class
our %Canon; # $codepoint => $hexstring : canonical decomp.
our %Compat; # $codepoint => $hexstring : compat. decomp.
our %Compos; # $string => $codepoint : composite
-
our %Exclus; # $codepoint => 1 : composition exclusions
+our %Single; # $codepoint => 1 : singletons
+our %NonStD; # $codepoint => 1 : non-starter decompositions
{
- my($f, $fh);
- foreach my $d (@INC) {
- use File::Spec;
- $f = File::Spec->catfile($d, "unicore", "CompositionExclusions.txt");
- last if open($fh, $f);
- $f = File::Spec->catfile($d, "unicode", "CompExcl.txt");
- last if open($fh, $f);
- $f = undef;
- }
- croak "$PACKAGE: CompExcl.txt not found in @INC" unless defined $f;
- while(<$fh>) {
- next if /^#/ or /^$/;
- s/#.*//;
- $Exclus{ hex($1) } =1 if /([0-9A-Fa-f]+)/;
- }
- close $fh;
+ my($f, $fh);
+ foreach my $d (@INC) {
+ use File::Spec;
+ $f = File::Spec->catfile($d, "unicore", "CompositionExclusions.txt");
+ last if open($fh, $f);
+ $f = File::Spec->catfile($d, "unicode", "CompExcl.txt");
+ last if open($fh, $f);
+ $f = undef;
+ }
+ croak "$PACKAGE: CompExcl.txt not found in @INC" unless defined $f;
+ while (<$fh>) {
+ next if /^#/ or /^$/;
+ s/#.*//;
+ $Exclus{ hex($1) } =1 if /([0-9A-Fa-f]+)/;
+ }
+ close $fh;
}
-while($Combin =~ /(.+)/g) {
- my @tab = split /\t/, $1;
- my $ini = hex $tab[0];
- if($tab[1] eq '') {
- $Combin{ $ini } = $tab[2];
- } else {
- $Combin{ $_ } = $tab[2] foreach $ini .. hex($tab[1]);
- }
+while ($Combin =~ /(.+)/g) {
+ my @tab = split /\t/, $1;
+ my $ini = hex $tab[0];
+ if ($tab[1] eq '') {
+ $Combin{ $ini } = $tab[2];
+ } else {
+ $Combin{ $_ } = $tab[2] foreach $ini .. hex($tab[1]);
+ }
}
-while($Decomp =~ /(.+)/g) {
- my @tab = split /\t/, $1;
- my $compat = $tab[2] =~ s/<[^>]+>//;
- my $dec = [ _getHexArray($tab[2]) ]; # decomposition
- my $com = pack('U*', @$dec); # composable sequence
- my $ini = hex($tab[0]);
- if($tab[1] eq '') {
- $Compat{ $ini } = $dec;
- if(! $compat) {
- $Canon{ $ini } = $dec;
- $Compos{ $com } = $ini if @$dec > 1;
+while ($Decomp =~ /(.+)/g) {
+ my @tab = split /\t/, $1;
+ my $compat = $tab[2] =~ s/<[^>]+>//;
+ my $dec = [ _getHexArray($tab[2]) ]; # decomposition
+ my $com = pack('U*', @$dec); # composable sequence
+ my $ini = hex($tab[0]); # initial decomposable character
+ if ($tab[1] eq '') {
+ $Compat{ $ini } = $dec;
+
+ if (! $compat) {
+ $Canon{ $ini } = $dec;
+
+ if (@$dec > 1) {
+ if ($Combin{ $dec->[0] }) {
+ $NonStD{ $ini } = 1;
+ } else {
+ $Compos{ $com } = $ini;
+ }
+ } else {
+ $Single{ $ini } = 1;
+ }
+ }
+ } else {
+ foreach my $u ($ini .. hex($tab[1])){
+ $Compat{ $u } = $dec;
+ if (! $compat) {
+ $Canon{ $u } = $dec;
+
+ if (@$dec > 1) {
+ if ($Combin{ $dec->[0] }) {
+ $NonStD{ $u } = 1;
+ } else {
+ $Compos{ $com } = $u;
+ }
+ } else {
+ $Single{ $u } = 1;
+ }
+ }
+ }
}
- } else {
- foreach my $u ($ini .. hex($tab[1])){
- $Compat{ $u } = $dec;
- if(! $compat){
- $Canon{ $u } = $dec;
- $Compos{ $com } = $ini if @$dec > 1;
- }
- }
- }
}
# exhaustive decomposition
foreach my $key (keys %Canon) {
- $Canon{$key} = [ getCanonList($key) ];
+ $Canon{$key} = [ getCanonList($key) ];
}
# exhaustive decomposition
foreach my $key (keys %Compat) {
- $Compat{$key} = [ getCompatList($key) ];
+ $Compat{$key} = [ getCompatList($key) ];
}
sub getCanonList {
- my @src = @_;
- my @dec = map $Canon{$_} ? @{ $Canon{$_} } : $_, @src;
- join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
- # condition @src == @dec is not ok.
+ my @src = @_;
+ my @dec = map $Canon{$_} ? @{ $Canon{$_} } : $_, @src;
+ return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
+ # condition @src == @dec is not ok.
}
sub getCompatList {
- my @src = @_;
- my @dec = map $Compat{$_} ? @{ $Compat{$_} } : $_, @src;
- join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
- # condition @src == @dec is not ok.
+ my @src = @_;
+ my @dec = map $Compat{$_} ? @{ $Compat{$_} } : $_, @src;
+ return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
+ # condition @src == @dec is not ok.
}
-sub _getHexArray {
- my $str = shift;
- map hex(), $str =~ /([0-9A-Fa-f]+)/g;
-}
+sub _getHexArray { map hex, $_[0] =~ /([0-9A-Fa-f]+)/g }
sub _U_stringify {
- sprintf '"%s"', join '',
- map sprintf("\\x%02x", $_), unpack 'C*', pack 'U*', @_;
+ sprintf '"%s"', join '',
+ map sprintf("\\x%02x", $_), unpack 'C*', pack 'U*', @_;
}
foreach my $hash (\%Canon, \%Compat) {
- foreach my $key (keys %$hash) {
- $hash->{$key} = _U_stringify( @{ $hash->{$key} } );
- }
+ foreach my $key (keys %$hash) {
+ $hash->{$key} = _U_stringify( @{ $hash->{$key} } );
+ }
}
my $prefix = "UNF_";
-
my $structname = "${prefix}complist";
-our (%Comp1st, %CompList);
+our (%Comp1st, %Comp2nd, %CompList);
-foreach(sort keys %Compos) {
- my @a = unpack('U*', $_);
- my $val = $Compos{$_};
- my $name = sprintf "${structname}_%06x", $a[0];
- $Comp1st{ $a[0] } = $name;
- $CompList{ $name }{ $a[1] } = $val;
+foreach (sort keys %Compos) {
+ my @a = unpack('U*', $_);
+ my $val = $Compos{$_};
+ my $name = sprintf "${structname}_%06x", $a[0];
+ $Comp1st{$a[0]} = $name;
+ $Comp2nd{$a[1]} = 1 if ! $Exclus{$Compos{$_}} && ! $Combin{$a[0]};
+ $CompList{$name}{$a[1]} = $val;
+}
+
+# modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo
+foreach (0x1161..0x1175, 0x11A8..0x11C2) {
+ $Comp2nd{$_} = 1;
}
my $compinit =
- "typedef struct { UV nextchar; UV composite; } $structname;\n\n";
+ "typedef struct { UV nextchar; UV composite; } $structname;\n\n";
foreach my $i (sort keys %CompList) {
- $compinit .= "$structname $i [] = {\n";
- $compinit .= join ",\n",
- map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}),
- sort {$a <=> $b } keys %{ $CompList{$i} };
- $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel
+ $compinit .= "$structname $i [] = {\n";
+ $compinit .= join ",\n",
+ map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}),
+ sort {$a <=> $b } keys %{ $CompList{$i} };
+ $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel
}
####################################
-my @Exclus = sort {$a <=> $b} keys %Exclus;
+my @boolfunc = (
+ {
+ name => "Exclusion",
+ type => "bool",
+ hash => \%Exclus,
+ },
+ {
+ name => "Singleton",
+ type => "bool",
+ hash => \%Single,
+ },
+ {
+ name => "NonStDecomp",
+ type => "bool",
+ hash => \%NonStD,
+ },
+ {
+ name => "Comp2nd",
+ type => "bool",
+ hash => \%Comp2nd,
+ },
+);
my $file = "unfexc.h";
open FH, ">$file" or croak "$PACKAGE: $file can't be made";
binmode FH; select FH;
-print "bool isExclusion (UV uv) \n{\nreturn\n\t";
+ print << 'EOF';
+/*
+ * This file is auto-generated by mkheader.
+ * Any changes here will be lost!
+ */
+EOF
-while(@Exclus) {
- my $cur = shift @Exclus;
- if(@Exclus && $cur + 1 == $Exclus[0]) {
- print "($cur <= uv && uv <= ";
- while(@Exclus && $cur + 1 == $Exclus[0]) {
- $cur = shift @Exclus;
+foreach my $tbl (@boolfunc) {
+ my @temp = sort {$a <=> $b} keys %{$tbl->{hash}};
+ my $type = $tbl->{type};
+ my $name = $tbl->{name};
+ print "$type is$name (UV uv)\n{\nreturn\n\t";
+
+ while (@temp) {
+ my $cur = shift @temp;
+ if (@temp && $cur + 1 == $temp[0]) {
+ print "($cur <= uv && uv <= ";
+ while (@temp && $cur + 1 == $temp[0]) {
+ $cur = shift @temp;
+ }
+ print "$cur)";
+ print "\n\t|| " if @temp;
+ } else {
+ print "uv == $cur";
+ print "\n\t|| " if @temp;
+ }
}
- print "$cur)";
- print "\n\t|| " if @Exclus;
- } else {
- print "uv == $cur";
- print "\n\t|| " if @Exclus;
- }
+ print "\n\t? TRUE : FALSE;\n}\n\n";
}
-print "\n\t? TRUE : FALSE;\n}\n\n";
close FH;
+
####################################
my @tripletable = (
- {
- file => "unfcmb",
- name => "combin",
- type => "STDCHAR",
- hash => \%Combin,
- null => 0,
- },
- {
- file => "unfcan",
- name => "canon",
- type => "char*",
- hash => \%Canon,
- null => "NULL",
- },
- {
- file => "unfcpt",
- name => "compat",
- type => "char*",
- hash => \%Compat,
- null => "NULL",
- },
- {
- file => "unfcmp",
- name => "compos",
- type => "$structname *",
- hash => \%Comp1st,
- null => "NULL",
- init => $compinit,
- },
+ {
+ file => "unfcmb",
+ name => "combin",
+ type => "STDCHAR",
+ hash => \%Combin,
+ null => 0,
+ },
+ {
+ file => "unfcan",
+ name => "canon",
+ type => "char*",
+ hash => \%Canon,
+ null => "NULL",
+ },
+ {
+ file => "unfcpt",
+ name => "compat",
+ type => "char*",
+ hash => \%Compat,
+ null => "NULL",
+ },
+ {
+ file => "unfcmp",
+ name => "compos",
+ type => "$structname *",
+ hash => \%Comp1st,
+ null => "NULL",
+ init => $compinit,
+ },
);
foreach my $tbl (@tripletable) {
- my $file = "$tbl->{file}.h";
- my $head = "${prefix}$tbl->{name}";
- my $type = $tbl->{type};
- my $hash = $tbl->{hash};
- my $null = $tbl->{null};
- my $init = $tbl->{init};
-
- open FH, ">$file" or croak "$PACKAGE: $file can't be made";
- binmode FH; select FH;
- my %val;
-
- print FH << 'EOF';
+ my $file = "$tbl->{file}.h";
+ my $head = "${prefix}$tbl->{name}";
+ my $type = $tbl->{type};
+ my $hash = $tbl->{hash};
+ my $null = $tbl->{null};
+ my $init = $tbl->{init};
+
+ open FH, ">$file" or croak "$PACKAGE: $file can't be made";
+ binmode FH; select FH;
+ my %val;
+
+ print FH << 'EOF';
/*
* This file is auto-generated by mkheader.
* Any changes here will be lost!
*/
EOF
- print $init if defined $init;
-
- foreach my $uv (keys %$hash) {
- my @c = unpack 'CCCC', pack 'N', $uv;
- $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv};
- }
-
- foreach my $p (sort { $a <=> $b } keys %val) {
- next if ! $val{ $p };
- for(my $r = 0; $r < 256; $r++){
- next if ! $val{ $p }{ $r };
- printf "$type ${head}_%02x_%02x [256] = {\n", $p, $r;
- for(my $c = 0; $c < 256; $c++){
- print "\t", defined $val{$p}{$r}{$c}
- ? "($type)".$val{$p}{$r}{$c} : $null;
- print ',' if $c != 255;
- print "\n" if $c % 8 == 7;
- }
- print "};\n\n";
+ print $init if defined $init;
+
+ foreach my $uv (keys %$hash) {
+ my @c = unpack 'CCCC', pack 'N', $uv;
+ $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv};
+ }
+
+ foreach my $p (sort { $a <=> $b } keys %val) {
+ next if ! $val{ $p };
+ for (my $r = 0; $r < 256; $r++) {
+ next if ! $val{ $p }{ $r };
+ printf "$type ${head}_%02x_%02x [256] = {\n", $p, $r;
+ for (my $c = 0; $c < 256; $c++) {
+ print "\t", defined $val{$p}{$r}{$c}
+ ? "($type)".$val{$p}{$r}{$c}
+ : $null;
+ print ',' if $c != 255;
+ print "\n" if $c % 8 == 7;
+ }
+ print "};\n\n";
+ }
+ }
+ foreach my $p (sort { $a <=> $b } keys %val) {
+ next if ! $val{ $p };
+ printf "$type* ${head}_%02x [256] = {\n", $p;
+ for (my $r = 0; $r < 256; $r++) {
+ print $val{ $p }{ $r }
+ ? sprintf("${head}_%02x_%02x", $p, $r)
+ : "NULL";
+ print ',' if $r != 255;
+ print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0;
+ }
+ print "};\n\n";
}
- }
- foreach my $p (sort { $a <=> $b } keys %val) {
- next if ! $val{ $p };
- printf "$type* ${head}_%02x [256] = {\n", $p;
- for(my $r = 0; $r < 256; $r++){
- print $val{ $p }{ $r } ? sprintf("${head}_%02x_%02x", $p, $r) : "NULL";
- print ',' if $r != 255;
- print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0;
+ print "$type** $head [] = {\n";
+ for (my $p = 0; $p <= 0x10; $p++) {
+ print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL";
+ print ',' if $p != 0x10;
+ print "\n";
}
print "};\n\n";
- }
- print "$type** $head [] = {\n";
- for(my $p = 0; $p <= 0x10; $p++){
- print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL";
- print ',' if $p != 0x10;
- print "\n";
- }
- print "};\n\n";
- close FH;
+ close FH;
}
__END__
BEGIN {
if (ord("A") == 193) {
- print "1..0 # Unicode::Normalize not ported to EBCDIC\n";
- exit 0;
+ print "1..0 # Unicode::Normalize not ported to EBCDIC\n";
+ exit 0;
}
}
use Test;
use strict;
use warnings;
-BEGIN { plan tests => 6 };
+BEGIN { plan tests => 10 };
use Unicode::Normalize qw(:all);
ok(1); # If we made it this far, we're ok.
print getCombinClass( 0) == 0
&& getCombinClass( 768) == 230
&& getCombinClass(1809) == 36
-# && getCombinClass(119143) == 1
+# && getCombinClass(119143) == 1 # U+1D167, a Unicode 3.1 character
? "ok" : "not ok", " 2\n";
print ! defined getCanon( 0)
&& 0xADF8 == getComposite(0x1100, 0x1173)
&& ! defined getComposite(0x1100, 0x11AF)
&& ! defined getComposite(0x1173, 0x11AF)
+ && ! defined getComposite(0xAC00, 0x11A7)
+ && 0xAC01 == getComposite(0xAC00, 0x11A8)
&& 0xAE00 == getComposite(0xADF8, 0x11AF)
? "ok" : "not ok", " 5\n";
print ! isExclusion( 0)
&& ! isExclusion(41)
- && isExclusion(2392)
- && isExclusion(3907)
- && isExclusion(64334)
+ && isExclusion(2392) # DEVANAGARI LETTER QA
+ && isExclusion(3907) # TIBETAN LETTER GHA
+ && isExclusion(64334) # HEBREW LETTER PE WITH RAFE
? "ok" : "not ok", " 6\n";
+print ! isSingleton( 0)
+ && isSingleton(0x212B) # ANGSTROM SIGN
+ ? "ok" : "not ok", " 7\n";
+
+print reorder("") eq ""
+ && reorder(pack("U*", 0x0041, 0x0300, 0x0315, 0x0313, 0x031b, 0x0061))
+ eq pack("U*", 0x0041, 0x031b, 0x0300, 0x0313, 0x0315, 0x0061)
+ && reorder(pack("U*", 0x00C1, 0x0300, 0x0315, 0x0313, 0x031b,
+ 0x0061, 0x309A, 0x3099))
+ eq pack("U*", 0x00C1, 0x031b, 0x0300, 0x0313, 0x0315,
+ 0x0061, 0x309A, 0x3099)
+ ? "ok" : "not ok", " 8\n";
+
+sub answer { defined $_[0] ? $_[0] ? "YES" : "NO" : "MAYBE" }
+
+print answer(checkNFD("")) eq "YES"
+ && answer(checkNFC("")) eq "YES"
+ && answer(checkNFKD("")) eq "YES"
+ && answer(checkNFKC("")) eq "YES"
+ && answer(check("NFD", "")) eq "YES"
+ && answer(check("NFC", "")) eq "YES"
+ && answer(check("NFKD","")) eq "YES"
+ && answer(check("NFKC","")) eq "YES"
+# U+0000 to U+007F are prenormalized in all the normalization forms.
+ && answer(checkNFD("AZaz\t12!#`")) eq "YES"
+ && answer(checkNFC("AZaz\t12!#`")) eq "YES"
+ && answer(checkNFKD("AZaz\t12!#`")) eq "YES"
+ && answer(checkNFKC("AZaz\t12!#`")) eq "YES"
+ && answer(check("D", "AZaz\t12!#`")) eq "YES"
+ && answer(check("C", "AZaz\t12!#`")) eq "YES"
+ && answer(check("KD","AZaz\t12!#`")) eq "YES"
+ && answer(check("KC","AZaz\t12!#`")) eq "YES"
+ ? "ok" : "not ok", " 9\n";
+
+print 1
+ && answer(checkNFD(NFD(pack('U*', 0xC1, 0x1100, 0x1173, 0x11AF)))) eq "YES"
+ && answer(checkNFD(pack('U*', 0x20, 0xC1, 0x1100, 0x1173, 0x11AF))) eq "NO"
+ && answer(checkNFC(pack('U*', 0x20, 0xC1, 0x1173, 0x11AF))) eq "MAYBE"
+ && answer(checkNFC(pack('U*', 0x20, 0xC1, 0xAE00, 0x1100))) eq "YES"
+ && answer(checkNFC(pack('U*', 0x20, 0xC1, 0xAE00, 0x1100, 0x300))) eq "MAYBE"
+ && answer(checkNFC(pack('U*', 0x20, 0xC1, 0xFF71, 0x2025))) eq "YES"
+ && answer(check("NFC", pack('U*', 0x20, 0xC1, 0x212B, 0x300))) eq "NO"
+ && answer(checkNFKD(pack('U*', 0x20, 0xC1, 0xFF71, 0x2025))) eq "NO"
+ && answer(checkNFKC(pack('U*', 0x20, 0xC1, 0xAE00, 0x2025))) eq "NO"
+ ? "ok" : "not ok", " 10\n";