Revision history for Perl extension Unicode::Normalize.
+1.02 Tue Jun 5 22:46:45 2007
+ - XSUB: mkheader, _U_stringify() - avoid unpack('C*') on unicode.
+ - test: short.t removed - pure perl is not inapprotiate for test of
+ unicode edge cases.
+
1.01 Tue Jun 13 22:01:53 2006
- XSUB: sv_setpvn() needs cast to (char*).
- XSUB: avoid double FETCH for tied scalar variables.
no warnings 'utf8';
-our $VERSION = '1.01';
+our $VERSION = '1.02';
our $PACKAGE = __PACKAGE__;
require Exporter;
affects NFC and NFKC) has been changed (see Public Review Issue #29
and recent UAX #15). This module has used the newer definition
since the version 0.07 (Oct 31, 2001).
-This module does not support normalization according to the older
+This module will not support the normalization according to the older
definition, even if the Unicode version implemented by perl is
lower than 4.1.0.
SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
-Copyright(C) 2001-2006, SADAHIRO Tomoyuki. Japan. All rights reserved.
+Copyright(C) 2001-2007, SADAHIRO Tomoyuki. Japan. All rights reserved.
This module is free software; you can redistribute it
and/or modify it under the same terms as Perl itself.
else {
Renew(seq_ext, seq_max, UV);
}
- seq_ptr = seq_ext; /* till now use seq_ext */
+ seq_ptr = seq_ext; /* use seq_ext from now */
}
seq_ptr[cc_pos] = uv;
++cc_pos;
OUTPUT:
RETVAL
-void
+SV*
checkNFD(src)
SV * src
PROTOTYPE: $
PREINIT:
STRLEN srclen, retlen;
U8 *s, *e, *p, curCC, preCC;
+ bool result = TRUE;
CODE:
s = (U8*)sv_2pvunicode(src,&srclen);
e = s + srclen;
croak(ErrRetlenIsZero, "checkNFD or -NFKD");
curCC = getCombinClass(uv);
- if (preCC > curCC && curCC != 0) /* canonical ordering violated */
- XSRETURN_NO;
- if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
- XSRETURN_NO;
+ if (preCC > curCC && curCC != 0) { /* canonical ordering violated */
+ result = FALSE;
+ break;
+ }
+ if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) {
+ result = FALSE;
+ break;
+ }
preCC = curCC;
}
- XSRETURN_YES;
-
+ RETVAL = boolSV(result);
+ OUTPUT:
+ RETVAL
-void
+SV*
checkNFC(src)
SV * src
PROTOTYPE: $
PREINIT:
STRLEN srclen, retlen;
U8 *s, *e, *p, curCC, preCC;
- bool isMAYBE;
+ bool result = TRUE;
+ bool isMAYBE = FALSE;
CODE:
s = (U8*)sv_2pvunicode(src,&srclen);
e = s + srclen;
preCC = 0;
- isMAYBE = FALSE;
for (p = s; p < e; p += retlen) {
UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
if (!retlen)
croak(ErrRetlenIsZero, "checkNFC or -NFKC");
curCC = getCombinClass(uv);
- if (preCC > curCC && curCC != 0) /* canonical ordering violated */
- XSRETURN_NO;
+ if (preCC > curCC && curCC != 0) { /* canonical ordering violated */
+ result = FALSE;
+ break;
+ }
/* get NFC/NFKC property */
if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */
; /* YES */
- else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
- XSRETURN_NO;
+ else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) {
+ result = FALSE;
+ break;
+ }
else if (isComp2nd(uv))
isMAYBE = TRUE;
else if (ix) {
/* NFKC_NO when having compatibility mapping. */
canon = (char *) dec_canonical(uv);
compat = (char *) dec_compat(uv);
- if (compat && !(canon && strEQ(canon, compat)))
- XSRETURN_NO;
+ if (compat && !(canon && strEQ(canon, compat))) {
+ result = FALSE;
+ break;
+ }
} /* end of get NFC/NFKC property */
preCC = curCC;
}
- if (isMAYBE)
+ if (isMAYBE && result) /* NO precedes MAYBE */
XSRETURN_UNDEF;
- else
- XSRETURN_YES;
-
+ RETVAL = boolSV(result);
+ OUTPUT:
+ RETVAL
-void
+SV*
checkFCD(src)
SV * src
PROTOTYPE: $
PREINIT:
STRLEN srclen, retlen;
U8 *s, *e, *p, curCC, preCC;
- bool isMAYBE;
+ bool result = TRUE;
+ bool isMAYBE = FALSE;
CODE:
s = (U8*)sv_2pvunicode(src,&srclen);
e = s + srclen;
preCC = 0;
- isMAYBE = FALSE;
for (p = s; p < e; p += retlen) {
U8 *sCan;
UV uvLead;
curCC = getCombinClass(uvLead);
- if (curCC != 0 && curCC < preCC) /* canonical ordering violated */
- XSRETURN_NO;
+ if (curCC != 0 && curCC < preCC) { /* canonical ordering violated */
+ result = FALSE;
+ break;
+ }
if (ix) {
- if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
- XSRETURN_NO;
+ if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) {
+ result = FALSE;
+ break;
+ }
else if (isComp2nd(uv))
isMAYBE = TRUE;
}
preCC = curCC;
}
}
- if (isMAYBE)
+ if (isMAYBE && result) /* NO precedes MAYBE */
XSRETURN_UNDEF;
- else
- XSRETURN_YES;
-
+ RETVAL = boolSV(result);
+ OUTPUT:
+ RETVAL
U8
-void
+SV*
isNFD_NO(uv)
UV uv
PROTOTYPE: $
ALIAS:
isNFKD_NO = 1
+ PREINIT:
+ bool result = FALSE;
CODE:
if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
- XSRETURN_YES; /* NFD_NO or NFKD_NO */
- else
- XSRETURN_NO;
-
+ result = TRUE; /* NFD_NO or NFKD_NO */
+ RETVAL = boolSV(result);
+ OUTPUT:
+ RETVAL
-void
+SV*
isComp_Ex(uv)
UV uv
PROTOTYPE: $
ALIAS:
isNFC_NO = 0
isNFKC_NO = 1
+ PREINIT:
+ bool result = FALSE;
CODE:
if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
- XSRETURN_YES; /* NFC_NO or NFKC_NO */
+ result = TRUE; /* NFC_NO or NFKC_NO */
else if (ix) {
char *canon, *compat;
canon = (char *) dec_canonical(uv);
compat = (char *) dec_compat(uv);
if (compat && (!canon || strNE(canon, compat)))
- XSRETURN_YES; /* NFC_NO or NFKC_NO */
- else
- XSRETURN_NO;
+ result = TRUE; /* NFC_NO or NFKC_NO */
}
- else
- XSRETURN_NO;
-
-
+ RETVAL = boolSV(result);
+ OUTPUT:
+ RETVAL
SV*
getComposite(uv, uv2)
-Unicode/Normalize version 1.01
+Unicode/Normalize version 1.02
===================================
Unicode::Normalize - Unicode Normalization Forms
- unicore/Decomposition.pl (or unicode/Decomposition.pl)
- unicore/CompositionExclusions.txt (or unicode/CompExcl.txt)
-CAVEATS
+NOTES
(1) After these unicore/*.* files are updated:
Rebuilding is not necessary,
as the data will be read on requirement.
-(2) Normalize.pmN (pure perl module) may work without any other file
- in this distribution (it must be renamed Normalize.pm, though)
+(2) Normalize.pmN (a pure perl module) may work without any other file
+ in this distribution. It must be renamed Normalize.pm, though.
COPYRIGHT AND LICENSE
SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
-Copyright(C) 2001-2006, SADAHIRO Tomoyuki. Japan. All rights reserved.
+Copyright(C) 2001-2007, SADAHIRO Tomoyuki. Japan. All rights reserved.
This module is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
VBase + $VIndex,
$TIndex ? (TBase + $TIndex) : (),
);
- wantarray ? @ret : pack('U*', @ret);
- # any element in @ret greater than 0xFF, so no need of u2n conversion.
+ return @ret;
}
########## getting full decomposion ##########
return pack('U*', @_);
}
+sub split_into_char {
+ use bytes;
+ my $uni = shift;
+ my $len = length($uni);
+ my @ary;
+ for(my $i = 0; $i < $len; ++$i) {
+ push @ary, ord(substr($uni,$i,1));
+ }
+ return @ary;
+}
+
sub _U_stringify {
sprintf '"%s"', join '',
- map sprintf("\\x%02x", $_), unpack 'U0C*', _pack_U(@_);
+ map sprintf("\\x%02x", $_), split_into_char(_pack_U(@_));
}
foreach my $hash (\%Canon, \%Compat) {
use Test;
use strict;
use warnings;
-BEGIN { plan tests => 68 };
+BEGIN { plan tests => 70 };
use Unicode::Normalize qw(:all);
ok(1); # If we made it this far, we're ok.
ok(answer(checkFCC(hexU("00C5 0327"))), 'NO'); # A-ring+cedilla
ok(answer(checkFCC("\x{AC01}\x{1100}\x{1161}")), 'MAYBE'); # hangul
ok(answer(checkFCC("\x{212B}\x{F900}")), 'NO'); # compat
+ok(answer(checkFCC("\x{212B}\x{0327}")), 'NO'); # compat
+ok(answer(checkFCC("\x{0327}\x{212B}")), 'NO'); # compat
use Test;
use strict;
use warnings;
-BEGIN { plan tests => 202 };
+BEGIN { plan tests => 211 };
use Unicode::Normalize qw(:all);
ok(1); # If we made it this far, we're ok.
ok(answer(check("KC","AZaz\t12!#`")), "YES");
ok(answer(checkNFD(NFD(_pack_U(0xC1, 0x1100, 0x1173, 0x11AF)))), "YES");
-ok(answer(checkNFD(_pack_U(0x20, 0xC1, 0x1100, 0x1173, 0x11AF))), "NO");
-ok(answer(checkNFC(_pack_U(0x20, 0xC1, 0x1173, 0x11AF))), "MAYBE");
-ok(answer(checkNFC(_pack_U(0x20, 0xC1, 0xAE00, 0x1100))), "YES");
-ok(answer(checkNFC(_pack_U(0x20, 0xC1, 0xAE00, 0x1100, 0x300))), "MAYBE");
-ok(answer(checkNFC(_pack_U(0x20, 0xC1, 0xFF71, 0x2025))), "YES");
-ok(answer(check("NFC", _pack_U(0x20, 0xC1, 0x212B, 0x300))), "NO");
-ok(answer(checkNFKD(_pack_U(0x20, 0xC1, 0xFF71, 0x2025))), "NO");
-ok(answer(checkNFKC(_pack_U(0x20, 0xC1, 0xAE00, 0x2025))), "NO");
+ok(answer(checkNFD(hexU("20 C1 1100 1173 11AF"))), "NO");
+ok(answer(checkNFC(hexU("20 C1 1173 11AF"))), "MAYBE");
+ok(answer(checkNFC(hexU("20 C1 AE00 1100"))), "YES");
+ok(answer(checkNFC(hexU("20 C1 AE00 1100 0300"))), "MAYBE");
+ok(answer(checkNFC(hexU("212B 1100 0300"))), "NO");
+ok(answer(checkNFC(hexU("1100 0300 212B"))), "NO");
+ok(answer(checkNFC(hexU("0041 0327 030A"))), "MAYBE"); # A+cedilla+ring
+ok(answer(checkNFC(hexU("0041 030A 0327"))), "NO"); # A+ring+cedilla
+ok(answer(checkNFC(hexU("20 C1 FF71 2025"))),"YES");
+ok(answer(check("NFC", hexU("20 C1 212B 300"))), "NO");
+ok(answer(checkNFKD(hexU("20 C1 FF71 2025"))), "NO");
+ok(answer(checkNFKC(hexU("20 C1 AE00 2025"))), "NO");
+ok(answer(checkNFKC(hexU("212B 1100 0300"))), "NO");
+ok(answer(checkNFKC(hexU("1100 0300 212B"))), "NO");
+ok(answer(checkNFKC(hexU("0041 0327 030A"))), "MAYBE"); # A+cedilla+ring
+ok(answer(checkNFKC(hexU("0041 030A 0327"))), "NO"); # A+ring+cedilla
+ok(answer(check("NFKC", hexU("20 C1 212B 300"))), "NO");
"012ABC" =~ /(\d+)(\w+)/;
ok("012" eq NFC $1 && "ABC" eq NFC $2);
ok(NFKC("A"), "A");
# don't modify the source
-# don't modify the source
my $sNFD = "\x{FA19}";
ok(NFD($sNFD), "\x{795E}");
ok($sNFD, "\x{FA19}");