From: Rafael Garcia-Suarez Date: Wed, 6 Jun 2007 12:03:00 +0000 (+0000) Subject: Upgrade to Unicode::Normalize 1.02 X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=2b8d773d4eceb2953bfb94958c430a08dbb56032;p=p5sagit%2Fp5-mst-13.2.git Upgrade to Unicode::Normalize 1.02 p4raw-id: //depot/perl@31338 --- diff --git a/ext/Unicode/Normalize/Changes b/ext/Unicode/Normalize/Changes index b7eb1d5..1f1a693 100644 --- a/ext/Unicode/Normalize/Changes +++ b/ext/Unicode/Normalize/Changes @@ -1,5 +1,10 @@ Revision history for Perl extension Unicode::Normalize. +1.02 Tue Jun 5 22:46:45 2007 + - XSUB: mkheader, _U_stringify() - avoid unpack('C*') on unicode. + - test: short.t removed - pure perl is not inapprotiate for test of + unicode edge cases. + 1.01 Tue Jun 13 22:01:53 2006 - XSUB: sv_setpvn() needs cast to (char*). - XSUB: avoid double FETCH for tied scalar variables. diff --git a/ext/Unicode/Normalize/Normalize.pm b/ext/Unicode/Normalize/Normalize.pm index b51177d..cb63fbf 100644 --- a/ext/Unicode/Normalize/Normalize.pm +++ b/ext/Unicode/Normalize/Normalize.pm @@ -13,7 +13,7 @@ use Carp; no warnings 'utf8'; -our $VERSION = '1.01'; +our $VERSION = '1.02'; our $PACKAGE = __PACKAGE__; require Exporter; @@ -470,7 +470,7 @@ In Unicode 4.1.0, the definition D2 of canonical composition (which affects NFC and NFKC) has been changed (see Public Review Issue #29 and recent UAX #15). This module has used the newer definition since the version 0.07 (Oct 31, 2001). -This module does not support normalization according to the older +This module will not support the normalization according to the older definition, even if the Unicode version implemented by perl is lower than 4.1.0. @@ -480,7 +480,7 @@ lower than 4.1.0. SADAHIRO Tomoyuki -Copyright(C) 2001-2006, SADAHIRO Tomoyuki. Japan. All rights reserved. +Copyright(C) 2001-2007, SADAHIRO Tomoyuki. Japan. All rights reserved. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. diff --git a/ext/Unicode/Normalize/Normalize.xs b/ext/Unicode/Normalize/Normalize.xs index b051e2a..792f4a3 100644 --- a/ext/Unicode/Normalize/Normalize.xs +++ b/ext/Unicode/Normalize/Normalize.xs @@ -416,7 +416,7 @@ U8* pv_utf8_compose(U8* s, STRLEN slen, U8* d, STRLEN dlen, bool iscontig) else { Renew(seq_ext, seq_max, UV); } - seq_ptr = seq_ext; /* till now use seq_ext */ + seq_ptr = seq_ext; /* use seq_ext from now */ } seq_ptr[cc_pos] = uv; ++cc_pos; @@ -593,7 +593,7 @@ NFC(src) OUTPUT: RETVAL -void +SV* checkNFD(src) SV * src PROTOTYPE: $ @@ -602,6 +602,7 @@ checkNFD(src) PREINIT: STRLEN srclen, retlen; U8 *s, *e, *p, curCC, preCC; + bool result = TRUE; CODE: s = (U8*)sv_2pvunicode(src,&srclen); e = s + srclen; @@ -613,17 +614,22 @@ checkNFD(src) croak(ErrRetlenIsZero, "checkNFD or -NFKD"); curCC = getCombinClass(uv); - if (preCC > curCC && curCC != 0) /* canonical ordering violated */ - XSRETURN_NO; - if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) - XSRETURN_NO; + if (preCC > curCC && curCC != 0) { /* canonical ordering violated */ + result = FALSE; + break; + } + if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) { + result = FALSE; + break; + } preCC = curCC; } - XSRETURN_YES; - + RETVAL = boolSV(result); + OUTPUT: + RETVAL -void +SV* checkNFC(src) SV * src PROTOTYPE: $ @@ -632,27 +638,31 @@ checkNFC(src) PREINIT: STRLEN srclen, retlen; U8 *s, *e, *p, curCC, preCC; - bool isMAYBE; + bool result = TRUE; + bool isMAYBE = FALSE; CODE: s = (U8*)sv_2pvunicode(src,&srclen); e = s + srclen; preCC = 0; - isMAYBE = FALSE; for (p = s; p < e; p += retlen) { UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); if (!retlen) croak(ErrRetlenIsZero, "checkNFC or -NFKC"); curCC = getCombinClass(uv); - if (preCC > curCC && curCC != 0) /* canonical ordering violated */ - XSRETURN_NO; + if (preCC > curCC && curCC != 0) { /* canonical ordering violated */ + result = FALSE; + break; + } /* get NFC/NFKC property */ if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */ ; /* YES */ - else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) - XSRETURN_NO; + else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) { + result = FALSE; + break; + } else if (isComp2nd(uv)) isMAYBE = TRUE; else if (ix) { @@ -660,20 +670,22 @@ checkNFC(src) /* NFKC_NO when having compatibility mapping. */ canon = (char *) dec_canonical(uv); compat = (char *) dec_compat(uv); - if (compat && !(canon && strEQ(canon, compat))) - XSRETURN_NO; + if (compat && !(canon && strEQ(canon, compat))) { + result = FALSE; + break; + } } /* end of get NFC/NFKC property */ preCC = curCC; } - if (isMAYBE) + if (isMAYBE && result) /* NO precedes MAYBE */ XSRETURN_UNDEF; - else - XSRETURN_YES; - + RETVAL = boolSV(result); + OUTPUT: + RETVAL -void +SV* checkFCD(src) SV * src PROTOTYPE: $ @@ -682,12 +694,12 @@ checkFCD(src) PREINIT: STRLEN srclen, retlen; U8 *s, *e, *p, curCC, preCC; - bool isMAYBE; + bool result = TRUE; + bool isMAYBE = FALSE; CODE: s = (U8*)sv_2pvunicode(src,&srclen); e = s + srclen; preCC = 0; - isMAYBE = FALSE; for (p = s; p < e; p += retlen) { U8 *sCan; UV uvLead; @@ -711,12 +723,16 @@ checkFCD(src) curCC = getCombinClass(uvLead); - if (curCC != 0 && curCC < preCC) /* canonical ordering violated */ - XSRETURN_NO; + if (curCC != 0 && curCC < preCC) { /* canonical ordering violated */ + result = FALSE; + break; + } if (ix) { - if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) - XSRETURN_NO; + if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) { + result = FALSE; + break; + } else if (isComp2nd(uv)) isMAYBE = TRUE; } @@ -737,11 +753,11 @@ checkFCD(src) preCC = curCC; } } - if (isMAYBE) + if (isMAYBE && result) /* NO precedes MAYBE */ XSRETURN_UNDEF; - else - XSRETURN_YES; - + RETVAL = boolSV(result); + OUTPUT: + RETVAL U8 @@ -774,43 +790,44 @@ isComp2nd(uv) -void +SV* isNFD_NO(uv) UV uv PROTOTYPE: $ ALIAS: isNFKD_NO = 1 + PREINIT: + bool result = FALSE; CODE: if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) - XSRETURN_YES; /* NFD_NO or NFKD_NO */ - else - XSRETURN_NO; - + result = TRUE; /* NFD_NO or NFKD_NO */ + RETVAL = boolSV(result); + OUTPUT: + RETVAL -void +SV* isComp_Ex(uv) UV uv PROTOTYPE: $ ALIAS: isNFC_NO = 0 isNFKC_NO = 1 + PREINIT: + bool result = FALSE; CODE: if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) - XSRETURN_YES; /* NFC_NO or NFKC_NO */ + result = TRUE; /* NFC_NO or NFKC_NO */ else if (ix) { char *canon, *compat; canon = (char *) dec_canonical(uv); compat = (char *) dec_compat(uv); if (compat && (!canon || strNE(canon, compat))) - XSRETURN_YES; /* NFC_NO or NFKC_NO */ - else - XSRETURN_NO; + result = TRUE; /* NFC_NO or NFKC_NO */ } - else - XSRETURN_NO; - - + RETVAL = boolSV(result); + OUTPUT: + RETVAL SV* getComposite(uv, uv2) diff --git a/ext/Unicode/Normalize/README b/ext/Unicode/Normalize/README index 79a2b5e..d7d1635 100644 --- a/ext/Unicode/Normalize/README +++ b/ext/Unicode/Normalize/README @@ -1,4 +1,4 @@ -Unicode/Normalize version 1.01 +Unicode/Normalize version 1.02 =================================== Unicode::Normalize - Unicode Normalization Forms @@ -64,7 +64,7 @@ which are included in recent perl core distributions. - unicore/Decomposition.pl (or unicode/Decomposition.pl) - unicore/CompositionExclusions.txt (or unicode/CompExcl.txt) -CAVEATS +NOTES (1) After these unicore/*.* files are updated: @@ -76,14 +76,14 @@ In the case of pure perl: Rebuilding is not necessary, as the data will be read on requirement. -(2) Normalize.pmN (pure perl module) may work without any other file - in this distribution (it must be renamed Normalize.pm, though) +(2) Normalize.pmN (a pure perl module) may work without any other file + in this distribution. It must be renamed Normalize.pm, though. COPYRIGHT AND LICENSE SADAHIRO Tomoyuki -Copyright(C) 2001-2006, SADAHIRO Tomoyuki. Japan. All rights reserved. +Copyright(C) 2001-2007, SADAHIRO Tomoyuki. Japan. All rights reserved. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. diff --git a/ext/Unicode/Normalize/mkheader b/ext/Unicode/Normalize/mkheader index 68b58ca..f3ab5e2 100644 --- a/ext/Unicode/Normalize/mkheader +++ b/ext/Unicode/Normalize/mkheader @@ -80,8 +80,7 @@ sub decomposeHangul { VBase + $VIndex, $TIndex ? (TBase + $TIndex) : (), ); - wantarray ? @ret : pack('U*', @ret); - # any element in @ret greater than 0xFF, so no need of u2n conversion. + return @ret; } ########## getting full decomposion ########## @@ -214,9 +213,20 @@ sub _pack_U { return pack('U*', @_); } +sub split_into_char { + use bytes; + my $uni = shift; + my $len = length($uni); + my @ary; + for(my $i = 0; $i < $len; ++$i) { + push @ary, ord(substr($uni,$i,1)); + } + return @ary; +} + sub _U_stringify { sprintf '"%s"', join '', - map sprintf("\\x%02x", $_), unpack 'U0C*', _pack_U(@_); + map sprintf("\\x%02x", $_), split_into_char(_pack_U(@_)); } foreach my $hash (\%Canon, \%Compat) { diff --git a/ext/Unicode/Normalize/t/fcdc.t b/ext/Unicode/Normalize/t/fcdc.t index 5fc78a5..1cc0db1 100644 --- a/ext/Unicode/Normalize/t/fcdc.t +++ b/ext/Unicode/Normalize/t/fcdc.t @@ -19,7 +19,7 @@ BEGIN { use Test; use strict; use warnings; -BEGIN { plan tests => 68 }; +BEGIN { plan tests => 70 }; use Unicode::Normalize qw(:all); ok(1); # If we made it this far, we're ok. @@ -108,4 +108,6 @@ ok(answer(checkFCC(hexU("0041 030A 0327"))), 'NO'); # A+ring+cedilla ok(answer(checkFCC(hexU("00C5 0327"))), 'NO'); # A-ring+cedilla ok(answer(checkFCC("\x{AC01}\x{1100}\x{1161}")), 'MAYBE'); # hangul ok(answer(checkFCC("\x{212B}\x{F900}")), 'NO'); # compat +ok(answer(checkFCC("\x{212B}\x{0327}")), 'NO'); # compat +ok(answer(checkFCC("\x{0327}\x{212B}")), 'NO'); # compat diff --git a/ext/Unicode/Normalize/t/func.t b/ext/Unicode/Normalize/t/func.t index 794fff6..76ced03 100644 --- a/ext/Unicode/Normalize/t/func.t +++ b/ext/Unicode/Normalize/t/func.t @@ -19,7 +19,7 @@ BEGIN { use Test; use strict; use warnings; -BEGIN { plan tests => 202 }; +BEGIN { plan tests => 211 }; use Unicode::Normalize qw(:all); ok(1); # If we made it this far, we're ok. @@ -202,14 +202,23 @@ ok(answer(check("KD","AZaz\t12!#`")), "YES"); ok(answer(check("KC","AZaz\t12!#`")), "YES"); ok(answer(checkNFD(NFD(_pack_U(0xC1, 0x1100, 0x1173, 0x11AF)))), "YES"); -ok(answer(checkNFD(_pack_U(0x20, 0xC1, 0x1100, 0x1173, 0x11AF))), "NO"); -ok(answer(checkNFC(_pack_U(0x20, 0xC1, 0x1173, 0x11AF))), "MAYBE"); -ok(answer(checkNFC(_pack_U(0x20, 0xC1, 0xAE00, 0x1100))), "YES"); -ok(answer(checkNFC(_pack_U(0x20, 0xC1, 0xAE00, 0x1100, 0x300))), "MAYBE"); -ok(answer(checkNFC(_pack_U(0x20, 0xC1, 0xFF71, 0x2025))), "YES"); -ok(answer(check("NFC", _pack_U(0x20, 0xC1, 0x212B, 0x300))), "NO"); -ok(answer(checkNFKD(_pack_U(0x20, 0xC1, 0xFF71, 0x2025))), "NO"); -ok(answer(checkNFKC(_pack_U(0x20, 0xC1, 0xAE00, 0x2025))), "NO"); +ok(answer(checkNFD(hexU("20 C1 1100 1173 11AF"))), "NO"); +ok(answer(checkNFC(hexU("20 C1 1173 11AF"))), "MAYBE"); +ok(answer(checkNFC(hexU("20 C1 AE00 1100"))), "YES"); +ok(answer(checkNFC(hexU("20 C1 AE00 1100 0300"))), "MAYBE"); +ok(answer(checkNFC(hexU("212B 1100 0300"))), "NO"); +ok(answer(checkNFC(hexU("1100 0300 212B"))), "NO"); +ok(answer(checkNFC(hexU("0041 0327 030A"))), "MAYBE"); # A+cedilla+ring +ok(answer(checkNFC(hexU("0041 030A 0327"))), "NO"); # A+ring+cedilla +ok(answer(checkNFC(hexU("20 C1 FF71 2025"))),"YES"); +ok(answer(check("NFC", hexU("20 C1 212B 300"))), "NO"); +ok(answer(checkNFKD(hexU("20 C1 FF71 2025"))), "NO"); +ok(answer(checkNFKC(hexU("20 C1 AE00 2025"))), "NO"); +ok(answer(checkNFKC(hexU("212B 1100 0300"))), "NO"); +ok(answer(checkNFKC(hexU("1100 0300 212B"))), "NO"); +ok(answer(checkNFKC(hexU("0041 0327 030A"))), "MAYBE"); # A+cedilla+ring +ok(answer(checkNFKC(hexU("0041 030A 0327"))), "NO"); # A+ring+cedilla +ok(answer(check("NFKC", hexU("20 C1 212B 300"))), "NO"); "012ABC" =~ /(\d+)(\w+)/; ok("012" eq NFC $1 && "ABC" eq NFC $2); diff --git a/ext/Unicode/Normalize/t/test.t b/ext/Unicode/Normalize/t/test.t index e07f6f0..e48e16f 100644 --- a/ext/Unicode/Normalize/t/test.t +++ b/ext/Unicode/Normalize/t/test.t @@ -39,7 +39,6 @@ ok(NFKD("A"), "A"); ok(NFKC("A"), "A"); # don't modify the source -# don't modify the source my $sNFD = "\x{FA19}"; ok(NFD($sNFD), "\x{795E}"); ok($sNFD, "\x{FA19}");