Update Unicode::Normalize to 0.28

diff --git a/MANIFEST b/MANIFEST

index 854a1a7..352cdaa 100644 (file)
--- a/MANIFEST
+++ b/MANIFEST
@@ -762,8 +762,11 @@ ext/Unicode/Normalize/README       Unicode::Normalize
 ext/Unicode/Normalize/t/fcdc.t Unicode::Normalize
 ext/Unicode/Normalize/t/form.t Unicode::Normalize
 ext/Unicode/Normalize/t/func.t Unicode::Normalize
+ext/Unicode/Normalize/t/illegal.t      Unicode::Normalize
 ext/Unicode/Normalize/t/norm.t Unicode::Normalize
+ext/Unicode/Normalize/t/null.t Unicode::Normalize
 ext/Unicode/Normalize/t/proto.t        Unicode::Normalize
+ext/Unicode/Normalize/t/short.t        Unicode::Normalize
 ext/Unicode/Normalize/t/split.t        Unicode::Normalize
 ext/Unicode/Normalize/t/test.t Unicode::Normalize
 ext/util/make_ext              Used by Makefile to execute extension Makefiles
diff --git a/ext/Unicode/Normalize/Changes b/ext/Unicode/Normalize/Changes

index 74b87e7..e63656c 100644 (file)
--- a/ext/Unicode/Normalize/Changes
+++ b/ext/Unicode/Normalize/Changes
@@ -1,112 +1,128 @@
 Revision history for Perl extension Unicode::Normalize.
 
+0.28  Sat Nov 22 23:46:24 2003
+    - XSUB: even if string contains a malformed, "short" Unicode character,
+      decompose() and reorder() will be safe. Garbage will be no longer added.
+    - added null.t and short.t.
+    - now truely added illegal.t (in 0.27, forgot to change MANIFEST).
+
+0.27  Sun Nov 16 13:16:21 2003
+    - Illegal code points (surrogate and noncharacter) will be allowed
+      (keep your code with <no warnings 'utf8';>);
+      but porting is not successful in the case of ((Pure Perl) and
+      (Perl 5.7.3 or before)).
+    - added illegal.t.
+
+0.26  Sat Nov 15 21:52:30 2003
+    - doc fix: s/FCD(?= is unique)/FCC/;
+
 0.25  Mon Oct  6 22:26:03 2003
-       - added form.t and proto.t.
+    - added form.t and proto.t.
 
 0.24  Sat Oct  4 17:57:10 2003
-       - supports FCD and FCC (UTN #5):
-         FCD(), normalize('FCD'), checkFCD(), check('FCD');
-         FCC(), normalize('FCC'), checkFCC(), check('FCC').
-       - changed INSTALLATION (cf. README).
-         * Initial state of the distribution is changed to XSUB.  To build
-           pure Perl, type <perl disableXS> before <perl Makefile.PL>.
-         * The purePerl-XSUB converter is now provided as two perl
-           script files, named "enableXS" and "disableXS".
-           (no longer <perl Makefile.PL xs> and <perl Makefile.PL noxs>.)
-         * simplified Makefile.PL.
-       - added fcdc.t and split.t.
+    - supports FCD and FCC (UTN #5):
+      FCD(), normalize('FCD'), checkFCD(), check('FCD');
+      FCC(), normalize('FCC'), checkFCC(), check('FCC').
+    - changed INSTALLATION (cf. README).
+      * Initial state of the distribution is changed to XSUB.  To build
+        pure Perl, type <perl disableXS> before <perl Makefile.PL>.
+      * The purePerl-XSUB converter is now provided as two perl
+        script files, named "enableXS" and "disableXS".
+        (no longer <perl Makefile.PL xs> and <perl Makefile.PL noxs>.)
+      * simplified Makefile.PL.
+    - added fcdc.t and split.t.
 
 0.23  Sat Jun 28 20:38:10 2003
-       - bug fix: \0-terminate in compose() in XS.
-       - tweak in pure perl: forced $codepoint to numeric (i.e. "+0065" to 65)
-       - tweak of POD and README.
+    - bug fix: \0-terminate in compose() in XS.
+    - tweak in pure perl: forced $codepoint to numeric (i.e. "+0065" to 65)
+    - tweak of POD and README.
 
 0.22  Mon Jun 09 22:23:10 2003
-       - internal tweak (again): pack_U() and unpack_U().
+    - internal tweak (again): pack_U() and unpack_U().
 
 0.21  Thu Apr 02 23:12:54 2003
-       - internal tweak: for (?un)pack 'U'.
+    - internal tweak: for (?un)pack 'U'.
 
 0.20  Sun Mar 02 13:29:25 2003
-       - decompose Hangul syllables in a decomposition mapping.
+    - decompose Hangul syllables in a decomposition mapping.
 
 0.18  ... unreleased
-       - synchronization with bleadperl.
-       - Change 16262: by me
+    - synchronization with bleadperl.
+    - Change 16262: by me
 
 0.17  Sun Apr 28 23:13:32 2002
-       - now normalize('NFC',$1) should work.
-       - Some croak()'s are added in mkheader.
-       - synchronization with bleadperl.
-       - Change 15596: by me
-       - Change 16136: by pudge
+    - now normalize('NFC',$1) should work.
+    - Some croak()'s are added in mkheader.
+    - synchronization with bleadperl.
+    - Change 15596: by me
+    - Change 16136: by pudge
 
 0.16  Thu Mar 21 13:36:14 2002
-       - synchronization with bleadperl.
-       - Change 15318: by jhi
-       - Change 15319: by jhi
+    - synchronization with bleadperl.
+    - Change 15318: by jhi
+    - Change 15319: by jhi
 
 0.15  Tue Mar 19 22:04:07 2002
-       - Quick check is implemented.
-       - decompose(), reorder(), and compose() are documented.
-       - The Non-XS version is also independent of Lingua::KO::Hangul::Util.
+    - Quick check is implemented.
+    - decompose(), reorder(), and compose() are documented.
+    - The Non-XS version is also independent of Lingua::KO::Hangul::Util.
 
 0.14  Sat Feb 02 20:40:14 2002
-       - synchronization with bleadperl.
-       - Change 14128: by Arthur
-       - Change 14129: by jhi
-       - Change 14156: 
-       - Change 14199: by Nikola Knezevic
-       - Change 14308: by Benjamin Goldberg
-       - Change 14370: by jhi
+    - synchronization with bleadperl.
+    - Change 14128: by Arthur
+    - Change 14129: by jhi
+    - Change 14156: 
+    - Change 14199: by Nikola Knezevic
+    - Change 14308: by Benjamin Goldberg
+    - Change 14370: by jhi
 
 0.13  Sat Dec 01 11:42:43 2001
-       - modify Makefile.PL to enable rebuild.
-         (This problem is pointed out by David Dyck.)
-       - Change 13388: by Jarkko Hietaniemi.
+    - modify Makefile.PL to enable rebuild.
+      (This problem is pointed out by David Dyck.)
+    - Change 13388: by Jarkko Hietaniemi.
 
 0.12  Wed Nov 29 22:49:02 2001
-       - documentation in .pod is appended to .pm and the .pod is removed.
-        (only POD in NON-XS refers to Lingua::KO::Hangul::Util.)
+    - documentation in .pod is appended to .pm and the .pod is removed.
+     (only POD in NON-XS refers to Lingua::KO::Hangul::Util.)
 
 0.11  Sat Nov 24 10:18:38 2001
-       - documentation of some functions for character data.
-       - Change 12909: by Jarkko Hietaniemi.
-       - Change 13228: by Peter Prymmer.
+    - documentation of some functions for character data.
+    - Change 12909: by Jarkko Hietaniemi.
+    - Change 13228: by Peter Prymmer.
 
 0.10  Sat Nov 03 16:30:20 2001
-       - The XS version is now independent of Lingua::KO::Hangul::Util.
-         (though the Non-XS version still requires that.)
+    - The XS version is now independent of Lingua::KO::Hangul::Util.
+      (though the Non-XS version still requires that.)
 
 0.09  Fri Nov 02 22:39:30 2001
-       - remove pTHX_.
+    - remove pTHX_.
 
 0.08  Thu Nov 01 23:20:42 2001
-       - use Lingua::KO::Hangul::Util 0.06 and remove "hangul.h".
+    - use Lingua::KO::Hangul::Util 0.06 and remove "hangul.h".
 
 0.07  Wed Oct 31 22:06:42 2001
-       - modify internal. decompose() - reorder() - compose().
+    - modify internal. decompose() - reorder() - compose().
 
 0.06  Sun Oct 28 14:28:46 2001
-       - an XS version.
-       (but the Non-XS version is also supported.)
+    - an XS version.
+    (but the Non-XS version is also supported.)
 
 0.05  Wed Oct 10 22:02:15 2001 (not released)
-       - %Compos contains unnecessary singletons
-         (though it did not cause any bug, only useless).
-         They will not be stored.
+    - %Compos contains unnecessary singletons
+      (though it did not cause any bug, only useless).
+      They will not be stored.
 
 0.04  Wed Aug 15 19:02:41 2001
-       - fix: NFD("") and NFKD("") must return "", not but undef.
+    - fix: NFD("") and NFKD("") must return "", not but undef.
 
 0.03  Fri Aug 10 22:44:18 2001
-       - rename the module name to Unicode::Normalize.
-       - normalize takes two arguments.
+    - rename the module name to Unicode::Normalize.
+    - normalize takes two arguments.
 
 0.02  Thu Aug  9 22:56:36 2001
-       - add function normalize
+    - add function normalize
 
 0.01  Mon Aug  6 21:45:11 2001
-       - original version; created by h2xs 1.21 with options
-               -A -X -n Text::Unicode::Normalize
+    - original version; created by h2xs 1.21 with options
+          -A -X -n Text::Unicode::Normalize
 
diff --git a/ext/Unicode/Normalize/Normalize.pm b/ext/Unicode/Normalize/Normalize.pm

index 704303d..ef8ec68 100644 (file)
--- a/ext/Unicode/Normalize/Normalize.pm
+++ b/ext/Unicode/Normalize/Normalize.pm
@@ -11,7 +11,9 @@ use strict;
 use warnings;
 use Carp;
 
-our $VERSION = '0.25';
+no warnings 'utf8';
+
+our $VERSION = '0.28';
 our $PACKAGE = __PACKAGE__;
 
 require Exporter;
@@ -180,7 +182,7 @@ each other. C<FCD()> will return one of these equivalent forms.
 
 returns the FCC form ("Fast C Contiguous"; cf. UTN #5).
 
-Note: FCD is unique, as well as four normalization forms (NF*).
+Note: FCC is unique, as well as four normalization forms (NF*).
 
 =item C<$normalized_string = normalize($form_name, $string)>
 
@@ -277,7 +279,7 @@ returns C<YES> (C<1>) or C<NO> (C<empty string>).
 
 returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
 
-If a string is not in C<FCD>, it must not be in <FCC>.
+If a string is not in FCD, it must not be in FCC.
 So C<checkFCC($not_FCD_string)> should return C<NO>.
 
 =item C<$result = check($form_name, $string)>
@@ -393,7 +395,7 @@ SADAHIRO Tomoyuki, <SADAHIRO@cpan.org>
 
 =over 4
 
-=item http://www.unicode.org/unicode/reports/tr15/
+=item http://www.unicode.org/reports/tr15/
 
 Unicode Normalization Forms - UAX #15
 
diff --git a/ext/Unicode/Normalize/Normalize.xs b/ext/Unicode/Normalize/Normalize.xs

index 04d0256..2ce7cbc 100644 (file)
--- a/ext/Unicode/Normalize/Normalize.xs
+++ b/ext/Unicode/Normalize/Normalize.xs
@@ -20,7 +20,14 @@
 #define utf8n_to_uvuni  utf8_to_uv
 #endif /* utf8n_to_uvuni */
 
-/* if utf8n_to_uvuni() sets retlen to 0 when flags = 0 */
+/* UTF8_ALLOW_BOM is used before Perl 5.8.0 */
+#ifdef UTF8_ALLOW_BOM
+#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FFFF)
+#else 
+#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)
+#endif
+
+/* if utf8n_to_uvuni() sets retlen to 0 (?) */
 #define ErrRetlenIsZero "panic (Unicode::Normalize): zero-length character"
 
 /* utf8_hop() hops back before start. Maybe broken UTF-8 */
@@ -64,7 +71,7 @@ typedef struct {
     STRLEN pos; /* position */
 } UNF_cc;
 
-int compare_cc (const void *a, const void *b)
+static int compare_cc (const void *a, const void *b)
 {
     int ret_cc;
     ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;
@@ -75,7 +82,7 @@ int compare_cc (const void *a, const void *b)
         - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );
 }
 
-U8* dec_canonical (UV uv)
+static U8* dec_canonical (UV uv)
 {
     U8 ***plane, **row;
     if (OVER_UTF_MAX(uv))
@@ -87,7 +94,7 @@ U8* dec_canonical (UV uv)
     return row ? row[uv & 0xff] : NULL;
 }
 
-U8* dec_compat (UV uv)
+static U8* dec_compat (UV uv)
 {
     U8 ***plane, **row;
     if (OVER_UTF_MAX(uv))
@@ -99,7 +106,7 @@ U8* dec_compat (UV uv)
     return row ? row[uv & 0xff] : NULL;
 }
 
-UV composite_uv (UV uv, UV uv2)
+static UV composite_uv (UV uv, UV uv2)
 {
     UNF_complist ***plane, **row, *cell, *i;
 
@@ -131,7 +138,7 @@ UV composite_uv (UV uv, UV uv2)
     return 0;
 }
 
-U8 getCombinClass (UV uv)
+static U8 getCombinClass (UV uv)
 {
     U8 **plane, *row;
     if (OVER_UTF_MAX(uv))
@@ -143,7 +150,7 @@ U8 getCombinClass (UV uv)
     return row ? row[uv & 0xff] : 0;
 }
 
-void sv_cat_decompHangul (SV* sv, UV uv)
+static void sv_cat_decompHangul (SV* sv, UV uv)
 {
     UV sindex, lindex, vindex, tindex;
     U8 *t, tmp[3 * UTF8_MAXLEN + 1];
@@ -162,7 +169,17 @@ void sv_cat_decompHangul (SV* sv, UV uv)
     if (tindex)
        t = uvuni_to_utf8(t, (tindex + Hangul_TBase));
     *t = '\0';
-    sv_catpvn(sv, (char *)tmp, strlen((char *)tmp));
+    sv_catpvn(sv, (char *)tmp, t - tmp);
+}
+
+static void sv_cat_uvuni (SV* sv, UV uv)
+{
+    U8 *t, tmp[UTF8_MAXLEN + 1];
+
+    t = tmp;
+    t = uvuni_to_utf8(t, uv);
+    *t = '\0';
+    sv_catpvn(sv, (char *)tmp, t - tmp);
 }
 
 MODULE = Unicode::Normalize    PACKAGE = Unicode::Normalize
@@ -194,7 +211,7 @@ decompose(arg, compat = &PL_sv_no)
     s = (U8*)SvPV(src,srclen);
     e = s + srclen;
     for (p = s; p < e; p += retlen) {
-       uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
+       uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
        if (!retlen)
            croak(ErrRetlenIsZero);
 
@@ -205,7 +222,7 @@ decompose(arg, compat = &PL_sv_no)
            if (r)
                sv_catpv(dst, (char *)r);
            else
-               sv_catpvn(dst, (char *)p, retlen);
+               sv_cat_uvuni(dst, uv);
        }
     }
     RETVAL = dst;
@@ -222,8 +239,10 @@ reorder(arg)
     SV *src, *dst;
     STRLEN srclen, dstlen, retlen, stk_cc_max;
     U8 *s, *e, *p, *d, curCC;
-    UV uv;
+    UV uv, uvlast;
     UNF_cc * stk_cc;
+    STRLEN i, cc_pos;
+    bool valid_uvlast;
   CODE:
     if (SvUTF8(arg)) {
        src = arg;
@@ -233,49 +252,46 @@ reorder(arg)
     }
 
     s = (U8*)SvPV(src, srclen);
-
+    e = s + srclen;
     dstlen = srclen + 1;
     dst = newSV(dstlen);
-    sv_setpvn(dst,(const char*)s,srclen);
+    (void)SvPOK_only(dst);
     SvUTF8_on(dst);
+    d = (U8*)SvPVX(dst);
 
     stk_cc_max = 10; /* enough as an initial value? */
     New(0, stk_cc, stk_cc_max, UNF_cc);
 
-    d = (U8*)SvPV(dst,dstlen);
-    e = d + dstlen;
-
-    for (p = d; p < e;) {
-       U8 *cc_in;
-       STRLEN cc_len, cc_iter, cc_pos;
-
-       uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
+    for (p = s; p < e;) {
+       uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
        if (!retlen)
            croak(ErrRetlenIsZero);
        p += retlen;
 
-       
-
        curCC = getCombinClass(uv);
-       if (! (curCC && p < e))
+       if (curCC == 0) {
+           d = uvuni_to_utf8(d, uv);
            continue;
-       else
-           cc_in = p - retlen;
+       }
 
        cc_pos = 0;
        stk_cc[cc_pos].cc  = curCC;
        stk_cc[cc_pos].uv  = uv;
        stk_cc[cc_pos].pos = cc_pos;
 
+       valid_uvlast = FALSE;
        while (p < e) {
-           uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
+           uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
            if (!retlen)
                croak(ErrRetlenIsZero);
            p += retlen;
 
            curCC = getCombinClass(uv);
-           if (!curCC)
+           if (curCC == 0) {
+               uvlast = uv;
+               valid_uvlast = TRUE;
                break;
+           }
 
            cc_pos++;
            if (stk_cc_max <= cc_pos) { /* extend if need */
@@ -287,18 +303,21 @@ reorder(arg)
            stk_cc[cc_pos].pos = cc_pos;
        }
 
-        /* only one c.c. in cc_len from cc_in, no need of reordering */
-       if (!cc_pos)
-           continue;
-
-       qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc);
+       /* reordered if there are two c.c.'s */
+       if (cc_pos) {
+           qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc);
+       }
 
-       cc_len = p - cc_in;
-       p = cc_in;
-       for (cc_iter = 0; cc_iter <= cc_pos; cc_iter++) {
-           p = uvuni_to_utf8(p, stk_cc[cc_iter].uv);
+       for (i = 0; i <= cc_pos; i++) {
+           d = uvuni_to_utf8(d, stk_cc[i].uv);
+       }
+       if (valid_uvlast)
+       {
+           d = uvuni_to_utf8(d, uvlast);
        }
     }
+    *d = '\0';
+    SvCUR_set(dst, d - (U8*)SvPVX(dst));
     Safefree(stk_cc);
     RETVAL = dst;
   OUTPUT:
@@ -341,7 +360,7 @@ compose(arg)
 
     for (p = s; p < e;) {
        if (beginning) {
-           uvS = utf8n_to_uvuni(p, e - p, &retlen, 0);
+           uvS = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
            if (!retlen)
                croak(ErrRetlenIsZero);
            p += retlen;
@@ -359,7 +378,7 @@ compose(arg)
 
     /* to the next Starter */
        while (p < e) {
-           uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
+           uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
            if (!retlen)
                croak(ErrRetlenIsZero);
            p += retlen;
@@ -433,7 +452,7 @@ checkNFD(arg)
 
     preCC = 0;
     for (p = s; p < e; p += retlen) {
-       uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
+       uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
        if (!retlen)
            croak(ErrRetlenIsZero);
 
@@ -474,7 +493,7 @@ checkNFC(arg)
     preCC = 0;
     isMAYBE = FALSE;
     for (p = s; p < e; p += retlen) {
-       uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
+       uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
        if (!retlen)
            croak(ErrRetlenIsZero);
 
@@ -535,7 +554,7 @@ checkFCD(arg)
     preCC = 0;
     isMAYBE = FALSE;
     for (p = s; p < e; p += retlen) {
-       uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
+       uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
        if (!retlen)
            croak(ErrRetlenIsZero);
 
@@ -543,7 +562,7 @@ checkFCD(arg)
 
        if (sCan) {
            canlen = (STRLEN)strlen((char *) sCan);
-           uvLead = utf8n_to_uvuni(sCan, canlen, &canret, 0);
+           uvLead = utf8n_to_uvuni(sCan, canlen, &canret, AllowAnyUTF);
        }
        else {
            uvLead = uv;
@@ -566,7 +585,7 @@ checkFCD(arg)
            pCan = utf8_hop(eCan, -1);
            if (pCan < sCan)
                croak(ErrHopBeforeStart);
-           uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, 0);
+           uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, AllowAnyUTF);
            preCC = getCombinClass(uvTrail);
        }
        else {
@@ -712,7 +731,7 @@ splitOnLastStarter(arg)
        p = utf8_hop(p, -1);
        if (p < s)
            croak(ErrHopBeforeStart);
-       uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
+       uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
        if (getCombinClass(uv) == 0) /* Last Starter found */
            break;
     }
diff --git a/ext/Unicode/Normalize/README b/ext/Unicode/Normalize/README

index 4fbdfe1..34e24e3 100644 (file)
--- a/ext/Unicode/Normalize/README
+++ b/ext/Unicode/Normalize/README
@@ -1,4 +1,4 @@
-Unicode/Normalize version 0.25
+Unicode/Normalize version 0.28
 ===================================
 
 Unicode::Normalize - Unicode Normalization Forms
@@ -24,7 +24,8 @@ SYNOPSIS
 
 INSTALLATION
 
-Perl 5.6.1 or later (Caution: Perl 5.6.0 is not recommended.)
+Perl 5.6.1 or later.
+(Perl 5.8.0 or later is recommended.)
 
 To install this module (XSUB: needs a C compiler), type the following:
 
@@ -85,9 +86,7 @@ CAVEAT
 
 COPYRIGHT AND LICENCE
 
-  SADAHIRO Tomoyuki
-
-  SADAHIRO@cpan.org
+  SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
 
   http://homepage1.nifty.com/nomenclator/perl/
 
diff --git a/ext/Unicode/Normalize/t/illegal.t b/ext/Unicode/Normalize/t/illegal.t

new file mode 100644 (file)

index 0000000..76cd833
--- /dev/null
+++ b/ext/Unicode/Normalize/t/illegal.t
@@ -0,0 +1,66 @@
+
+BEGIN {
+    unless ("A" eq pack('U', 0x41)) {
+       print "1..0 # Unicode::Normalize " .
+           "cannot stringify a Unicode code point\n";
+       exit 0;
+    }
+}
+
+BEGIN {
+    if ($ENV{PERL_CORE}) {
+        chdir('t') if -d 't';
+        @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib);
+    }
+}
+
+#########################
+
+use 5.006001;
+use Test;
+use strict;
+use warnings;
+
+BEGIN {
+    use Unicode::Normalize qw(:all);
+
+    unless (exists &Unicode::Normalize::bootstrap or 5.008 <= $]) {
+       print "1..0 # skipped: XSUB, or Perl 5.8.0 or later".
+               " needed for this test\n";
+       print $@;
+       exit;
+    }
+}
+
+BEGIN { plan tests => 112 };
+
+#########################
+
+no warnings qw(utf8);
+# To avoid warning in Test.pm, EXPR in ok(EXPR) must be boolean.
+
+for my $u (0xD800, 0xDFFF, 0xFDD0, 0xFDEF, 0xFEFF, 0xFFFE, 0xFFFF,
+          0x1FFFF, 0x10FFFF, 0x110000, 0x7FFFFFFF)
+{
+    my $c = chr $u;
+    ok($c eq NFD($c));  # 1
+    ok($c eq NFC($c));  # 2
+    ok($c eq NFKD($c)); # 3
+    ok($c eq NFKC($c)); # 4
+    ok($c eq FCD($c));  # 5
+    ok($c eq FCC($c));  # 6
+    ok($c eq decompose($c));   # 7
+    ok($c eq decompose($c,1)); # 8
+    ok($c eq reorder($c));     # 9
+    ok($c eq compose($c));     # 10
+}
+
+our $proc;    # before the last starter
+our $unproc;  # the last starter and after
+
+sub _pack_U   { Unicode::Normalize::pack_U(@_) }
+
+($proc, $unproc) = splitOnLastStarter(_pack_U(0x41, 0x300, 0x327, 0xFFFF));
+ok($proc   eq _pack_U(0x41, 0x300, 0x327));
+ok($unproc eq "\x{FFFF}");
+
diff --git a/ext/Unicode/Normalize/t/null.t b/ext/Unicode/Normalize/t/null.t

new file mode 100644 (file)

index 0000000..ae75752
--- /dev/null
+++ b/ext/Unicode/Normalize/t/null.t
@@ -0,0 +1,49 @@
+
+BEGIN {
+    unless ("A" eq pack('U', 0x41)) {
+       print "1..0 # Unicode::Normalize " .
+           "cannot stringify a Unicode code point\n";
+       exit 0;
+    }
+}
+
+BEGIN {
+    if ($ENV{PERL_CORE}) {
+        chdir('t') if -d 't';
+        @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib);
+    }
+}
+
+#########################
+
+use strict;
+use warnings;
+
+use Unicode::Normalize qw(:all);
+print "1..8\n";
+
+print "ok 1\n";
+
+# if $_ is not NULL-terminated, test may fail.
+
+$_ = compose('abc');
+print /c$/ ? "ok" : "not ok", " 2\n";
+
+$_ = decompose('abc');
+print /c$/ ? "ok" : "not ok", " 3\n";
+
+$_ = reorder('abc');
+print /c$/ ? "ok" : "not ok", " 4\n";
+
+$_ = NFD('abc');
+print /c$/ ? "ok" : "not ok", " 5\n";
+
+$_ = NFC('abc');
+print /c$/ ? "ok" : "not ok", " 6\n";
+
+$_ = NFKD('abc');
+print /c$/ ? "ok" : "not ok", " 7\n";
+
+$_ = NFKC('abc');
+print /c$/ ? "ok" : "not ok", " 8\n";
+
diff --git a/ext/Unicode/Normalize/t/short.t b/ext/Unicode/Normalize/t/short.t

new file mode 100644 (file)

index 0000000..1f185ac
--- /dev/null
+++ b/ext/Unicode/Normalize/t/short.t
@@ -0,0 +1,55 @@
+
+BEGIN {
+    unless ("A" eq pack('U', 0x41)) {
+       print "1..0 # Unicode::Normalize " .
+           "cannot stringify a Unicode code point\n";
+       exit 0;
+    }
+}
+
+BEGIN {
+    if ($ENV{PERL_CORE}) {
+        chdir('t') if -d 't';
+        @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib);
+    }
+}
+
+#########################
+
+use strict;
+use Unicode::Normalize qw(:all);
+
+print "1..8\n";
+print "ok 1\n";
+
+#########################
+
+no warnings qw(utf8);
+
+our $a = "\x{3042}"; # 3-byte length (in UTF-8/UTF-EBCDIC)
+{
+    use bytes;
+    substr($a,1,length($a), ''); # remove trailing octets
+}
+
+print NFD($a) eq "\0"
+   ? "ok" : "not ok", " 2\n";
+
+print NFKD($a) eq "\0"
+   ? "ok" : "not ok", " 3\n";
+
+print NFC($a) eq "\0"
+   ? "ok" : "not ok", " 4\n";
+
+print NFKC($a) eq "\0"
+   ? "ok" : "not ok", " 5\n";
+
+print decompose($a) eq "\0"
+   ? "ok" : "not ok", " 6\n";
+
+print reorder($a) eq "\0"
+   ? "ok" : "not ok", " 7\n";
+
+print compose($a) eq "\0"
+   ? "ok" : "not ok", " 8\n";
+
MANIFEST		patch \| blob \| blame \| history
ext/Unicode/Normalize/Changes		patch \| blob \| blame \| history
ext/Unicode/Normalize/Normalize.pm		patch \| blob \| blame \| history
ext/Unicode/Normalize/Normalize.xs		patch \| blob \| blame \| history
ext/Unicode/Normalize/README		patch \| blob \| blame \| history
ext/Unicode/Normalize/t/illegal.t	[new file with mode: 0644]	patch \| blob
ext/Unicode/Normalize/t/null.t	[new file with mode: 0644]	patch \| blob
ext/Unicode/Normalize/t/short.t	[new file with mode: 0644]	patch \| blob