ext/Unicode/Normalize/t/fcdc.t Unicode::Normalize
ext/Unicode/Normalize/t/form.t Unicode::Normalize
ext/Unicode/Normalize/t/func.t Unicode::Normalize
+ext/Unicode/Normalize/t/illegal.t Unicode::Normalize
ext/Unicode/Normalize/t/norm.t Unicode::Normalize
+ext/Unicode/Normalize/t/null.t Unicode::Normalize
ext/Unicode/Normalize/t/proto.t Unicode::Normalize
+ext/Unicode/Normalize/t/short.t Unicode::Normalize
ext/Unicode/Normalize/t/split.t Unicode::Normalize
ext/Unicode/Normalize/t/test.t Unicode::Normalize
ext/util/make_ext Used by Makefile to execute extension Makefiles
Revision history for Perl extension Unicode::Normalize.
+0.28 Sat Nov 22 23:46:24 2003
+ - XSUB: even if string contains a malformed, "short" Unicode character,
+ decompose() and reorder() will be safe. Garbage will be no longer added.
+ - added null.t and short.t.
+ - now truely added illegal.t (in 0.27, forgot to change MANIFEST).
+
+0.27 Sun Nov 16 13:16:21 2003
+ - Illegal code points (surrogate and noncharacter) will be allowed
+ (keep your code with <no warnings 'utf8';>);
+ but porting is not successful in the case of ((Pure Perl) and
+ (Perl 5.7.3 or before)).
+ - added illegal.t.
+
+0.26 Sat Nov 15 21:52:30 2003
+ - doc fix: s/FCD(?= is unique)/FCC/;
+
0.25 Mon Oct 6 22:26:03 2003
- - added form.t and proto.t.
+ - added form.t and proto.t.
0.24 Sat Oct 4 17:57:10 2003
- - supports FCD and FCC (UTN #5):
- FCD(), normalize('FCD'), checkFCD(), check('FCD');
- FCC(), normalize('FCC'), checkFCC(), check('FCC').
- - changed INSTALLATION (cf. README).
- * Initial state of the distribution is changed to XSUB. To build
- pure Perl, type <perl disableXS> before <perl Makefile.PL>.
- * The purePerl-XSUB converter is now provided as two perl
- script files, named "enableXS" and "disableXS".
- (no longer <perl Makefile.PL xs> and <perl Makefile.PL noxs>.)
- * simplified Makefile.PL.
- - added fcdc.t and split.t.
+ - supports FCD and FCC (UTN #5):
+ FCD(), normalize('FCD'), checkFCD(), check('FCD');
+ FCC(), normalize('FCC'), checkFCC(), check('FCC').
+ - changed INSTALLATION (cf. README).
+ * Initial state of the distribution is changed to XSUB. To build
+ pure Perl, type <perl disableXS> before <perl Makefile.PL>.
+ * The purePerl-XSUB converter is now provided as two perl
+ script files, named "enableXS" and "disableXS".
+ (no longer <perl Makefile.PL xs> and <perl Makefile.PL noxs>.)
+ * simplified Makefile.PL.
+ - added fcdc.t and split.t.
0.23 Sat Jun 28 20:38:10 2003
- - bug fix: \0-terminate in compose() in XS.
- - tweak in pure perl: forced $codepoint to numeric (i.e. "+0065" to 65)
- - tweak of POD and README.
+ - bug fix: \0-terminate in compose() in XS.
+ - tweak in pure perl: forced $codepoint to numeric (i.e. "+0065" to 65)
+ - tweak of POD and README.
0.22 Mon Jun 09 22:23:10 2003
- - internal tweak (again): pack_U() and unpack_U().
+ - internal tweak (again): pack_U() and unpack_U().
0.21 Thu Apr 02 23:12:54 2003
- - internal tweak: for (?un)pack 'U'.
+ - internal tweak: for (?un)pack 'U'.
0.20 Sun Mar 02 13:29:25 2003
- - decompose Hangul syllables in a decomposition mapping.
+ - decompose Hangul syllables in a decomposition mapping.
0.18 ... unreleased
- - synchronization with bleadperl.
- - Change 16262: by me
+ - synchronization with bleadperl.
+ - Change 16262: by me
0.17 Sun Apr 28 23:13:32 2002
- - now normalize('NFC',$1) should work.
- - Some croak()'s are added in mkheader.
- - synchronization with bleadperl.
- - Change 15596: by me
- - Change 16136: by pudge
+ - now normalize('NFC',$1) should work.
+ - Some croak()'s are added in mkheader.
+ - synchronization with bleadperl.
+ - Change 15596: by me
+ - Change 16136: by pudge
0.16 Thu Mar 21 13:36:14 2002
- - synchronization with bleadperl.
- - Change 15318: by jhi
- - Change 15319: by jhi
+ - synchronization with bleadperl.
+ - Change 15318: by jhi
+ - Change 15319: by jhi
0.15 Tue Mar 19 22:04:07 2002
- - Quick check is implemented.
- - decompose(), reorder(), and compose() are documented.
- - The Non-XS version is also independent of Lingua::KO::Hangul::Util.
+ - Quick check is implemented.
+ - decompose(), reorder(), and compose() are documented.
+ - The Non-XS version is also independent of Lingua::KO::Hangul::Util.
0.14 Sat Feb 02 20:40:14 2002
- - synchronization with bleadperl.
- - Change 14128: by Arthur
- - Change 14129: by jhi
- - Change 14156:
- - Change 14199: by Nikola Knezevic
- - Change 14308: by Benjamin Goldberg
- - Change 14370: by jhi
+ - synchronization with bleadperl.
+ - Change 14128: by Arthur
+ - Change 14129: by jhi
+ - Change 14156:
+ - Change 14199: by Nikola Knezevic
+ - Change 14308: by Benjamin Goldberg
+ - Change 14370: by jhi
0.13 Sat Dec 01 11:42:43 2001
- - modify Makefile.PL to enable rebuild.
- (This problem is pointed out by David Dyck.)
- - Change 13388: by Jarkko Hietaniemi.
+ - modify Makefile.PL to enable rebuild.
+ (This problem is pointed out by David Dyck.)
+ - Change 13388: by Jarkko Hietaniemi.
0.12 Wed Nov 29 22:49:02 2001
- - documentation in .pod is appended to .pm and the .pod is removed.
- (only POD in NON-XS refers to Lingua::KO::Hangul::Util.)
+ - documentation in .pod is appended to .pm and the .pod is removed.
+ (only POD in NON-XS refers to Lingua::KO::Hangul::Util.)
0.11 Sat Nov 24 10:18:38 2001
- - documentation of some functions for character data.
- - Change 12909: by Jarkko Hietaniemi.
- - Change 13228: by Peter Prymmer.
+ - documentation of some functions for character data.
+ - Change 12909: by Jarkko Hietaniemi.
+ - Change 13228: by Peter Prymmer.
0.10 Sat Nov 03 16:30:20 2001
- - The XS version is now independent of Lingua::KO::Hangul::Util.
- (though the Non-XS version still requires that.)
+ - The XS version is now independent of Lingua::KO::Hangul::Util.
+ (though the Non-XS version still requires that.)
0.09 Fri Nov 02 22:39:30 2001
- - remove pTHX_.
+ - remove pTHX_.
0.08 Thu Nov 01 23:20:42 2001
- - use Lingua::KO::Hangul::Util 0.06 and remove "hangul.h".
+ - use Lingua::KO::Hangul::Util 0.06 and remove "hangul.h".
0.07 Wed Oct 31 22:06:42 2001
- - modify internal. decompose() - reorder() - compose().
+ - modify internal. decompose() - reorder() - compose().
0.06 Sun Oct 28 14:28:46 2001
- - an XS version.
- (but the Non-XS version is also supported.)
+ - an XS version.
+ (but the Non-XS version is also supported.)
0.05 Wed Oct 10 22:02:15 2001 (not released)
- - %Compos contains unnecessary singletons
- (though it did not cause any bug, only useless).
- They will not be stored.
+ - %Compos contains unnecessary singletons
+ (though it did not cause any bug, only useless).
+ They will not be stored.
0.04 Wed Aug 15 19:02:41 2001
- - fix: NFD("") and NFKD("") must return "", not but undef.
+ - fix: NFD("") and NFKD("") must return "", not but undef.
0.03 Fri Aug 10 22:44:18 2001
- - rename the module name to Unicode::Normalize.
- - normalize takes two arguments.
+ - rename the module name to Unicode::Normalize.
+ - normalize takes two arguments.
0.02 Thu Aug 9 22:56:36 2001
- - add function normalize
+ - add function normalize
0.01 Mon Aug 6 21:45:11 2001
- - original version; created by h2xs 1.21 with options
- -A -X -n Text::Unicode::Normalize
+ - original version; created by h2xs 1.21 with options
+ -A -X -n Text::Unicode::Normalize
use warnings;
use Carp;
-our $VERSION = '0.25';
+no warnings 'utf8';
+
+our $VERSION = '0.28';
our $PACKAGE = __PACKAGE__;
require Exporter;
returns the FCC form ("Fast C Contiguous"; cf. UTN #5).
-Note: FCD is unique, as well as four normalization forms (NF*).
+Note: FCC is unique, as well as four normalization forms (NF*).
=item C<$normalized_string = normalize($form_name, $string)>
returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
-If a string is not in C<FCD>, it must not be in <FCC>.
+If a string is not in FCD, it must not be in FCC.
So C<checkFCC($not_FCD_string)> should return C<NO>.
=item C<$result = check($form_name, $string)>
=over 4
-=item http://www.unicode.org/unicode/reports/tr15/
+=item http://www.unicode.org/reports/tr15/
Unicode Normalization Forms - UAX #15
#define utf8n_to_uvuni utf8_to_uv
#endif /* utf8n_to_uvuni */
-/* if utf8n_to_uvuni() sets retlen to 0 when flags = 0 */
+/* UTF8_ALLOW_BOM is used before Perl 5.8.0 */
+#ifdef UTF8_ALLOW_BOM
+#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FFFF)
+#else
+#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)
+#endif
+
+/* if utf8n_to_uvuni() sets retlen to 0 (?) */
#define ErrRetlenIsZero "panic (Unicode::Normalize): zero-length character"
/* utf8_hop() hops back before start. Maybe broken UTF-8 */
STRLEN pos; /* position */
} UNF_cc;
-int compare_cc (const void *a, const void *b)
+static int compare_cc (const void *a, const void *b)
{
int ret_cc;
ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;
- ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );
}
-U8* dec_canonical (UV uv)
+static U8* dec_canonical (UV uv)
{
U8 ***plane, **row;
if (OVER_UTF_MAX(uv))
return row ? row[uv & 0xff] : NULL;
}
-U8* dec_compat (UV uv)
+static U8* dec_compat (UV uv)
{
U8 ***plane, **row;
if (OVER_UTF_MAX(uv))
return row ? row[uv & 0xff] : NULL;
}
-UV composite_uv (UV uv, UV uv2)
+static UV composite_uv (UV uv, UV uv2)
{
UNF_complist ***plane, **row, *cell, *i;
return 0;
}
-U8 getCombinClass (UV uv)
+static U8 getCombinClass (UV uv)
{
U8 **plane, *row;
if (OVER_UTF_MAX(uv))
return row ? row[uv & 0xff] : 0;
}
-void sv_cat_decompHangul (SV* sv, UV uv)
+static void sv_cat_decompHangul (SV* sv, UV uv)
{
UV sindex, lindex, vindex, tindex;
U8 *t, tmp[3 * UTF8_MAXLEN + 1];
if (tindex)
t = uvuni_to_utf8(t, (tindex + Hangul_TBase));
*t = '\0';
- sv_catpvn(sv, (char *)tmp, strlen((char *)tmp));
+ sv_catpvn(sv, (char *)tmp, t - tmp);
+}
+
+static void sv_cat_uvuni (SV* sv, UV uv)
+{
+ U8 *t, tmp[UTF8_MAXLEN + 1];
+
+ t = tmp;
+ t = uvuni_to_utf8(t, uv);
+ *t = '\0';
+ sv_catpvn(sv, (char *)tmp, t - tmp);
}
MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
s = (U8*)SvPV(src,srclen);
e = s + srclen;
for (p = s; p < e; p += retlen) {
- uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
+ uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
if (!retlen)
croak(ErrRetlenIsZero);
if (r)
sv_catpv(dst, (char *)r);
else
- sv_catpvn(dst, (char *)p, retlen);
+ sv_cat_uvuni(dst, uv);
}
}
RETVAL = dst;
SV *src, *dst;
STRLEN srclen, dstlen, retlen, stk_cc_max;
U8 *s, *e, *p, *d, curCC;
- UV uv;
+ UV uv, uvlast;
UNF_cc * stk_cc;
+ STRLEN i, cc_pos;
+ bool valid_uvlast;
CODE:
if (SvUTF8(arg)) {
src = arg;
}
s = (U8*)SvPV(src, srclen);
-
+ e = s + srclen;
dstlen = srclen + 1;
dst = newSV(dstlen);
- sv_setpvn(dst,(const char*)s,srclen);
+ (void)SvPOK_only(dst);
SvUTF8_on(dst);
+ d = (U8*)SvPVX(dst);
stk_cc_max = 10; /* enough as an initial value? */
New(0, stk_cc, stk_cc_max, UNF_cc);
- d = (U8*)SvPV(dst,dstlen);
- e = d + dstlen;
-
- for (p = d; p < e;) {
- U8 *cc_in;
- STRLEN cc_len, cc_iter, cc_pos;
-
- uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
+ for (p = s; p < e;) {
+ uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
if (!retlen)
croak(ErrRetlenIsZero);
p += retlen;
-
-
curCC = getCombinClass(uv);
- if (! (curCC && p < e))
+ if (curCC == 0) {
+ d = uvuni_to_utf8(d, uv);
continue;
- else
- cc_in = p - retlen;
+ }
cc_pos = 0;
stk_cc[cc_pos].cc = curCC;
stk_cc[cc_pos].uv = uv;
stk_cc[cc_pos].pos = cc_pos;
+ valid_uvlast = FALSE;
while (p < e) {
- uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
+ uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
if (!retlen)
croak(ErrRetlenIsZero);
p += retlen;
curCC = getCombinClass(uv);
- if (!curCC)
+ if (curCC == 0) {
+ uvlast = uv;
+ valid_uvlast = TRUE;
break;
+ }
cc_pos++;
if (stk_cc_max <= cc_pos) { /* extend if need */
stk_cc[cc_pos].pos = cc_pos;
}
- /* only one c.c. in cc_len from cc_in, no need of reordering */
- if (!cc_pos)
- continue;
-
- qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc);
+ /* reordered if there are two c.c.'s */
+ if (cc_pos) {
+ qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc);
+ }
- cc_len = p - cc_in;
- p = cc_in;
- for (cc_iter = 0; cc_iter <= cc_pos; cc_iter++) {
- p = uvuni_to_utf8(p, stk_cc[cc_iter].uv);
+ for (i = 0; i <= cc_pos; i++) {
+ d = uvuni_to_utf8(d, stk_cc[i].uv);
+ }
+ if (valid_uvlast)
+ {
+ d = uvuni_to_utf8(d, uvlast);
}
}
+ *d = '\0';
+ SvCUR_set(dst, d - (U8*)SvPVX(dst));
Safefree(stk_cc);
RETVAL = dst;
OUTPUT:
for (p = s; p < e;) {
if (beginning) {
- uvS = utf8n_to_uvuni(p, e - p, &retlen, 0);
+ uvS = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
if (!retlen)
croak(ErrRetlenIsZero);
p += retlen;
/* to the next Starter */
while (p < e) {
- uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
+ uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
if (!retlen)
croak(ErrRetlenIsZero);
p += retlen;
preCC = 0;
for (p = s; p < e; p += retlen) {
- uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
+ uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
if (!retlen)
croak(ErrRetlenIsZero);
preCC = 0;
isMAYBE = FALSE;
for (p = s; p < e; p += retlen) {
- uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
+ uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
if (!retlen)
croak(ErrRetlenIsZero);
preCC = 0;
isMAYBE = FALSE;
for (p = s; p < e; p += retlen) {
- uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
+ uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
if (!retlen)
croak(ErrRetlenIsZero);
if (sCan) {
canlen = (STRLEN)strlen((char *) sCan);
- uvLead = utf8n_to_uvuni(sCan, canlen, &canret, 0);
+ uvLead = utf8n_to_uvuni(sCan, canlen, &canret, AllowAnyUTF);
}
else {
uvLead = uv;
pCan = utf8_hop(eCan, -1);
if (pCan < sCan)
croak(ErrHopBeforeStart);
- uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, 0);
+ uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, AllowAnyUTF);
preCC = getCombinClass(uvTrail);
}
else {
p = utf8_hop(p, -1);
if (p < s)
croak(ErrHopBeforeStart);
- uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
+ uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
if (getCombinClass(uv) == 0) /* Last Starter found */
break;
}
-Unicode/Normalize version 0.25
+Unicode/Normalize version 0.28
===================================
Unicode::Normalize - Unicode Normalization Forms
INSTALLATION
-Perl 5.6.1 or later (Caution: Perl 5.6.0 is not recommended.)
+Perl 5.6.1 or later.
+(Perl 5.8.0 or later is recommended.)
To install this module (XSUB: needs a C compiler), type the following:
COPYRIGHT AND LICENCE
- SADAHIRO Tomoyuki
-
- SADAHIRO@cpan.org
+ SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
http://homepage1.nifty.com/nomenclator/perl/
--- /dev/null
+
+BEGIN {
+ unless ("A" eq pack('U', 0x41)) {
+ print "1..0 # Unicode::Normalize " .
+ "cannot stringify a Unicode code point\n";
+ exit 0;
+ }
+}
+
+BEGIN {
+ if ($ENV{PERL_CORE}) {
+ chdir('t') if -d 't';
+ @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib);
+ }
+}
+
+#########################
+
+use 5.006001;
+use Test;
+use strict;
+use warnings;
+
+BEGIN {
+ use Unicode::Normalize qw(:all);
+
+ unless (exists &Unicode::Normalize::bootstrap or 5.008 <= $]) {
+ print "1..0 # skipped: XSUB, or Perl 5.8.0 or later".
+ " needed for this test\n";
+ print $@;
+ exit;
+ }
+}
+
+BEGIN { plan tests => 112 };
+
+#########################
+
+no warnings qw(utf8);
+# To avoid warning in Test.pm, EXPR in ok(EXPR) must be boolean.
+
+for my $u (0xD800, 0xDFFF, 0xFDD0, 0xFDEF, 0xFEFF, 0xFFFE, 0xFFFF,
+ 0x1FFFF, 0x10FFFF, 0x110000, 0x7FFFFFFF)
+{
+ my $c = chr $u;
+ ok($c eq NFD($c)); # 1
+ ok($c eq NFC($c)); # 2
+ ok($c eq NFKD($c)); # 3
+ ok($c eq NFKC($c)); # 4
+ ok($c eq FCD($c)); # 5
+ ok($c eq FCC($c)); # 6
+ ok($c eq decompose($c)); # 7
+ ok($c eq decompose($c,1)); # 8
+ ok($c eq reorder($c)); # 9
+ ok($c eq compose($c)); # 10
+}
+
+our $proc; # before the last starter
+our $unproc; # the last starter and after
+
+sub _pack_U { Unicode::Normalize::pack_U(@_) }
+
+($proc, $unproc) = splitOnLastStarter(_pack_U(0x41, 0x300, 0x327, 0xFFFF));
+ok($proc eq _pack_U(0x41, 0x300, 0x327));
+ok($unproc eq "\x{FFFF}");
+
--- /dev/null
+
+BEGIN {
+ unless ("A" eq pack('U', 0x41)) {
+ print "1..0 # Unicode::Normalize " .
+ "cannot stringify a Unicode code point\n";
+ exit 0;
+ }
+}
+
+BEGIN {
+ if ($ENV{PERL_CORE}) {
+ chdir('t') if -d 't';
+ @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib);
+ }
+}
+
+#########################
+
+use strict;
+use warnings;
+
+use Unicode::Normalize qw(:all);
+print "1..8\n";
+
+print "ok 1\n";
+
+# if $_ is not NULL-terminated, test may fail.
+
+$_ = compose('abc');
+print /c$/ ? "ok" : "not ok", " 2\n";
+
+$_ = decompose('abc');
+print /c$/ ? "ok" : "not ok", " 3\n";
+
+$_ = reorder('abc');
+print /c$/ ? "ok" : "not ok", " 4\n";
+
+$_ = NFD('abc');
+print /c$/ ? "ok" : "not ok", " 5\n";
+
+$_ = NFC('abc');
+print /c$/ ? "ok" : "not ok", " 6\n";
+
+$_ = NFKD('abc');
+print /c$/ ? "ok" : "not ok", " 7\n";
+
+$_ = NFKC('abc');
+print /c$/ ? "ok" : "not ok", " 8\n";
+
--- /dev/null
+
+BEGIN {
+ unless ("A" eq pack('U', 0x41)) {
+ print "1..0 # Unicode::Normalize " .
+ "cannot stringify a Unicode code point\n";
+ exit 0;
+ }
+}
+
+BEGIN {
+ if ($ENV{PERL_CORE}) {
+ chdir('t') if -d 't';
+ @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib);
+ }
+}
+
+#########################
+
+use strict;
+use Unicode::Normalize qw(:all);
+
+print "1..8\n";
+print "ok 1\n";
+
+#########################
+
+no warnings qw(utf8);
+
+our $a = "\x{3042}"; # 3-byte length (in UTF-8/UTF-EBCDIC)
+{
+ use bytes;
+ substr($a,1,length($a), ''); # remove trailing octets
+}
+
+print NFD($a) eq "\0"
+ ? "ok" : "not ok", " 2\n";
+
+print NFKD($a) eq "\0"
+ ? "ok" : "not ok", " 3\n";
+
+print NFC($a) eq "\0"
+ ? "ok" : "not ok", " 4\n";
+
+print NFKC($a) eq "\0"
+ ? "ok" : "not ok", " 5\n";
+
+print decompose($a) eq "\0"
+ ? "ok" : "not ok", " 6\n";
+
+print reorder($a) eq "\0"
+ ? "ok" : "not ok", " 7\n";
+
+print compose($a) eq "\0"
+ ? "ok" : "not ok", " 8\n";
+