/* Perl 5.6.1 ? */
#ifndef uvuni_to_utf8
#define uvuni_to_utf8 uv_to_utf8
-#endif /* uvuni_to_utf8 */
+#endif /* uvuni_to_utf8 */
/* Perl 5.6.1 ? */
#ifndef utf8n_to_uvuni
#define utf8n_to_uvuni utf8_to_uv
-#endif /* utf8n_to_uvuni */
+#endif /* utf8n_to_uvuni */
+
+/* UTF8_ALLOW_BOM is used before Perl 5.8.0 */
+#ifdef UTF8_ALLOW_BOM
+#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FFFF)
+#else
+#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)
+#endif
+
+/* if utf8n_to_uvuni() sets retlen to 0 (?) */
+#define ErrRetlenIsZero "panic (Unicode::Normalize): zero-length character"
+
+/* utf8_hop() hops back before start. Maybe broken UTF-8 */
+#define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start"
/* At present, char > 0x10ffff are unaffected without complaint, right? */
#define VALID_UTF_MAX (0x10ffff)
STRLEN pos; /* position */
} UNF_cc;
-int compare_cc(const void *a, const void *b)
+static int compare_cc (const void *a, const void *b)
{
int ret_cc;
- ret_cc = (*(UNF_cc*)a).cc - (*(UNF_cc*)b).cc;
+ ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;
if (ret_cc)
return ret_cc;
- return (*(UNF_cc*)a).pos - (*(UNF_cc*)b).pos;
+
+ return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos )
+ - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );
}
-U8* dec_canonical (UV uv)
+static U8* dec_canonical (UV uv)
{
U8 ***plane, **row;
if (OVER_UTF_MAX(uv))
return row ? row[uv & 0xff] : NULL;
}
-U8* dec_compat (UV uv)
+static U8* dec_compat (UV uv)
{
U8 ***plane, **row;
if (OVER_UTF_MAX(uv))
return row ? row[uv & 0xff] : NULL;
}
-UV composite_uv (UV uv, UV uv2)
+static UV composite_uv (UV uv, UV uv2)
{
UNF_complist ***plane, **row, *cell, *i;
return 0;
}
-U8 getCombinClass (UV uv)
+static U8 getCombinClass (UV uv)
{
U8 **plane, *row;
if (OVER_UTF_MAX(uv))
return row ? row[uv & 0xff] : 0;
}
-void sv_cat_decompHangul (SV* sv, UV uv)
+static void sv_cat_decompHangul (SV* sv, UV uv)
{
UV sindex, lindex, vindex, tindex;
U8 *t, tmp[3 * UTF8_MAXLEN + 1];
if (tindex)
t = uvuni_to_utf8(t, (tindex + Hangul_TBase));
*t = '\0';
- sv_catpvn(sv, (char *)tmp, strlen((char *)tmp));
+ sv_catpvn(sv, (char *)tmp, t - tmp);
+}
+
+static void sv_cat_uvuni (SV* sv, UV uv)
+{
+ U8 *t, tmp[UTF8_MAXLEN + 1];
+
+ t = tmp;
+ t = uvuni_to_utf8(t, uv);
+ *t = '\0';
+ sv_catpvn(sv, (char *)tmp, t - tmp);
}
MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
s = (U8*)SvPV(src,srclen);
e = s + srclen;
- for (p = s; p < e;) {
- uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
- p += retlen;
+ for (p = s; p < e; p += retlen) {
+ uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
+ if (!retlen)
+ croak(ErrRetlenIsZero);
+
if (Hangul_IsS(uv))
sv_cat_decompHangul(dst, uv);
else {
if (r)
sv_catpv(dst, (char *)r);
else
- sv_catpvn(dst, (char *)p - retlen, retlen);
+ sv_cat_uvuni(dst, uv);
}
}
RETVAL = dst;
SV *src, *dst;
STRLEN srclen, dstlen, retlen, stk_cc_max;
U8 *s, *e, *p, *d, curCC;
- UV uv;
+ UV uv, uvlast;
UNF_cc * stk_cc;
+ STRLEN i, cc_pos;
+ bool valid_uvlast;
CODE:
if (SvUTF8(arg)) {
src = arg;
}
s = (U8*)SvPV(src, srclen);
-
+ e = s + srclen;
dstlen = srclen + 1;
dst = newSV(dstlen);
- sv_setpvn(dst,(const char*)s,srclen);
+ (void)SvPOK_only(dst);
SvUTF8_on(dst);
+ d = (U8*)SvPVX(dst);
stk_cc_max = 10; /* enough as an initial value? */
New(0, stk_cc, stk_cc_max, UNF_cc);
- d = (U8*)SvPV(dst,dstlen);
- e = d + dstlen;
-
- for (p = d; p < e;) {
- U8 *cc_in;
- STRLEN cc_len, cc_iter, cc_pos;
-
- uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
- curCC = getCombinClass(uv);
+ for (p = s; p < e;) {
+ uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
+ if (!retlen)
+ croak(ErrRetlenIsZero);
p += retlen;
- if (! (curCC && p < e))
+ curCC = getCombinClass(uv);
+ if (curCC == 0) {
+ d = uvuni_to_utf8(d, uv);
continue;
- else
- cc_in = p - retlen;
+ }
cc_pos = 0;
stk_cc[cc_pos].cc = curCC;
stk_cc[cc_pos].uv = uv;
stk_cc[cc_pos].pos = cc_pos;
+ valid_uvlast = FALSE;
while (p < e) {
- uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
+ uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
+ if (!retlen)
+ croak(ErrRetlenIsZero);
+ p += retlen;
+
curCC = getCombinClass(uv);
- if (!curCC)
+ if (curCC == 0) {
+ uvlast = uv;
+ valid_uvlast = TRUE;
break;
- p += retlen;
+ }
+
cc_pos++;
if (stk_cc_max <= cc_pos) { /* extend if need */
stk_cc_max = cc_pos + 1;
stk_cc[cc_pos].pos = cc_pos;
}
- /* only one c.c. in cc_len from cc_in, no need of reordering */
- if (!cc_pos)
- continue;
-
- qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc);
+ /* reordered if there are two c.c.'s */
+ if (cc_pos) {
+ qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc);
+ }
- cc_len = p - cc_in;
- p = cc_in;
- for (cc_iter = 0; cc_iter <= cc_pos; cc_iter++) {
- p = uvuni_to_utf8(p, stk_cc[cc_iter].uv);
+ for (i = 0; i <= cc_pos; i++) {
+ d = uvuni_to_utf8(d, stk_cc[i].uv);
+ }
+ if (valid_uvlast)
+ {
+ d = uvuni_to_utf8(d, uvlast);
}
}
+ *d = '\0';
+ SvCUR_set(dst, d - (U8*)SvPVX(dst));
Safefree(stk_cc);
RETVAL = dst;
OUTPUT:
compose(arg)
SV * arg
PROTOTYPE: $
+ ALIAS:
+ composeContiguous = 1
PREINIT:
SV *src, *dst, *tmp;
U8 *s, *p, *e, *d, *t, *tmp_start, curCC, preCC;
for (p = s; p < e;) {
if (beginning) {
- uvS = utf8n_to_uvuni(p, e - p, &retlen, 0);
+ uvS = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
+ if (!retlen)
+ croak(ErrRetlenIsZero);
p += retlen;
if (getCombinClass(uvS)) { /* no Starter found yet */
/* to the next Starter */
while (p < e) {
- uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
+ uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
+ if (!retlen)
+ croak(ErrRetlenIsZero);
p += retlen;
+
curCC = getCombinClass(uv);
if (preCC && preCC == curCC) {
} else {
uvComp = composite_uv(uvS, uv);
- if (uvComp && ! isExclusion(uvComp) && preCC <= curCC) {
+ if (uvComp && ! isExclusion(uvComp) &&
+ (ix ? (t == tmp_start) : (preCC <= curCC))) {
STRLEN leftcur, rightcur, dstcur;
leftcur = UNISKIP(uvComp);
rightcur = UNISKIP(uvS) + UNISKIP(uv);
}
/* preCC not changed to curCC */
uvS = uvComp;
- } else if (! curCC && p < e) { /* blocked */
+ } else if (! curCC && p < e) { /* blocked */
break;
} else {
preCC = curCC;
}
uvS = uv;
} /* for */
+ *d = '\0';
SvCUR_set(dst, d - (U8*)SvPVX(dst));
RETVAL = dst;
OUTPUT:
RETVAL
-
void
checkNFD(arg)
SV * arg
SV *src;
STRLEN srclen, retlen;
U8 *s, *e, *p, curCC, preCC;
- PPCODE:
+ CODE:
if (SvUTF8(arg)) {
src = arg;
} else {
preCC = 0;
for (p = s; p < e; p += retlen) {
- uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
+ uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
+ if (!retlen)
+ croak(ErrRetlenIsZero);
+
curCC = getCombinClass(uv);
if (preCC > curCC && curCC != 0) /* canonical ordering violated */
XSRETURN_NO;
STRLEN srclen, retlen;
U8 *s, *e, *p, curCC, preCC;
bool isMAYBE;
- PPCODE:
+ CODE:
if (SvUTF8(arg)) {
src = arg;
} else {
preCC = 0;
isMAYBE = FALSE;
for (p = s; p < e; p += retlen) {
- uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
+ uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
+ if (!retlen)
+ croak(ErrRetlenIsZero);
+
curCC = getCombinClass(uv);
if (preCC > curCC && curCC != 0) /* canonical ordering violated */
isMAYBE = TRUE;
else if (ix) {
char *canon, *compat;
- /*
- * NFKC_NO when having compatibility mapping;
- * i.e. dec_compat(uv) defined & different with dec_canonical(uv).
- */
+ /* NFKC_NO when having compatibility mapping. */
canon = (char *) dec_canonical(uv);
compat = (char *) dec_compat(uv);
- if (compat && (!canon || strNE(canon, compat)))
+ if (compat && !(canon && strEQ(canon, compat)))
XSRETURN_NO;
} /* end of get NFC/NFKC property */
+void
+checkFCD(arg)
+ SV * arg
+ PROTOTYPE: $
+ ALIAS:
+ checkFCC = 1
+ PREINIT:
+ UV uv, uvLead, uvTrail;
+ SV *src;
+ STRLEN srclen, retlen, canlen, canret;
+ U8 *s, *e, *p, curCC, preCC;
+ U8 *sCan, *pCan, *eCan;
+ bool isMAYBE;
+ CODE:
+ if (SvUTF8(arg)) {
+ src = arg;
+ } else {
+ src = sv_mortalcopy(arg);
+ sv_utf8_upgrade(src);
+ }
+
+ s = (U8*)SvPV(src,srclen);
+ e = s + srclen;
+
+ preCC = 0;
+ isMAYBE = FALSE;
+ for (p = s; p < e; p += retlen) {
+ uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
+ if (!retlen)
+ croak(ErrRetlenIsZero);
+
+ sCan = (U8*) dec_canonical(uv);
+
+ if (sCan) {
+ canlen = (STRLEN)strlen((char *) sCan);
+ uvLead = utf8n_to_uvuni(sCan, canlen, &canret, AllowAnyUTF);
+ }
+ else {
+ uvLead = uv;
+ }
+
+ curCC = getCombinClass(uvLead);
+
+ if (curCC != 0 && curCC < preCC) /* canonical ordering violated */
+ XSRETURN_NO;
+
+ if (ix) {
+ if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
+ XSRETURN_NO;
+ else if (isComp2nd(uv))
+ isMAYBE = TRUE;
+ }
+
+ if (sCan) {
+ eCan = sCan + canlen;
+ pCan = utf8_hop(eCan, -1);
+ if (pCan < sCan)
+ croak(ErrHopBeforeStart);
+ uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, AllowAnyUTF);
+ preCC = getCombinClass(uvTrail);
+ }
+ else {
+ preCC = curCC;
+ }
+ }
+ if (isMAYBE)
+ XSRETURN_UNDEF;
+ else
+ XSRETURN_YES;
+
+
+
U8
getCombinClass(uv)
UV uv
PROTOTYPE: $
ALIAS:
isNFKD_NO = 1
- PPCODE:
+ CODE:
if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
XSRETURN_YES; /* NFD_NO or NFKD_NO */
else
ALIAS:
isNFC_NO = 0
isNFKC_NO = 1
- PPCODE:
+ CODE:
if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
XSRETURN_YES; /* NFC_NO or NFKC_NO */
else if (ix) {
UV uv2
PROTOTYPE: $$
PREINIT:
- UV comp;
+ UV composite;
CODE:
- comp = composite_uv(uv, uv2);
- RETVAL = comp ? newSVuv(comp) : &PL_sv_undef;
+ composite = composite_uv(uv, uv2);
+ RETVAL = composite ? newSVuv(composite) : &PL_sv_undef;
OUTPUT:
RETVAL
OUTPUT:
RETVAL
+
+void
+splitOnLastStarter(arg)
+ SV * arg
+ PREINIT:
+ UV uv;
+ SV *src, *svp;
+ STRLEN srclen, retlen;
+ U8 *s, *e, *p;
+ PPCODE:
+ if (SvUTF8(arg)) {
+ src = arg;
+ } else {
+ src = sv_mortalcopy(arg);
+ sv_utf8_upgrade(src);
+ }
+
+ s = (U8*)SvPV(src,srclen);
+ e = s + srclen;
+
+ for (p = e; s < p; ) {
+ p = utf8_hop(p, -1);
+ if (p < s)
+ croak(ErrHopBeforeStart);
+ uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
+ if (getCombinClass(uv) == 0) /* Last Starter found */
+ break;
+ }
+
+ svp = sv_2mortal(newSVpvn((char*)s, p - s));
+ SvUTF8_on(svp);
+ XPUSHs(svp);
+
+ svp = sv_2mortal(newSVpvn((char*)p, e - p));
+ SvUTF8_on(svp);
+ XPUSHs(svp);
+