6 /* These 5 files are prepared by mkheader */
15 #define uvuni_to_utf8 uv_to_utf8
16 #endif /* uvuni_to_utf8 */
19 #ifndef utf8n_to_uvuni
20 #define utf8n_to_uvuni utf8_to_uv
21 #endif /* utf8n_to_uvuni */
23 /* UTF8_ALLOW_BOM is used before Perl 5.8.0 */
25 #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FFFF)
27 #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)
30 /* if utf8n_to_uvuni() sets retlen to 0 (?) */
31 #define ErrRetlenIsZero "panic (Unicode::Normalize): zero-length character"
33 /* utf8_hop() hops back before start. Maybe broken UTF-8 */
34 #define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start"
36 /* At present, char > 0x10ffff are unaffected without complaint, right? */
37 #define VALID_UTF_MAX (0x10ffff)
38 #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
41 #define Hangul_SBase 0xAC00
42 #define Hangul_SFinal 0xD7A3
43 #define Hangul_SCount 11172
45 #define Hangul_NCount 588
47 #define Hangul_LBase 0x1100
48 #define Hangul_LFinal 0x1112
49 #define Hangul_LCount 19
51 #define Hangul_VBase 0x1161
52 #define Hangul_VFinal 0x1175
53 #define Hangul_VCount 21
55 #define Hangul_TBase 0x11A7
56 #define Hangul_TFinal 0x11C2
57 #define Hangul_TCount 28
59 #define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
60 #define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0)
61 #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
62 #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
63 #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
64 #define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal))
67 /* this is used for canonical ordering of combining characters (c.c.). */
69 U8 cc; /* combining class */
70 UV uv; /* codepoint */
71 STRLEN pos; /* position */
74 static int compare_cc (const void *a, const void *b)
77 ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;
81 return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos )
82 - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );
85 static U8* dec_canonical (UV uv)
90 plane = (U8***)UNF_canon[uv >> 16];
93 row = plane[(uv >> 8) & 0xff];
94 return row ? row[uv & 0xff] : NULL;
97 static U8* dec_compat (UV uv)
100 if (OVER_UTF_MAX(uv))
102 plane = (U8***)UNF_compat[uv >> 16];
105 row = plane[(uv >> 8) & 0xff];
106 return row ? row[uv & 0xff] : NULL;
109 static UV composite_uv (UV uv, UV uv2)
111 UNF_complist ***plane, **row, *cell, *i;
113 if (! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
116 if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {
117 uv -= Hangul_LBase; /* lindex */
118 uv2 -= Hangul_VBase; /* vindex */
119 return(Hangul_SBase + (uv * Hangul_VCount + uv2) * Hangul_TCount);
121 if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
122 uv2 -= Hangul_TBase; /* tindex */
125 plane = UNF_compos[uv >> 16];
128 row = plane[(uv >> 8) & 0xff];
131 cell = row[uv & 0xff];
134 for (i = cell; i->nextchar; i++) {
135 if (uv2 == i->nextchar)
141 static U8 getCombinClass (UV uv)
144 if (OVER_UTF_MAX(uv))
146 plane = (U8**)UNF_combin[uv >> 16];
149 row = plane[(uv >> 8) & 0xff];
150 return row ? row[uv & 0xff] : 0;
153 static void sv_cat_decompHangul (SV* sv, UV uv)
155 UV sindex, lindex, vindex, tindex;
156 U8 *t, tmp[3 * UTF8_MAXLEN + 1];
158 if (! Hangul_IsS(uv))
161 sindex = uv - Hangul_SBase;
162 lindex = sindex / Hangul_NCount;
163 vindex = (sindex % Hangul_NCount) / Hangul_TCount;
164 tindex = sindex % Hangul_TCount;
167 t = uvuni_to_utf8(t, (lindex + Hangul_LBase));
168 t = uvuni_to_utf8(t, (vindex + Hangul_VBase));
170 t = uvuni_to_utf8(t, (tindex + Hangul_TBase));
172 sv_catpvn(sv, (char *)tmp, t - tmp);
175 static void sv_cat_uvuni (SV* sv, UV uv)
177 U8 *t, tmp[UTF8_MAXLEN + 1];
180 t = uvuni_to_utf8(t, uv);
182 sv_catpvn(sv, (char *)tmp, t - tmp);
185 static char * sv_2pvunicode(SV *sv, STRLEN *lp)
189 s = (char*)SvPV(sv,len);
191 SV* tmpsv = sv_mortalcopy(sv);
193 (void)sv_pvn_force(tmpsv,&len);
194 sv_utf8_upgrade(tmpsv);
195 s = (char*)SvPV(tmpsv,len);
201 MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
204 decompose(src, compat = &PL_sv_no)
210 STRLEN srclen, retlen;
215 iscompat = SvTRUE(compat);
216 s = (U8*)sv_2pvunicode(src,&srclen);
220 (void)SvPOK_only(dst);
223 for (p = s; p < e; p += retlen) {
224 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
226 croak(ErrRetlenIsZero);
229 sv_cat_decompHangul(dst, uv);
231 r = iscompat ? dec_compat(uv) : dec_canonical(uv);
233 sv_catpv(dst, (char *)r);
235 sv_cat_uvuni(dst, uv);
250 STRLEN srclen, dstlen, retlen, stk_cc_max;
251 U8 *s, *e, *p, *d, curCC;
257 s = (U8*)sv_2pvunicode(src,&srclen);
262 (void)SvPOK_only(dst);
266 stk_cc_max = 10; /* enough as an initial value? */
267 New(0, stk_cc, stk_cc_max, UNF_cc);
269 for (p = s; p < e;) {
270 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
272 croak(ErrRetlenIsZero);
275 curCC = getCombinClass(uv);
277 d = uvuni_to_utf8(d, uv);
282 stk_cc[cc_pos].cc = curCC;
283 stk_cc[cc_pos].uv = uv;
284 stk_cc[cc_pos].pos = cc_pos;
286 valid_uvlast = FALSE;
288 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
290 croak(ErrRetlenIsZero);
293 curCC = getCombinClass(uv);
301 if (stk_cc_max <= cc_pos) { /* extend if need */
302 stk_cc_max = cc_pos + 1;
303 Renew(stk_cc, stk_cc_max, UNF_cc);
305 stk_cc[cc_pos].cc = curCC;
306 stk_cc[cc_pos].uv = uv;
307 stk_cc[cc_pos].pos = cc_pos;
310 /* reordered if there are two c.c.'s */
312 qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc);
315 for (i = 0; i <= cc_pos; i++) {
316 d = uvuni_to_utf8(d, stk_cc[i].uv);
320 d = uvuni_to_utf8(d, uvlast);
324 SvCUR_set(dst, d - (U8*)SvPVX(dst));
337 composeContiguous = 1
340 U8 *s, *p, *e, *d, *t, *tmp_start, curCC, preCC;
342 STRLEN srclen, dstlen, tmplen, retlen;
343 bool beginning = TRUE;
345 s = (U8*)sv_2pvunicode(src,&srclen);
350 (void)SvPOK_only(dst);
354 /* for uncomposed combining char */
355 tmp = sv_2mortal(newSV(dstlen));
356 (void)SvPOK_only(tmp);
359 for (p = s; p < e;) {
361 uvS = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
363 croak(ErrRetlenIsZero);
366 if (getCombinClass(uvS)) { /* no Starter found yet */
367 d = uvuni_to_utf8(d, uvS);
374 t = tmp_start = (U8*)SvPVX(tmp);
377 /* to the next Starter */
379 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
381 croak(ErrRetlenIsZero);
384 curCC = getCombinClass(uv);
386 if (preCC && preCC == curCC) {
388 t = uvuni_to_utf8(t, uv);
390 uvComp = composite_uv(uvS, uv);
392 if (uvComp && ! isExclusion(uvComp) &&
393 (ix ? (t == tmp_start) : (preCC <= curCC))) {
394 STRLEN leftcur, rightcur, dstcur;
395 leftcur = UNISKIP(uvComp);
396 rightcur = UNISKIP(uvS) + UNISKIP(uv);
398 if (leftcur > rightcur) {
399 dstcur = d - (U8*)SvPVX(dst);
400 dstlen += leftcur - rightcur;
401 d = (U8*)SvGROW(dst,dstlen) + dstcur;
403 /* preCC not changed to curCC */
405 } else if (! curCC && p < e) { /* blocked */
409 t = uvuni_to_utf8(t, uv);
413 d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */
414 tmplen = t - tmp_start;
415 if (tmplen) { /* uncomposed combining char */
423 SvCUR_set(dst, d - (U8*)SvPVX(dst));
436 STRLEN srclen, retlen;
437 U8 *s, *e, *p, curCC, preCC;
440 s = (U8*)sv_2pvunicode(src,&srclen);
444 for (p = s; p < e; p += retlen) {
445 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
447 croak(ErrRetlenIsZero);
449 curCC = getCombinClass(uv);
450 if (preCC > curCC && curCC != 0) /* canonical ordering violated */
452 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
467 STRLEN srclen, retlen;
468 U8 *s, *e, *p, curCC, preCC;
472 s = (U8*)sv_2pvunicode(src,&srclen);
477 for (p = s; p < e; p += retlen) {
478 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
480 croak(ErrRetlenIsZero);
482 curCC = getCombinClass(uv);
484 if (preCC > curCC && curCC != 0) /* canonical ordering violated */
487 /* get NFC/NFKC property */
488 if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */
490 else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
492 else if (isComp2nd(uv))
495 char *canon, *compat;
496 /* NFKC_NO when having compatibility mapping. */
497 canon = (char *) dec_canonical(uv);
498 compat = (char *) dec_compat(uv);
499 if (compat && !(canon && strEQ(canon, compat)))
501 } /* end of get NFC/NFKC property */
519 STRLEN srclen, retlen, canlen, canret;
520 U8 *s, *e, *p, curCC, preCC;
521 UV uv, uvLead, uvTrail;
522 U8 *sCan, *pCan, *eCan;
525 s = (U8*)sv_2pvunicode(src,&srclen);
530 for (p = s; p < e; p += retlen) {
531 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
533 croak(ErrRetlenIsZero);
535 sCan = (U8*) dec_canonical(uv);
538 canlen = (STRLEN)strlen((char *) sCan);
539 uvLead = utf8n_to_uvuni(sCan, canlen, &canret, AllowAnyUTF);
545 curCC = getCombinClass(uvLead);
547 if (curCC != 0 && curCC < preCC) /* canonical ordering violated */
551 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
553 else if (isComp2nd(uv))
558 eCan = sCan + canlen;
559 pCan = utf8_hop(eCan, -1);
561 croak(ErrHopBeforeStart);
562 uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, AllowAnyUTF);
563 preCC = getCombinClass(uvTrail);
613 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
614 XSRETURN_YES; /* NFD_NO or NFKD_NO */
628 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
629 XSRETURN_YES; /* NFC_NO or NFKC_NO */
631 char *canon, *compat;
632 canon = (char *) dec_canonical(uv);
633 compat = (char *) dec_compat(uv);
634 if (compat && (!canon || strNE(canon, compat)))
635 XSRETURN_YES; /* NFC_NO or NFKC_NO */
645 getComposite(uv, uv2)
652 composite = composite_uv(uv, uv2);
653 RETVAL = composite ? newSVuv(composite) : &PL_sv_undef;
668 if (Hangul_IsS(uv)) {
671 (void)SvPOK_only(dst);
672 sv_cat_decompHangul(dst, uv);
675 rstr = ix ? dec_compat(uv) : dec_canonical(uv);
678 RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
686 splitOnLastStarter(src)
690 STRLEN srclen, retlen;
694 s = (U8*)sv_2pvunicode(src,&srclen);
697 for (p = e; s < p; ) {
700 croak(ErrHopBeforeStart);
701 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
702 if (getCombinClass(uv) == 0) /* Last Starter found */
706 svp = sv_2mortal(newSVpvn((char*)s, p - s));
710 svp = sv_2mortal(newSVpvn((char*)p, e - p));