6 /* These 5 files are prepared by mkheader */
15 #define uvuni_to_utf8 uv_to_utf8
16 #endif /* uvuni_to_utf8 */
19 #ifndef utf8n_to_uvuni
20 #define utf8n_to_uvuni utf8_to_uv
21 #endif /* utf8n_to_uvuni */
23 /* UTF8_ALLOW_BOM is used before Perl 5.8.0 */
25 #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FFFF)
27 #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)
30 /* if utf8n_to_uvuni() sets retlen to 0 (?) */
31 #define ErrRetlenIsZero "panic (Unicode::Normalize): zero-length character"
33 /* utf8_hop() hops back before start. Maybe broken UTF-8 */
34 #define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start"
36 /* At present, char > 0x10ffff are unaffected without complaint, right? */
37 #define VALID_UTF_MAX (0x10ffff)
38 #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
41 #define Hangul_SBase 0xAC00
42 #define Hangul_SFinal 0xD7A3
43 #define Hangul_SCount 11172
45 #define Hangul_NCount 588
47 #define Hangul_LBase 0x1100
48 #define Hangul_LFinal 0x1112
49 #define Hangul_LCount 19
51 #define Hangul_VBase 0x1161
52 #define Hangul_VFinal 0x1175
53 #define Hangul_VCount 21
55 #define Hangul_TBase 0x11A7
56 #define Hangul_TFinal 0x11C2
57 #define Hangul_TCount 28
59 #define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
60 #define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0)
61 #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
62 #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
63 #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
64 #define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal))
67 /* this is used for canonical ordering of combining characters (c.c.). */
69 U8 cc; /* combining class */
70 UV uv; /* codepoint */
71 STRLEN pos; /* position */
74 static int compare_cc (const void *a, const void *b)
77 ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;
81 return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos )
82 - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );
85 static U8* dec_canonical (UV uv)
90 plane = (U8***)UNF_canon[uv >> 16];
93 row = plane[(uv >> 8) & 0xff];
94 return row ? row[uv & 0xff] : NULL;
97 static U8* dec_compat (UV uv)
100 if (OVER_UTF_MAX(uv))
102 plane = (U8***)UNF_compat[uv >> 16];
105 row = plane[(uv >> 8) & 0xff];
106 return row ? row[uv & 0xff] : NULL;
109 static UV composite_uv (UV uv, UV uv2)
111 UNF_complist ***plane, **row, *cell, *i;
113 if (! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
116 if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {
117 uv -= Hangul_LBase; /* lindex */
118 uv2 -= Hangul_VBase; /* vindex */
119 return(Hangul_SBase + (uv * Hangul_VCount + uv2) * Hangul_TCount);
121 if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
122 uv2 -= Hangul_TBase; /* tindex */
125 plane = UNF_compos[uv >> 16];
128 row = plane[(uv >> 8) & 0xff];
131 cell = row[uv & 0xff];
134 for (i = cell; i->nextchar; i++) {
135 if (uv2 == i->nextchar)
141 static U8 getCombinClass (UV uv)
144 if (OVER_UTF_MAX(uv))
146 plane = (U8**)UNF_combin[uv >> 16];
149 row = plane[(uv >> 8) & 0xff];
150 return row ? row[uv & 0xff] : 0;
153 static void sv_cat_decompHangul (SV* sv, UV uv)
155 UV sindex, lindex, vindex, tindex;
156 U8 *t, tmp[3 * UTF8_MAXLEN + 1];
158 if (! Hangul_IsS(uv))
161 sindex = uv - Hangul_SBase;
162 lindex = sindex / Hangul_NCount;
163 vindex = (sindex % Hangul_NCount) / Hangul_TCount;
164 tindex = sindex % Hangul_TCount;
167 t = uvuni_to_utf8(t, (lindex + Hangul_LBase));
168 t = uvuni_to_utf8(t, (vindex + Hangul_VBase));
170 t = uvuni_to_utf8(t, (tindex + Hangul_TBase));
172 sv_catpvn(sv, (char *)tmp, t - tmp);
175 static void sv_cat_uvuni (SV* sv, UV uv)
177 U8 *t, tmp[UTF8_MAXLEN + 1];
180 t = uvuni_to_utf8(t, uv);
182 sv_catpvn(sv, (char *)tmp, t - tmp);
185 MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
188 decompose(arg, compat = &PL_sv_no)
195 STRLEN srclen, retlen;
202 src = sv_mortalcopy(arg);
203 sv_utf8_upgrade(src);
205 iscompat = SvTRUE(compat);
208 (void)SvPOK_only(dst);
211 s = (U8*)SvPV(src,srclen);
213 for (p = s; p < e; p += retlen) {
214 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
216 croak(ErrRetlenIsZero);
219 sv_cat_decompHangul(dst, uv);
221 r = iscompat ? dec_compat(uv) : dec_canonical(uv);
223 sv_catpv(dst, (char *)r);
225 sv_cat_uvuni(dst, uv);
240 STRLEN srclen, dstlen, retlen, stk_cc_max;
241 U8 *s, *e, *p, *d, curCC;
250 src = sv_mortalcopy(arg);
251 sv_utf8_upgrade(src);
254 s = (U8*)SvPV(src, srclen);
258 (void)SvPOK_only(dst);
262 stk_cc_max = 10; /* enough as an initial value? */
263 New(0, stk_cc, stk_cc_max, UNF_cc);
265 for (p = s; p < e;) {
266 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
268 croak(ErrRetlenIsZero);
271 curCC = getCombinClass(uv);
273 d = uvuni_to_utf8(d, uv);
278 stk_cc[cc_pos].cc = curCC;
279 stk_cc[cc_pos].uv = uv;
280 stk_cc[cc_pos].pos = cc_pos;
282 valid_uvlast = FALSE;
284 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
286 croak(ErrRetlenIsZero);
289 curCC = getCombinClass(uv);
297 if (stk_cc_max <= cc_pos) { /* extend if need */
298 stk_cc_max = cc_pos + 1;
299 Renew(stk_cc, stk_cc_max, UNF_cc);
301 stk_cc[cc_pos].cc = curCC;
302 stk_cc[cc_pos].uv = uv;
303 stk_cc[cc_pos].pos = cc_pos;
306 /* reordered if there are two c.c.'s */
308 qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc);
311 for (i = 0; i <= cc_pos; i++) {
312 d = uvuni_to_utf8(d, stk_cc[i].uv);
316 d = uvuni_to_utf8(d, uvlast);
320 SvCUR_set(dst, d - (U8*)SvPVX(dst));
333 composeContiguous = 1
336 U8 *s, *p, *e, *d, *t, *tmp_start, curCC, preCC;
338 STRLEN srclen, dstlen, tmplen, retlen;
339 bool beginning = TRUE;
344 src = sv_mortalcopy(arg);
345 sv_utf8_upgrade(src);
348 s = (U8*)SvPV(src, srclen);
352 (void)SvPOK_only(dst);
356 /* for uncomposed combining char */
357 tmp = sv_2mortal(newSV(dstlen));
358 (void)SvPOK_only(tmp);
361 for (p = s; p < e;) {
363 uvS = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
365 croak(ErrRetlenIsZero);
368 if (getCombinClass(uvS)) { /* no Starter found yet */
369 d = uvuni_to_utf8(d, uvS);
376 t = tmp_start = (U8*)SvPVX(tmp);
379 /* to the next Starter */
381 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
383 croak(ErrRetlenIsZero);
386 curCC = getCombinClass(uv);
388 if (preCC && preCC == curCC) {
390 t = uvuni_to_utf8(t, uv);
392 uvComp = composite_uv(uvS, uv);
394 if (uvComp && ! isExclusion(uvComp) &&
395 (ix ? (t == tmp_start) : (preCC <= curCC))) {
396 STRLEN leftcur, rightcur, dstcur;
397 leftcur = UNISKIP(uvComp);
398 rightcur = UNISKIP(uvS) + UNISKIP(uv);
400 if (leftcur > rightcur) {
401 dstcur = d - (U8*)SvPVX(dst);
402 dstlen += leftcur - rightcur;
403 d = (U8*)SvGROW(dst,dstlen) + dstcur;
405 /* preCC not changed to curCC */
407 } else if (! curCC && p < e) { /* blocked */
411 t = uvuni_to_utf8(t, uv);
415 d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */
416 tmplen = t - tmp_start;
417 if (tmplen) { /* uncomposed combining char */
425 SvCUR_set(dst, d - (U8*)SvPVX(dst));
440 STRLEN srclen, retlen;
441 U8 *s, *e, *p, curCC, preCC;
446 src = sv_mortalcopy(arg);
447 sv_utf8_upgrade(src);
450 s = (U8*)SvPV(src,srclen);
454 for (p = s; p < e; p += retlen) {
455 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
457 croak(ErrRetlenIsZero);
459 curCC = getCombinClass(uv);
460 if (preCC > curCC && curCC != 0) /* canonical ordering violated */
462 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
479 STRLEN srclen, retlen;
480 U8 *s, *e, *p, curCC, preCC;
486 src = sv_mortalcopy(arg);
487 sv_utf8_upgrade(src);
490 s = (U8*)SvPV(src,srclen);
495 for (p = s; p < e; p += retlen) {
496 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
498 croak(ErrRetlenIsZero);
500 curCC = getCombinClass(uv);
502 if (preCC > curCC && curCC != 0) /* canonical ordering violated */
505 /* get NFC/NFKC property */
506 if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */
508 else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
510 else if (isComp2nd(uv))
513 char *canon, *compat;
514 /* NFKC_NO when having compatibility mapping. */
515 canon = (char *) dec_canonical(uv);
516 compat = (char *) dec_compat(uv);
517 if (compat && !(canon && strEQ(canon, compat)))
519 } /* end of get NFC/NFKC property */
537 UV uv, uvLead, uvTrail;
539 STRLEN srclen, retlen, canlen, canret;
540 U8 *s, *e, *p, curCC, preCC;
541 U8 *sCan, *pCan, *eCan;
547 src = sv_mortalcopy(arg);
548 sv_utf8_upgrade(src);
551 s = (U8*)SvPV(src,srclen);
556 for (p = s; p < e; p += retlen) {
557 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
559 croak(ErrRetlenIsZero);
561 sCan = (U8*) dec_canonical(uv);
564 canlen = (STRLEN)strlen((char *) sCan);
565 uvLead = utf8n_to_uvuni(sCan, canlen, &canret, AllowAnyUTF);
571 curCC = getCombinClass(uvLead);
573 if (curCC != 0 && curCC < preCC) /* canonical ordering violated */
577 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
579 else if (isComp2nd(uv))
584 eCan = sCan + canlen;
585 pCan = utf8_hop(eCan, -1);
587 croak(ErrHopBeforeStart);
588 uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, AllowAnyUTF);
589 preCC = getCombinClass(uvTrail);
639 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
640 XSRETURN_YES; /* NFD_NO or NFKD_NO */
654 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
655 XSRETURN_YES; /* NFC_NO or NFKC_NO */
657 char *canon, *compat;
658 canon = (char *) dec_canonical(uv);
659 compat = (char *) dec_compat(uv);
660 if (compat && (!canon || strNE(canon, compat)))
661 XSRETURN_YES; /* NFC_NO or NFKC_NO */
671 getComposite(uv, uv2)
678 composite = composite_uv(uv, uv2);
679 RETVAL = composite ? newSVuv(composite) : &PL_sv_undef;
694 if (Hangul_IsS(uv)) {
697 (void)SvPOK_only(dst);
698 sv_cat_decompHangul(dst, uv);
701 rstr = ix ? dec_compat(uv) : dec_canonical(uv);
704 RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
712 splitOnLastStarter(arg)
717 STRLEN srclen, retlen;
723 src = sv_mortalcopy(arg);
724 sv_utf8_upgrade(src);
727 s = (U8*)SvPV(src,srclen);
730 for (p = e; s < p; ) {
733 croak(ErrHopBeforeStart);
734 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
735 if (getCombinClass(uv) == 0) /* Last Starter found */
739 svp = sv_2mortal(newSVpvn((char*)s, p - s));
743 svp = sv_2mortal(newSVpvn((char*)p, e - p));