6 /* These 5 files are prepared by mkheader */
15 #define uvuni_to_utf8 uv_to_utf8
16 #endif /* uvuni_to_utf8 */
19 #ifndef utf8n_to_uvuni
20 #define utf8n_to_uvuni utf8_to_uv
21 #endif /* utf8n_to_uvuni */
23 /* At present, char > 0x10ffff are unaffected without complaint, right? */
24 #define VALID_UTF_MAX (0x10ffff)
25 #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
28 #define Hangul_SBase 0xAC00
29 #define Hangul_SFinal 0xD7A3
30 #define Hangul_SCount 11172
32 #define Hangul_NCount 588
34 #define Hangul_LBase 0x1100
35 #define Hangul_LFinal 0x1112
36 #define Hangul_LCount 19
38 #define Hangul_VBase 0x1161
39 #define Hangul_VFinal 0x1175
40 #define Hangul_VCount 21
42 #define Hangul_TBase 0x11A7
43 #define Hangul_TFinal 0x11C2
44 #define Hangul_TCount 28
46 #define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
47 #define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0)
48 #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
49 #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
50 #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
51 #define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal))
54 /* this is used for canonical ordering of combining characters (c.c.). */
56 U8 cc; /* combining class */
57 UV uv; /* codepoint */
58 STRLEN pos; /* position */
61 int compare_cc (const void *a, const void *b)
64 ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;
68 return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos )
69 - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );
72 U8* dec_canonical (UV uv)
77 plane = (U8***)UNF_canon[uv >> 16];
80 row = plane[(uv >> 8) & 0xff];
81 return row ? row[uv & 0xff] : NULL;
84 U8* dec_compat (UV uv)
89 plane = (U8***)UNF_compat[uv >> 16];
92 row = plane[(uv >> 8) & 0xff];
93 return row ? row[uv & 0xff] : NULL;
96 UV composite_uv (UV uv, UV uv2)
98 UNF_complist ***plane, **row, *cell, *i;
100 if (! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
103 if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {
104 uv -= Hangul_LBase; /* lindex */
105 uv2 -= Hangul_VBase; /* vindex */
106 return(Hangul_SBase + (uv * Hangul_VCount + uv2) * Hangul_TCount);
108 if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
109 uv2 -= Hangul_TBase; /* tindex */
112 plane = UNF_compos[uv >> 16];
115 row = plane[(uv >> 8) & 0xff];
118 cell = row[uv & 0xff];
121 for (i = cell; i->nextchar; i++) {
122 if (uv2 == i->nextchar)
128 U8 getCombinClass (UV uv)
131 if (OVER_UTF_MAX(uv))
133 plane = (U8**)UNF_combin[uv >> 16];
136 row = plane[(uv >> 8) & 0xff];
137 return row ? row[uv & 0xff] : 0;
140 void sv_cat_decompHangul (SV* sv, UV uv)
142 UV sindex, lindex, vindex, tindex;
143 U8 *t, tmp[3 * UTF8_MAXLEN + 1];
145 if (! Hangul_IsS(uv))
148 sindex = uv - Hangul_SBase;
149 lindex = sindex / Hangul_NCount;
150 vindex = (sindex % Hangul_NCount) / Hangul_TCount;
151 tindex = sindex % Hangul_TCount;
154 t = uvuni_to_utf8(t, (lindex + Hangul_LBase));
155 t = uvuni_to_utf8(t, (vindex + Hangul_VBase));
157 t = uvuni_to_utf8(t, (tindex + Hangul_TBase));
159 sv_catpvn(sv, (char *)tmp, strlen((char *)tmp));
162 MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
165 decompose(arg, compat = &PL_sv_no)
172 STRLEN srclen, retlen;
179 src = sv_mortalcopy(arg);
180 sv_utf8_upgrade(src);
182 iscompat = SvTRUE(compat);
185 (void)SvPOK_only(dst);
188 s = (U8*)SvPV(src,srclen);
190 for (p = s; p < e;) {
191 uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
194 sv_cat_decompHangul(dst, uv);
196 r = iscompat ? dec_compat(uv) : dec_canonical(uv);
198 sv_catpv(dst, (char *)r);
200 sv_catpvn(dst, (char *)p - retlen, retlen);
215 STRLEN srclen, dstlen, retlen, stk_cc_max;
216 U8 *s, *e, *p, *d, curCC;
223 src = sv_mortalcopy(arg);
224 sv_utf8_upgrade(src);
227 s = (U8*)SvPV(src, srclen);
231 sv_setpvn(dst,(const char*)s,srclen);
234 stk_cc_max = 10; /* enough as an initial value? */
235 New(0, stk_cc, stk_cc_max, UNF_cc);
237 d = (U8*)SvPV(dst,dstlen);
240 for (p = d; p < e;) {
242 STRLEN cc_len, cc_iter, cc_pos;
244 uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
245 curCC = getCombinClass(uv);
248 if (! (curCC && p < e))
254 stk_cc[cc_pos].cc = curCC;
255 stk_cc[cc_pos].uv = uv;
256 stk_cc[cc_pos].pos = cc_pos;
259 uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
260 curCC = getCombinClass(uv);
265 if (stk_cc_max <= cc_pos) { /* extend if need */
266 stk_cc_max = cc_pos + 1;
267 Renew(stk_cc, stk_cc_max, UNF_cc);
269 stk_cc[cc_pos].cc = curCC;
270 stk_cc[cc_pos].uv = uv;
271 stk_cc[cc_pos].pos = cc_pos;
274 /* only one c.c. in cc_len from cc_in, no need of reordering */
278 qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc);
282 for (cc_iter = 0; cc_iter <= cc_pos; cc_iter++) {
283 p = uvuni_to_utf8(p, stk_cc[cc_iter].uv);
299 U8 *s, *p, *e, *d, *t, *tmp_start, curCC, preCC;
301 STRLEN srclen, dstlen, tmplen, retlen;
302 bool beginning = TRUE;
307 src = sv_mortalcopy(arg);
308 sv_utf8_upgrade(src);
311 s = (U8*)SvPV(src, srclen);
315 (void)SvPOK_only(dst);
319 /* for uncomposed combining char */
320 tmp = sv_2mortal(newSV(dstlen));
321 (void)SvPOK_only(tmp);
324 for (p = s; p < e;) {
326 uvS = utf8n_to_uvuni(p, e - p, &retlen, 0);
329 if (getCombinClass(uvS)) { /* no Starter found yet */
330 d = uvuni_to_utf8(d, uvS);
337 t = tmp_start = (U8*)SvPVX(tmp);
340 /* to the next Starter */
342 uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
344 curCC = getCombinClass(uv);
346 if (preCC && preCC == curCC) {
348 t = uvuni_to_utf8(t, uv);
350 uvComp = composite_uv(uvS, uv);
352 if (uvComp && ! isExclusion(uvComp) && preCC <= curCC) {
353 STRLEN leftcur, rightcur, dstcur;
354 leftcur = UNISKIP(uvComp);
355 rightcur = UNISKIP(uvS) + UNISKIP(uv);
357 if (leftcur > rightcur) {
358 dstcur = d - (U8*)SvPVX(dst);
359 dstlen += leftcur - rightcur;
360 d = (U8*)SvGROW(dst,dstlen) + dstcur;
362 /* preCC not changed to curCC */
364 } else if (! curCC && p < e) { /* blocked */
368 t = uvuni_to_utf8(t, uv);
372 d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */
373 tmplen = t - tmp_start;
374 if (tmplen) { /* uncomposed combining char */
382 SvCUR_set(dst, d - (U8*)SvPVX(dst));
398 STRLEN srclen, retlen;
399 U8 *s, *e, *p, curCC, preCC;
404 src = sv_mortalcopy(arg);
405 sv_utf8_upgrade(src);
408 s = (U8*)SvPV(src,srclen);
412 for (p = s; p < e; p += retlen) {
413 uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
414 curCC = getCombinClass(uv);
415 if (preCC > curCC && curCC != 0) /* canonical ordering violated */
417 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
434 STRLEN srclen, retlen;
435 U8 *s, *e, *p, curCC, preCC;
441 src = sv_mortalcopy(arg);
442 sv_utf8_upgrade(src);
445 s = (U8*)SvPV(src,srclen);
450 for (p = s; p < e; p += retlen) {
451 uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
452 curCC = getCombinClass(uv);
454 if (preCC > curCC && curCC != 0) /* canonical ordering violated */
457 /* get NFC/NFKC property */
458 if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */
460 else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
462 else if (isComp2nd(uv))
465 char *canon, *compat;
466 /* NFKC_NO when having compatibility mapping. */
467 canon = (char *) dec_canonical(uv);
468 compat = (char *) dec_compat(uv);
469 if (compat && !(canon && strEQ(canon, compat)))
471 } /* end of get NFC/NFKC property */
519 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
520 XSRETURN_YES; /* NFD_NO or NFKD_NO */
534 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
535 XSRETURN_YES; /* NFC_NO or NFKC_NO */
537 char *canon, *compat;
538 canon = (char *) dec_canonical(uv);
539 compat = (char *) dec_compat(uv);
540 if (compat && (!canon || strNE(canon, compat)))
541 XSRETURN_YES; /* NFC_NO or NFKC_NO */
551 getComposite(uv, uv2)
558 composite = composite_uv(uv, uv2);
559 RETVAL = composite ? newSVuv(composite) : &PL_sv_undef;
574 if (Hangul_IsS(uv)) {
577 (void)SvPOK_only(dst);
578 sv_cat_decompHangul(dst, uv);
581 rstr = ix ? dec_compat(uv) : dec_canonical(uv);
584 RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));