6 /* These 5 files are prepared by mkheader */
15 #define uvuni_to_utf8 uv_to_utf8
16 #endif /* uvuni_to_utf8 */
19 #ifndef utf8n_to_uvuni
20 #define utf8n_to_uvuni utf8_to_uv
21 #endif /* utf8n_to_uvuni */
23 /* At present, char > 0x10ffff are unaffected without complaint, right? */
24 #define VALID_UTF_MAX (0x10ffff)
25 #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
28 #define Hangul_SBase 0xAC00
29 #define Hangul_SFinal 0xD7A3
30 #define Hangul_SCount 11172
32 #define Hangul_NCount 588
34 #define Hangul_LBase 0x1100
35 #define Hangul_LFinal 0x1112
36 #define Hangul_LCount 19
38 #define Hangul_VBase 0x1161
39 #define Hangul_VFinal 0x1175
40 #define Hangul_VCount 21
42 #define Hangul_TBase 0x11A7
43 #define Hangul_TFinal 0x11C2
44 #define Hangul_TCount 28
46 #define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
47 #define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0)
48 #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
49 #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
50 #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
51 #define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal))
54 /* this is used for canonical ordering of combining characters (c.c.). */
56 U8 cc; /* combining class */
57 UV uv; /* codepoint */
58 STRLEN pos; /* position */
61 int compare_cc(const void *a, const void *b)
64 ret_cc = (*(UNF_cc*)a).cc - (*(UNF_cc*)b).cc;
67 return (*(UNF_cc*)a).pos - (*(UNF_cc*)b).pos;
70 U8* dec_canonical (UV uv)
75 plane = (U8***)UNF_canon[uv >> 16];
78 row = plane[(uv >> 8) & 0xff];
79 return row ? row[uv & 0xff] : NULL;
82 U8* dec_compat (UV uv)
87 plane = (U8***)UNF_compat[uv >> 16];
90 row = plane[(uv >> 8) & 0xff];
91 return row ? row[uv & 0xff] : NULL;
94 UV composite_uv (UV uv, UV uv2)
96 UNF_complist ***plane, **row, *cell, *i;
98 if (! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
101 if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {
102 uv -= Hangul_LBase; /* lindex */
103 uv2 -= Hangul_VBase; /* vindex */
104 return(Hangul_SBase + (uv * Hangul_VCount + uv2) * Hangul_TCount);
106 if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
107 uv2 -= Hangul_TBase; /* tindex */
110 plane = UNF_compos[uv >> 16];
113 row = plane[(uv >> 8) & 0xff];
116 cell = row[uv & 0xff];
119 for (i = cell; i->nextchar; i++) {
120 if (uv2 == i->nextchar)
126 U8 getCombinClass (UV uv)
129 if (OVER_UTF_MAX(uv))
131 plane = (U8**)UNF_combin[uv >> 16];
134 row = plane[(uv >> 8) & 0xff];
135 return row ? row[uv & 0xff] : 0;
138 void sv_cat_decompHangul (SV* sv, UV uv)
140 UV sindex, lindex, vindex, tindex;
141 U8 *t, tmp[3 * UTF8_MAXLEN + 1];
143 if (! Hangul_IsS(uv))
146 sindex = uv - Hangul_SBase;
147 lindex = sindex / Hangul_NCount;
148 vindex = (sindex % Hangul_NCount) / Hangul_TCount;
149 tindex = sindex % Hangul_TCount;
152 t = uvuni_to_utf8(t, (lindex + Hangul_LBase));
153 t = uvuni_to_utf8(t, (vindex + Hangul_VBase));
155 t = uvuni_to_utf8(t, (tindex + Hangul_TBase));
157 sv_catpvn(sv, (char *)tmp, strlen((char *)tmp));
160 MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
163 decompose(arg, compat = &PL_sv_no)
170 STRLEN srclen, retlen;
177 src = sv_mortalcopy(arg);
178 sv_utf8_upgrade(src);
180 iscompat = SvTRUE(compat);
183 (void)SvPOK_only(dst);
186 s = (U8*)SvPV(src,srclen);
188 for (p = s; p < e;) {
189 uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
192 sv_cat_decompHangul(dst, uv);
194 r = iscompat ? dec_compat(uv) : dec_canonical(uv);
196 sv_catpv(dst, (char *)r);
198 sv_catpvn(dst, (char *)p - retlen, retlen);
213 STRLEN srclen, dstlen, retlen, stk_cc_max;
214 U8 *s, *e, *p, *d, curCC;
221 src = sv_mortalcopy(arg);
222 sv_utf8_upgrade(src);
225 s = (U8*)SvPV(src, srclen);
229 sv_setpvn(dst,(const char*)s,srclen);
232 stk_cc_max = 10; /* enough as an initial value? */
233 New(0, stk_cc, stk_cc_max, UNF_cc);
235 d = (U8*)SvPV(dst,dstlen);
238 for (p = d; p < e;) {
240 STRLEN cc_len, cc_iter, cc_pos;
242 uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
243 curCC = getCombinClass(uv);
246 if (! (curCC && p < e))
252 stk_cc[cc_pos].cc = curCC;
253 stk_cc[cc_pos].uv = uv;
254 stk_cc[cc_pos].pos = cc_pos;
257 uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
258 curCC = getCombinClass(uv);
263 if (stk_cc_max <= cc_pos) { /* extend if need */
264 stk_cc_max = cc_pos + 1;
265 Renew(stk_cc, stk_cc_max, UNF_cc);
267 stk_cc[cc_pos].cc = curCC;
268 stk_cc[cc_pos].uv = uv;
269 stk_cc[cc_pos].pos = cc_pos;
272 /* only one c.c. in cc_len from cc_in, no need of reordering */
276 qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc);
280 for (cc_iter = 0; cc_iter <= cc_pos; cc_iter++) {
281 p = uvuni_to_utf8(p, stk_cc[cc_iter].uv);
297 U8 *s, *p, *e, *d, *t, *tmp_start, curCC, preCC;
299 STRLEN srclen, dstlen, tmplen, retlen;
300 bool beginning = TRUE;
305 src = sv_mortalcopy(arg);
306 sv_utf8_upgrade(src);
309 s = (U8*)SvPV(src, srclen);
313 (void)SvPOK_only(dst);
317 /* for uncomposed combining char */
318 tmp = sv_2mortal(newSV(dstlen));
319 (void)SvPOK_only(tmp);
322 for (p = s; p < e;) {
324 uvS = utf8n_to_uvuni(p, e - p, &retlen, 0);
327 if (getCombinClass(uvS)) { /* no Starter found yet */
328 d = uvuni_to_utf8(d, uvS);
335 t = tmp_start = (U8*)SvPVX(tmp);
338 /* to the next Starter */
340 uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
342 curCC = getCombinClass(uv);
344 if (preCC && preCC == curCC) {
346 t = uvuni_to_utf8(t, uv);
348 uvComp = composite_uv(uvS, uv);
350 if (uvComp && ! isExclusion(uvComp) && preCC <= curCC) {
351 STRLEN leftcur, rightcur, dstcur;
352 leftcur = UNISKIP(uvComp);
353 rightcur = UNISKIP(uvS) + UNISKIP(uv);
355 if (leftcur > rightcur) {
356 dstcur = d - (U8*)SvPVX(dst);
357 dstlen += leftcur - rightcur;
358 d = (U8*)SvGROW(dst,dstlen) + dstcur;
360 /* preCC not changed to curCC */
362 } else if (! curCC && p < e) { /* blocked */
366 t = uvuni_to_utf8(t, uv);
370 d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */
371 tmplen = t - tmp_start;
372 if (tmplen) { /* uncomposed combining char */
379 SvCUR_set(dst, d - (U8*)SvPVX(dst));
395 STRLEN srclen, retlen;
396 U8 *s, *e, *p, curCC, preCC;
401 src = sv_mortalcopy(arg);
402 sv_utf8_upgrade(src);
405 s = (U8*)SvPV(src,srclen);
409 for (p = s; p < e; p += retlen) {
410 uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
411 curCC = getCombinClass(uv);
412 if (preCC > curCC && curCC != 0) /* canonical ordering violated */
414 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
431 STRLEN srclen, retlen;
432 U8 *s, *e, *p, curCC, preCC;
438 src = sv_mortalcopy(arg);
439 sv_utf8_upgrade(src);
442 s = (U8*)SvPV(src,srclen);
447 for (p = s; p < e; p += retlen) {
448 uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
449 curCC = getCombinClass(uv);
451 if (preCC > curCC && curCC != 0) /* canonical ordering violated */
454 /* get NFC/NFKC property */
455 if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */
457 else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
459 else if (isComp2nd(uv))
462 char *canon, *compat;
464 * NFKC_NO when having compatibility mapping;
465 * i.e. dec_compat(uv) defined & different with dec_canonical(uv).
467 canon = (char *) dec_canonical(uv);
468 compat = (char *) dec_compat(uv);
469 if (compat && (!canon || strNE(canon, compat)))
471 } /* end of get NFC/NFKC property */
519 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
520 XSRETURN_YES; /* NFD_NO or NFKD_NO */
534 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
535 XSRETURN_YES; /* NFC_NO or NFKC_NO */
537 char *canon, *compat;
538 canon = (char *) dec_canonical(uv);
539 compat = (char *) dec_compat(uv);
540 if (compat && (!canon || strNE(canon, compat)))
541 XSRETURN_YES; /* NFC_NO or NFKC_NO */
551 getComposite(uv, uv2)
558 composite = composite_uv(uv, uv2);
559 RETVAL = composite ? newSVuv(composite) : &PL_sv_undef;
574 if (Hangul_IsS(uv)) {
577 (void)SvPOK_only(dst);
578 sv_cat_decompHangul(dst, uv);
581 rstr = ix ? dec_compat(uv) : dec_canonical(uv);
584 RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));