6 /* These 5 files are prepared by mkheader */
15 #define uvuni_to_utf8 uv_to_utf8
16 #endif /* uvuni_to_utf8 */
19 #ifndef utf8n_to_uvchr
20 #define utf8n_to_uvchr utf8_to_uv
21 #endif /* utf8n_to_uvchr */
23 /* At present, char > 0x10ffff are unaffected without complaint, right? */
24 #define VALID_UTF_MAX (0x10ffff)
25 #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
28 #define Hangul_SBase 0xAC00
29 #define Hangul_SFinal 0xD7A3
30 #define Hangul_SCount 11172
32 #define Hangul_NCount 588
34 #define Hangul_LBase 0x1100
35 #define Hangul_LFinal 0x1112
36 #define Hangul_LCount 19
38 #define Hangul_VBase 0x1161
39 #define Hangul_VFinal 0x1175
40 #define Hangul_VCount 21
42 #define Hangul_TBase 0x11A7
43 #define Hangul_TFinal 0x11C2
44 #define Hangul_TCount 28
46 #define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
47 #define Hangul_IsN(u) (! (((u) - Hangul_SBase) % Hangul_TCount))
48 #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
49 #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
50 #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
51 #define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal))
54 /* this is used for canonical ordering of combining characters (c.c.). */
56 U8 cc; /* combining class */
57 UV uv; /* codepoint */
58 STRLEN pos; /* position */
61 int compare_cc(const void *a, const void *b)
64 ret_cc = (*(UNF_cc*)a).cc - (*(UNF_cc*)b).cc;
65 if(ret_cc) return ret_cc;
66 return (*(UNF_cc*)a).pos - (*(UNF_cc*)b).pos;
69 U8* dec_canonical (UV uv)
72 if(OVER_UTF_MAX(uv)) return NULL;
73 plane = (U8***)UNF_canon[uv >> 16];
74 if(! plane) return NULL;
75 row = plane[(uv >> 8) & 0xff];
76 return row ? row[uv & 0xff] : NULL;
79 U8* dec_compat (UV uv)
82 if(OVER_UTF_MAX(uv)) return NULL;
83 plane = (U8***)UNF_compat[uv >> 16];
84 if(! plane) return NULL;
85 row = plane[(uv >> 8) & 0xff];
86 return row ? row[uv & 0xff] : NULL;
89 UV getComposite (UV uv, UV uv2)
91 UNF_complist ***plane, **row, *cell, *i;
93 if(! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2)) return 0;
95 if(Hangul_IsL(uv) && Hangul_IsV(uv2)) {
96 uv -= Hangul_LBase; /* lindex */
97 uv2 -= Hangul_VBase; /* vindex */
98 return(Hangul_SBase + (uv * Hangul_VCount + uv2) * Hangul_TCount);
100 if(Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
101 uv2 -= Hangul_TBase; /* tindex */
104 plane = UNF_compos[uv >> 16];
105 if(! plane) return 0;
106 row = plane[(uv >> 8) & 0xff];
108 cell = row[uv & 0xff];
110 for(i = cell; i->nextchar; i++) {
111 if(uv2 == i->nextchar) return i->composite;
116 U8 getCombinClass (UV uv)
119 if(OVER_UTF_MAX(uv)) return 0;
120 plane = (U8**)UNF_combin[uv >> 16];
121 if(! plane) return 0;
122 row = plane[(uv >> 8) & 0xff];
123 return row ? row[uv & 0xff] : 0;
126 void sv_cat_decompHangul (SV* sv, UV uv)
128 UV sindex, lindex, vindex, tindex;
129 U8 *t, temp[3 * UTF8_MAXLEN + 1];
131 if(! Hangul_IsS(uv)) return;
133 sindex = uv - Hangul_SBase;
134 lindex = sindex / Hangul_NCount;
135 vindex = (sindex % Hangul_NCount) / Hangul_TCount;
136 tindex = sindex % Hangul_TCount;
139 t = uvuni_to_utf8(t, (lindex + Hangul_LBase));
140 t = uvuni_to_utf8(t, (vindex + Hangul_VBase));
141 if (tindex) t = uvuni_to_utf8(t, (tindex + Hangul_TBase));
143 sv_catpvn(sv, (char *)temp, strlen((char *)temp));
146 MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
150 decompose(arg, compat)
156 STRLEN srclen, dstlen, retlen;
157 U8 *s, *e, *p, *d, *r;
164 src = sv_mortalcopy(arg);
165 sv_utf8_upgrade(src);
168 iscompat = SvTRUE(compat);
171 (void)SvPOK_only(dst);
174 s = (U8*)SvPV(src,srclen);
177 uv = utf8n_to_uvchr(p, e - p, &retlen, 0);
179 if(Hangul_IsS(uv)) sv_cat_decompHangul(dst, uv);
181 r = iscompat ? dec_compat(uv) : dec_canonical(uv);
182 if(r) sv_catpv(dst, (char *)r);
183 else sv_catpvn(dst, (char *)p - retlen, retlen);
198 STRLEN srclen, retlen, stk_cc_max;
199 U8 *s, *e, *p, curCC;
204 if(! SvUTF8(arg)) sv_utf8_upgrade(src);
206 stk_cc_max = 10; /* enough as an initial value? */
207 New(0, stk_cc, stk_cc_max, UNF_cc);
209 s = (U8*)SvPV(src,srclen);
213 STRLEN cc_len, cc_iter, cc_pos;
215 uv = utf8n_to_uvchr(p, e - p, &retlen, 0);
218 curCC = getCombinClass(uv);
219 if(! (curCC && p < e)) continue; else cc_in = p - retlen;
221 stk_cc[cc_pos].cc = curCC;
222 stk_cc[cc_pos].uv = uv;
223 stk_cc[cc_pos].pos = cc_pos;
226 uv = utf8n_to_uvchr(p, e - p, &retlen, 0);
227 curCC = getCombinClass(uv);
231 if(stk_cc_max <= cc_pos) { /* extend if need */
232 stk_cc_max = cc_pos + 1;
233 Renew(stk_cc, stk_cc_max, UNF_cc);
235 stk_cc[cc_pos].cc = curCC;
236 stk_cc[cc_pos].uv = uv;
237 stk_cc[cc_pos].pos = cc_pos;
240 /* only one c.c. in cc_len from cc_in, no need of reordering */
241 if(!cc_pos) continue;
243 qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc);
247 for(cc_iter = 0; cc_iter <= cc_pos; cc_iter++) {
248 p = uvuni_to_utf8(p, stk_cc[cc_iter].uv);
264 U8 *s, *p, *e, *d, *t, *tmp_start, curCC, preCC;
266 STRLEN srclen, dstlen, tmplen, dstcur, retlen;
267 bool beginning = TRUE;
272 src = sv_mortalcopy(arg);
273 sv_utf8_upgrade(src);
275 s = (U8*)SvPV(src, srclen);
277 dstlen = srclen + 1; /* equal or shorter, XXX */
278 dst = sv_2mortal(newSV(dstlen));
279 (void)SvPOK_only(dst);
283 /* for uncomposed combining char */
284 tmp = sv_2mortal(newSV(dstlen));
285 (void)SvPOK_only(tmp);
290 uvS = utf8n_to_uvchr(p, e - p, &retlen, 0);
293 if (getCombinClass(uvS)){ /* no Starter found yet */
294 d = uvuni_to_utf8(d, uvS);
301 t = tmp_start = (U8*)SvPVX(tmp);
304 /* to the next Starter */
306 uv = utf8n_to_uvchr(p, e - p, &retlen, 0);
308 curCC = getCombinClass(uv);
310 if(preCC && preCC == curCC) {
312 t = uvuni_to_utf8(t, uv);
314 uvComp = getComposite(uvS, uv);
316 /* S + C + S => S-S + C would be also blocked. */
317 if( uvComp && ! getExclusion(uvComp) && preCC <= curCC)
319 /* preCC not changed to curCC */
321 } else if (! curCC && p < e) { /* blocked */
325 t = uvuni_to_utf8(t, uv);
329 d = uvuni_to_utf8(d, uvS); /* composed char */
330 if(tmplen = t - tmp_start) { /* uncomposed combining char */
332 while(tmplen--) *d++ = *t++;
336 dstcur = d - (U8*)SvPVX(dst);
337 SvCUR_set(dst, dstcur);
351 getComposite(uv, uv2)
367 (void)SvPOK_only(dst);
368 sv_cat_decompHangul(dst, uv);
371 rstr = ix ? dec_compat(uv) : dec_canonical(uv);
372 if(!rstr) XSRETURN_UNDEF;
373 RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));