Upgrade to Unicode::Normalize 0.12.
[p5sagit/p5-mst-13.2.git] / ext / Unicode / Normalize / Normalize.xs
CommitLineData
ac5ea531 1
2#include "EXTERN.h"
3#include "perl.h"
4#include "XSUB.h"
5
6/* These 5 files are prepared by mkheader */
7#include "unfcmb.h"
8#include "unfcan.h"
9#include "unfcpt.h"
10#include "unfcmp.h"
11#include "unfexc.h"
12
13/* Perl 5.6.1 ? */
14#ifndef uvuni_to_utf8
15#define uvuni_to_utf8 uv_to_utf8
16#endif /* uvuni_to_utf8 */
17
18/* Perl 5.6.1 ? */
19#ifndef utf8n_to_uvchr
20#define utf8n_to_uvchr utf8_to_uv
21#endif /* utf8n_to_uvchr */
22
23/* At present, char > 0x10ffff are unaffected without complaint, right? */
24#define VALID_UTF_MAX (0x10ffff)
25#define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
26
27/* HANGUL_H */
28#define Hangul_SBase 0xAC00
29#define Hangul_SFinal 0xD7A3
30#define Hangul_SCount 11172
31
32#define Hangul_NCount 588
33
34#define Hangul_LBase 0x1100
35#define Hangul_LFinal 0x1112
36#define Hangul_LCount 19
37
38#define Hangul_VBase 0x1161
39#define Hangul_VFinal 0x1175
40#define Hangul_VCount 21
41
42#define Hangul_TBase 0x11A7
43#define Hangul_TFinal 0x11C2
44#define Hangul_TCount 28
45
46#define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
2a204b45 47#define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0)
ac5ea531 48#define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
49#define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
50#define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
51#define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal))
52/* HANGUL_H */
53
54/* this is used for canonical ordering of combining characters (c.c.). */
55typedef struct {
56 U8 cc; /* combining class */
57 UV uv; /* codepoint */
58 STRLEN pos; /* position */
59} UNF_cc;
60
61int compare_cc(const void *a, const void *b)
62{
63 int ret_cc;
64 ret_cc = (*(UNF_cc*)a).cc - (*(UNF_cc*)b).cc;
65 if(ret_cc) return ret_cc;
66 return (*(UNF_cc*)a).pos - (*(UNF_cc*)b).pos;
67}
68
69U8* dec_canonical (UV uv)
70{
71 U8 ***plane, **row;
72 if(OVER_UTF_MAX(uv)) return NULL;
73 plane = (U8***)UNF_canon[uv >> 16];
74 if(! plane) return NULL;
75 row = plane[(uv >> 8) & 0xff];
76 return row ? row[uv & 0xff] : NULL;
77}
78
79U8* dec_compat (UV uv)
80{
81 U8 ***plane, **row;
82 if(OVER_UTF_MAX(uv)) return NULL;
83 plane = (U8***)UNF_compat[uv >> 16];
84 if(! plane) return NULL;
85 row = plane[(uv >> 8) & 0xff];
86 return row ? row[uv & 0xff] : NULL;
87}
88
2a204b45 89UV composite_uv (UV uv, UV uv2)
ac5ea531 90{
91 UNF_complist ***plane, **row, *cell, *i;
92
93 if(! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2)) return 0;
94
95 if(Hangul_IsL(uv) && Hangul_IsV(uv2)) {
96 uv -= Hangul_LBase; /* lindex */
97 uv2 -= Hangul_VBase; /* vindex */
98 return(Hangul_SBase + (uv * Hangul_VCount + uv2) * Hangul_TCount);
99 }
100 if(Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
101 uv2 -= Hangul_TBase; /* tindex */
2a204b45 102 return(uv + uv2);
ac5ea531 103 }
104 plane = UNF_compos[uv >> 16];
105 if(! plane) return 0;
106 row = plane[(uv >> 8) & 0xff];
2a204b45 107 if(! row) return 0;
ac5ea531 108 cell = row[uv & 0xff];
2a204b45 109 if(! cell) return 0;
ac5ea531 110 for(i = cell; i->nextchar; i++) {
111 if(uv2 == i->nextchar) return i->composite;
112 }
113 return 0;
114}
115
116U8 getCombinClass (UV uv)
117{
118 U8 **plane, *row;
119 if(OVER_UTF_MAX(uv)) return 0;
120 plane = (U8**)UNF_combin[uv >> 16];
121 if(! plane) return 0;
122 row = plane[(uv >> 8) & 0xff];
123 return row ? row[uv & 0xff] : 0;
124}
125
126void sv_cat_decompHangul (SV* sv, UV uv)
127{
128 UV sindex, lindex, vindex, tindex;
2a204b45 129 U8 *t, tmp[3 * UTF8_MAXLEN + 1];
ac5ea531 130
131 if(! Hangul_IsS(uv)) return;
132
133 sindex = uv - Hangul_SBase;
134 lindex = sindex / Hangul_NCount;
135 vindex = (sindex % Hangul_NCount) / Hangul_TCount;
136 tindex = sindex % Hangul_TCount;
137
2a204b45 138 t = tmp;
ac5ea531 139 t = uvuni_to_utf8(t, (lindex + Hangul_LBase));
140 t = uvuni_to_utf8(t, (vindex + Hangul_VBase));
141 if (tindex) t = uvuni_to_utf8(t, (tindex + Hangul_TBase));
142 *t = '\0';
2a204b45 143 sv_catpvn(sv, (char *)tmp, strlen((char *)tmp));
ac5ea531 144}
145
146MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
147
ac5ea531 148SV*
149decompose(arg, compat)
150 SV * arg
151 SV * compat
152 PROTOTYPE: $
153 PREINIT:
2a204b45 154 UV uv;
ac5ea531 155 SV *src, *dst;
156 STRLEN srclen, dstlen, retlen;
157 U8 *s, *e, *p, *d, *r;
ac5ea531 158 bool iscompat;
159 CODE:
160 if(SvUTF8(arg)) {
161 src = arg;
162 } else {
163 src = sv_mortalcopy(arg);
164 sv_utf8_upgrade(src);
165 }
ac5ea531 166 iscompat = SvTRUE(compat);
167
168 dst = newSV(1);
169 (void)SvPOK_only(dst);
170 SvUTF8_on(dst);
171
172 s = (U8*)SvPV(src,srclen);
173 e = s + srclen;
174 for(p = s; p < e;){
175 uv = utf8n_to_uvchr(p, e - p, &retlen, 0);
176 p += retlen;
177 if(Hangul_IsS(uv)) sv_cat_decompHangul(dst, uv);
178 else {
179 r = iscompat ? dec_compat(uv) : dec_canonical(uv);
180 if(r) sv_catpv(dst, (char *)r);
181 else sv_catpvn(dst, (char *)p - retlen, retlen);
182 }
183 }
184 RETVAL = dst;
185 OUTPUT:
186 RETVAL
187
188
189
190SV*
191reorder(arg)
192 SV * arg
193 PROTOTYPE: $
194 PREINIT:
195 SV *src;
196 STRLEN srclen, retlen, stk_cc_max;
197 U8 *s, *e, *p, curCC;
198 UV uv;
199 UNF_cc * stk_cc;
200 CODE:
201 src = newSVsv(arg);
202 if(! SvUTF8(arg)) sv_utf8_upgrade(src);
203
204 stk_cc_max = 10; /* enough as an initial value? */
205 New(0, stk_cc, stk_cc_max, UNF_cc);
206
207 s = (U8*)SvPV(src,srclen);
208 e = s + srclen;
2a204b45 209
ac5ea531 210 for(p = s; p < e;){
211 U8 *cc_in;
212 STRLEN cc_len, cc_iter, cc_pos;
213
214 uv = utf8n_to_uvchr(p, e - p, &retlen, 0);
ac5ea531 215 curCC = getCombinClass(uv);
2a204b45 216 p += retlen;
217
ac5ea531 218 if(! (curCC && p < e)) continue; else cc_in = p - retlen;
219
2a204b45 220 cc_pos = 0;
ac5ea531 221 stk_cc[cc_pos].cc = curCC;
222 stk_cc[cc_pos].uv = uv;
223 stk_cc[cc_pos].pos = cc_pos;
224
225 while(p < e) {
226 uv = utf8n_to_uvchr(p, e - p, &retlen, 0);
227 curCC = getCombinClass(uv);
228 if(!curCC) break;
229 p += retlen;
230 cc_pos++;
231 if(stk_cc_max <= cc_pos) { /* extend if need */
232 stk_cc_max = cc_pos + 1;
233 Renew(stk_cc, stk_cc_max, UNF_cc);
234 }
235 stk_cc[cc_pos].cc = curCC;
236 stk_cc[cc_pos].uv = uv;
237 stk_cc[cc_pos].pos = cc_pos;
238 }
239
240 /* only one c.c. in cc_len from cc_in, no need of reordering */
241 if(!cc_pos) continue;
242
243 qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc);
244
245 cc_len = p - cc_in;
246 p = cc_in;
247 for(cc_iter = 0; cc_iter <= cc_pos; cc_iter++) {
248 p = uvuni_to_utf8(p, stk_cc[cc_iter].uv);
249 }
250 }
251 Safefree(stk_cc);
252 RETVAL = src;
253 OUTPUT:
254 RETVAL
255
256
257
2a204b45 258SV*
ac5ea531 259compose(arg)
260 SV * arg
261 PROTOTYPE: $
262 PREINIT:
263 SV *src, *dst, *tmp;
264 U8 *s, *p, *e, *d, *t, *tmp_start, curCC, preCC;
265 UV uv, uvS, uvComp;
2a204b45 266 STRLEN srclen, dstlen, tmplen, retlen;
ac5ea531 267 bool beginning = TRUE;
2a204b45 268 CODE:
ac5ea531 269 if(SvUTF8(arg)) {
270 src = arg;
271 } else {
272 src = sv_mortalcopy(arg);
273 sv_utf8_upgrade(src);
274 }
2a204b45 275
ac5ea531 276 s = (U8*)SvPV(src, srclen);
277 e = s + srclen;
278 dstlen = srclen + 1; /* equal or shorter, XXX */
2a204b45 279 dst = newSV(dstlen);
ac5ea531 280 (void)SvPOK_only(dst);
281 SvUTF8_on(dst);
282 d = (U8*)SvPVX(dst);
283
284 /* for uncomposed combining char */
285 tmp = sv_2mortal(newSV(dstlen));
286 (void)SvPOK_only(tmp);
287 SvUTF8_on(tmp);
288
289 for(p = s; p < e;){
290 if(beginning) {
291 uvS = utf8n_to_uvchr(p, e - p, &retlen, 0);
292 p += retlen;
293
294 if (getCombinClass(uvS)){ /* no Starter found yet */
295 d = uvuni_to_utf8(d, uvS);
296 continue;
297 }
298 beginning = FALSE;
299 }
300
301 /* Starter */
302 t = tmp_start = (U8*)SvPVX(tmp);
303 preCC = 0;
304
305 /* to the next Starter */
306 while(p < e) {
307 uv = utf8n_to_uvchr(p, e - p, &retlen, 0);
308 p += retlen;
309 curCC = getCombinClass(uv);
310
311 if(preCC && preCC == curCC) {
312 preCC = curCC;
313 t = uvuni_to_utf8(t, uv);
314 } else {
2a204b45 315 uvComp = composite_uv(uvS, uv);
ac5ea531 316
317 /* S + C + S => S-S + C would be also blocked. */
2a204b45 318 if( uvComp && ! isExclusion(uvComp) && preCC <= curCC)
ac5ea531 319 {
320 /* preCC not changed to curCC */
321 uvS = uvComp;
322 } else if (! curCC && p < e) { /* blocked */
323 break;
324 } else {
325 preCC = curCC;
326 t = uvuni_to_utf8(t, uv);
327 }
328 }
329 }
2a204b45 330 d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */
ac5ea531 331 if(tmplen = t - tmp_start) { /* uncomposed combining char */
332 t = (U8*)SvPVX(tmp);
333 while(tmplen--) *d++ = *t++;
334 }
335 uvS = uv;
336 } /* for */
2a204b45 337 e = d; /* end of dst */
338 d = (U8*)SvPVX(dst);
339 SvCUR_set(dst, e - d);
340 RETVAL = dst;
341 OUTPUT:
342 RETVAL
ac5ea531 343
344
345
346U8
347getCombinClass(uv)
348 UV uv
349
350bool
2a204b45 351isExclusion(uv)
ac5ea531 352 UV uv
353
2a204b45 354SV*
ac5ea531 355getComposite(uv, uv2)
356 UV uv
357 UV uv2
2a204b45 358 PROTOTYPE: $$
359 PREINIT:
360 UV comp;
361 CODE:
362 comp = composite_uv(uv, uv2);
363 RETVAL = comp ? newSVuv(comp) : &PL_sv_undef;
364 OUTPUT:
365 RETVAL
ac5ea531 366
367SV*
368getCanon(uv)
369 UV uv
370 PROTOTYPE: $
371 ALIAS:
372 getCompat = 1
373 PREINIT:
374 U8 * rstr;
375 CODE:
376 if(Hangul_IsS(uv)) {
377 SV * dst;
378 dst = newSV(1);
379 (void)SvPOK_only(dst);
380 sv_cat_decompHangul(dst, uv);
381 RETVAL = dst;
382 } else {
383 rstr = ix ? dec_compat(uv) : dec_canonical(uv);
384 if(!rstr) XSRETURN_UNDEF;
385 RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
386 }
387 SvUTF8_on(RETVAL);
388 OUTPUT:
389 RETVAL
390