switch unfcmb.h to employ 'STDCHAR' in place of char
[p5sagit/p5-mst-13.2.git] / ext / Unicode / Normalize / Normalize.xs
CommitLineData
ac5ea531 1
2#include "EXTERN.h"
3#include "perl.h"
4#include "XSUB.h"
5
6/* These 5 files are prepared by mkheader */
7#include "unfcmb.h"
8#include "unfcan.h"
9#include "unfcpt.h"
10#include "unfcmp.h"
11#include "unfexc.h"
12
13/* Perl 5.6.1 ? */
14#ifndef uvuni_to_utf8
15#define uvuni_to_utf8 uv_to_utf8
16#endif /* uvuni_to_utf8 */
17
18/* Perl 5.6.1 ? */
19#ifndef utf8n_to_uvchr
20#define utf8n_to_uvchr utf8_to_uv
21#endif /* utf8n_to_uvchr */
22
23/* At present, char > 0x10ffff are unaffected without complaint, right? */
24#define VALID_UTF_MAX (0x10ffff)
25#define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
26
27/* HANGUL_H */
28#define Hangul_SBase 0xAC00
29#define Hangul_SFinal 0xD7A3
30#define Hangul_SCount 11172
31
32#define Hangul_NCount 588
33
34#define Hangul_LBase 0x1100
35#define Hangul_LFinal 0x1112
36#define Hangul_LCount 19
37
38#define Hangul_VBase 0x1161
39#define Hangul_VFinal 0x1175
40#define Hangul_VCount 21
41
42#define Hangul_TBase 0x11A7
43#define Hangul_TFinal 0x11C2
44#define Hangul_TCount 28
45
46#define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
47#define Hangul_IsN(u) (! (((u) - Hangul_SBase) % Hangul_TCount))
48#define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
49#define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
50#define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
51#define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal))
52/* HANGUL_H */
53
54/* this is used for canonical ordering of combining characters (c.c.). */
55typedef struct {
56 U8 cc; /* combining class */
57 UV uv; /* codepoint */
58 STRLEN pos; /* position */
59} UNF_cc;
60
61int compare_cc(const void *a, const void *b)
62{
63 int ret_cc;
64 ret_cc = (*(UNF_cc*)a).cc - (*(UNF_cc*)b).cc;
65 if(ret_cc) return ret_cc;
66 return (*(UNF_cc*)a).pos - (*(UNF_cc*)b).pos;
67}
68
69U8* dec_canonical (UV uv)
70{
71 U8 ***plane, **row;
72 if(OVER_UTF_MAX(uv)) return NULL;
73 plane = (U8***)UNF_canon[uv >> 16];
74 if(! plane) return NULL;
75 row = plane[(uv >> 8) & 0xff];
76 return row ? row[uv & 0xff] : NULL;
77}
78
79U8* dec_compat (UV uv)
80{
81 U8 ***plane, **row;
82 if(OVER_UTF_MAX(uv)) return NULL;
83 plane = (U8***)UNF_compat[uv >> 16];
84 if(! plane) return NULL;
85 row = plane[(uv >> 8) & 0xff];
86 return row ? row[uv & 0xff] : NULL;
87}
88
89UV getComposite (UV uv, UV uv2)
90{
91 UNF_complist ***plane, **row, *cell, *i;
92
93 if(! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2)) return 0;
94
95 if(Hangul_IsL(uv) && Hangul_IsV(uv2)) {
96 uv -= Hangul_LBase; /* lindex */
97 uv2 -= Hangul_VBase; /* vindex */
98 return(Hangul_SBase + (uv * Hangul_VCount + uv2) * Hangul_TCount);
99 }
100 if(Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
101 uv2 -= Hangul_TBase; /* tindex */
102 return (uv + uv2);
103 }
104 plane = UNF_compos[uv >> 16];
105 if(! plane) return 0;
106 row = plane[(uv >> 8) & 0xff];
107 if(! row) return 0;
108 cell = row[uv & 0xff];
109 if(! cell) return 0;
110 for(i = cell; i->nextchar; i++) {
111 if(uv2 == i->nextchar) return i->composite;
112 }
113 return 0;
114}
115
116U8 getCombinClass (UV uv)
117{
118 U8 **plane, *row;
119 if(OVER_UTF_MAX(uv)) return 0;
120 plane = (U8**)UNF_combin[uv >> 16];
121 if(! plane) return 0;
122 row = plane[(uv >> 8) & 0xff];
123 return row ? row[uv & 0xff] : 0;
124}
125
126void sv_cat_decompHangul (SV* sv, UV uv)
127{
128 UV sindex, lindex, vindex, tindex;
129 U8 *t, temp[3 * UTF8_MAXLEN + 1];
130
131 if(! Hangul_IsS(uv)) return;
132
133 sindex = uv - Hangul_SBase;
134 lindex = sindex / Hangul_NCount;
135 vindex = (sindex % Hangul_NCount) / Hangul_TCount;
136 tindex = sindex % Hangul_TCount;
137
138 t = temp;
139 t = uvuni_to_utf8(t, (lindex + Hangul_LBase));
140 t = uvuni_to_utf8(t, (vindex + Hangul_VBase));
141 if (tindex) t = uvuni_to_utf8(t, (tindex + Hangul_TBase));
142 *t = '\0';
143 sv_catpvn(sv, (char *)temp, strlen((char *)temp));
144}
145
146MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
147
148
149SV*
150decompose(arg, compat)
151 SV * arg
152 SV * compat
153 PROTOTYPE: $
154 PREINIT:
155 SV *src, *dst;
156 STRLEN srclen, dstlen, retlen;
157 U8 *s, *e, *p, *d, *r;
158 UV uv;
159 bool iscompat;
160 CODE:
161 if(SvUTF8(arg)) {
162 src = arg;
163 } else {
164 src = sv_mortalcopy(arg);
165 sv_utf8_upgrade(src);
166 }
167
168 iscompat = SvTRUE(compat);
169
170 dst = newSV(1);
171 (void)SvPOK_only(dst);
172 SvUTF8_on(dst);
173
174 s = (U8*)SvPV(src,srclen);
175 e = s + srclen;
176 for(p = s; p < e;){
177 uv = utf8n_to_uvchr(p, e - p, &retlen, 0);
178 p += retlen;
179 if(Hangul_IsS(uv)) sv_cat_decompHangul(dst, uv);
180 else {
181 r = iscompat ? dec_compat(uv) : dec_canonical(uv);
182 if(r) sv_catpv(dst, (char *)r);
183 else sv_catpvn(dst, (char *)p - retlen, retlen);
184 }
185 }
186 RETVAL = dst;
187 OUTPUT:
188 RETVAL
189
190
191
192SV*
193reorder(arg)
194 SV * arg
195 PROTOTYPE: $
196 PREINIT:
197 SV *src;
198 STRLEN srclen, retlen, stk_cc_max;
199 U8 *s, *e, *p, curCC;
200 UV uv;
201 UNF_cc * stk_cc;
202 CODE:
203 src = newSVsv(arg);
204 if(! SvUTF8(arg)) sv_utf8_upgrade(src);
205
206 stk_cc_max = 10; /* enough as an initial value? */
207 New(0, stk_cc, stk_cc_max, UNF_cc);
208
209 s = (U8*)SvPV(src,srclen);
210 e = s + srclen;
211 for(p = s; p < e;){
212 U8 *cc_in;
213 STRLEN cc_len, cc_iter, cc_pos;
214
215 uv = utf8n_to_uvchr(p, e - p, &retlen, 0);
216 p += retlen;
217 cc_pos = 0;
218 curCC = getCombinClass(uv);
219 if(! (curCC && p < e)) continue; else cc_in = p - retlen;
220
221 stk_cc[cc_pos].cc = curCC;
222 stk_cc[cc_pos].uv = uv;
223 stk_cc[cc_pos].pos = cc_pos;
224
225 while(p < e) {
226 uv = utf8n_to_uvchr(p, e - p, &retlen, 0);
227 curCC = getCombinClass(uv);
228 if(!curCC) break;
229 p += retlen;
230 cc_pos++;
231 if(stk_cc_max <= cc_pos) { /* extend if need */
232 stk_cc_max = cc_pos + 1;
233 Renew(stk_cc, stk_cc_max, UNF_cc);
234 }
235 stk_cc[cc_pos].cc = curCC;
236 stk_cc[cc_pos].uv = uv;
237 stk_cc[cc_pos].pos = cc_pos;
238 }
239
240 /* only one c.c. in cc_len from cc_in, no need of reordering */
241 if(!cc_pos) continue;
242
243 qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc);
244
245 cc_len = p - cc_in;
246 p = cc_in;
247 for(cc_iter = 0; cc_iter <= cc_pos; cc_iter++) {
248 p = uvuni_to_utf8(p, stk_cc[cc_iter].uv);
249 }
250 }
251 Safefree(stk_cc);
252 RETVAL = src;
253 OUTPUT:
254 RETVAL
255
256
257
258void
259compose(arg)
260 SV * arg
261 PROTOTYPE: $
262 PREINIT:
263 SV *src, *dst, *tmp;
264 U8 *s, *p, *e, *d, *t, *tmp_start, curCC, preCC;
265 UV uv, uvS, uvComp;
266 STRLEN srclen, dstlen, tmplen, dstcur, retlen;
267 bool beginning = TRUE;
268 PPCODE:
269 if(SvUTF8(arg)) {
270 src = arg;
271 } else {
272 src = sv_mortalcopy(arg);
273 sv_utf8_upgrade(src);
274 }
275 s = (U8*)SvPV(src, srclen);
276 e = s + srclen;
277 dstlen = srclen + 1; /* equal or shorter, XXX */
278 dst = sv_2mortal(newSV(dstlen));
279 (void)SvPOK_only(dst);
280 SvUTF8_on(dst);
281 d = (U8*)SvPVX(dst);
282
283 /* for uncomposed combining char */
284 tmp = sv_2mortal(newSV(dstlen));
285 (void)SvPOK_only(tmp);
286 SvUTF8_on(tmp);
287
288 for(p = s; p < e;){
289 if(beginning) {
290 uvS = utf8n_to_uvchr(p, e - p, &retlen, 0);
291 p += retlen;
292
293 if (getCombinClass(uvS)){ /* no Starter found yet */
294 d = uvuni_to_utf8(d, uvS);
295 continue;
296 }
297 beginning = FALSE;
298 }
299
300 /* Starter */
301 t = tmp_start = (U8*)SvPVX(tmp);
302 preCC = 0;
303
304 /* to the next Starter */
305 while(p < e) {
306 uv = utf8n_to_uvchr(p, e - p, &retlen, 0);
307 p += retlen;
308 curCC = getCombinClass(uv);
309
310 if(preCC && preCC == curCC) {
311 preCC = curCC;
312 t = uvuni_to_utf8(t, uv);
313 } else {
314 uvComp = getComposite(uvS, uv);
315
316 /* S + C + S => S-S + C would be also blocked. */
317 if( uvComp && ! getExclusion(uvComp) && preCC <= curCC)
318 {
319 /* preCC not changed to curCC */
320 uvS = uvComp;
321 } else if (! curCC && p < e) { /* blocked */
322 break;
323 } else {
324 preCC = curCC;
325 t = uvuni_to_utf8(t, uv);
326 }
327 }
328 }
329 d = uvuni_to_utf8(d, uvS); /* composed char */
330 if(tmplen = t - tmp_start) { /* uncomposed combining char */
331 t = (U8*)SvPVX(tmp);
332 while(tmplen--) *d++ = *t++;
333 }
334 uvS = uv;
335 } /* for */
336 dstcur = d - (U8*)SvPVX(dst);
337 SvCUR_set(dst, dstcur);
338 XPUSHs(dst);
339
340
341
342U8
343getCombinClass(uv)
344 UV uv
345
346bool
347getExclusion(uv)
348 UV uv
349
350UV
351getComposite(uv, uv2)
352 UV uv
353 UV uv2
354
355SV*
356getCanon(uv)
357 UV uv
358 PROTOTYPE: $
359 ALIAS:
360 getCompat = 1
361 PREINIT:
362 U8 * rstr;
363 CODE:
364 if(Hangul_IsS(uv)) {
365 SV * dst;
366 dst = newSV(1);
367 (void)SvPOK_only(dst);
368 sv_cat_decompHangul(dst, uv);
369 RETVAL = dst;
370 } else {
371 rstr = ix ? dec_compat(uv) : dec_canonical(uv);
372 if(!rstr) XSRETURN_UNDEF;
373 RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
374 }
375 SvUTF8_on(RETVAL);
376 OUTPUT:
377 RETVAL
378