Commit | Line | Data |
ac5ea531 |
1 | |
2 | #include "EXTERN.h" |
3 | #include "perl.h" |
4 | #include "XSUB.h" |
5 | |
6 | /* These 5 files are prepared by mkheader */ |
7 | #include "unfcmb.h" |
8 | #include "unfcan.h" |
9 | #include "unfcpt.h" |
10 | #include "unfcmp.h" |
11 | #include "unfexc.h" |
12 | |
13 | /* Perl 5.6.1 ? */ |
14 | #ifndef uvuni_to_utf8 |
15 | #define uvuni_to_utf8 uv_to_utf8 |
16 | #endif /* uvuni_to_utf8 */ |
17 | |
18 | /* Perl 5.6.1 ? */ |
19 | #ifndef utf8n_to_uvchr |
20 | #define utf8n_to_uvchr utf8_to_uv |
21 | #endif /* utf8n_to_uvchr */ |
22 | |
23 | /* At present, char > 0x10ffff are unaffected without complaint, right? */ |
24 | #define VALID_UTF_MAX (0x10ffff) |
25 | #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv)) |
26 | |
27 | /* HANGUL_H */ |
28 | #define Hangul_SBase 0xAC00 |
29 | #define Hangul_SFinal 0xD7A3 |
30 | #define Hangul_SCount 11172 |
31 | |
32 | #define Hangul_NCount 588 |
33 | |
34 | #define Hangul_LBase 0x1100 |
35 | #define Hangul_LFinal 0x1112 |
36 | #define Hangul_LCount 19 |
37 | |
38 | #define Hangul_VBase 0x1161 |
39 | #define Hangul_VFinal 0x1175 |
40 | #define Hangul_VCount 21 |
41 | |
42 | #define Hangul_TBase 0x11A7 |
43 | #define Hangul_TFinal 0x11C2 |
44 | #define Hangul_TCount 28 |
45 | |
46 | #define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal)) |
2a204b45 |
47 | #define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0) |
ac5ea531 |
48 | #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u)) |
49 | #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal)) |
50 | #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal)) |
51 | #define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal)) |
52 | /* HANGUL_H */ |
53 | |
54 | /* this is used for canonical ordering of combining characters (c.c.). */ |
55 | typedef struct { |
56 | U8 cc; /* combining class */ |
57 | UV uv; /* codepoint */ |
58 | STRLEN pos; /* position */ |
59 | } UNF_cc; |
60 | |
61 | int compare_cc(const void *a, const void *b) |
62 | { |
63 | int ret_cc; |
64 | ret_cc = (*(UNF_cc*)a).cc - (*(UNF_cc*)b).cc; |
65 | if(ret_cc) return ret_cc; |
66 | return (*(UNF_cc*)a).pos - (*(UNF_cc*)b).pos; |
67 | } |
68 | |
69 | U8* dec_canonical (UV uv) |
70 | { |
71 | U8 ***plane, **row; |
72 | if(OVER_UTF_MAX(uv)) return NULL; |
73 | plane = (U8***)UNF_canon[uv >> 16]; |
74 | if(! plane) return NULL; |
75 | row = plane[(uv >> 8) & 0xff]; |
76 | return row ? row[uv & 0xff] : NULL; |
77 | } |
78 | |
79 | U8* dec_compat (UV uv) |
80 | { |
81 | U8 ***plane, **row; |
82 | if(OVER_UTF_MAX(uv)) return NULL; |
83 | plane = (U8***)UNF_compat[uv >> 16]; |
84 | if(! plane) return NULL; |
85 | row = plane[(uv >> 8) & 0xff]; |
86 | return row ? row[uv & 0xff] : NULL; |
87 | } |
88 | |
2a204b45 |
89 | UV composite_uv (UV uv, UV uv2) |
ac5ea531 |
90 | { |
91 | UNF_complist ***plane, **row, *cell, *i; |
92 | |
93 | if(! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2)) return 0; |
94 | |
95 | if(Hangul_IsL(uv) && Hangul_IsV(uv2)) { |
96 | uv -= Hangul_LBase; /* lindex */ |
97 | uv2 -= Hangul_VBase; /* vindex */ |
98 | return(Hangul_SBase + (uv * Hangul_VCount + uv2) * Hangul_TCount); |
99 | } |
100 | if(Hangul_IsLV(uv) && Hangul_IsT(uv2)) { |
101 | uv2 -= Hangul_TBase; /* tindex */ |
2a204b45 |
102 | return(uv + uv2); |
ac5ea531 |
103 | } |
104 | plane = UNF_compos[uv >> 16]; |
105 | if(! plane) return 0; |
106 | row = plane[(uv >> 8) & 0xff]; |
2a204b45 |
107 | if(! row) return 0; |
ac5ea531 |
108 | cell = row[uv & 0xff]; |
2a204b45 |
109 | if(! cell) return 0; |
ac5ea531 |
110 | for(i = cell; i->nextchar; i++) { |
111 | if(uv2 == i->nextchar) return i->composite; |
112 | } |
113 | return 0; |
114 | } |
115 | |
116 | U8 getCombinClass (UV uv) |
117 | { |
118 | U8 **plane, *row; |
119 | if(OVER_UTF_MAX(uv)) return 0; |
120 | plane = (U8**)UNF_combin[uv >> 16]; |
121 | if(! plane) return 0; |
122 | row = plane[(uv >> 8) & 0xff]; |
123 | return row ? row[uv & 0xff] : 0; |
124 | } |
125 | |
126 | void sv_cat_decompHangul (SV* sv, UV uv) |
127 | { |
128 | UV sindex, lindex, vindex, tindex; |
2a204b45 |
129 | U8 *t, tmp[3 * UTF8_MAXLEN + 1]; |
ac5ea531 |
130 | |
131 | if(! Hangul_IsS(uv)) return; |
132 | |
133 | sindex = uv - Hangul_SBase; |
134 | lindex = sindex / Hangul_NCount; |
135 | vindex = (sindex % Hangul_NCount) / Hangul_TCount; |
136 | tindex = sindex % Hangul_TCount; |
137 | |
2a204b45 |
138 | t = tmp; |
ac5ea531 |
139 | t = uvuni_to_utf8(t, (lindex + Hangul_LBase)); |
140 | t = uvuni_to_utf8(t, (vindex + Hangul_VBase)); |
141 | if (tindex) t = uvuni_to_utf8(t, (tindex + Hangul_TBase)); |
142 | *t = '\0'; |
2a204b45 |
143 | sv_catpvn(sv, (char *)tmp, strlen((char *)tmp)); |
ac5ea531 |
144 | } |
145 | |
146 | MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize |
147 | |
ac5ea531 |
148 | SV* |
149 | decompose(arg, compat) |
150 | SV * arg |
151 | SV * compat |
152 | PROTOTYPE: $ |
153 | PREINIT: |
2a204b45 |
154 | UV uv; |
ac5ea531 |
155 | SV *src, *dst; |
73263a9c |
156 | STRLEN srclen, retlen; |
157 | U8 *s, *e, *p, *r; |
ac5ea531 |
158 | bool iscompat; |
159 | CODE: |
160 | if(SvUTF8(arg)) { |
161 | src = arg; |
162 | } else { |
163 | src = sv_mortalcopy(arg); |
164 | sv_utf8_upgrade(src); |
165 | } |
ac5ea531 |
166 | iscompat = SvTRUE(compat); |
167 | |
168 | dst = newSV(1); |
169 | (void)SvPOK_only(dst); |
170 | SvUTF8_on(dst); |
171 | |
172 | s = (U8*)SvPV(src,srclen); |
173 | e = s + srclen; |
174 | for(p = s; p < e;){ |
175 | uv = utf8n_to_uvchr(p, e - p, &retlen, 0); |
176 | p += retlen; |
177 | if(Hangul_IsS(uv)) sv_cat_decompHangul(dst, uv); |
178 | else { |
179 | r = iscompat ? dec_compat(uv) : dec_canonical(uv); |
180 | if(r) sv_catpv(dst, (char *)r); |
181 | else sv_catpvn(dst, (char *)p - retlen, retlen); |
182 | } |
183 | } |
184 | RETVAL = dst; |
185 | OUTPUT: |
186 | RETVAL |
187 | |
188 | |
189 | |
190 | SV* |
191 | reorder(arg) |
192 | SV * arg |
193 | PROTOTYPE: $ |
194 | PREINIT: |
195 | SV *src; |
196 | STRLEN srclen, retlen, stk_cc_max; |
197 | U8 *s, *e, *p, curCC; |
198 | UV uv; |
199 | UNF_cc * stk_cc; |
200 | CODE: |
201 | src = newSVsv(arg); |
202 | if(! SvUTF8(arg)) sv_utf8_upgrade(src); |
203 | |
204 | stk_cc_max = 10; /* enough as an initial value? */ |
205 | New(0, stk_cc, stk_cc_max, UNF_cc); |
206 | |
207 | s = (U8*)SvPV(src,srclen); |
208 | e = s + srclen; |
2a204b45 |
209 | |
ac5ea531 |
210 | for(p = s; p < e;){ |
211 | U8 *cc_in; |
212 | STRLEN cc_len, cc_iter, cc_pos; |
213 | |
214 | uv = utf8n_to_uvchr(p, e - p, &retlen, 0); |
ac5ea531 |
215 | curCC = getCombinClass(uv); |
2a204b45 |
216 | p += retlen; |
217 | |
ac5ea531 |
218 | if(! (curCC && p < e)) continue; else cc_in = p - retlen; |
219 | |
2a204b45 |
220 | cc_pos = 0; |
ac5ea531 |
221 | stk_cc[cc_pos].cc = curCC; |
222 | stk_cc[cc_pos].uv = uv; |
223 | stk_cc[cc_pos].pos = cc_pos; |
224 | |
225 | while(p < e) { |
226 | uv = utf8n_to_uvchr(p, e - p, &retlen, 0); |
227 | curCC = getCombinClass(uv); |
228 | if(!curCC) break; |
229 | p += retlen; |
230 | cc_pos++; |
231 | if(stk_cc_max <= cc_pos) { /* extend if need */ |
232 | stk_cc_max = cc_pos + 1; |
233 | Renew(stk_cc, stk_cc_max, UNF_cc); |
234 | } |
235 | stk_cc[cc_pos].cc = curCC; |
236 | stk_cc[cc_pos].uv = uv; |
237 | stk_cc[cc_pos].pos = cc_pos; |
238 | } |
239 | |
240 | /* only one c.c. in cc_len from cc_in, no need of reordering */ |
241 | if(!cc_pos) continue; |
242 | |
243 | qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc); |
244 | |
245 | cc_len = p - cc_in; |
246 | p = cc_in; |
247 | for(cc_iter = 0; cc_iter <= cc_pos; cc_iter++) { |
248 | p = uvuni_to_utf8(p, stk_cc[cc_iter].uv); |
249 | } |
250 | } |
251 | Safefree(stk_cc); |
252 | RETVAL = src; |
253 | OUTPUT: |
254 | RETVAL |
255 | |
256 | |
257 | |
2a204b45 |
258 | SV* |
ac5ea531 |
259 | compose(arg) |
260 | SV * arg |
261 | PROTOTYPE: $ |
262 | PREINIT: |
263 | SV *src, *dst, *tmp; |
264 | U8 *s, *p, *e, *d, *t, *tmp_start, curCC, preCC; |
265 | UV uv, uvS, uvComp; |
2a204b45 |
266 | STRLEN srclen, dstlen, tmplen, retlen; |
ac5ea531 |
267 | bool beginning = TRUE; |
2a204b45 |
268 | CODE: |
ac5ea531 |
269 | if(SvUTF8(arg)) { |
270 | src = arg; |
271 | } else { |
272 | src = sv_mortalcopy(arg); |
273 | sv_utf8_upgrade(src); |
274 | } |
2a204b45 |
275 | |
ac5ea531 |
276 | s = (U8*)SvPV(src, srclen); |
277 | e = s + srclen; |
278 | dstlen = srclen + 1; /* equal or shorter, XXX */ |
2a204b45 |
279 | dst = newSV(dstlen); |
ac5ea531 |
280 | (void)SvPOK_only(dst); |
281 | SvUTF8_on(dst); |
282 | d = (U8*)SvPVX(dst); |
283 | |
284 | /* for uncomposed combining char */ |
285 | tmp = sv_2mortal(newSV(dstlen)); |
286 | (void)SvPOK_only(tmp); |
287 | SvUTF8_on(tmp); |
288 | |
289 | for(p = s; p < e;){ |
290 | if(beginning) { |
291 | uvS = utf8n_to_uvchr(p, e - p, &retlen, 0); |
292 | p += retlen; |
293 | |
294 | if (getCombinClass(uvS)){ /* no Starter found yet */ |
295 | d = uvuni_to_utf8(d, uvS); |
296 | continue; |
297 | } |
298 | beginning = FALSE; |
299 | } |
300 | |
301 | /* Starter */ |
302 | t = tmp_start = (U8*)SvPVX(tmp); |
303 | preCC = 0; |
304 | |
305 | /* to the next Starter */ |
306 | while(p < e) { |
307 | uv = utf8n_to_uvchr(p, e - p, &retlen, 0); |
308 | p += retlen; |
309 | curCC = getCombinClass(uv); |
310 | |
311 | if(preCC && preCC == curCC) { |
312 | preCC = curCC; |
313 | t = uvuni_to_utf8(t, uv); |
314 | } else { |
2a204b45 |
315 | uvComp = composite_uv(uvS, uv); |
ac5ea531 |
316 | |
317 | /* S + C + S => S-S + C would be also blocked. */ |
2a204b45 |
318 | if( uvComp && ! isExclusion(uvComp) && preCC <= curCC) |
ac5ea531 |
319 | { |
320 | /* preCC not changed to curCC */ |
321 | uvS = uvComp; |
322 | } else if (! curCC && p < e) { /* blocked */ |
323 | break; |
324 | } else { |
325 | preCC = curCC; |
326 | t = uvuni_to_utf8(t, uv); |
327 | } |
328 | } |
329 | } |
2a204b45 |
330 | d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */ |
20d72259 |
331 | if((tmplen = t - tmp_start)) { /* uncomposed combining char */ |
ac5ea531 |
332 | t = (U8*)SvPVX(tmp); |
333 | while(tmplen--) *d++ = *t++; |
334 | } |
335 | uvS = uv; |
336 | } /* for */ |
2a204b45 |
337 | e = d; /* end of dst */ |
338 | d = (U8*)SvPVX(dst); |
339 | SvCUR_set(dst, e - d); |
340 | RETVAL = dst; |
341 | OUTPUT: |
342 | RETVAL |
ac5ea531 |
343 | |
344 | |
345 | |
346 | U8 |
347 | getCombinClass(uv) |
348 | UV uv |
349 | |
350 | bool |
2a204b45 |
351 | isExclusion(uv) |
ac5ea531 |
352 | UV uv |
353 | |
2a204b45 |
354 | SV* |
ac5ea531 |
355 | getComposite(uv, uv2) |
356 | UV uv |
357 | UV uv2 |
2a204b45 |
358 | PROTOTYPE: $$ |
359 | PREINIT: |
360 | UV comp; |
361 | CODE: |
362 | comp = composite_uv(uv, uv2); |
363 | RETVAL = comp ? newSVuv(comp) : &PL_sv_undef; |
364 | OUTPUT: |
365 | RETVAL |
ac5ea531 |
366 | |
367 | SV* |
368 | getCanon(uv) |
369 | UV uv |
370 | PROTOTYPE: $ |
371 | ALIAS: |
372 | getCompat = 1 |
373 | PREINIT: |
374 | U8 * rstr; |
375 | CODE: |
376 | if(Hangul_IsS(uv)) { |
377 | SV * dst; |
378 | dst = newSV(1); |
379 | (void)SvPOK_only(dst); |
380 | sv_cat_decompHangul(dst, uv); |
381 | RETVAL = dst; |
382 | } else { |
383 | rstr = ix ? dec_compat(uv) : dec_canonical(uv); |
384 | if(!rstr) XSRETURN_UNDEF; |
385 | RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr)); |
386 | } |
387 | SvUTF8_on(RETVAL); |
388 | OUTPUT: |
389 | RETVAL |
390 | |