Commit | Line | Data |
ac5ea531 |
1 | |
2 | #include "EXTERN.h" |
3 | #include "perl.h" |
4 | #include "XSUB.h" |
5 | |
6 | /* These 5 files are prepared by mkheader */ |
7 | #include "unfcmb.h" |
8 | #include "unfcan.h" |
9 | #include "unfcpt.h" |
10 | #include "unfcmp.h" |
11 | #include "unfexc.h" |
12 | |
13 | /* Perl 5.6.1 ? */ |
14 | #ifndef uvuni_to_utf8 |
15 | #define uvuni_to_utf8 uv_to_utf8 |
16 | #endif /* uvuni_to_utf8 */ |
17 | |
18 | /* Perl 5.6.1 ? */ |
19 | #ifndef utf8n_to_uvchr |
20 | #define utf8n_to_uvchr utf8_to_uv |
21 | #endif /* utf8n_to_uvchr */ |
22 | |
23 | /* At present, char > 0x10ffff are unaffected without complaint, right? */ |
24 | #define VALID_UTF_MAX (0x10ffff) |
25 | #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv)) |
26 | |
27 | /* HANGUL_H */ |
28 | #define Hangul_SBase 0xAC00 |
29 | #define Hangul_SFinal 0xD7A3 |
30 | #define Hangul_SCount 11172 |
31 | |
32 | #define Hangul_NCount 588 |
33 | |
34 | #define Hangul_LBase 0x1100 |
35 | #define Hangul_LFinal 0x1112 |
36 | #define Hangul_LCount 19 |
37 | |
38 | #define Hangul_VBase 0x1161 |
39 | #define Hangul_VFinal 0x1175 |
40 | #define Hangul_VCount 21 |
41 | |
42 | #define Hangul_TBase 0x11A7 |
43 | #define Hangul_TFinal 0x11C2 |
44 | #define Hangul_TCount 28 |
45 | |
46 | #define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal)) |
47 | #define Hangul_IsN(u) (! (((u) - Hangul_SBase) % Hangul_TCount)) |
48 | #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u)) |
49 | #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal)) |
50 | #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal)) |
51 | #define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal)) |
52 | /* HANGUL_H */ |
53 | |
54 | /* this is used for canonical ordering of combining characters (c.c.). */ |
55 | typedef struct { |
56 | U8 cc; /* combining class */ |
57 | UV uv; /* codepoint */ |
58 | STRLEN pos; /* position */ |
59 | } UNF_cc; |
60 | |
61 | int compare_cc(const void *a, const void *b) |
62 | { |
63 | int ret_cc; |
64 | ret_cc = (*(UNF_cc*)a).cc - (*(UNF_cc*)b).cc; |
65 | if(ret_cc) return ret_cc; |
66 | return (*(UNF_cc*)a).pos - (*(UNF_cc*)b).pos; |
67 | } |
68 | |
69 | U8* dec_canonical (UV uv) |
70 | { |
71 | U8 ***plane, **row; |
72 | if(OVER_UTF_MAX(uv)) return NULL; |
73 | plane = (U8***)UNF_canon[uv >> 16]; |
74 | if(! plane) return NULL; |
75 | row = plane[(uv >> 8) & 0xff]; |
76 | return row ? row[uv & 0xff] : NULL; |
77 | } |
78 | |
79 | U8* dec_compat (UV uv) |
80 | { |
81 | U8 ***plane, **row; |
82 | if(OVER_UTF_MAX(uv)) return NULL; |
83 | plane = (U8***)UNF_compat[uv >> 16]; |
84 | if(! plane) return NULL; |
85 | row = plane[(uv >> 8) & 0xff]; |
86 | return row ? row[uv & 0xff] : NULL; |
87 | } |
88 | |
89 | UV getComposite (UV uv, UV uv2) |
90 | { |
91 | UNF_complist ***plane, **row, *cell, *i; |
92 | |
93 | if(! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2)) return 0; |
94 | |
95 | if(Hangul_IsL(uv) && Hangul_IsV(uv2)) { |
96 | uv -= Hangul_LBase; /* lindex */ |
97 | uv2 -= Hangul_VBase; /* vindex */ |
98 | return(Hangul_SBase + (uv * Hangul_VCount + uv2) * Hangul_TCount); |
99 | } |
100 | if(Hangul_IsLV(uv) && Hangul_IsT(uv2)) { |
101 | uv2 -= Hangul_TBase; /* tindex */ |
102 | return (uv + uv2); |
103 | } |
104 | plane = UNF_compos[uv >> 16]; |
105 | if(! plane) return 0; |
106 | row = plane[(uv >> 8) & 0xff]; |
107 | if(! row) return 0; |
108 | cell = row[uv & 0xff]; |
109 | if(! cell) return 0; |
110 | for(i = cell; i->nextchar; i++) { |
111 | if(uv2 == i->nextchar) return i->composite; |
112 | } |
113 | return 0; |
114 | } |
115 | |
116 | U8 getCombinClass (UV uv) |
117 | { |
118 | U8 **plane, *row; |
119 | if(OVER_UTF_MAX(uv)) return 0; |
120 | plane = (U8**)UNF_combin[uv >> 16]; |
121 | if(! plane) return 0; |
122 | row = plane[(uv >> 8) & 0xff]; |
123 | return row ? row[uv & 0xff] : 0; |
124 | } |
125 | |
126 | void sv_cat_decompHangul (SV* sv, UV uv) |
127 | { |
128 | UV sindex, lindex, vindex, tindex; |
129 | U8 *t, temp[3 * UTF8_MAXLEN + 1]; |
130 | |
131 | if(! Hangul_IsS(uv)) return; |
132 | |
133 | sindex = uv - Hangul_SBase; |
134 | lindex = sindex / Hangul_NCount; |
135 | vindex = (sindex % Hangul_NCount) / Hangul_TCount; |
136 | tindex = sindex % Hangul_TCount; |
137 | |
138 | t = temp; |
139 | t = uvuni_to_utf8(t, (lindex + Hangul_LBase)); |
140 | t = uvuni_to_utf8(t, (vindex + Hangul_VBase)); |
141 | if (tindex) t = uvuni_to_utf8(t, (tindex + Hangul_TBase)); |
142 | *t = '\0'; |
143 | sv_catpvn(sv, (char *)temp, strlen((char *)temp)); |
144 | } |
145 | |
146 | MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize |
147 | |
148 | |
149 | SV* |
150 | decompose(arg, compat) |
151 | SV * arg |
152 | SV * compat |
153 | PROTOTYPE: $ |
154 | PREINIT: |
155 | SV *src, *dst; |
156 | STRLEN srclen, dstlen, retlen; |
157 | U8 *s, *e, *p, *d, *r; |
158 | UV uv; |
159 | bool iscompat; |
160 | CODE: |
161 | if(SvUTF8(arg)) { |
162 | src = arg; |
163 | } else { |
164 | src = sv_mortalcopy(arg); |
165 | sv_utf8_upgrade(src); |
166 | } |
167 | |
168 | iscompat = SvTRUE(compat); |
169 | |
170 | dst = newSV(1); |
171 | (void)SvPOK_only(dst); |
172 | SvUTF8_on(dst); |
173 | |
174 | s = (U8*)SvPV(src,srclen); |
175 | e = s + srclen; |
176 | for(p = s; p < e;){ |
177 | uv = utf8n_to_uvchr(p, e - p, &retlen, 0); |
178 | p += retlen; |
179 | if(Hangul_IsS(uv)) sv_cat_decompHangul(dst, uv); |
180 | else { |
181 | r = iscompat ? dec_compat(uv) : dec_canonical(uv); |
182 | if(r) sv_catpv(dst, (char *)r); |
183 | else sv_catpvn(dst, (char *)p - retlen, retlen); |
184 | } |
185 | } |
186 | RETVAL = dst; |
187 | OUTPUT: |
188 | RETVAL |
189 | |
190 | |
191 | |
192 | SV* |
193 | reorder(arg) |
194 | SV * arg |
195 | PROTOTYPE: $ |
196 | PREINIT: |
197 | SV *src; |
198 | STRLEN srclen, retlen, stk_cc_max; |
199 | U8 *s, *e, *p, curCC; |
200 | UV uv; |
201 | UNF_cc * stk_cc; |
202 | CODE: |
203 | src = newSVsv(arg); |
204 | if(! SvUTF8(arg)) sv_utf8_upgrade(src); |
205 | |
206 | stk_cc_max = 10; /* enough as an initial value? */ |
207 | New(0, stk_cc, stk_cc_max, UNF_cc); |
208 | |
209 | s = (U8*)SvPV(src,srclen); |
210 | e = s + srclen; |
211 | for(p = s; p < e;){ |
212 | U8 *cc_in; |
213 | STRLEN cc_len, cc_iter, cc_pos; |
214 | |
215 | uv = utf8n_to_uvchr(p, e - p, &retlen, 0); |
216 | p += retlen; |
217 | cc_pos = 0; |
218 | curCC = getCombinClass(uv); |
219 | if(! (curCC && p < e)) continue; else cc_in = p - retlen; |
220 | |
221 | stk_cc[cc_pos].cc = curCC; |
222 | stk_cc[cc_pos].uv = uv; |
223 | stk_cc[cc_pos].pos = cc_pos; |
224 | |
225 | while(p < e) { |
226 | uv = utf8n_to_uvchr(p, e - p, &retlen, 0); |
227 | curCC = getCombinClass(uv); |
228 | if(!curCC) break; |
229 | p += retlen; |
230 | cc_pos++; |
231 | if(stk_cc_max <= cc_pos) { /* extend if need */ |
232 | stk_cc_max = cc_pos + 1; |
233 | Renew(stk_cc, stk_cc_max, UNF_cc); |
234 | } |
235 | stk_cc[cc_pos].cc = curCC; |
236 | stk_cc[cc_pos].uv = uv; |
237 | stk_cc[cc_pos].pos = cc_pos; |
238 | } |
239 | |
240 | /* only one c.c. in cc_len from cc_in, no need of reordering */ |
241 | if(!cc_pos) continue; |
242 | |
243 | qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc); |
244 | |
245 | cc_len = p - cc_in; |
246 | p = cc_in; |
247 | for(cc_iter = 0; cc_iter <= cc_pos; cc_iter++) { |
248 | p = uvuni_to_utf8(p, stk_cc[cc_iter].uv); |
249 | } |
250 | } |
251 | Safefree(stk_cc); |
252 | RETVAL = src; |
253 | OUTPUT: |
254 | RETVAL |
255 | |
256 | |
257 | |
258 | void |
259 | compose(arg) |
260 | SV * arg |
261 | PROTOTYPE: $ |
262 | PREINIT: |
263 | SV *src, *dst, *tmp; |
264 | U8 *s, *p, *e, *d, *t, *tmp_start, curCC, preCC; |
265 | UV uv, uvS, uvComp; |
266 | STRLEN srclen, dstlen, tmplen, dstcur, retlen; |
267 | bool beginning = TRUE; |
268 | PPCODE: |
269 | if(SvUTF8(arg)) { |
270 | src = arg; |
271 | } else { |
272 | src = sv_mortalcopy(arg); |
273 | sv_utf8_upgrade(src); |
274 | } |
275 | s = (U8*)SvPV(src, srclen); |
276 | e = s + srclen; |
277 | dstlen = srclen + 1; /* equal or shorter, XXX */ |
278 | dst = sv_2mortal(newSV(dstlen)); |
279 | (void)SvPOK_only(dst); |
280 | SvUTF8_on(dst); |
281 | d = (U8*)SvPVX(dst); |
282 | |
283 | /* for uncomposed combining char */ |
284 | tmp = sv_2mortal(newSV(dstlen)); |
285 | (void)SvPOK_only(tmp); |
286 | SvUTF8_on(tmp); |
287 | |
288 | for(p = s; p < e;){ |
289 | if(beginning) { |
290 | uvS = utf8n_to_uvchr(p, e - p, &retlen, 0); |
291 | p += retlen; |
292 | |
293 | if (getCombinClass(uvS)){ /* no Starter found yet */ |
294 | d = uvuni_to_utf8(d, uvS); |
295 | continue; |
296 | } |
297 | beginning = FALSE; |
298 | } |
299 | |
300 | /* Starter */ |
301 | t = tmp_start = (U8*)SvPVX(tmp); |
302 | preCC = 0; |
303 | |
304 | /* to the next Starter */ |
305 | while(p < e) { |
306 | uv = utf8n_to_uvchr(p, e - p, &retlen, 0); |
307 | p += retlen; |
308 | curCC = getCombinClass(uv); |
309 | |
310 | if(preCC && preCC == curCC) { |
311 | preCC = curCC; |
312 | t = uvuni_to_utf8(t, uv); |
313 | } else { |
314 | uvComp = getComposite(uvS, uv); |
315 | |
316 | /* S + C + S => S-S + C would be also blocked. */ |
317 | if( uvComp && ! getExclusion(uvComp) && preCC <= curCC) |
318 | { |
319 | /* preCC not changed to curCC */ |
320 | uvS = uvComp; |
321 | } else if (! curCC && p < e) { /* blocked */ |
322 | break; |
323 | } else { |
324 | preCC = curCC; |
325 | t = uvuni_to_utf8(t, uv); |
326 | } |
327 | } |
328 | } |
329 | d = uvuni_to_utf8(d, uvS); /* composed char */ |
330 | if(tmplen = t - tmp_start) { /* uncomposed combining char */ |
331 | t = (U8*)SvPVX(tmp); |
332 | while(tmplen--) *d++ = *t++; |
333 | } |
334 | uvS = uv; |
335 | } /* for */ |
336 | dstcur = d - (U8*)SvPVX(dst); |
337 | SvCUR_set(dst, dstcur); |
338 | XPUSHs(dst); |
339 | |
340 | |
341 | |
342 | U8 |
343 | getCombinClass(uv) |
344 | UV uv |
345 | |
346 | bool |
347 | getExclusion(uv) |
348 | UV uv |
349 | |
350 | UV |
351 | getComposite(uv, uv2) |
352 | UV uv |
353 | UV uv2 |
354 | |
355 | SV* |
356 | getCanon(uv) |
357 | UV uv |
358 | PROTOTYPE: $ |
359 | ALIAS: |
360 | getCompat = 1 |
361 | PREINIT: |
362 | U8 * rstr; |
363 | CODE: |
364 | if(Hangul_IsS(uv)) { |
365 | SV * dst; |
366 | dst = newSV(1); |
367 | (void)SvPOK_only(dst); |
368 | sv_cat_decompHangul(dst, uv); |
369 | RETVAL = dst; |
370 | } else { |
371 | rstr = ix ? dec_compat(uv) : dec_canonical(uv); |
372 | if(!rstr) XSRETURN_UNDEF; |
373 | RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr)); |
374 | } |
375 | SvUTF8_on(RETVAL); |
376 | OUTPUT: |
377 | RETVAL |
378 | |