Commit | Line | Data |
ac5ea531 |
1 | |
2 | #include "EXTERN.h" |
3 | #include "perl.h" |
4 | #include "XSUB.h" |
5 | |
6 | /* These 5 files are prepared by mkheader */ |
7 | #include "unfcmb.h" |
8 | #include "unfcan.h" |
9 | #include "unfcpt.h" |
10 | #include "unfcmp.h" |
11 | #include "unfexc.h" |
12 | |
13 | /* Perl 5.6.1 ? */ |
14 | #ifndef uvuni_to_utf8 |
15 | #define uvuni_to_utf8 uv_to_utf8 |
6c941e0c |
16 | #endif /* uvuni_to_utf8 */ |
ac5ea531 |
17 | |
18 | /* Perl 5.6.1 ? */ |
ab8fe378 |
19 | #ifndef utf8n_to_uvuni |
20 | #define utf8n_to_uvuni utf8_to_uv |
6c941e0c |
21 | #endif /* utf8n_to_uvuni */ |
ac5ea531 |
22 | |
e524f5b2 |
23 | /* UTF8_ALLOW_BOM is used before Perl 5.8.0 */ |
24 | #ifdef UTF8_ALLOW_BOM |
25 | #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FFFF) |
26 | #else |
27 | #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF) |
28 | #endif |
29 | |
30 | /* if utf8n_to_uvuni() sets retlen to 0 (?) */ |
82e740b6 |
31 | #define ErrRetlenIsZero "panic (Unicode::Normalize): zero-length character" |
32 | |
33 | /* utf8_hop() hops back before start. Maybe broken UTF-8 */ |
34 | #define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start" |
35 | |
ac5ea531 |
36 | /* At present, char > 0x10ffff are unaffected without complaint, right? */ |
37 | #define VALID_UTF_MAX (0x10ffff) |
38 | #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv)) |
39 | |
40 | /* HANGUL_H */ |
41 | #define Hangul_SBase 0xAC00 |
42 | #define Hangul_SFinal 0xD7A3 |
43 | #define Hangul_SCount 11172 |
44 | |
45 | #define Hangul_NCount 588 |
46 | |
47 | #define Hangul_LBase 0x1100 |
48 | #define Hangul_LFinal 0x1112 |
49 | #define Hangul_LCount 19 |
50 | |
51 | #define Hangul_VBase 0x1161 |
52 | #define Hangul_VFinal 0x1175 |
53 | #define Hangul_VCount 21 |
54 | |
55 | #define Hangul_TBase 0x11A7 |
56 | #define Hangul_TFinal 0x11C2 |
57 | #define Hangul_TCount 28 |
58 | |
59 | #define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal)) |
2a204b45 |
60 | #define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0) |
ac5ea531 |
61 | #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u)) |
62 | #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal)) |
63 | #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal)) |
64 | #define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal)) |
65 | /* HANGUL_H */ |
66 | |
67 | /* this is used for canonical ordering of combining characters (c.c.). */ |
68 | typedef struct { |
69 | U8 cc; /* combining class */ |
70 | UV uv; /* codepoint */ |
71 | STRLEN pos; /* position */ |
72 | } UNF_cc; |
73 | |
e524f5b2 |
74 | static int compare_cc (const void *a, const void *b) |
ac5ea531 |
75 | { |
76 | int ret_cc; |
6c941e0c |
77 | ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc; |
8f118dcd |
78 | if (ret_cc) |
79 | return ret_cc; |
6c941e0c |
80 | |
81 | return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos ) |
82 | - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos ); |
ac5ea531 |
83 | } |
84 | |
e524f5b2 |
85 | static U8* dec_canonical (UV uv) |
ac5ea531 |
86 | { |
87 | U8 ***plane, **row; |
8f118dcd |
88 | if (OVER_UTF_MAX(uv)) |
89 | return NULL; |
ac5ea531 |
90 | plane = (U8***)UNF_canon[uv >> 16]; |
8f118dcd |
91 | if (! plane) |
92 | return NULL; |
ac5ea531 |
93 | row = plane[(uv >> 8) & 0xff]; |
94 | return row ? row[uv & 0xff] : NULL; |
95 | } |
96 | |
e524f5b2 |
97 | static U8* dec_compat (UV uv) |
ac5ea531 |
98 | { |
99 | U8 ***plane, **row; |
8f118dcd |
100 | if (OVER_UTF_MAX(uv)) |
101 | return NULL; |
ac5ea531 |
102 | plane = (U8***)UNF_compat[uv >> 16]; |
8f118dcd |
103 | if (! plane) |
104 | return NULL; |
ac5ea531 |
105 | row = plane[(uv >> 8) & 0xff]; |
106 | return row ? row[uv & 0xff] : NULL; |
107 | } |
108 | |
e524f5b2 |
109 | static UV composite_uv (UV uv, UV uv2) |
ac5ea531 |
110 | { |
111 | UNF_complist ***plane, **row, *cell, *i; |
112 | |
8f118dcd |
113 | if (! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2)) |
114 | return 0; |
ac5ea531 |
115 | |
8f118dcd |
116 | if (Hangul_IsL(uv) && Hangul_IsV(uv2)) { |
ac5ea531 |
117 | uv -= Hangul_LBase; /* lindex */ |
118 | uv2 -= Hangul_VBase; /* vindex */ |
119 | return(Hangul_SBase + (uv * Hangul_VCount + uv2) * Hangul_TCount); |
120 | } |
8f118dcd |
121 | if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) { |
ac5ea531 |
122 | uv2 -= Hangul_TBase; /* tindex */ |
2a204b45 |
123 | return(uv + uv2); |
ac5ea531 |
124 | } |
125 | plane = UNF_compos[uv >> 16]; |
8f118dcd |
126 | if (! plane) |
127 | return 0; |
ac5ea531 |
128 | row = plane[(uv >> 8) & 0xff]; |
8f118dcd |
129 | if (! row) |
130 | return 0; |
ac5ea531 |
131 | cell = row[uv & 0xff]; |
8f118dcd |
132 | if (! cell) |
133 | return 0; |
134 | for (i = cell; i->nextchar; i++) { |
135 | if (uv2 == i->nextchar) |
136 | return i->composite; |
ac5ea531 |
137 | } |
138 | return 0; |
139 | } |
140 | |
e524f5b2 |
141 | static U8 getCombinClass (UV uv) |
ac5ea531 |
142 | { |
143 | U8 **plane, *row; |
8f118dcd |
144 | if (OVER_UTF_MAX(uv)) |
145 | return 0; |
ac5ea531 |
146 | plane = (U8**)UNF_combin[uv >> 16]; |
8f118dcd |
147 | if (! plane) |
148 | return 0; |
ac5ea531 |
149 | row = plane[(uv >> 8) & 0xff]; |
150 | return row ? row[uv & 0xff] : 0; |
151 | } |
152 | |
e524f5b2 |
153 | static void sv_cat_decompHangul (SV* sv, UV uv) |
ac5ea531 |
154 | { |
155 | UV sindex, lindex, vindex, tindex; |
2a204b45 |
156 | U8 *t, tmp[3 * UTF8_MAXLEN + 1]; |
ac5ea531 |
157 | |
8f118dcd |
158 | if (! Hangul_IsS(uv)) |
159 | return; |
ac5ea531 |
160 | |
161 | sindex = uv - Hangul_SBase; |
162 | lindex = sindex / Hangul_NCount; |
163 | vindex = (sindex % Hangul_NCount) / Hangul_TCount; |
164 | tindex = sindex % Hangul_TCount; |
165 | |
2a204b45 |
166 | t = tmp; |
ac5ea531 |
167 | t = uvuni_to_utf8(t, (lindex + Hangul_LBase)); |
168 | t = uvuni_to_utf8(t, (vindex + Hangul_VBase)); |
8f118dcd |
169 | if (tindex) |
170 | t = uvuni_to_utf8(t, (tindex + Hangul_TBase)); |
ac5ea531 |
171 | *t = '\0'; |
e524f5b2 |
172 | sv_catpvn(sv, (char *)tmp, t - tmp); |
173 | } |
174 | |
175 | static void sv_cat_uvuni (SV* sv, UV uv) |
176 | { |
177 | U8 *t, tmp[UTF8_MAXLEN + 1]; |
178 | |
179 | t = tmp; |
180 | t = uvuni_to_utf8(t, uv); |
181 | *t = '\0'; |
182 | sv_catpvn(sv, (char *)tmp, t - tmp); |
ac5ea531 |
183 | } |
184 | |
185 | MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize |
186 | |
ac5ea531 |
187 | SV* |
d85850a7 |
188 | decompose(arg, compat = &PL_sv_no) |
ac5ea531 |
189 | SV * arg |
190 | SV * compat |
d85850a7 |
191 | PROTOTYPE: $;$ |
ac5ea531 |
192 | PREINIT: |
2a204b45 |
193 | UV uv; |
ac5ea531 |
194 | SV *src, *dst; |
73263a9c |
195 | STRLEN srclen, retlen; |
196 | U8 *s, *e, *p, *r; |
ac5ea531 |
197 | bool iscompat; |
198 | CODE: |
8f118dcd |
199 | if (SvUTF8(arg)) { |
ac5ea531 |
200 | src = arg; |
201 | } else { |
202 | src = sv_mortalcopy(arg); |
203 | sv_utf8_upgrade(src); |
204 | } |
ac5ea531 |
205 | iscompat = SvTRUE(compat); |
206 | |
207 | dst = newSV(1); |
208 | (void)SvPOK_only(dst); |
209 | SvUTF8_on(dst); |
210 | |
211 | s = (U8*)SvPV(src,srclen); |
212 | e = s + srclen; |
82e740b6 |
213 | for (p = s; p < e; p += retlen) { |
e524f5b2 |
214 | uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 |
215 | if (!retlen) |
216 | croak(ErrRetlenIsZero); |
217 | |
8f118dcd |
218 | if (Hangul_IsS(uv)) |
219 | sv_cat_decompHangul(dst, uv); |
ac5ea531 |
220 | else { |
221 | r = iscompat ? dec_compat(uv) : dec_canonical(uv); |
8f118dcd |
222 | if (r) |
223 | sv_catpv(dst, (char *)r); |
224 | else |
e524f5b2 |
225 | sv_cat_uvuni(dst, uv); |
ac5ea531 |
226 | } |
227 | } |
228 | RETVAL = dst; |
229 | OUTPUT: |
230 | RETVAL |
231 | |
232 | |
233 | |
234 | SV* |
235 | reorder(arg) |
236 | SV * arg |
237 | PROTOTYPE: $ |
238 | PREINIT: |
8f118dcd |
239 | SV *src, *dst; |
240 | STRLEN srclen, dstlen, retlen, stk_cc_max; |
241 | U8 *s, *e, *p, *d, curCC; |
e524f5b2 |
242 | UV uv, uvlast; |
ac5ea531 |
243 | UNF_cc * stk_cc; |
e524f5b2 |
244 | STRLEN i, cc_pos; |
245 | bool valid_uvlast; |
ac5ea531 |
246 | CODE: |
8f118dcd |
247 | if (SvUTF8(arg)) { |
248 | src = arg; |
249 | } else { |
250 | src = sv_mortalcopy(arg); |
251 | sv_utf8_upgrade(src); |
252 | } |
253 | |
254 | s = (U8*)SvPV(src, srclen); |
e524f5b2 |
255 | e = s + srclen; |
8f118dcd |
256 | dstlen = srclen + 1; |
257 | dst = newSV(dstlen); |
e524f5b2 |
258 | (void)SvPOK_only(dst); |
8f118dcd |
259 | SvUTF8_on(dst); |
e524f5b2 |
260 | d = (U8*)SvPVX(dst); |
ac5ea531 |
261 | |
262 | stk_cc_max = 10; /* enough as an initial value? */ |
263 | New(0, stk_cc, stk_cc_max, UNF_cc); |
264 | |
e524f5b2 |
265 | for (p = s; p < e;) { |
266 | uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 |
267 | if (!retlen) |
268 | croak(ErrRetlenIsZero); |
2a204b45 |
269 | p += retlen; |
270 | |
82e740b6 |
271 | curCC = getCombinClass(uv); |
e524f5b2 |
272 | if (curCC == 0) { |
273 | d = uvuni_to_utf8(d, uv); |
8f118dcd |
274 | continue; |
e524f5b2 |
275 | } |
ac5ea531 |
276 | |
2a204b45 |
277 | cc_pos = 0; |
ac5ea531 |
278 | stk_cc[cc_pos].cc = curCC; |
279 | stk_cc[cc_pos].uv = uv; |
280 | stk_cc[cc_pos].pos = cc_pos; |
281 | |
e524f5b2 |
282 | valid_uvlast = FALSE; |
8f118dcd |
283 | while (p < e) { |
e524f5b2 |
284 | uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 |
285 | if (!retlen) |
286 | croak(ErrRetlenIsZero); |
287 | p += retlen; |
288 | |
ac5ea531 |
289 | curCC = getCombinClass(uv); |
e524f5b2 |
290 | if (curCC == 0) { |
291 | uvlast = uv; |
292 | valid_uvlast = TRUE; |
8f118dcd |
293 | break; |
e524f5b2 |
294 | } |
82e740b6 |
295 | |
ac5ea531 |
296 | cc_pos++; |
8f118dcd |
297 | if (stk_cc_max <= cc_pos) { /* extend if need */ |
ac5ea531 |
298 | stk_cc_max = cc_pos + 1; |
299 | Renew(stk_cc, stk_cc_max, UNF_cc); |
300 | } |
301 | stk_cc[cc_pos].cc = curCC; |
302 | stk_cc[cc_pos].uv = uv; |
303 | stk_cc[cc_pos].pos = cc_pos; |
304 | } |
305 | |
e524f5b2 |
306 | /* reordered if there are two c.c.'s */ |
307 | if (cc_pos) { |
308 | qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc); |
309 | } |
ac5ea531 |
310 | |
e524f5b2 |
311 | for (i = 0; i <= cc_pos; i++) { |
312 | d = uvuni_to_utf8(d, stk_cc[i].uv); |
313 | } |
314 | if (valid_uvlast) |
315 | { |
316 | d = uvuni_to_utf8(d, uvlast); |
ac5ea531 |
317 | } |
318 | } |
e524f5b2 |
319 | *d = '\0'; |
320 | SvCUR_set(dst, d - (U8*)SvPVX(dst)); |
ac5ea531 |
321 | Safefree(stk_cc); |
8f118dcd |
322 | RETVAL = dst; |
ac5ea531 |
323 | OUTPUT: |
324 | RETVAL |
325 | |
326 | |
327 | |
2a204b45 |
328 | SV* |
ac5ea531 |
329 | compose(arg) |
330 | SV * arg |
331 | PROTOTYPE: $ |
82e740b6 |
332 | ALIAS: |
333 | composeContiguous = 1 |
ac5ea531 |
334 | PREINIT: |
335 | SV *src, *dst, *tmp; |
336 | U8 *s, *p, *e, *d, *t, *tmp_start, curCC, preCC; |
337 | UV uv, uvS, uvComp; |
2a204b45 |
338 | STRLEN srclen, dstlen, tmplen, retlen; |
ac5ea531 |
339 | bool beginning = TRUE; |
2a204b45 |
340 | CODE: |
8f118dcd |
341 | if (SvUTF8(arg)) { |
ac5ea531 |
342 | src = arg; |
343 | } else { |
344 | src = sv_mortalcopy(arg); |
345 | sv_utf8_upgrade(src); |
346 | } |
2a204b45 |
347 | |
ac5ea531 |
348 | s = (U8*)SvPV(src, srclen); |
349 | e = s + srclen; |
d85850a7 |
350 | dstlen = srclen + 1; |
2a204b45 |
351 | dst = newSV(dstlen); |
ac5ea531 |
352 | (void)SvPOK_only(dst); |
353 | SvUTF8_on(dst); |
354 | d = (U8*)SvPVX(dst); |
355 | |
356 | /* for uncomposed combining char */ |
357 | tmp = sv_2mortal(newSV(dstlen)); |
358 | (void)SvPOK_only(tmp); |
359 | SvUTF8_on(tmp); |
360 | |
8f118dcd |
361 | for (p = s; p < e;) { |
362 | if (beginning) { |
e524f5b2 |
363 | uvS = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 |
364 | if (!retlen) |
365 | croak(ErrRetlenIsZero); |
ac5ea531 |
366 | p += retlen; |
367 | |
8f118dcd |
368 | if (getCombinClass(uvS)) { /* no Starter found yet */ |
ac5ea531 |
369 | d = uvuni_to_utf8(d, uvS); |
370 | continue; |
371 | } |
372 | beginning = FALSE; |
373 | } |
374 | |
375 | /* Starter */ |
376 | t = tmp_start = (U8*)SvPVX(tmp); |
377 | preCC = 0; |
378 | |
379 | /* to the next Starter */ |
8f118dcd |
380 | while (p < e) { |
e524f5b2 |
381 | uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 |
382 | if (!retlen) |
383 | croak(ErrRetlenIsZero); |
ac5ea531 |
384 | p += retlen; |
82e740b6 |
385 | |
ac5ea531 |
386 | curCC = getCombinClass(uv); |
387 | |
8f118dcd |
388 | if (preCC && preCC == curCC) { |
ac5ea531 |
389 | preCC = curCC; |
390 | t = uvuni_to_utf8(t, uv); |
391 | } else { |
2a204b45 |
392 | uvComp = composite_uv(uvS, uv); |
ac5ea531 |
393 | |
82e740b6 |
394 | if (uvComp && ! isExclusion(uvComp) && |
395 | (ix ? (t == tmp_start) : (preCC <= curCC))) { |
d85850a7 |
396 | STRLEN leftcur, rightcur, dstcur; |
397 | leftcur = UNISKIP(uvComp); |
398 | rightcur = UNISKIP(uvS) + UNISKIP(uv); |
399 | |
400 | if (leftcur > rightcur) { |
401 | dstcur = d - (U8*)SvPVX(dst); |
402 | dstlen += leftcur - rightcur; |
403 | d = (U8*)SvGROW(dst,dstlen) + dstcur; |
404 | } |
ac5ea531 |
405 | /* preCC not changed to curCC */ |
406 | uvS = uvComp; |
1efaba7f |
407 | } else if (! curCC && p < e) { /* blocked */ |
ac5ea531 |
408 | break; |
409 | } else { |
410 | preCC = curCC; |
411 | t = uvuni_to_utf8(t, uv); |
412 | } |
413 | } |
414 | } |
2a204b45 |
415 | d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */ |
d85850a7 |
416 | tmplen = t - tmp_start; |
417 | if (tmplen) { /* uncomposed combining char */ |
ac5ea531 |
418 | t = (U8*)SvPVX(tmp); |
8f118dcd |
419 | while (tmplen--) |
420 | *d++ = *t++; |
ac5ea531 |
421 | } |
422 | uvS = uv; |
423 | } /* for */ |
00f2676f |
424 | *d = '\0'; |
d85850a7 |
425 | SvCUR_set(dst, d - (U8*)SvPVX(dst)); |
2a204b45 |
426 | RETVAL = dst; |
427 | OUTPUT: |
428 | RETVAL |
ac5ea531 |
429 | |
430 | |
8f118dcd |
431 | void |
432 | checkNFD(arg) |
433 | SV * arg |
434 | PROTOTYPE: $ |
435 | ALIAS: |
436 | checkNFKD = 1 |
437 | PREINIT: |
438 | UV uv; |
439 | SV *src; |
440 | STRLEN srclen, retlen; |
441 | U8 *s, *e, *p, curCC, preCC; |
82e740b6 |
442 | CODE: |
8f118dcd |
443 | if (SvUTF8(arg)) { |
444 | src = arg; |
445 | } else { |
446 | src = sv_mortalcopy(arg); |
447 | sv_utf8_upgrade(src); |
448 | } |
449 | |
450 | s = (U8*)SvPV(src,srclen); |
451 | e = s + srclen; |
452 | |
453 | preCC = 0; |
454 | for (p = s; p < e; p += retlen) { |
e524f5b2 |
455 | uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 |
456 | if (!retlen) |
457 | croak(ErrRetlenIsZero); |
458 | |
8f118dcd |
459 | curCC = getCombinClass(uv); |
460 | if (preCC > curCC && curCC != 0) /* canonical ordering violated */ |
461 | XSRETURN_NO; |
462 | if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) |
463 | XSRETURN_NO; |
464 | preCC = curCC; |
465 | } |
466 | XSRETURN_YES; |
467 | |
468 | |
469 | |
470 | void |
471 | checkNFC(arg) |
472 | SV * arg |
473 | PROTOTYPE: $ |
474 | ALIAS: |
475 | checkNFKC = 1 |
476 | PREINIT: |
477 | UV uv; |
478 | SV *src; |
479 | STRLEN srclen, retlen; |
480 | U8 *s, *e, *p, curCC, preCC; |
481 | bool isMAYBE; |
82e740b6 |
482 | CODE: |
8f118dcd |
483 | if (SvUTF8(arg)) { |
484 | src = arg; |
485 | } else { |
486 | src = sv_mortalcopy(arg); |
487 | sv_utf8_upgrade(src); |
488 | } |
489 | |
490 | s = (U8*)SvPV(src,srclen); |
491 | e = s + srclen; |
492 | |
493 | preCC = 0; |
494 | isMAYBE = FALSE; |
495 | for (p = s; p < e; p += retlen) { |
e524f5b2 |
496 | uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 |
497 | if (!retlen) |
498 | croak(ErrRetlenIsZero); |
499 | |
8f118dcd |
500 | curCC = getCombinClass(uv); |
501 | |
502 | if (preCC > curCC && curCC != 0) /* canonical ordering violated */ |
503 | XSRETURN_NO; |
504 | |
505 | /* get NFC/NFKC property */ |
506 | if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */ |
507 | ; /* YES */ |
508 | else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) |
509 | XSRETURN_NO; |
510 | else if (isComp2nd(uv)) |
511 | isMAYBE = TRUE; |
512 | else if (ix) { |
513 | char *canon, *compat; |
6c941e0c |
514 | /* NFKC_NO when having compatibility mapping. */ |
8f118dcd |
515 | canon = (char *) dec_canonical(uv); |
516 | compat = (char *) dec_compat(uv); |
6c941e0c |
517 | if (compat && !(canon && strEQ(canon, compat))) |
8f118dcd |
518 | XSRETURN_NO; |
519 | } /* end of get NFC/NFKC property */ |
520 | |
521 | preCC = curCC; |
522 | } |
523 | if (isMAYBE) |
524 | XSRETURN_UNDEF; |
525 | else |
526 | XSRETURN_YES; |
527 | |
528 | |
529 | |
82e740b6 |
530 | void |
531 | checkFCD(arg) |
532 | SV * arg |
533 | PROTOTYPE: $ |
534 | ALIAS: |
535 | checkFCC = 1 |
536 | PREINIT: |
537 | UV uv, uvLead, uvTrail; |
538 | SV *src; |
539 | STRLEN srclen, retlen, canlen, canret; |
540 | U8 *s, *e, *p, curCC, preCC; |
541 | U8 *sCan, *pCan, *eCan; |
542 | bool isMAYBE; |
543 | CODE: |
544 | if (SvUTF8(arg)) { |
545 | src = arg; |
546 | } else { |
547 | src = sv_mortalcopy(arg); |
548 | sv_utf8_upgrade(src); |
549 | } |
550 | |
551 | s = (U8*)SvPV(src,srclen); |
552 | e = s + srclen; |
553 | |
554 | preCC = 0; |
555 | isMAYBE = FALSE; |
556 | for (p = s; p < e; p += retlen) { |
e524f5b2 |
557 | uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 |
558 | if (!retlen) |
559 | croak(ErrRetlenIsZero); |
560 | |
561 | sCan = (U8*) dec_canonical(uv); |
562 | |
563 | if (sCan) { |
564 | canlen = (STRLEN)strlen((char *) sCan); |
e524f5b2 |
565 | uvLead = utf8n_to_uvuni(sCan, canlen, &canret, AllowAnyUTF); |
82e740b6 |
566 | } |
567 | else { |
568 | uvLead = uv; |
569 | } |
570 | |
571 | curCC = getCombinClass(uvLead); |
572 | |
573 | if (curCC != 0 && curCC < preCC) /* canonical ordering violated */ |
574 | XSRETURN_NO; |
575 | |
576 | if (ix) { |
577 | if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) |
578 | XSRETURN_NO; |
579 | else if (isComp2nd(uv)) |
580 | isMAYBE = TRUE; |
581 | } |
582 | |
583 | if (sCan) { |
584 | eCan = sCan + canlen; |
585 | pCan = utf8_hop(eCan, -1); |
586 | if (pCan < sCan) |
587 | croak(ErrHopBeforeStart); |
e524f5b2 |
588 | uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, AllowAnyUTF); |
82e740b6 |
589 | preCC = getCombinClass(uvTrail); |
590 | } |
591 | else { |
592 | preCC = curCC; |
593 | } |
594 | } |
595 | if (isMAYBE) |
596 | XSRETURN_UNDEF; |
597 | else |
598 | XSRETURN_YES; |
599 | |
600 | |
601 | |
ac5ea531 |
602 | U8 |
603 | getCombinClass(uv) |
604 | UV uv |
8f118dcd |
605 | PROTOTYPE: $ |
ac5ea531 |
606 | |
607 | bool |
2a204b45 |
608 | isExclusion(uv) |
ac5ea531 |
609 | UV uv |
8f118dcd |
610 | PROTOTYPE: $ |
611 | |
612 | bool |
613 | isSingleton(uv) |
614 | UV uv |
615 | PROTOTYPE: $ |
616 | |
617 | bool |
618 | isNonStDecomp(uv) |
619 | UV uv |
620 | PROTOTYPE: $ |
621 | |
622 | bool |
623 | isComp2nd(uv) |
624 | UV uv |
625 | PROTOTYPE: $ |
626 | ALIAS: |
627 | isNFC_MAYBE = 1 |
628 | isNFKC_MAYBE = 2 |
629 | |
630 | |
631 | |
632 | void |
633 | isNFD_NO(uv) |
634 | UV uv |
635 | PROTOTYPE: $ |
636 | ALIAS: |
637 | isNFKD_NO = 1 |
82e740b6 |
638 | CODE: |
8f118dcd |
639 | if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) |
640 | XSRETURN_YES; /* NFD_NO or NFKD_NO */ |
641 | else |
642 | XSRETURN_NO; |
643 | |
644 | |
645 | |
646 | void |
647 | isComp_Ex(uv) |
648 | UV uv |
649 | PROTOTYPE: $ |
650 | ALIAS: |
651 | isNFC_NO = 0 |
652 | isNFKC_NO = 1 |
82e740b6 |
653 | CODE: |
8f118dcd |
654 | if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) |
655 | XSRETURN_YES; /* NFC_NO or NFKC_NO */ |
656 | else if (ix) { |
657 | char *canon, *compat; |
658 | canon = (char *) dec_canonical(uv); |
659 | compat = (char *) dec_compat(uv); |
660 | if (compat && (!canon || strNE(canon, compat))) |
661 | XSRETURN_YES; /* NFC_NO or NFKC_NO */ |
662 | else |
663 | XSRETURN_NO; |
664 | } |
665 | else |
666 | XSRETURN_NO; |
667 | |
668 | |
ac5ea531 |
669 | |
2a204b45 |
670 | SV* |
ac5ea531 |
671 | getComposite(uv, uv2) |
672 | UV uv |
673 | UV uv2 |
2a204b45 |
674 | PROTOTYPE: $$ |
675 | PREINIT: |
bcdb689b |
676 | UV composite; |
2a204b45 |
677 | CODE: |
bcdb689b |
678 | composite = composite_uv(uv, uv2); |
679 | RETVAL = composite ? newSVuv(composite) : &PL_sv_undef; |
2a204b45 |
680 | OUTPUT: |
681 | RETVAL |
ac5ea531 |
682 | |
8f118dcd |
683 | |
684 | |
ac5ea531 |
685 | SV* |
686 | getCanon(uv) |
687 | UV uv |
688 | PROTOTYPE: $ |
689 | ALIAS: |
690 | getCompat = 1 |
691 | PREINIT: |
692 | U8 * rstr; |
693 | CODE: |
8f118dcd |
694 | if (Hangul_IsS(uv)) { |
ac5ea531 |
695 | SV * dst; |
696 | dst = newSV(1); |
697 | (void)SvPOK_only(dst); |
698 | sv_cat_decompHangul(dst, uv); |
699 | RETVAL = dst; |
700 | } else { |
701 | rstr = ix ? dec_compat(uv) : dec_canonical(uv); |
8f118dcd |
702 | if (!rstr) |
703 | XSRETURN_UNDEF; |
ac5ea531 |
704 | RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr)); |
705 | } |
706 | SvUTF8_on(RETVAL); |
707 | OUTPUT: |
708 | RETVAL |
709 | |
82e740b6 |
710 | |
711 | void |
712 | splitOnLastStarter(arg) |
713 | SV * arg |
714 | PREINIT: |
715 | UV uv; |
716 | SV *src, *svp; |
717 | STRLEN srclen, retlen; |
718 | U8 *s, *e, *p; |
719 | PPCODE: |
720 | if (SvUTF8(arg)) { |
721 | src = arg; |
722 | } else { |
723 | src = sv_mortalcopy(arg); |
724 | sv_utf8_upgrade(src); |
725 | } |
726 | |
727 | s = (U8*)SvPV(src,srclen); |
728 | e = s + srclen; |
729 | |
730 | for (p = e; s < p; ) { |
731 | p = utf8_hop(p, -1); |
732 | if (p < s) |
733 | croak(ErrHopBeforeStart); |
e524f5b2 |
734 | uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 |
735 | if (getCombinClass(uv) == 0) /* Last Starter found */ |
736 | break; |
737 | } |
738 | |
739 | svp = sv_2mortal(newSVpvn((char*)s, p - s)); |
740 | SvUTF8_on(svp); |
741 | XPUSHs(svp); |
742 | |
743 | svp = sv_2mortal(newSVpvn((char*)p, e - p)); |
744 | SvUTF8_on(svp); |
745 | XPUSHs(svp); |
746 | |