Commit | Line | Data |
ac5ea531 |
1 | |
2 | #include "EXTERN.h" |
3 | #include "perl.h" |
4 | #include "XSUB.h" |
5 | |
6 | /* These 5 files are prepared by mkheader */ |
7 | #include "unfcmb.h" |
8 | #include "unfcan.h" |
9 | #include "unfcpt.h" |
10 | #include "unfcmp.h" |
11 | #include "unfexc.h" |
12 | |
13 | /* Perl 5.6.1 ? */ |
14 | #ifndef uvuni_to_utf8 |
15 | #define uvuni_to_utf8 uv_to_utf8 |
6c941e0c |
16 | #endif /* uvuni_to_utf8 */ |
ac5ea531 |
17 | |
18 | /* Perl 5.6.1 ? */ |
ab8fe378 |
19 | #ifndef utf8n_to_uvuni |
20 | #define utf8n_to_uvuni utf8_to_uv |
6c941e0c |
21 | #endif /* utf8n_to_uvuni */ |
ac5ea531 |
22 | |
e524f5b2 |
23 | /* UTF8_ALLOW_BOM is used before Perl 5.8.0 */ |
24 | #ifdef UTF8_ALLOW_BOM |
25 | #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FFFF) |
628bbff0 |
26 | #else |
e524f5b2 |
27 | #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF) |
28 | #endif |
29 | |
30 | /* if utf8n_to_uvuni() sets retlen to 0 (?) */ |
82e740b6 |
31 | #define ErrRetlenIsZero "panic (Unicode::Normalize): zero-length character" |
32 | |
33 | /* utf8_hop() hops back before start. Maybe broken UTF-8 */ |
34 | #define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start" |
35 | |
ac5ea531 |
36 | /* At present, char > 0x10ffff are unaffected without complaint, right? */ |
37 | #define VALID_UTF_MAX (0x10ffff) |
38 | #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv)) |
39 | |
40 | /* HANGUL_H */ |
41 | #define Hangul_SBase 0xAC00 |
42 | #define Hangul_SFinal 0xD7A3 |
43 | #define Hangul_SCount 11172 |
44 | |
45 | #define Hangul_NCount 588 |
46 | |
47 | #define Hangul_LBase 0x1100 |
48 | #define Hangul_LFinal 0x1112 |
49 | #define Hangul_LCount 19 |
50 | |
51 | #define Hangul_VBase 0x1161 |
52 | #define Hangul_VFinal 0x1175 |
53 | #define Hangul_VCount 21 |
54 | |
55 | #define Hangul_TBase 0x11A7 |
56 | #define Hangul_TFinal 0x11C2 |
57 | #define Hangul_TCount 28 |
58 | |
59 | #define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal)) |
2a204b45 |
60 | #define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0) |
ac5ea531 |
61 | #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u)) |
62 | #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal)) |
63 | #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal)) |
64 | #define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal)) |
65 | /* HANGUL_H */ |
66 | |
67 | /* this is used for canonical ordering of combining characters (c.c.). */ |
68 | typedef struct { |
69 | U8 cc; /* combining class */ |
70 | UV uv; /* codepoint */ |
71 | STRLEN pos; /* position */ |
72 | } UNF_cc; |
73 | |
e524f5b2 |
74 | static int compare_cc (const void *a, const void *b) |
ac5ea531 |
75 | { |
76 | int ret_cc; |
6c941e0c |
77 | ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc; |
8f118dcd |
78 | if (ret_cc) |
79 | return ret_cc; |
6c941e0c |
80 | |
81 | return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos ) |
82 | - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos ); |
ac5ea531 |
83 | } |
84 | |
e524f5b2 |
85 | static U8* dec_canonical (UV uv) |
ac5ea531 |
86 | { |
87 | U8 ***plane, **row; |
8f118dcd |
88 | if (OVER_UTF_MAX(uv)) |
89 | return NULL; |
ac5ea531 |
90 | plane = (U8***)UNF_canon[uv >> 16]; |
8f118dcd |
91 | if (! plane) |
92 | return NULL; |
ac5ea531 |
93 | row = plane[(uv >> 8) & 0xff]; |
94 | return row ? row[uv & 0xff] : NULL; |
95 | } |
96 | |
e524f5b2 |
97 | static U8* dec_compat (UV uv) |
ac5ea531 |
98 | { |
99 | U8 ***plane, **row; |
8f118dcd |
100 | if (OVER_UTF_MAX(uv)) |
101 | return NULL; |
ac5ea531 |
102 | plane = (U8***)UNF_compat[uv >> 16]; |
8f118dcd |
103 | if (! plane) |
104 | return NULL; |
ac5ea531 |
105 | row = plane[(uv >> 8) & 0xff]; |
106 | return row ? row[uv & 0xff] : NULL; |
107 | } |
108 | |
e524f5b2 |
109 | static UV composite_uv (UV uv, UV uv2) |
ac5ea531 |
110 | { |
111 | UNF_complist ***plane, **row, *cell, *i; |
112 | |
8f118dcd |
113 | if (! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2)) |
114 | return 0; |
ac5ea531 |
115 | |
8f118dcd |
116 | if (Hangul_IsL(uv) && Hangul_IsV(uv2)) { |
ac5ea531 |
117 | uv -= Hangul_LBase; /* lindex */ |
118 | uv2 -= Hangul_VBase; /* vindex */ |
119 | return(Hangul_SBase + (uv * Hangul_VCount + uv2) * Hangul_TCount); |
120 | } |
8f118dcd |
121 | if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) { |
ac5ea531 |
122 | uv2 -= Hangul_TBase; /* tindex */ |
2a204b45 |
123 | return(uv + uv2); |
ac5ea531 |
124 | } |
125 | plane = UNF_compos[uv >> 16]; |
8f118dcd |
126 | if (! plane) |
127 | return 0; |
ac5ea531 |
128 | row = plane[(uv >> 8) & 0xff]; |
8f118dcd |
129 | if (! row) |
130 | return 0; |
ac5ea531 |
131 | cell = row[uv & 0xff]; |
8f118dcd |
132 | if (! cell) |
133 | return 0; |
134 | for (i = cell; i->nextchar; i++) { |
135 | if (uv2 == i->nextchar) |
136 | return i->composite; |
ac5ea531 |
137 | } |
138 | return 0; |
139 | } |
140 | |
e524f5b2 |
141 | static U8 getCombinClass (UV uv) |
ac5ea531 |
142 | { |
143 | U8 **plane, *row; |
8f118dcd |
144 | if (OVER_UTF_MAX(uv)) |
145 | return 0; |
ac5ea531 |
146 | plane = (U8**)UNF_combin[uv >> 16]; |
8f118dcd |
147 | if (! plane) |
148 | return 0; |
ac5ea531 |
149 | row = plane[(uv >> 8) & 0xff]; |
150 | return row ? row[uv & 0xff] : 0; |
151 | } |
152 | |
e524f5b2 |
153 | static void sv_cat_decompHangul (SV* sv, UV uv) |
ac5ea531 |
154 | { |
155 | UV sindex, lindex, vindex, tindex; |
2a204b45 |
156 | U8 *t, tmp[3 * UTF8_MAXLEN + 1]; |
ac5ea531 |
157 | |
8f118dcd |
158 | if (! Hangul_IsS(uv)) |
159 | return; |
ac5ea531 |
160 | |
161 | sindex = uv - Hangul_SBase; |
162 | lindex = sindex / Hangul_NCount; |
163 | vindex = (sindex % Hangul_NCount) / Hangul_TCount; |
164 | tindex = sindex % Hangul_TCount; |
165 | |
2a204b45 |
166 | t = tmp; |
ac5ea531 |
167 | t = uvuni_to_utf8(t, (lindex + Hangul_LBase)); |
168 | t = uvuni_to_utf8(t, (vindex + Hangul_VBase)); |
8f118dcd |
169 | if (tindex) |
170 | t = uvuni_to_utf8(t, (tindex + Hangul_TBase)); |
ac5ea531 |
171 | *t = '\0'; |
e524f5b2 |
172 | sv_catpvn(sv, (char *)tmp, t - tmp); |
173 | } |
174 | |
175 | static void sv_cat_uvuni (SV* sv, UV uv) |
176 | { |
177 | U8 *t, tmp[UTF8_MAXLEN + 1]; |
178 | |
179 | t = tmp; |
180 | t = uvuni_to_utf8(t, uv); |
181 | *t = '\0'; |
182 | sv_catpvn(sv, (char *)tmp, t - tmp); |
ac5ea531 |
183 | } |
184 | |
a092bcfd |
185 | static char * sv_2pvunicode(SV *sv, STRLEN *lp) |
186 | { |
187 | char *s; |
188 | STRLEN len; |
189 | s = (char*)SvPV(sv,len); |
190 | if (!SvUTF8(sv)) { |
191 | SV* tmpsv = sv_mortalcopy(sv); |
192 | if (!SvPOK(tmpsv)) |
193 | (void)sv_pvn_force(tmpsv,&len); |
194 | sv_utf8_upgrade(tmpsv); |
195 | s = (char*)SvPV(tmpsv,len); |
196 | } |
197 | *lp = len; |
198 | return s; |
199 | } |
200 | |
ac5ea531 |
201 | MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize |
202 | |
ac5ea531 |
203 | SV* |
a092bcfd |
204 | decompose(src, compat = &PL_sv_no) |
205 | SV * src |
ac5ea531 |
206 | SV * compat |
d85850a7 |
207 | PROTOTYPE: $;$ |
ac5ea531 |
208 | PREINIT: |
a092bcfd |
209 | SV *dst; |
73263a9c |
210 | STRLEN srclen, retlen; |
211 | U8 *s, *e, *p, *r; |
a092bcfd |
212 | UV uv; |
ac5ea531 |
213 | bool iscompat; |
214 | CODE: |
ac5ea531 |
215 | iscompat = SvTRUE(compat); |
a092bcfd |
216 | s = (U8*)sv_2pvunicode(src,&srclen); |
217 | e = s + srclen; |
ac5ea531 |
218 | |
219 | dst = newSV(1); |
220 | (void)SvPOK_only(dst); |
221 | SvUTF8_on(dst); |
222 | |
82e740b6 |
223 | for (p = s; p < e; p += retlen) { |
e524f5b2 |
224 | uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 |
225 | if (!retlen) |
226 | croak(ErrRetlenIsZero); |
227 | |
8f118dcd |
228 | if (Hangul_IsS(uv)) |
229 | sv_cat_decompHangul(dst, uv); |
ac5ea531 |
230 | else { |
231 | r = iscompat ? dec_compat(uv) : dec_canonical(uv); |
8f118dcd |
232 | if (r) |
233 | sv_catpv(dst, (char *)r); |
234 | else |
e524f5b2 |
235 | sv_cat_uvuni(dst, uv); |
ac5ea531 |
236 | } |
237 | } |
238 | RETVAL = dst; |
239 | OUTPUT: |
240 | RETVAL |
241 | |
242 | |
243 | |
244 | SV* |
a092bcfd |
245 | reorder(src) |
246 | SV * src |
ac5ea531 |
247 | PROTOTYPE: $ |
248 | PREINIT: |
a092bcfd |
249 | SV *dst; |
8f118dcd |
250 | STRLEN srclen, dstlen, retlen, stk_cc_max; |
251 | U8 *s, *e, *p, *d, curCC; |
e524f5b2 |
252 | UV uv, uvlast; |
ac5ea531 |
253 | UNF_cc * stk_cc; |
e524f5b2 |
254 | STRLEN i, cc_pos; |
255 | bool valid_uvlast; |
ac5ea531 |
256 | CODE: |
a092bcfd |
257 | s = (U8*)sv_2pvunicode(src,&srclen); |
e524f5b2 |
258 | e = s + srclen; |
a092bcfd |
259 | |
8f118dcd |
260 | dstlen = srclen + 1; |
261 | dst = newSV(dstlen); |
e524f5b2 |
262 | (void)SvPOK_only(dst); |
8f118dcd |
263 | SvUTF8_on(dst); |
e524f5b2 |
264 | d = (U8*)SvPVX(dst); |
ac5ea531 |
265 | |
266 | stk_cc_max = 10; /* enough as an initial value? */ |
267 | New(0, stk_cc, stk_cc_max, UNF_cc); |
268 | |
e524f5b2 |
269 | for (p = s; p < e;) { |
270 | uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 |
271 | if (!retlen) |
272 | croak(ErrRetlenIsZero); |
2a204b45 |
273 | p += retlen; |
274 | |
82e740b6 |
275 | curCC = getCombinClass(uv); |
e524f5b2 |
276 | if (curCC == 0) { |
277 | d = uvuni_to_utf8(d, uv); |
8f118dcd |
278 | continue; |
e524f5b2 |
279 | } |
ac5ea531 |
280 | |
2a204b45 |
281 | cc_pos = 0; |
ac5ea531 |
282 | stk_cc[cc_pos].cc = curCC; |
283 | stk_cc[cc_pos].uv = uv; |
284 | stk_cc[cc_pos].pos = cc_pos; |
285 | |
e524f5b2 |
286 | valid_uvlast = FALSE; |
8f118dcd |
287 | while (p < e) { |
e524f5b2 |
288 | uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 |
289 | if (!retlen) |
290 | croak(ErrRetlenIsZero); |
291 | p += retlen; |
292 | |
ac5ea531 |
293 | curCC = getCombinClass(uv); |
e524f5b2 |
294 | if (curCC == 0) { |
295 | uvlast = uv; |
296 | valid_uvlast = TRUE; |
8f118dcd |
297 | break; |
e524f5b2 |
298 | } |
82e740b6 |
299 | |
ac5ea531 |
300 | cc_pos++; |
8f118dcd |
301 | if (stk_cc_max <= cc_pos) { /* extend if need */ |
ac5ea531 |
302 | stk_cc_max = cc_pos + 1; |
303 | Renew(stk_cc, stk_cc_max, UNF_cc); |
304 | } |
305 | stk_cc[cc_pos].cc = curCC; |
306 | stk_cc[cc_pos].uv = uv; |
307 | stk_cc[cc_pos].pos = cc_pos; |
308 | } |
309 | |
e524f5b2 |
310 | /* reordered if there are two c.c.'s */ |
311 | if (cc_pos) { |
312 | qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc); |
313 | } |
ac5ea531 |
314 | |
e524f5b2 |
315 | for (i = 0; i <= cc_pos; i++) { |
316 | d = uvuni_to_utf8(d, stk_cc[i].uv); |
317 | } |
318 | if (valid_uvlast) |
319 | { |
320 | d = uvuni_to_utf8(d, uvlast); |
ac5ea531 |
321 | } |
322 | } |
e524f5b2 |
323 | *d = '\0'; |
324 | SvCUR_set(dst, d - (U8*)SvPVX(dst)); |
ac5ea531 |
325 | Safefree(stk_cc); |
8f118dcd |
326 | RETVAL = dst; |
ac5ea531 |
327 | OUTPUT: |
328 | RETVAL |
329 | |
330 | |
331 | |
2a204b45 |
332 | SV* |
a092bcfd |
333 | compose(src) |
334 | SV * src |
ac5ea531 |
335 | PROTOTYPE: $ |
82e740b6 |
336 | ALIAS: |
337 | composeContiguous = 1 |
ac5ea531 |
338 | PREINIT: |
a092bcfd |
339 | SV *dst, *tmp; |
ac5ea531 |
340 | U8 *s, *p, *e, *d, *t, *tmp_start, curCC, preCC; |
341 | UV uv, uvS, uvComp; |
2a204b45 |
342 | STRLEN srclen, dstlen, tmplen, retlen; |
ac5ea531 |
343 | bool beginning = TRUE; |
2a204b45 |
344 | CODE: |
a092bcfd |
345 | s = (U8*)sv_2pvunicode(src,&srclen); |
ac5ea531 |
346 | e = s + srclen; |
a092bcfd |
347 | |
d85850a7 |
348 | dstlen = srclen + 1; |
2a204b45 |
349 | dst = newSV(dstlen); |
ac5ea531 |
350 | (void)SvPOK_only(dst); |
351 | SvUTF8_on(dst); |
352 | d = (U8*)SvPVX(dst); |
353 | |
354 | /* for uncomposed combining char */ |
355 | tmp = sv_2mortal(newSV(dstlen)); |
356 | (void)SvPOK_only(tmp); |
357 | SvUTF8_on(tmp); |
358 | |
8f118dcd |
359 | for (p = s; p < e;) { |
360 | if (beginning) { |
e524f5b2 |
361 | uvS = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 |
362 | if (!retlen) |
363 | croak(ErrRetlenIsZero); |
ac5ea531 |
364 | p += retlen; |
365 | |
8f118dcd |
366 | if (getCombinClass(uvS)) { /* no Starter found yet */ |
ac5ea531 |
367 | d = uvuni_to_utf8(d, uvS); |
368 | continue; |
369 | } |
370 | beginning = FALSE; |
371 | } |
372 | |
373 | /* Starter */ |
374 | t = tmp_start = (U8*)SvPVX(tmp); |
375 | preCC = 0; |
376 | |
377 | /* to the next Starter */ |
8f118dcd |
378 | while (p < e) { |
e524f5b2 |
379 | uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 |
380 | if (!retlen) |
381 | croak(ErrRetlenIsZero); |
ac5ea531 |
382 | p += retlen; |
82e740b6 |
383 | |
ac5ea531 |
384 | curCC = getCombinClass(uv); |
385 | |
8f118dcd |
386 | if (preCC && preCC == curCC) { |
ac5ea531 |
387 | preCC = curCC; |
388 | t = uvuni_to_utf8(t, uv); |
389 | } else { |
2a204b45 |
390 | uvComp = composite_uv(uvS, uv); |
ac5ea531 |
391 | |
82e740b6 |
392 | if (uvComp && ! isExclusion(uvComp) && |
393 | (ix ? (t == tmp_start) : (preCC <= curCC))) { |
d85850a7 |
394 | STRLEN leftcur, rightcur, dstcur; |
395 | leftcur = UNISKIP(uvComp); |
396 | rightcur = UNISKIP(uvS) + UNISKIP(uv); |
397 | |
398 | if (leftcur > rightcur) { |
399 | dstcur = d - (U8*)SvPVX(dst); |
400 | dstlen += leftcur - rightcur; |
401 | d = (U8*)SvGROW(dst,dstlen) + dstcur; |
402 | } |
ac5ea531 |
403 | /* preCC not changed to curCC */ |
404 | uvS = uvComp; |
1efaba7f |
405 | } else if (! curCC && p < e) { /* blocked */ |
ac5ea531 |
406 | break; |
407 | } else { |
408 | preCC = curCC; |
409 | t = uvuni_to_utf8(t, uv); |
410 | } |
411 | } |
412 | } |
2a204b45 |
413 | d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */ |
d85850a7 |
414 | tmplen = t - tmp_start; |
415 | if (tmplen) { /* uncomposed combining char */ |
ac5ea531 |
416 | t = (U8*)SvPVX(tmp); |
8f118dcd |
417 | while (tmplen--) |
418 | *d++ = *t++; |
ac5ea531 |
419 | } |
420 | uvS = uv; |
421 | } /* for */ |
00f2676f |
422 | *d = '\0'; |
d85850a7 |
423 | SvCUR_set(dst, d - (U8*)SvPVX(dst)); |
2a204b45 |
424 | RETVAL = dst; |
425 | OUTPUT: |
426 | RETVAL |
ac5ea531 |
427 | |
428 | |
8f118dcd |
429 | void |
a092bcfd |
430 | checkNFD(src) |
431 | SV * src |
8f118dcd |
432 | PROTOTYPE: $ |
433 | ALIAS: |
434 | checkNFKD = 1 |
435 | PREINIT: |
8f118dcd |
436 | STRLEN srclen, retlen; |
437 | U8 *s, *e, *p, curCC, preCC; |
a092bcfd |
438 | UV uv; |
82e740b6 |
439 | CODE: |
a092bcfd |
440 | s = (U8*)sv_2pvunicode(src,&srclen); |
8f118dcd |
441 | e = s + srclen; |
442 | |
443 | preCC = 0; |
444 | for (p = s; p < e; p += retlen) { |
e524f5b2 |
445 | uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 |
446 | if (!retlen) |
447 | croak(ErrRetlenIsZero); |
448 | |
8f118dcd |
449 | curCC = getCombinClass(uv); |
450 | if (preCC > curCC && curCC != 0) /* canonical ordering violated */ |
451 | XSRETURN_NO; |
452 | if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) |
453 | XSRETURN_NO; |
454 | preCC = curCC; |
455 | } |
456 | XSRETURN_YES; |
457 | |
458 | |
459 | |
460 | void |
a092bcfd |
461 | checkNFC(src) |
462 | SV * src |
8f118dcd |
463 | PROTOTYPE: $ |
464 | ALIAS: |
465 | checkNFKC = 1 |
466 | PREINIT: |
8f118dcd |
467 | STRLEN srclen, retlen; |
468 | U8 *s, *e, *p, curCC, preCC; |
a092bcfd |
469 | UV uv; |
8f118dcd |
470 | bool isMAYBE; |
82e740b6 |
471 | CODE: |
a092bcfd |
472 | s = (U8*)sv_2pvunicode(src,&srclen); |
8f118dcd |
473 | e = s + srclen; |
474 | |
475 | preCC = 0; |
476 | isMAYBE = FALSE; |
477 | for (p = s; p < e; p += retlen) { |
e524f5b2 |
478 | uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 |
479 | if (!retlen) |
480 | croak(ErrRetlenIsZero); |
481 | |
8f118dcd |
482 | curCC = getCombinClass(uv); |
483 | |
484 | if (preCC > curCC && curCC != 0) /* canonical ordering violated */ |
485 | XSRETURN_NO; |
486 | |
487 | /* get NFC/NFKC property */ |
488 | if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */ |
489 | ; /* YES */ |
490 | else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) |
491 | XSRETURN_NO; |
492 | else if (isComp2nd(uv)) |
493 | isMAYBE = TRUE; |
494 | else if (ix) { |
495 | char *canon, *compat; |
6c941e0c |
496 | /* NFKC_NO when having compatibility mapping. */ |
8f118dcd |
497 | canon = (char *) dec_canonical(uv); |
498 | compat = (char *) dec_compat(uv); |
6c941e0c |
499 | if (compat && !(canon && strEQ(canon, compat))) |
8f118dcd |
500 | XSRETURN_NO; |
501 | } /* end of get NFC/NFKC property */ |
502 | |
503 | preCC = curCC; |
504 | } |
505 | if (isMAYBE) |
506 | XSRETURN_UNDEF; |
507 | else |
508 | XSRETURN_YES; |
509 | |
510 | |
511 | |
82e740b6 |
512 | void |
a092bcfd |
513 | checkFCD(src) |
514 | SV * src |
82e740b6 |
515 | PROTOTYPE: $ |
516 | ALIAS: |
517 | checkFCC = 1 |
518 | PREINIT: |
82e740b6 |
519 | STRLEN srclen, retlen, canlen, canret; |
520 | U8 *s, *e, *p, curCC, preCC; |
a092bcfd |
521 | UV uv, uvLead, uvTrail; |
82e740b6 |
522 | U8 *sCan, *pCan, *eCan; |
523 | bool isMAYBE; |
524 | CODE: |
a092bcfd |
525 | s = (U8*)sv_2pvunicode(src,&srclen); |
82e740b6 |
526 | e = s + srclen; |
527 | |
528 | preCC = 0; |
529 | isMAYBE = FALSE; |
530 | for (p = s; p < e; p += retlen) { |
e524f5b2 |
531 | uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 |
532 | if (!retlen) |
533 | croak(ErrRetlenIsZero); |
534 | |
535 | sCan = (U8*) dec_canonical(uv); |
536 | |
537 | if (sCan) { |
538 | canlen = (STRLEN)strlen((char *) sCan); |
e524f5b2 |
539 | uvLead = utf8n_to_uvuni(sCan, canlen, &canret, AllowAnyUTF); |
82e740b6 |
540 | } |
541 | else { |
542 | uvLead = uv; |
543 | } |
544 | |
545 | curCC = getCombinClass(uvLead); |
546 | |
547 | if (curCC != 0 && curCC < preCC) /* canonical ordering violated */ |
548 | XSRETURN_NO; |
549 | |
550 | if (ix) { |
551 | if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) |
552 | XSRETURN_NO; |
553 | else if (isComp2nd(uv)) |
554 | isMAYBE = TRUE; |
555 | } |
556 | |
557 | if (sCan) { |
558 | eCan = sCan + canlen; |
559 | pCan = utf8_hop(eCan, -1); |
560 | if (pCan < sCan) |
561 | croak(ErrHopBeforeStart); |
e524f5b2 |
562 | uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, AllowAnyUTF); |
82e740b6 |
563 | preCC = getCombinClass(uvTrail); |
564 | } |
565 | else { |
566 | preCC = curCC; |
567 | } |
568 | } |
569 | if (isMAYBE) |
570 | XSRETURN_UNDEF; |
571 | else |
572 | XSRETURN_YES; |
573 | |
574 | |
575 | |
ac5ea531 |
576 | U8 |
577 | getCombinClass(uv) |
578 | UV uv |
8f118dcd |
579 | PROTOTYPE: $ |
ac5ea531 |
580 | |
581 | bool |
2a204b45 |
582 | isExclusion(uv) |
ac5ea531 |
583 | UV uv |
8f118dcd |
584 | PROTOTYPE: $ |
585 | |
586 | bool |
587 | isSingleton(uv) |
588 | UV uv |
589 | PROTOTYPE: $ |
590 | |
591 | bool |
592 | isNonStDecomp(uv) |
593 | UV uv |
594 | PROTOTYPE: $ |
595 | |
596 | bool |
597 | isComp2nd(uv) |
598 | UV uv |
599 | PROTOTYPE: $ |
600 | ALIAS: |
601 | isNFC_MAYBE = 1 |
602 | isNFKC_MAYBE = 2 |
603 | |
604 | |
605 | |
606 | void |
607 | isNFD_NO(uv) |
608 | UV uv |
609 | PROTOTYPE: $ |
610 | ALIAS: |
611 | isNFKD_NO = 1 |
82e740b6 |
612 | CODE: |
8f118dcd |
613 | if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) |
614 | XSRETURN_YES; /* NFD_NO or NFKD_NO */ |
615 | else |
616 | XSRETURN_NO; |
617 | |
618 | |
619 | |
620 | void |
621 | isComp_Ex(uv) |
622 | UV uv |
623 | PROTOTYPE: $ |
624 | ALIAS: |
625 | isNFC_NO = 0 |
626 | isNFKC_NO = 1 |
82e740b6 |
627 | CODE: |
8f118dcd |
628 | if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) |
629 | XSRETURN_YES; /* NFC_NO or NFKC_NO */ |
630 | else if (ix) { |
631 | char *canon, *compat; |
632 | canon = (char *) dec_canonical(uv); |
633 | compat = (char *) dec_compat(uv); |
634 | if (compat && (!canon || strNE(canon, compat))) |
635 | XSRETURN_YES; /* NFC_NO or NFKC_NO */ |
636 | else |
637 | XSRETURN_NO; |
638 | } |
639 | else |
640 | XSRETURN_NO; |
641 | |
642 | |
ac5ea531 |
643 | |
2a204b45 |
644 | SV* |
ac5ea531 |
645 | getComposite(uv, uv2) |
646 | UV uv |
647 | UV uv2 |
2a204b45 |
648 | PROTOTYPE: $$ |
649 | PREINIT: |
bcdb689b |
650 | UV composite; |
2a204b45 |
651 | CODE: |
bcdb689b |
652 | composite = composite_uv(uv, uv2); |
653 | RETVAL = composite ? newSVuv(composite) : &PL_sv_undef; |
2a204b45 |
654 | OUTPUT: |
655 | RETVAL |
ac5ea531 |
656 | |
8f118dcd |
657 | |
658 | |
ac5ea531 |
659 | SV* |
660 | getCanon(uv) |
661 | UV uv |
662 | PROTOTYPE: $ |
663 | ALIAS: |
664 | getCompat = 1 |
665 | PREINIT: |
666 | U8 * rstr; |
667 | CODE: |
8f118dcd |
668 | if (Hangul_IsS(uv)) { |
ac5ea531 |
669 | SV * dst; |
670 | dst = newSV(1); |
671 | (void)SvPOK_only(dst); |
672 | sv_cat_decompHangul(dst, uv); |
673 | RETVAL = dst; |
674 | } else { |
675 | rstr = ix ? dec_compat(uv) : dec_canonical(uv); |
8f118dcd |
676 | if (!rstr) |
677 | XSRETURN_UNDEF; |
ac5ea531 |
678 | RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr)); |
679 | } |
680 | SvUTF8_on(RETVAL); |
681 | OUTPUT: |
682 | RETVAL |
683 | |
82e740b6 |
684 | |
685 | void |
a092bcfd |
686 | splitOnLastStarter(src) |
687 | SV * src |
82e740b6 |
688 | PREINIT: |
a092bcfd |
689 | SV *svp; |
82e740b6 |
690 | STRLEN srclen, retlen; |
691 | U8 *s, *e, *p; |
a092bcfd |
692 | UV uv; |
82e740b6 |
693 | PPCODE: |
a092bcfd |
694 | s = (U8*)sv_2pvunicode(src,&srclen); |
82e740b6 |
695 | e = s + srclen; |
696 | |
697 | for (p = e; s < p; ) { |
698 | p = utf8_hop(p, -1); |
699 | if (p < s) |
700 | croak(ErrHopBeforeStart); |
e524f5b2 |
701 | uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 |
702 | if (getCombinClass(uv) == 0) /* Last Starter found */ |
703 | break; |
704 | } |
705 | |
706 | svp = sv_2mortal(newSVpvn((char*)s, p - s)); |
707 | SvUTF8_on(svp); |
708 | XPUSHs(svp); |
709 | |
710 | svp = sv_2mortal(newSVpvn((char*)p, e - p)); |
711 | SvUTF8_on(svp); |
712 | XPUSHs(svp); |
713 | |