Re: Clock skew failures in Memoize test suite
[p5sagit/p5-mst-13.2.git] / ext / Unicode / Normalize / Normalize.xs
CommitLineData
ac5ea531 1
2#include "EXTERN.h"
3#include "perl.h"
4#include "XSUB.h"
5
6/* These 5 files are prepared by mkheader */
7#include "unfcmb.h"
8#include "unfcan.h"
9#include "unfcpt.h"
10#include "unfcmp.h"
11#include "unfexc.h"
12
13/* Perl 5.6.1 ? */
14#ifndef uvuni_to_utf8
15#define uvuni_to_utf8 uv_to_utf8
16#endif /* uvuni_to_utf8 */
17
18/* Perl 5.6.1 ? */
ab8fe378 19#ifndef utf8n_to_uvuni
20#define utf8n_to_uvuni utf8_to_uv
21#endif /* utf8n_to_uvuni */
ac5ea531 22
23/* At present, char > 0x10ffff are unaffected without complaint, right? */
24#define VALID_UTF_MAX (0x10ffff)
25#define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
26
27/* HANGUL_H */
28#define Hangul_SBase 0xAC00
29#define Hangul_SFinal 0xD7A3
30#define Hangul_SCount 11172
31
32#define Hangul_NCount 588
33
34#define Hangul_LBase 0x1100
35#define Hangul_LFinal 0x1112
36#define Hangul_LCount 19
37
38#define Hangul_VBase 0x1161
39#define Hangul_VFinal 0x1175
40#define Hangul_VCount 21
41
42#define Hangul_TBase 0x11A7
43#define Hangul_TFinal 0x11C2
44#define Hangul_TCount 28
45
46#define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
2a204b45 47#define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0)
ac5ea531 48#define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
49#define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
50#define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
51#define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal))
52/* HANGUL_H */
53
54/* this is used for canonical ordering of combining characters (c.c.). */
55typedef struct {
56 U8 cc; /* combining class */
57 UV uv; /* codepoint */
58 STRLEN pos; /* position */
59} UNF_cc;
60
61int compare_cc(const void *a, const void *b)
62{
63 int ret_cc;
64 ret_cc = (*(UNF_cc*)a).cc - (*(UNF_cc*)b).cc;
8f118dcd 65 if (ret_cc)
66 return ret_cc;
ac5ea531 67 return (*(UNF_cc*)a).pos - (*(UNF_cc*)b).pos;
68}
69
70U8* dec_canonical (UV uv)
71{
72 U8 ***plane, **row;
8f118dcd 73 if (OVER_UTF_MAX(uv))
74 return NULL;
ac5ea531 75 plane = (U8***)UNF_canon[uv >> 16];
8f118dcd 76 if (! plane)
77 return NULL;
ac5ea531 78 row = plane[(uv >> 8) & 0xff];
79 return row ? row[uv & 0xff] : NULL;
80}
81
82U8* dec_compat (UV uv)
83{
84 U8 ***plane, **row;
8f118dcd 85 if (OVER_UTF_MAX(uv))
86 return NULL;
ac5ea531 87 plane = (U8***)UNF_compat[uv >> 16];
8f118dcd 88 if (! plane)
89 return NULL;
ac5ea531 90 row = plane[(uv >> 8) & 0xff];
91 return row ? row[uv & 0xff] : NULL;
92}
93
2a204b45 94UV composite_uv (UV uv, UV uv2)
ac5ea531 95{
96 UNF_complist ***plane, **row, *cell, *i;
97
8f118dcd 98 if (! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
99 return 0;
ac5ea531 100
8f118dcd 101 if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {
ac5ea531 102 uv -= Hangul_LBase; /* lindex */
103 uv2 -= Hangul_VBase; /* vindex */
104 return(Hangul_SBase + (uv * Hangul_VCount + uv2) * Hangul_TCount);
105 }
8f118dcd 106 if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
ac5ea531 107 uv2 -= Hangul_TBase; /* tindex */
2a204b45 108 return(uv + uv2);
ac5ea531 109 }
110 plane = UNF_compos[uv >> 16];
8f118dcd 111 if (! plane)
112 return 0;
ac5ea531 113 row = plane[(uv >> 8) & 0xff];
8f118dcd 114 if (! row)
115 return 0;
ac5ea531 116 cell = row[uv & 0xff];
8f118dcd 117 if (! cell)
118 return 0;
119 for (i = cell; i->nextchar; i++) {
120 if (uv2 == i->nextchar)
121 return i->composite;
ac5ea531 122 }
123 return 0;
124}
125
126U8 getCombinClass (UV uv)
127{
128 U8 **plane, *row;
8f118dcd 129 if (OVER_UTF_MAX(uv))
130 return 0;
ac5ea531 131 plane = (U8**)UNF_combin[uv >> 16];
8f118dcd 132 if (! plane)
133 return 0;
ac5ea531 134 row = plane[(uv >> 8) & 0xff];
135 return row ? row[uv & 0xff] : 0;
136}
137
138void sv_cat_decompHangul (SV* sv, UV uv)
139{
140 UV sindex, lindex, vindex, tindex;
2a204b45 141 U8 *t, tmp[3 * UTF8_MAXLEN + 1];
ac5ea531 142
8f118dcd 143 if (! Hangul_IsS(uv))
144 return;
ac5ea531 145
146 sindex = uv - Hangul_SBase;
147 lindex = sindex / Hangul_NCount;
148 vindex = (sindex % Hangul_NCount) / Hangul_TCount;
149 tindex = sindex % Hangul_TCount;
150
2a204b45 151 t = tmp;
ac5ea531 152 t = uvuni_to_utf8(t, (lindex + Hangul_LBase));
153 t = uvuni_to_utf8(t, (vindex + Hangul_VBase));
8f118dcd 154 if (tindex)
155 t = uvuni_to_utf8(t, (tindex + Hangul_TBase));
ac5ea531 156 *t = '\0';
2a204b45 157 sv_catpvn(sv, (char *)tmp, strlen((char *)tmp));
ac5ea531 158}
159
160MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
161
ac5ea531 162SV*
d85850a7 163decompose(arg, compat = &PL_sv_no)
ac5ea531 164 SV * arg
165 SV * compat
d85850a7 166 PROTOTYPE: $;$
ac5ea531 167 PREINIT:
2a204b45 168 UV uv;
ac5ea531 169 SV *src, *dst;
73263a9c 170 STRLEN srclen, retlen;
171 U8 *s, *e, *p, *r;
ac5ea531 172 bool iscompat;
173 CODE:
8f118dcd 174 if (SvUTF8(arg)) {
ac5ea531 175 src = arg;
176 } else {
177 src = sv_mortalcopy(arg);
178 sv_utf8_upgrade(src);
179 }
ac5ea531 180 iscompat = SvTRUE(compat);
181
182 dst = newSV(1);
183 (void)SvPOK_only(dst);
184 SvUTF8_on(dst);
185
186 s = (U8*)SvPV(src,srclen);
187 e = s + srclen;
8f118dcd 188 for (p = s; p < e;) {
ab8fe378 189 uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
ac5ea531 190 p += retlen;
8f118dcd 191 if (Hangul_IsS(uv))
192 sv_cat_decompHangul(dst, uv);
ac5ea531 193 else {
194 r = iscompat ? dec_compat(uv) : dec_canonical(uv);
8f118dcd 195 if (r)
196 sv_catpv(dst, (char *)r);
197 else
198 sv_catpvn(dst, (char *)p - retlen, retlen);
ac5ea531 199 }
200 }
201 RETVAL = dst;
202 OUTPUT:
203 RETVAL
204
205
206
207SV*
208reorder(arg)
209 SV * arg
210 PROTOTYPE: $
211 PREINIT:
8f118dcd 212 SV *src, *dst;
213 STRLEN srclen, dstlen, retlen, stk_cc_max;
214 U8 *s, *e, *p, *d, curCC;
ac5ea531 215 UV uv;
216 UNF_cc * stk_cc;
217 CODE:
8f118dcd 218 if (SvUTF8(arg)) {
219 src = arg;
220 } else {
221 src = sv_mortalcopy(arg);
222 sv_utf8_upgrade(src);
223 }
224
225 s = (U8*)SvPV(src, srclen);
226
227 dstlen = srclen + 1;
228 dst = newSV(dstlen);
1aab597d 229 sv_setpvn(dst,(const char*)s,srclen);
8f118dcd 230 SvUTF8_on(dst);
ac5ea531 231
232 stk_cc_max = 10; /* enough as an initial value? */
233 New(0, stk_cc, stk_cc_max, UNF_cc);
234
8f118dcd 235 d = (U8*)SvPV(dst,dstlen);
236 e = d + dstlen;
2a204b45 237
8f118dcd 238 for (p = d; p < e;) {
ac5ea531 239 U8 *cc_in;
240 STRLEN cc_len, cc_iter, cc_pos;
241
ab8fe378 242 uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
ac5ea531 243 curCC = getCombinClass(uv);
2a204b45 244 p += retlen;
245
8f118dcd 246 if (! (curCC && p < e))
247 continue;
248 else
249 cc_in = p - retlen;
ac5ea531 250
2a204b45 251 cc_pos = 0;
ac5ea531 252 stk_cc[cc_pos].cc = curCC;
253 stk_cc[cc_pos].uv = uv;
254 stk_cc[cc_pos].pos = cc_pos;
255
8f118dcd 256 while (p < e) {
ab8fe378 257 uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
ac5ea531 258 curCC = getCombinClass(uv);
8f118dcd 259 if (!curCC)
260 break;
ac5ea531 261 p += retlen;
262 cc_pos++;
8f118dcd 263 if (stk_cc_max <= cc_pos) { /* extend if need */
ac5ea531 264 stk_cc_max = cc_pos + 1;
265 Renew(stk_cc, stk_cc_max, UNF_cc);
266 }
267 stk_cc[cc_pos].cc = curCC;
268 stk_cc[cc_pos].uv = uv;
269 stk_cc[cc_pos].pos = cc_pos;
270 }
271
272 /* only one c.c. in cc_len from cc_in, no need of reordering */
8f118dcd 273 if (!cc_pos)
274 continue;
ac5ea531 275
276 qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc);
277
278 cc_len = p - cc_in;
279 p = cc_in;
8f118dcd 280 for (cc_iter = 0; cc_iter <= cc_pos; cc_iter++) {
ac5ea531 281 p = uvuni_to_utf8(p, stk_cc[cc_iter].uv);
282 }
283 }
284 Safefree(stk_cc);
8f118dcd 285 RETVAL = dst;
ac5ea531 286 OUTPUT:
287 RETVAL
288
289
290
2a204b45 291SV*
ac5ea531 292compose(arg)
293 SV * arg
294 PROTOTYPE: $
295 PREINIT:
296 SV *src, *dst, *tmp;
297 U8 *s, *p, *e, *d, *t, *tmp_start, curCC, preCC;
298 UV uv, uvS, uvComp;
2a204b45 299 STRLEN srclen, dstlen, tmplen, retlen;
ac5ea531 300 bool beginning = TRUE;
2a204b45 301 CODE:
8f118dcd 302 if (SvUTF8(arg)) {
ac5ea531 303 src = arg;
304 } else {
305 src = sv_mortalcopy(arg);
306 sv_utf8_upgrade(src);
307 }
2a204b45 308
ac5ea531 309 s = (U8*)SvPV(src, srclen);
310 e = s + srclen;
d85850a7 311 dstlen = srclen + 1;
2a204b45 312 dst = newSV(dstlen);
ac5ea531 313 (void)SvPOK_only(dst);
314 SvUTF8_on(dst);
315 d = (U8*)SvPVX(dst);
316
317 /* for uncomposed combining char */
318 tmp = sv_2mortal(newSV(dstlen));
319 (void)SvPOK_only(tmp);
320 SvUTF8_on(tmp);
321
8f118dcd 322 for (p = s; p < e;) {
323 if (beginning) {
ab8fe378 324 uvS = utf8n_to_uvuni(p, e - p, &retlen, 0);
ac5ea531 325 p += retlen;
326
8f118dcd 327 if (getCombinClass(uvS)) { /* no Starter found yet */
ac5ea531 328 d = uvuni_to_utf8(d, uvS);
329 continue;
330 }
331 beginning = FALSE;
332 }
333
334 /* Starter */
335 t = tmp_start = (U8*)SvPVX(tmp);
336 preCC = 0;
337
338 /* to the next Starter */
8f118dcd 339 while (p < e) {
ab8fe378 340 uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
ac5ea531 341 p += retlen;
342 curCC = getCombinClass(uv);
343
8f118dcd 344 if (preCC && preCC == curCC) {
ac5ea531 345 preCC = curCC;
346 t = uvuni_to_utf8(t, uv);
347 } else {
2a204b45 348 uvComp = composite_uv(uvS, uv);
ac5ea531 349
8f118dcd 350 if (uvComp && ! isExclusion(uvComp) && preCC <= curCC) {
d85850a7 351 STRLEN leftcur, rightcur, dstcur;
352 leftcur = UNISKIP(uvComp);
353 rightcur = UNISKIP(uvS) + UNISKIP(uv);
354
355 if (leftcur > rightcur) {
356 dstcur = d - (U8*)SvPVX(dst);
357 dstlen += leftcur - rightcur;
358 d = (U8*)SvGROW(dst,dstlen) + dstcur;
359 }
ac5ea531 360 /* preCC not changed to curCC */
361 uvS = uvComp;
362 } else if (! curCC && p < e) { /* blocked */
363 break;
364 } else {
365 preCC = curCC;
366 t = uvuni_to_utf8(t, uv);
367 }
368 }
369 }
2a204b45 370 d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */
d85850a7 371 tmplen = t - tmp_start;
372 if (tmplen) { /* uncomposed combining char */
ac5ea531 373 t = (U8*)SvPVX(tmp);
8f118dcd 374 while (tmplen--)
375 *d++ = *t++;
ac5ea531 376 }
377 uvS = uv;
378 } /* for */
d85850a7 379 SvCUR_set(dst, d - (U8*)SvPVX(dst));
2a204b45 380 RETVAL = dst;
381 OUTPUT:
382 RETVAL
ac5ea531 383
384
385
8f118dcd 386void
387checkNFD(arg)
388 SV * arg
389 PROTOTYPE: $
390 ALIAS:
391 checkNFKD = 1
392 PREINIT:
393 UV uv;
394 SV *src;
395 STRLEN srclen, retlen;
396 U8 *s, *e, *p, curCC, preCC;
397 PPCODE:
398 if (SvUTF8(arg)) {
399 src = arg;
400 } else {
401 src = sv_mortalcopy(arg);
402 sv_utf8_upgrade(src);
403 }
404
405 s = (U8*)SvPV(src,srclen);
406 e = s + srclen;
407
408 preCC = 0;
409 for (p = s; p < e; p += retlen) {
410 uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
411 curCC = getCombinClass(uv);
412 if (preCC > curCC && curCC != 0) /* canonical ordering violated */
413 XSRETURN_NO;
414 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
415 XSRETURN_NO;
416 preCC = curCC;
417 }
418 XSRETURN_YES;
419
420
421
422void
423checkNFC(arg)
424 SV * arg
425 PROTOTYPE: $
426 ALIAS:
427 checkNFKC = 1
428 PREINIT:
429 UV uv;
430 SV *src;
431 STRLEN srclen, retlen;
432 U8 *s, *e, *p, curCC, preCC;
433 bool isMAYBE;
434 PPCODE:
435 if (SvUTF8(arg)) {
436 src = arg;
437 } else {
438 src = sv_mortalcopy(arg);
439 sv_utf8_upgrade(src);
440 }
441
442 s = (U8*)SvPV(src,srclen);
443 e = s + srclen;
444
445 preCC = 0;
446 isMAYBE = FALSE;
447 for (p = s; p < e; p += retlen) {
448 uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
449 curCC = getCombinClass(uv);
450
451 if (preCC > curCC && curCC != 0) /* canonical ordering violated */
452 XSRETURN_NO;
453
454 /* get NFC/NFKC property */
455 if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */
456 ; /* YES */
457 else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
458 XSRETURN_NO;
459 else if (isComp2nd(uv))
460 isMAYBE = TRUE;
461 else if (ix) {
462 char *canon, *compat;
463 /*
464 * NFKC_NO when having compatibility mapping;
465 * i.e. dec_compat(uv) defined & different with dec_canonical(uv).
466 */
467 canon = (char *) dec_canonical(uv);
468 compat = (char *) dec_compat(uv);
469 if (compat && (!canon || strNE(canon, compat)))
470 XSRETURN_NO;
471 } /* end of get NFC/NFKC property */
472
473 preCC = curCC;
474 }
475 if (isMAYBE)
476 XSRETURN_UNDEF;
477 else
478 XSRETURN_YES;
479
480
481
ac5ea531 482U8
483getCombinClass(uv)
484 UV uv
8f118dcd 485 PROTOTYPE: $
ac5ea531 486
487bool
2a204b45 488isExclusion(uv)
ac5ea531 489 UV uv
8f118dcd 490 PROTOTYPE: $
491
492bool
493isSingleton(uv)
494 UV uv
495 PROTOTYPE: $
496
497bool
498isNonStDecomp(uv)
499 UV uv
500 PROTOTYPE: $
501
502bool
503isComp2nd(uv)
504 UV uv
505 PROTOTYPE: $
506 ALIAS:
507 isNFC_MAYBE = 1
508 isNFKC_MAYBE = 2
509
510
511
512void
513isNFD_NO(uv)
514 UV uv
515 PROTOTYPE: $
516 ALIAS:
517 isNFKD_NO = 1
518 PPCODE:
519 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
520 XSRETURN_YES; /* NFD_NO or NFKD_NO */
521 else
522 XSRETURN_NO;
523
524
525
526void
527isComp_Ex(uv)
528 UV uv
529 PROTOTYPE: $
530 ALIAS:
531 isNFC_NO = 0
532 isNFKC_NO = 1
533 PPCODE:
534 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
535 XSRETURN_YES; /* NFC_NO or NFKC_NO */
536 else if (ix) {
537 char *canon, *compat;
538 canon = (char *) dec_canonical(uv);
539 compat = (char *) dec_compat(uv);
540 if (compat && (!canon || strNE(canon, compat)))
541 XSRETURN_YES; /* NFC_NO or NFKC_NO */
542 else
543 XSRETURN_NO;
544 }
545 else
546 XSRETURN_NO;
547
548
ac5ea531 549
2a204b45 550SV*
ac5ea531 551getComposite(uv, uv2)
552 UV uv
553 UV uv2
2a204b45 554 PROTOTYPE: $$
555 PREINIT:
bcdb689b 556 UV composite;
2a204b45 557 CODE:
bcdb689b 558 composite = composite_uv(uv, uv2);
559 RETVAL = composite ? newSVuv(composite) : &PL_sv_undef;
2a204b45 560 OUTPUT:
561 RETVAL
ac5ea531 562
8f118dcd 563
564
ac5ea531 565SV*
566getCanon(uv)
567 UV uv
568 PROTOTYPE: $
569 ALIAS:
570 getCompat = 1
571 PREINIT:
572 U8 * rstr;
573 CODE:
8f118dcd 574 if (Hangul_IsS(uv)) {
ac5ea531 575 SV * dst;
576 dst = newSV(1);
577 (void)SvPOK_only(dst);
578 sv_cat_decompHangul(dst, uv);
579 RETVAL = dst;
580 } else {
581 rstr = ix ? dec_compat(uv) : dec_canonical(uv);
8f118dcd 582 if (!rstr)
583 XSRETURN_UNDEF;
ac5ea531 584 RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
585 }
586 SvUTF8_on(RETVAL);
587 OUTPUT:
588 RETVAL
589