Upgrade to Unicode::Normalize 0.23.
[p5sagit/p5-mst-13.2.git] / ext / Unicode / Normalize / Normalize.xs
CommitLineData
ac5ea531 1
2#include "EXTERN.h"
3#include "perl.h"
4#include "XSUB.h"
5
6/* These 5 files are prepared by mkheader */
7#include "unfcmb.h"
8#include "unfcan.h"
9#include "unfcpt.h"
10#include "unfcmp.h"
11#include "unfexc.h"
12
13/* Perl 5.6.1 ? */
14#ifndef uvuni_to_utf8
15#define uvuni_to_utf8 uv_to_utf8
6c941e0c 16#endif /* uvuni_to_utf8 */
ac5ea531 17
18/* Perl 5.6.1 ? */
ab8fe378 19#ifndef utf8n_to_uvuni
20#define utf8n_to_uvuni utf8_to_uv
6c941e0c 21#endif /* utf8n_to_uvuni */
ac5ea531 22
23/* At present, char > 0x10ffff are unaffected without complaint, right? */
24#define VALID_UTF_MAX (0x10ffff)
25#define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
26
27/* HANGUL_H */
28#define Hangul_SBase 0xAC00
29#define Hangul_SFinal 0xD7A3
30#define Hangul_SCount 11172
31
32#define Hangul_NCount 588
33
34#define Hangul_LBase 0x1100
35#define Hangul_LFinal 0x1112
36#define Hangul_LCount 19
37
38#define Hangul_VBase 0x1161
39#define Hangul_VFinal 0x1175
40#define Hangul_VCount 21
41
42#define Hangul_TBase 0x11A7
43#define Hangul_TFinal 0x11C2
44#define Hangul_TCount 28
45
46#define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
2a204b45 47#define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0)
ac5ea531 48#define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
49#define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
50#define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
51#define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal))
52/* HANGUL_H */
53
54/* this is used for canonical ordering of combining characters (c.c.). */
55typedef struct {
56 U8 cc; /* combining class */
57 UV uv; /* codepoint */
58 STRLEN pos; /* position */
59} UNF_cc;
60
6c941e0c 61int compare_cc (const void *a, const void *b)
ac5ea531 62{
63 int ret_cc;
6c941e0c 64 ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;
8f118dcd 65 if (ret_cc)
66 return ret_cc;
6c941e0c 67
68 return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos )
69 - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );
ac5ea531 70}
71
72U8* dec_canonical (UV uv)
73{
74 U8 ***plane, **row;
8f118dcd 75 if (OVER_UTF_MAX(uv))
76 return NULL;
ac5ea531 77 plane = (U8***)UNF_canon[uv >> 16];
8f118dcd 78 if (! plane)
79 return NULL;
ac5ea531 80 row = plane[(uv >> 8) & 0xff];
81 return row ? row[uv & 0xff] : NULL;
82}
83
84U8* dec_compat (UV uv)
85{
86 U8 ***plane, **row;
8f118dcd 87 if (OVER_UTF_MAX(uv))
88 return NULL;
ac5ea531 89 plane = (U8***)UNF_compat[uv >> 16];
8f118dcd 90 if (! plane)
91 return NULL;
ac5ea531 92 row = plane[(uv >> 8) & 0xff];
93 return row ? row[uv & 0xff] : NULL;
94}
95
2a204b45 96UV composite_uv (UV uv, UV uv2)
ac5ea531 97{
98 UNF_complist ***plane, **row, *cell, *i;
99
8f118dcd 100 if (! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
101 return 0;
ac5ea531 102
8f118dcd 103 if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {
ac5ea531 104 uv -= Hangul_LBase; /* lindex */
105 uv2 -= Hangul_VBase; /* vindex */
106 return(Hangul_SBase + (uv * Hangul_VCount + uv2) * Hangul_TCount);
107 }
8f118dcd 108 if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
ac5ea531 109 uv2 -= Hangul_TBase; /* tindex */
2a204b45 110 return(uv + uv2);
ac5ea531 111 }
112 plane = UNF_compos[uv >> 16];
8f118dcd 113 if (! plane)
114 return 0;
ac5ea531 115 row = plane[(uv >> 8) & 0xff];
8f118dcd 116 if (! row)
117 return 0;
ac5ea531 118 cell = row[uv & 0xff];
8f118dcd 119 if (! cell)
120 return 0;
121 for (i = cell; i->nextchar; i++) {
122 if (uv2 == i->nextchar)
123 return i->composite;
ac5ea531 124 }
125 return 0;
126}
127
128U8 getCombinClass (UV uv)
129{
130 U8 **plane, *row;
8f118dcd 131 if (OVER_UTF_MAX(uv))
132 return 0;
ac5ea531 133 plane = (U8**)UNF_combin[uv >> 16];
8f118dcd 134 if (! plane)
135 return 0;
ac5ea531 136 row = plane[(uv >> 8) & 0xff];
137 return row ? row[uv & 0xff] : 0;
138}
139
140void sv_cat_decompHangul (SV* sv, UV uv)
141{
142 UV sindex, lindex, vindex, tindex;
2a204b45 143 U8 *t, tmp[3 * UTF8_MAXLEN + 1];
ac5ea531 144
8f118dcd 145 if (! Hangul_IsS(uv))
146 return;
ac5ea531 147
148 sindex = uv - Hangul_SBase;
149 lindex = sindex / Hangul_NCount;
150 vindex = (sindex % Hangul_NCount) / Hangul_TCount;
151 tindex = sindex % Hangul_TCount;
152
2a204b45 153 t = tmp;
ac5ea531 154 t = uvuni_to_utf8(t, (lindex + Hangul_LBase));
155 t = uvuni_to_utf8(t, (vindex + Hangul_VBase));
8f118dcd 156 if (tindex)
157 t = uvuni_to_utf8(t, (tindex + Hangul_TBase));
ac5ea531 158 *t = '\0';
2a204b45 159 sv_catpvn(sv, (char *)tmp, strlen((char *)tmp));
ac5ea531 160}
161
162MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
163
ac5ea531 164SV*
d85850a7 165decompose(arg, compat = &PL_sv_no)
ac5ea531 166 SV * arg
167 SV * compat
d85850a7 168 PROTOTYPE: $;$
ac5ea531 169 PREINIT:
2a204b45 170 UV uv;
ac5ea531 171 SV *src, *dst;
73263a9c 172 STRLEN srclen, retlen;
173 U8 *s, *e, *p, *r;
ac5ea531 174 bool iscompat;
175 CODE:
8f118dcd 176 if (SvUTF8(arg)) {
ac5ea531 177 src = arg;
178 } else {
179 src = sv_mortalcopy(arg);
180 sv_utf8_upgrade(src);
181 }
ac5ea531 182 iscompat = SvTRUE(compat);
183
184 dst = newSV(1);
185 (void)SvPOK_only(dst);
186 SvUTF8_on(dst);
187
188 s = (U8*)SvPV(src,srclen);
189 e = s + srclen;
8f118dcd 190 for (p = s; p < e;) {
ab8fe378 191 uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
ac5ea531 192 p += retlen;
8f118dcd 193 if (Hangul_IsS(uv))
194 sv_cat_decompHangul(dst, uv);
ac5ea531 195 else {
196 r = iscompat ? dec_compat(uv) : dec_canonical(uv);
8f118dcd 197 if (r)
198 sv_catpv(dst, (char *)r);
199 else
200 sv_catpvn(dst, (char *)p - retlen, retlen);
ac5ea531 201 }
202 }
203 RETVAL = dst;
204 OUTPUT:
205 RETVAL
206
207
208
209SV*
210reorder(arg)
211 SV * arg
212 PROTOTYPE: $
213 PREINIT:
8f118dcd 214 SV *src, *dst;
215 STRLEN srclen, dstlen, retlen, stk_cc_max;
216 U8 *s, *e, *p, *d, curCC;
ac5ea531 217 UV uv;
218 UNF_cc * stk_cc;
219 CODE:
8f118dcd 220 if (SvUTF8(arg)) {
221 src = arg;
222 } else {
223 src = sv_mortalcopy(arg);
224 sv_utf8_upgrade(src);
225 }
226
227 s = (U8*)SvPV(src, srclen);
228
229 dstlen = srclen + 1;
230 dst = newSV(dstlen);
1aab597d 231 sv_setpvn(dst,(const char*)s,srclen);
8f118dcd 232 SvUTF8_on(dst);
ac5ea531 233
234 stk_cc_max = 10; /* enough as an initial value? */
235 New(0, stk_cc, stk_cc_max, UNF_cc);
236
8f118dcd 237 d = (U8*)SvPV(dst,dstlen);
238 e = d + dstlen;
2a204b45 239
8f118dcd 240 for (p = d; p < e;) {
ac5ea531 241 U8 *cc_in;
242 STRLEN cc_len, cc_iter, cc_pos;
243
ab8fe378 244 uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
ac5ea531 245 curCC = getCombinClass(uv);
2a204b45 246 p += retlen;
247
8f118dcd 248 if (! (curCC && p < e))
249 continue;
250 else
251 cc_in = p - retlen;
ac5ea531 252
2a204b45 253 cc_pos = 0;
ac5ea531 254 stk_cc[cc_pos].cc = curCC;
255 stk_cc[cc_pos].uv = uv;
256 stk_cc[cc_pos].pos = cc_pos;
257
8f118dcd 258 while (p < e) {
ab8fe378 259 uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
ac5ea531 260 curCC = getCombinClass(uv);
8f118dcd 261 if (!curCC)
262 break;
ac5ea531 263 p += retlen;
264 cc_pos++;
8f118dcd 265 if (stk_cc_max <= cc_pos) { /* extend if need */
ac5ea531 266 stk_cc_max = cc_pos + 1;
267 Renew(stk_cc, stk_cc_max, UNF_cc);
268 }
269 stk_cc[cc_pos].cc = curCC;
270 stk_cc[cc_pos].uv = uv;
271 stk_cc[cc_pos].pos = cc_pos;
272 }
273
274 /* only one c.c. in cc_len from cc_in, no need of reordering */
8f118dcd 275 if (!cc_pos)
276 continue;
ac5ea531 277
278 qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc);
279
280 cc_len = p - cc_in;
281 p = cc_in;
8f118dcd 282 for (cc_iter = 0; cc_iter <= cc_pos; cc_iter++) {
ac5ea531 283 p = uvuni_to_utf8(p, stk_cc[cc_iter].uv);
284 }
285 }
286 Safefree(stk_cc);
8f118dcd 287 RETVAL = dst;
ac5ea531 288 OUTPUT:
289 RETVAL
290
291
292
2a204b45 293SV*
ac5ea531 294compose(arg)
295 SV * arg
296 PROTOTYPE: $
297 PREINIT:
298 SV *src, *dst, *tmp;
299 U8 *s, *p, *e, *d, *t, *tmp_start, curCC, preCC;
300 UV uv, uvS, uvComp;
2a204b45 301 STRLEN srclen, dstlen, tmplen, retlen;
ac5ea531 302 bool beginning = TRUE;
2a204b45 303 CODE:
8f118dcd 304 if (SvUTF8(arg)) {
ac5ea531 305 src = arg;
306 } else {
307 src = sv_mortalcopy(arg);
308 sv_utf8_upgrade(src);
309 }
2a204b45 310
ac5ea531 311 s = (U8*)SvPV(src, srclen);
312 e = s + srclen;
d85850a7 313 dstlen = srclen + 1;
2a204b45 314 dst = newSV(dstlen);
ac5ea531 315 (void)SvPOK_only(dst);
316 SvUTF8_on(dst);
317 d = (U8*)SvPVX(dst);
318
319 /* for uncomposed combining char */
320 tmp = sv_2mortal(newSV(dstlen));
321 (void)SvPOK_only(tmp);
322 SvUTF8_on(tmp);
323
8f118dcd 324 for (p = s; p < e;) {
325 if (beginning) {
ab8fe378 326 uvS = utf8n_to_uvuni(p, e - p, &retlen, 0);
ac5ea531 327 p += retlen;
328
8f118dcd 329 if (getCombinClass(uvS)) { /* no Starter found yet */
ac5ea531 330 d = uvuni_to_utf8(d, uvS);
331 continue;
332 }
333 beginning = FALSE;
334 }
335
336 /* Starter */
337 t = tmp_start = (U8*)SvPVX(tmp);
338 preCC = 0;
339
340 /* to the next Starter */
8f118dcd 341 while (p < e) {
ab8fe378 342 uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
ac5ea531 343 p += retlen;
344 curCC = getCombinClass(uv);
345
8f118dcd 346 if (preCC && preCC == curCC) {
ac5ea531 347 preCC = curCC;
348 t = uvuni_to_utf8(t, uv);
349 } else {
2a204b45 350 uvComp = composite_uv(uvS, uv);
ac5ea531 351
8f118dcd 352 if (uvComp && ! isExclusion(uvComp) && preCC <= curCC) {
d85850a7 353 STRLEN leftcur, rightcur, dstcur;
354 leftcur = UNISKIP(uvComp);
355 rightcur = UNISKIP(uvS) + UNISKIP(uv);
356
357 if (leftcur > rightcur) {
358 dstcur = d - (U8*)SvPVX(dst);
359 dstlen += leftcur - rightcur;
360 d = (U8*)SvGROW(dst,dstlen) + dstcur;
361 }
ac5ea531 362 /* preCC not changed to curCC */
363 uvS = uvComp;
1efaba7f 364 } else if (! curCC && p < e) { /* blocked */
ac5ea531 365 break;
366 } else {
367 preCC = curCC;
368 t = uvuni_to_utf8(t, uv);
369 }
370 }
371 }
2a204b45 372 d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */
d85850a7 373 tmplen = t - tmp_start;
374 if (tmplen) { /* uncomposed combining char */
ac5ea531 375 t = (U8*)SvPVX(tmp);
8f118dcd 376 while (tmplen--)
377 *d++ = *t++;
ac5ea531 378 }
379 uvS = uv;
380 } /* for */
00f2676f 381 *d = '\0';
d85850a7 382 SvCUR_set(dst, d - (U8*)SvPVX(dst));
2a204b45 383 RETVAL = dst;
384 OUTPUT:
385 RETVAL
ac5ea531 386
387
388
8f118dcd 389void
390checkNFD(arg)
391 SV * arg
392 PROTOTYPE: $
393 ALIAS:
394 checkNFKD = 1
395 PREINIT:
396 UV uv;
397 SV *src;
398 STRLEN srclen, retlen;
399 U8 *s, *e, *p, curCC, preCC;
400 PPCODE:
401 if (SvUTF8(arg)) {
402 src = arg;
403 } else {
404 src = sv_mortalcopy(arg);
405 sv_utf8_upgrade(src);
406 }
407
408 s = (U8*)SvPV(src,srclen);
409 e = s + srclen;
410
411 preCC = 0;
412 for (p = s; p < e; p += retlen) {
413 uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
414 curCC = getCombinClass(uv);
415 if (preCC > curCC && curCC != 0) /* canonical ordering violated */
416 XSRETURN_NO;
417 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
418 XSRETURN_NO;
419 preCC = curCC;
420 }
421 XSRETURN_YES;
422
423
424
425void
426checkNFC(arg)
427 SV * arg
428 PROTOTYPE: $
429 ALIAS:
430 checkNFKC = 1
431 PREINIT:
432 UV uv;
433 SV *src;
434 STRLEN srclen, retlen;
435 U8 *s, *e, *p, curCC, preCC;
436 bool isMAYBE;
437 PPCODE:
438 if (SvUTF8(arg)) {
439 src = arg;
440 } else {
441 src = sv_mortalcopy(arg);
442 sv_utf8_upgrade(src);
443 }
444
445 s = (U8*)SvPV(src,srclen);
446 e = s + srclen;
447
448 preCC = 0;
449 isMAYBE = FALSE;
450 for (p = s; p < e; p += retlen) {
451 uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
452 curCC = getCombinClass(uv);
453
454 if (preCC > curCC && curCC != 0) /* canonical ordering violated */
455 XSRETURN_NO;
456
457 /* get NFC/NFKC property */
458 if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */
459 ; /* YES */
460 else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
461 XSRETURN_NO;
462 else if (isComp2nd(uv))
463 isMAYBE = TRUE;
464 else if (ix) {
465 char *canon, *compat;
6c941e0c 466 /* NFKC_NO when having compatibility mapping. */
8f118dcd 467 canon = (char *) dec_canonical(uv);
468 compat = (char *) dec_compat(uv);
6c941e0c 469 if (compat && !(canon && strEQ(canon, compat)))
8f118dcd 470 XSRETURN_NO;
471 } /* end of get NFC/NFKC property */
472
473 preCC = curCC;
474 }
475 if (isMAYBE)
476 XSRETURN_UNDEF;
477 else
478 XSRETURN_YES;
479
480
481
ac5ea531 482U8
483getCombinClass(uv)
484 UV uv
8f118dcd 485 PROTOTYPE: $
ac5ea531 486
487bool
2a204b45 488isExclusion(uv)
ac5ea531 489 UV uv
8f118dcd 490 PROTOTYPE: $
491
492bool
493isSingleton(uv)
494 UV uv
495 PROTOTYPE: $
496
497bool
498isNonStDecomp(uv)
499 UV uv
500 PROTOTYPE: $
501
502bool
503isComp2nd(uv)
504 UV uv
505 PROTOTYPE: $
506 ALIAS:
507 isNFC_MAYBE = 1
508 isNFKC_MAYBE = 2
509
510
511
512void
513isNFD_NO(uv)
514 UV uv
515 PROTOTYPE: $
516 ALIAS:
517 isNFKD_NO = 1
518 PPCODE:
519 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
520 XSRETURN_YES; /* NFD_NO or NFKD_NO */
521 else
522 XSRETURN_NO;
523
524
525
526void
527isComp_Ex(uv)
528 UV uv
529 PROTOTYPE: $
530 ALIAS:
531 isNFC_NO = 0
532 isNFKC_NO = 1
533 PPCODE:
534 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
535 XSRETURN_YES; /* NFC_NO or NFKC_NO */
536 else if (ix) {
537 char *canon, *compat;
538 canon = (char *) dec_canonical(uv);
539 compat = (char *) dec_compat(uv);
540 if (compat && (!canon || strNE(canon, compat)))
541 XSRETURN_YES; /* NFC_NO or NFKC_NO */
542 else
543 XSRETURN_NO;
544 }
545 else
546 XSRETURN_NO;
547
548
ac5ea531 549
2a204b45 550SV*
ac5ea531 551getComposite(uv, uv2)
552 UV uv
553 UV uv2
2a204b45 554 PROTOTYPE: $$
555 PREINIT:
bcdb689b 556 UV composite;
2a204b45 557 CODE:
bcdb689b 558 composite = composite_uv(uv, uv2);
559 RETVAL = composite ? newSVuv(composite) : &PL_sv_undef;
2a204b45 560 OUTPUT:
561 RETVAL
ac5ea531 562
8f118dcd 563
564
ac5ea531 565SV*
566getCanon(uv)
567 UV uv
568 PROTOTYPE: $
569 ALIAS:
570 getCompat = 1
571 PREINIT:
572 U8 * rstr;
573 CODE:
8f118dcd 574 if (Hangul_IsS(uv)) {
ac5ea531 575 SV * dst;
576 dst = newSV(1);
577 (void)SvPOK_only(dst);
578 sv_cat_decompHangul(dst, uv);
579 RETVAL = dst;
580 } else {
581 rstr = ix ? dec_compat(uv) : dec_canonical(uv);
8f118dcd 582 if (!rstr)
583 XSRETURN_UNDEF;
ac5ea531 584 RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
585 }
586 SvUTF8_on(RETVAL);
587 OUTPUT:
588 RETVAL
589