Upgrade to Encode 1.92.
[p5sagit/p5-mst-13.2.git] / ext / Unicode / Normalize / Normalize.xs
CommitLineData
ac5ea531 1
2#include "EXTERN.h"
3#include "perl.h"
4#include "XSUB.h"
5
6/* These 5 files are prepared by mkheader */
7#include "unfcmb.h"
8#include "unfcan.h"
9#include "unfcpt.h"
10#include "unfcmp.h"
11#include "unfexc.h"
12
13/* Perl 5.6.1 ? */
14#ifndef uvuni_to_utf8
15#define uvuni_to_utf8 uv_to_utf8
6c941e0c 16#endif /* uvuni_to_utf8 */
ac5ea531 17
18/* Perl 5.6.1 ? */
ab8fe378 19#ifndef utf8n_to_uvuni
20#define utf8n_to_uvuni utf8_to_uv
6c941e0c 21#endif /* utf8n_to_uvuni */
ac5ea531 22
23/* At present, char > 0x10ffff are unaffected without complaint, right? */
24#define VALID_UTF_MAX (0x10ffff)
25#define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
26
27/* HANGUL_H */
28#define Hangul_SBase 0xAC00
29#define Hangul_SFinal 0xD7A3
30#define Hangul_SCount 11172
31
32#define Hangul_NCount 588
33
34#define Hangul_LBase 0x1100
35#define Hangul_LFinal 0x1112
36#define Hangul_LCount 19
37
38#define Hangul_VBase 0x1161
39#define Hangul_VFinal 0x1175
40#define Hangul_VCount 21
41
42#define Hangul_TBase 0x11A7
43#define Hangul_TFinal 0x11C2
44#define Hangul_TCount 28
45
46#define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
2a204b45 47#define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0)
ac5ea531 48#define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
49#define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
50#define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
51#define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal))
52/* HANGUL_H */
53
54/* this is used for canonical ordering of combining characters (c.c.). */
55typedef struct {
56 U8 cc; /* combining class */
57 UV uv; /* codepoint */
58 STRLEN pos; /* position */
59} UNF_cc;
60
6c941e0c 61int compare_cc (const void *a, const void *b)
ac5ea531 62{
63 int ret_cc;
6c941e0c 64 ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;
8f118dcd 65 if (ret_cc)
66 return ret_cc;
6c941e0c 67
68 return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos )
69 - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );
ac5ea531 70}
71
72U8* dec_canonical (UV uv)
73{
74 U8 ***plane, **row;
8f118dcd 75 if (OVER_UTF_MAX(uv))
76 return NULL;
ac5ea531 77 plane = (U8***)UNF_canon[uv >> 16];
8f118dcd 78 if (! plane)
79 return NULL;
ac5ea531 80 row = plane[(uv >> 8) & 0xff];
81 return row ? row[uv & 0xff] : NULL;
82}
83
84U8* dec_compat (UV uv)
85{
86 U8 ***plane, **row;
8f118dcd 87 if (OVER_UTF_MAX(uv))
88 return NULL;
ac5ea531 89 plane = (U8***)UNF_compat[uv >> 16];
8f118dcd 90 if (! plane)
91 return NULL;
ac5ea531 92 row = plane[(uv >> 8) & 0xff];
93 return row ? row[uv & 0xff] : NULL;
94}
95
2a204b45 96UV composite_uv (UV uv, UV uv2)
ac5ea531 97{
98 UNF_complist ***plane, **row, *cell, *i;
99
8f118dcd 100 if (! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
101 return 0;
ac5ea531 102
8f118dcd 103 if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {
ac5ea531 104 uv -= Hangul_LBase; /* lindex */
105 uv2 -= Hangul_VBase; /* vindex */
106 return(Hangul_SBase + (uv * Hangul_VCount + uv2) * Hangul_TCount);
107 }
8f118dcd 108 if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
ac5ea531 109 uv2 -= Hangul_TBase; /* tindex */
2a204b45 110 return(uv + uv2);
ac5ea531 111 }
112 plane = UNF_compos[uv >> 16];
8f118dcd 113 if (! plane)
114 return 0;
ac5ea531 115 row = plane[(uv >> 8) & 0xff];
8f118dcd 116 if (! row)
117 return 0;
ac5ea531 118 cell = row[uv & 0xff];
8f118dcd 119 if (! cell)
120 return 0;
121 for (i = cell; i->nextchar; i++) {
122 if (uv2 == i->nextchar)
123 return i->composite;
ac5ea531 124 }
125 return 0;
126}
127
128U8 getCombinClass (UV uv)
129{
130 U8 **plane, *row;
8f118dcd 131 if (OVER_UTF_MAX(uv))
132 return 0;
ac5ea531 133 plane = (U8**)UNF_combin[uv >> 16];
8f118dcd 134 if (! plane)
135 return 0;
ac5ea531 136 row = plane[(uv >> 8) & 0xff];
137 return row ? row[uv & 0xff] : 0;
138}
139
140void sv_cat_decompHangul (SV* sv, UV uv)
141{
142 UV sindex, lindex, vindex, tindex;
2a204b45 143 U8 *t, tmp[3 * UTF8_MAXLEN + 1];
ac5ea531 144
8f118dcd 145 if (! Hangul_IsS(uv))
146 return;
ac5ea531 147
148 sindex = uv - Hangul_SBase;
149 lindex = sindex / Hangul_NCount;
150 vindex = (sindex % Hangul_NCount) / Hangul_TCount;
151 tindex = sindex % Hangul_TCount;
152
2a204b45 153 t = tmp;
ac5ea531 154 t = uvuni_to_utf8(t, (lindex + Hangul_LBase));
155 t = uvuni_to_utf8(t, (vindex + Hangul_VBase));
8f118dcd 156 if (tindex)
157 t = uvuni_to_utf8(t, (tindex + Hangul_TBase));
ac5ea531 158 *t = '\0';
2a204b45 159 sv_catpvn(sv, (char *)tmp, strlen((char *)tmp));
ac5ea531 160}
161
162MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
163
ac5ea531 164SV*
d85850a7 165decompose(arg, compat = &PL_sv_no)
ac5ea531 166 SV * arg
167 SV * compat
d85850a7 168 PROTOTYPE: $;$
ac5ea531 169 PREINIT:
2a204b45 170 UV uv;
ac5ea531 171 SV *src, *dst;
73263a9c 172 STRLEN srclen, retlen;
173 U8 *s, *e, *p, *r;
ac5ea531 174 bool iscompat;
175 CODE:
8f118dcd 176 if (SvUTF8(arg)) {
ac5ea531 177 src = arg;
178 } else {
179 src = sv_mortalcopy(arg);
180 sv_utf8_upgrade(src);
181 }
ac5ea531 182 iscompat = SvTRUE(compat);
183
184 dst = newSV(1);
185 (void)SvPOK_only(dst);
186 SvUTF8_on(dst);
187
188 s = (U8*)SvPV(src,srclen);
189 e = s + srclen;
8f118dcd 190 for (p = s; p < e;) {
ab8fe378 191 uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
ac5ea531 192 p += retlen;
8f118dcd 193 if (Hangul_IsS(uv))
194 sv_cat_decompHangul(dst, uv);
ac5ea531 195 else {
196 r = iscompat ? dec_compat(uv) : dec_canonical(uv);
8f118dcd 197 if (r)
198 sv_catpv(dst, (char *)r);
199 else
200 sv_catpvn(dst, (char *)p - retlen, retlen);
ac5ea531 201 }
202 }
203 RETVAL = dst;
204 OUTPUT:
205 RETVAL
206
207
208
209SV*
210reorder(arg)
211 SV * arg
212 PROTOTYPE: $
213 PREINIT:
8f118dcd 214 SV *src, *dst;
215 STRLEN srclen, dstlen, retlen, stk_cc_max;
216 U8 *s, *e, *p, *d, curCC;
ac5ea531 217 UV uv;
218 UNF_cc * stk_cc;
219 CODE:
8f118dcd 220 if (SvUTF8(arg)) {
221 src = arg;
222 } else {
223 src = sv_mortalcopy(arg);
224 sv_utf8_upgrade(src);
225 }
226
227 s = (U8*)SvPV(src, srclen);
228
229 dstlen = srclen + 1;
230 dst = newSV(dstlen);
1aab597d 231 sv_setpvn(dst,(const char*)s,srclen);
8f118dcd 232 SvUTF8_on(dst);
ac5ea531 233
234 stk_cc_max = 10; /* enough as an initial value? */
235 New(0, stk_cc, stk_cc_max, UNF_cc);
236
8f118dcd 237 d = (U8*)SvPV(dst,dstlen);
238 e = d + dstlen;
2a204b45 239
8f118dcd 240 for (p = d; p < e;) {
ac5ea531 241 U8 *cc_in;
242 STRLEN cc_len, cc_iter, cc_pos;
243
ab8fe378 244 uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
ac5ea531 245 curCC = getCombinClass(uv);
2a204b45 246 p += retlen;
247
8f118dcd 248 if (! (curCC && p < e))
249 continue;
250 else
251 cc_in = p - retlen;
ac5ea531 252
2a204b45 253 cc_pos = 0;
ac5ea531 254 stk_cc[cc_pos].cc = curCC;
255 stk_cc[cc_pos].uv = uv;
256 stk_cc[cc_pos].pos = cc_pos;
257
8f118dcd 258 while (p < e) {
ab8fe378 259 uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
ac5ea531 260 curCC = getCombinClass(uv);
8f118dcd 261 if (!curCC)
262 break;
ac5ea531 263 p += retlen;
264 cc_pos++;
8f118dcd 265 if (stk_cc_max <= cc_pos) { /* extend if need */
ac5ea531 266 stk_cc_max = cc_pos + 1;
267 Renew(stk_cc, stk_cc_max, UNF_cc);
268 }
269 stk_cc[cc_pos].cc = curCC;
270 stk_cc[cc_pos].uv = uv;
271 stk_cc[cc_pos].pos = cc_pos;
272 }
273
274 /* only one c.c. in cc_len from cc_in, no need of reordering */
8f118dcd 275 if (!cc_pos)
276 continue;
ac5ea531 277
278 qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc);
279
280 cc_len = p - cc_in;
281 p = cc_in;
8f118dcd 282 for (cc_iter = 0; cc_iter <= cc_pos; cc_iter++) {
ac5ea531 283 p = uvuni_to_utf8(p, stk_cc[cc_iter].uv);
284 }
285 }
286 Safefree(stk_cc);
8f118dcd 287 RETVAL = dst;
ac5ea531 288 OUTPUT:
289 RETVAL
290
291
292
2a204b45 293SV*
ac5ea531 294compose(arg)
295 SV * arg
296 PROTOTYPE: $
297 PREINIT:
298 SV *src, *dst, *tmp;
299 U8 *s, *p, *e, *d, *t, *tmp_start, curCC, preCC;
300 UV uv, uvS, uvComp;
2a204b45 301 STRLEN srclen, dstlen, tmplen, retlen;
ac5ea531 302 bool beginning = TRUE;
2a204b45 303 CODE:
8f118dcd 304 if (SvUTF8(arg)) {
ac5ea531 305 src = arg;
306 } else {
307 src = sv_mortalcopy(arg);
308 sv_utf8_upgrade(src);
309 }
2a204b45 310
ac5ea531 311 s = (U8*)SvPV(src, srclen);
312 e = s + srclen;
d85850a7 313 dstlen = srclen + 1;
2a204b45 314 dst = newSV(dstlen);
ac5ea531 315 (void)SvPOK_only(dst);
316 SvUTF8_on(dst);
317 d = (U8*)SvPVX(dst);
318
319 /* for uncomposed combining char */
320 tmp = sv_2mortal(newSV(dstlen));
321 (void)SvPOK_only(tmp);
322 SvUTF8_on(tmp);
323
8f118dcd 324 for (p = s; p < e;) {
325 if (beginning) {
ab8fe378 326 uvS = utf8n_to_uvuni(p, e - p, &retlen, 0);
ac5ea531 327 p += retlen;
328
8f118dcd 329 if (getCombinClass(uvS)) { /* no Starter found yet */
ac5ea531 330 d = uvuni_to_utf8(d, uvS);
331 continue;
332 }
333 beginning = FALSE;
334 }
335
336 /* Starter */
337 t = tmp_start = (U8*)SvPVX(tmp);
338 preCC = 0;
339
340 /* to the next Starter */
8f118dcd 341 while (p < e) {
ab8fe378 342 uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
ac5ea531 343 p += retlen;
344 curCC = getCombinClass(uv);
345
8f118dcd 346 if (preCC && preCC == curCC) {
ac5ea531 347 preCC = curCC;
348 t = uvuni_to_utf8(t, uv);
349 } else {
2a204b45 350 uvComp = composite_uv(uvS, uv);
ac5ea531 351
8f118dcd 352 if (uvComp && ! isExclusion(uvComp) && preCC <= curCC) {
d85850a7 353 STRLEN leftcur, rightcur, dstcur;
354 leftcur = UNISKIP(uvComp);
355 rightcur = UNISKIP(uvS) + UNISKIP(uv);
356
357 if (leftcur > rightcur) {
358 dstcur = d - (U8*)SvPVX(dst);
359 dstlen += leftcur - rightcur;
360 d = (U8*)SvGROW(dst,dstlen) + dstcur;
361 }
ac5ea531 362 /* preCC not changed to curCC */
363 uvS = uvComp;
364 } else if (! curCC && p < e) { /* blocked */
365 break;
366 } else {
367 preCC = curCC;
368 t = uvuni_to_utf8(t, uv);
369 }
370 }
371 }
2a204b45 372 d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */
d85850a7 373 tmplen = t - tmp_start;
374 if (tmplen) { /* uncomposed combining char */
ac5ea531 375 t = (U8*)SvPVX(tmp);
8f118dcd 376 while (tmplen--)
377 *d++ = *t++;
ac5ea531 378 }
379 uvS = uv;
380 } /* for */
d85850a7 381 SvCUR_set(dst, d - (U8*)SvPVX(dst));
2a204b45 382 RETVAL = dst;
383 OUTPUT:
384 RETVAL
ac5ea531 385
386
387
8f118dcd 388void
389checkNFD(arg)
390 SV * arg
391 PROTOTYPE: $
392 ALIAS:
393 checkNFKD = 1
394 PREINIT:
395 UV uv;
396 SV *src;
397 STRLEN srclen, retlen;
398 U8 *s, *e, *p, curCC, preCC;
399 PPCODE:
400 if (SvUTF8(arg)) {
401 src = arg;
402 } else {
403 src = sv_mortalcopy(arg);
404 sv_utf8_upgrade(src);
405 }
406
407 s = (U8*)SvPV(src,srclen);
408 e = s + srclen;
409
410 preCC = 0;
411 for (p = s; p < e; p += retlen) {
412 uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
413 curCC = getCombinClass(uv);
414 if (preCC > curCC && curCC != 0) /* canonical ordering violated */
415 XSRETURN_NO;
416 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
417 XSRETURN_NO;
418 preCC = curCC;
419 }
420 XSRETURN_YES;
421
422
423
424void
425checkNFC(arg)
426 SV * arg
427 PROTOTYPE: $
428 ALIAS:
429 checkNFKC = 1
430 PREINIT:
431 UV uv;
432 SV *src;
433 STRLEN srclen, retlen;
434 U8 *s, *e, *p, curCC, preCC;
435 bool isMAYBE;
436 PPCODE:
437 if (SvUTF8(arg)) {
438 src = arg;
439 } else {
440 src = sv_mortalcopy(arg);
441 sv_utf8_upgrade(src);
442 }
443
444 s = (U8*)SvPV(src,srclen);
445 e = s + srclen;
446
447 preCC = 0;
448 isMAYBE = FALSE;
449 for (p = s; p < e; p += retlen) {
450 uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
451 curCC = getCombinClass(uv);
452
453 if (preCC > curCC && curCC != 0) /* canonical ordering violated */
454 XSRETURN_NO;
455
456 /* get NFC/NFKC property */
457 if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */
458 ; /* YES */
459 else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
460 XSRETURN_NO;
461 else if (isComp2nd(uv))
462 isMAYBE = TRUE;
463 else if (ix) {
464 char *canon, *compat;
6c941e0c 465 /* NFKC_NO when having compatibility mapping. */
8f118dcd 466 canon = (char *) dec_canonical(uv);
467 compat = (char *) dec_compat(uv);
6c941e0c 468 if (compat && !(canon && strEQ(canon, compat)))
8f118dcd 469 XSRETURN_NO;
470 } /* end of get NFC/NFKC property */
471
472 preCC = curCC;
473 }
474 if (isMAYBE)
475 XSRETURN_UNDEF;
476 else
477 XSRETURN_YES;
478
479
480
ac5ea531 481U8
482getCombinClass(uv)
483 UV uv
8f118dcd 484 PROTOTYPE: $
ac5ea531 485
486bool
2a204b45 487isExclusion(uv)
ac5ea531 488 UV uv
8f118dcd 489 PROTOTYPE: $
490
491bool
492isSingleton(uv)
493 UV uv
494 PROTOTYPE: $
495
496bool
497isNonStDecomp(uv)
498 UV uv
499 PROTOTYPE: $
500
501bool
502isComp2nd(uv)
503 UV uv
504 PROTOTYPE: $
505 ALIAS:
506 isNFC_MAYBE = 1
507 isNFKC_MAYBE = 2
508
509
510
511void
512isNFD_NO(uv)
513 UV uv
514 PROTOTYPE: $
515 ALIAS:
516 isNFKD_NO = 1
517 PPCODE:
518 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
519 XSRETURN_YES; /* NFD_NO or NFKD_NO */
520 else
521 XSRETURN_NO;
522
523
524
525void
526isComp_Ex(uv)
527 UV uv
528 PROTOTYPE: $
529 ALIAS:
530 isNFC_NO = 0
531 isNFKC_NO = 1
532 PPCODE:
533 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
534 XSRETURN_YES; /* NFC_NO or NFKC_NO */
535 else if (ix) {
536 char *canon, *compat;
537 canon = (char *) dec_canonical(uv);
538 compat = (char *) dec_compat(uv);
539 if (compat && (!canon || strNE(canon, compat)))
540 XSRETURN_YES; /* NFC_NO or NFKC_NO */
541 else
542 XSRETURN_NO;
543 }
544 else
545 XSRETURN_NO;
546
547
ac5ea531 548
2a204b45 549SV*
ac5ea531 550getComposite(uv, uv2)
551 UV uv
552 UV uv2
2a204b45 553 PROTOTYPE: $$
554 PREINIT:
bcdb689b 555 UV composite;
2a204b45 556 CODE:
bcdb689b 557 composite = composite_uv(uv, uv2);
558 RETVAL = composite ? newSVuv(composite) : &PL_sv_undef;
2a204b45 559 OUTPUT:
560 RETVAL
ac5ea531 561
8f118dcd 562
563
ac5ea531 564SV*
565getCanon(uv)
566 UV uv
567 PROTOTYPE: $
568 ALIAS:
569 getCompat = 1
570 PREINIT:
571 U8 * rstr;
572 CODE:
8f118dcd 573 if (Hangul_IsS(uv)) {
ac5ea531 574 SV * dst;
575 dst = newSV(1);
576 (void)SvPOK_only(dst);
577 sv_cat_decompHangul(dst, uv);
578 RETVAL = dst;
579 } else {
580 rstr = ix ? dec_compat(uv) : dec_canonical(uv);
8f118dcd 581 if (!rstr)
582 XSRETURN_UNDEF;
ac5ea531 583 RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
584 }
585 SvUTF8_on(RETVAL);
586 OUTPUT:
587 RETVAL
588