Upgrade to Unicode::Normalize 1.02
[p5sagit/p5-mst-13.2.git] / ext / Unicode / Normalize / Normalize.xs
CommitLineData
ac5ea531 1
2#include "EXTERN.h"
3#include "perl.h"
4#include "XSUB.h"
5
6/* These 5 files are prepared by mkheader */
7#include "unfcmb.h"
8#include "unfcan.h"
9#include "unfcpt.h"
10#include "unfcmp.h"
11#include "unfexc.h"
12
13/* Perl 5.6.1 ? */
14#ifndef uvuni_to_utf8
15#define uvuni_to_utf8 uv_to_utf8
6c941e0c 16#endif /* uvuni_to_utf8 */
ac5ea531 17
18/* Perl 5.6.1 ? */
ab8fe378 19#ifndef utf8n_to_uvuni
20#define utf8n_to_uvuni utf8_to_uv
6c941e0c 21#endif /* utf8n_to_uvuni */
ac5ea531 22
e524f5b2 23/* UTF8_ALLOW_BOM is used before Perl 5.8.0 */
24#ifdef UTF8_ALLOW_BOM
25#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FFFF)
628bbff0 26#else
e524f5b2 27#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)
28#endif
29
30/* if utf8n_to_uvuni() sets retlen to 0 (?) */
fe067ad9 31#define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character"
82e740b6 32
33/* utf8_hop() hops back before start. Maybe broken UTF-8 */
34#define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start"
35
fe067ad9 36/* It should never happen as there is no instance in UTF-8 and UTF-EBCDIC;
37 according to Versioning and Stability in UAX#15, no new composition
38 should come in future. */
39#define ErrLongerThanSrc "panic (Unicode::Normalize %s): longer than source"
40
41/* uvuni_to_utf8 wants UTF8_MAXBYTES free bytes available */
42#define ErrTargetNotEnough "panic (Unicode::Normalize %s): target not enough"
43
ac5ea531 44/* At present, char > 0x10ffff are unaffected without complaint, right? */
45#define VALID_UTF_MAX (0x10ffff)
46#define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
47
fe067ad9 48/* size of array for combining characters */
49/* enough as an initial value? */
50#define CC_SEQ_SIZE (10)
51#define CC_SEQ_STEP (5)
52
53/* HANGUL begin */
ac5ea531 54#define Hangul_SBase 0xAC00
55#define Hangul_SFinal 0xD7A3
56#define Hangul_SCount 11172
57
58#define Hangul_NCount 588
59
60#define Hangul_LBase 0x1100
61#define Hangul_LFinal 0x1112
62#define Hangul_LCount 19
63
64#define Hangul_VBase 0x1161
65#define Hangul_VFinal 0x1175
66#define Hangul_VCount 21
67
68#define Hangul_TBase 0x11A7
69#define Hangul_TFinal 0x11C2
70#define Hangul_TCount 28
71
72#define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
2a204b45 73#define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0)
ac5ea531 74#define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
75#define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
76#define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
77#define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal))
fe067ad9 78/* HANGUL end */
ac5ea531 79
80/* this is used for canonical ordering of combining characters (c.c.). */
81typedef struct {
82 U8 cc; /* combining class */
83 UV uv; /* codepoint */
84 STRLEN pos; /* position */
85} UNF_cc;
86
fe067ad9 87static int compare_cc(const void *a, const void *b)
ac5ea531 88{
89 int ret_cc;
6c941e0c 90 ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;
8f118dcd 91 if (ret_cc)
92 return ret_cc;
6c941e0c 93
94 return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos )
95 - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );
ac5ea531 96}
97
fe067ad9 98static U8* dec_canonical(UV uv)
ac5ea531 99{
100 U8 ***plane, **row;
8f118dcd 101 if (OVER_UTF_MAX(uv))
102 return NULL;
ac5ea531 103 plane = (U8***)UNF_canon[uv >> 16];
8f118dcd 104 if (! plane)
105 return NULL;
ac5ea531 106 row = plane[(uv >> 8) & 0xff];
107 return row ? row[uv & 0xff] : NULL;
108}
109
fe067ad9 110static U8* dec_compat(UV uv)
ac5ea531 111{
112 U8 ***plane, **row;
8f118dcd 113 if (OVER_UTF_MAX(uv))
114 return NULL;
ac5ea531 115 plane = (U8***)UNF_compat[uv >> 16];
8f118dcd 116 if (! plane)
117 return NULL;
ac5ea531 118 row = plane[(uv >> 8) & 0xff];
119 return row ? row[uv & 0xff] : NULL;
120}
121
fe067ad9 122static UV composite_uv(UV uv, UV uv2)
ac5ea531 123{
124 UNF_complist ***plane, **row, *cell, *i;
125
fe067ad9 126 if (!uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
8f118dcd 127 return 0;
ac5ea531 128
8f118dcd 129 if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {
fe067ad9 130 UV lindex = uv - Hangul_LBase;
131 UV vindex = uv2 - Hangul_VBase;
132 return(Hangul_SBase + (lindex * Hangul_VCount + vindex) *
133 Hangul_TCount);
ac5ea531 134 }
8f118dcd 135 if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
fe067ad9 136 UV tindex = uv2 - Hangul_TBase;
137 return(uv + tindex);
ac5ea531 138 }
139 plane = UNF_compos[uv >> 16];
8f118dcd 140 if (! plane)
141 return 0;
ac5ea531 142 row = plane[(uv >> 8) & 0xff];
8f118dcd 143 if (! row)
144 return 0;
ac5ea531 145 cell = row[uv & 0xff];
8f118dcd 146 if (! cell)
147 return 0;
148 for (i = cell; i->nextchar; i++) {
149 if (uv2 == i->nextchar)
150 return i->composite;
ac5ea531 151 }
152 return 0;
153}
154
fe067ad9 155static U8 getCombinClass(UV uv)
ac5ea531 156{
157 U8 **plane, *row;
8f118dcd 158 if (OVER_UTF_MAX(uv))
159 return 0;
ac5ea531 160 plane = (U8**)UNF_combin[uv >> 16];
8f118dcd 161 if (! plane)
162 return 0;
ac5ea531 163 row = plane[(uv >> 8) & 0xff];
164 return row ? row[uv & 0xff] : 0;
165}
166
fe067ad9 167static U8* pv_cat_decompHangul(U8* d, UV uv)
ac5ea531 168{
fe067ad9 169 UV sindex = uv - Hangul_SBase;
170 UV lindex = sindex / Hangul_NCount;
171 UV vindex = (sindex % Hangul_NCount) / Hangul_TCount;
172 UV tindex = sindex % Hangul_TCount;
ac5ea531 173
8f118dcd 174 if (! Hangul_IsS(uv))
fe067ad9 175 return d;
ac5ea531 176
fe067ad9 177 d = uvuni_to_utf8(d, (lindex + Hangul_LBase));
178 d = uvuni_to_utf8(d, (vindex + Hangul_VBase));
8f118dcd 179 if (tindex)
fe067ad9 180 d = uvuni_to_utf8(d, (tindex + Hangul_TBase));
181 return d;
ac5ea531 182}
183
39f4556f 184static char* sv_2pvunicode(SV *sv, STRLEN *lp)
a092bcfd 185{
186 char *s;
187 STRLEN len;
39f4556f 188 s = SvPV(sv,len);
a092bcfd 189 if (!SvUTF8(sv)) {
39f4556f 190 SV* tmpsv = sv_2mortal(newSVpvn(s, len));
a092bcfd 191 if (!SvPOK(tmpsv))
39f4556f 192 s = SvPV_force(tmpsv,len);
a092bcfd 193 sv_utf8_upgrade(tmpsv);
39f4556f 194 s = SvPV(tmpsv,len);
a092bcfd 195 }
fe067ad9 196 if (lp)
197 *lp = len;
a092bcfd 198 return s;
199}
200
fe067ad9 201static
202U8* pv_utf8_decompose(U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat)
203{
204 U8* p = s;
205 U8* e = s + slen;
206 U8* dstart = *dp;
207 U8* d = dstart;
208
209 while (p < e) {
210 STRLEN retlen;
211 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
82e740b6 212 if (!retlen)
fe067ad9 213 croak(ErrRetlenIsZero, "decompose");
214 p += retlen;
215
216 if (Hangul_IsS(uv)) {
217 STRLEN cur = d - dstart;
82e740b6 218
fe067ad9 219 if (dlen < cur + UTF8_MAXLEN * 3) {
220 dlen += UTF8_MAXLEN * 3;
221 Renew(dstart, dlen+1, U8);
222 d = dstart + cur;
223 }
224 d = pv_cat_decompHangul(d, uv);
225 }
ac5ea531 226 else {
fe067ad9 227 U8* r = iscompat ? dec_compat(uv) : dec_canonical(uv);
228
229 if (r) {
230 STRLEN len = (STRLEN)strlen((char *)r);
231 STRLEN cur = d - dstart;
232 if (dlen < cur + len) {
233 dlen += len;
234 Renew(dstart, dlen+1, U8);
235 d = dstart + cur;
236 }
237 while (len--)
238 *d++ = *r++;
239 }
240 else {
241 STRLEN cur = d - dstart;
242
243 if (dlen < cur + UTF8_MAXLEN) {
244 dlen += UTF8_MAXLEN;
245 Renew(dstart, dlen+1, U8);
246 d = dstart + cur;
247 }
248 d = uvuni_to_utf8(d, uv);
249 }
ac5ea531 250 }
251 }
fe067ad9 252 *dp = dstart;
253 return d;
254}
ac5ea531 255
fe067ad9 256static
257U8* pv_utf8_reorder(U8* s, STRLEN slen, U8* d, STRLEN dlen)
258{
259 U8* p = s;
260 U8* e = s + slen;
261 U8* dend = d + dlen;
262
263 UNF_cc seq_ary[CC_SEQ_SIZE];
264 UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */
265 UNF_cc* seq_ext = NULL; /* extend if need */
266 STRLEN seq_max = CC_SEQ_SIZE;
267 STRLEN cc_pos = 0;
268
269 if (dlen < slen || dlen < slen + UTF8_MAXLEN)
270 croak(ErrTargetNotEnough, "reorder");
271 dend -= UTF8_MAXLEN; /* safety */
272
273 while (p < e) {
274 U8 curCC;
275 STRLEN retlen;
276 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
277 if (!retlen)
278 croak(ErrRetlenIsZero, "reorder");
279 p += retlen;
ac5ea531 280
fe067ad9 281 curCC = getCombinClass(uv);
ac5ea531 282
fe067ad9 283 if (curCC != 0) {
284 if (seq_max < cc_pos + 1) { /* extend if need */
285 seq_max = cc_pos + CC_SEQ_STEP; /* new size */
286 if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
287 STRLEN i;
288 New(0, seq_ext, seq_max, UNF_cc);
289 for (i = 0; i < cc_pos; i++)
290 seq_ext[i] = seq_ary[i];
291 }
292 else {
293 Renew(seq_ext, seq_max, UNF_cc);
294 }
39f4556f 295 seq_ptr = seq_ext; /* use seq_ext from now */
fe067ad9 296 }
a092bcfd 297
fe067ad9 298 seq_ptr[cc_pos].cc = curCC;
299 seq_ptr[cc_pos].uv = uv;
300 seq_ptr[cc_pos].pos = cc_pos;
301 ++cc_pos;
ac5ea531 302
fe067ad9 303 if (p < e)
304 continue;
305 }
ac5ea531 306
fe067ad9 307 if (cc_pos) {
308 STRLEN i;
309
310 if (cc_pos > 1) /* reordered if there are two c.c.'s */
311 qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc);
312
313 for (i = 0; i < cc_pos; i++) {
314 d = uvuni_to_utf8(d, seq_ptr[i].uv);
315 if (dend < d) /* real end is dend + UTF8_MAXLEN */
316 croak(ErrLongerThanSrc, "reorder");
317 }
318 cc_pos = 0;
319 }
2a204b45 320
e524f5b2 321 if (curCC == 0) {
322 d = uvuni_to_utf8(d, uv);
fe067ad9 323 if (dend < d) /* real end is dend + UTF8_MAXLEN */
324 croak(ErrLongerThanSrc, "reorder");
e524f5b2 325 }
fe067ad9 326 }
327 if (seq_ext)
328 Safefree(seq_ext);
329 return d;
330}
ac5ea531 331
fe067ad9 332static
333U8* pv_utf8_compose(U8* s, STRLEN slen, U8* d, STRLEN dlen, bool iscontig)
334{
335 U8* p = s;
336 U8* e = s + slen;
337 U8* dend = d + dlen;
338
39f4556f 339 UV uvS = 0; /* code point of the starter */
fe067ad9 340 bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */
341 U8 preCC = 0;
342
343 UV seq_ary[CC_SEQ_SIZE];
344 UV* seq_ptr = seq_ary; /* use array at the beginning */
345 UV* seq_ext = NULL; /* extend if need */
346 STRLEN seq_max = CC_SEQ_SIZE;
347 STRLEN cc_pos = 0;
348
349 if (dlen < slen || dlen < slen + UTF8_MAXLEN)
350 croak(ErrTargetNotEnough, "compose");
351 dend -= UTF8_MAXLEN; /* safety */
352
353 while (p < e) {
354 U8 curCC;
355 STRLEN retlen;
356 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
357 if (!retlen)
358 croak(ErrRetlenIsZero, "compose");
359 p += retlen;
ac5ea531 360
fe067ad9 361 curCC = getCombinClass(uv);
82e740b6 362
fe067ad9 363 if (!valid_uvS) {
e524f5b2 364 if (curCC == 0) {
fe067ad9 365 uvS = uv; /* the first Starter is found */
366 valid_uvS = TRUE;
367 if (p < e)
368 continue;
e524f5b2 369 }
fe067ad9 370 else {
371 d = uvuni_to_utf8(d, uv);
372 if (dend < d) /* real end is dend + UTF8_MAXLEN */
373 croak(ErrLongerThanSrc, "compose");
374 continue;
375 }
376 }
377 else {
378 bool composed;
379
380 /* blocked */
381 if (iscontig && cc_pos || /* discontiguous combination */
382 curCC != 0 && preCC == curCC || /* blocked by same CC */
383 preCC > curCC) /* blocked by higher CC: revised D2 */
384 composed = FALSE;
385
386 /* not blocked:
387 iscontig && cc_pos == 0 -- contiguous combination
388 curCC == 0 && preCC == 0 -- starter + starter
389 curCC != 0 && preCC < curCC -- lower CC */
390 else {
391 /* try composition */
392 UV uvComp = composite_uv(uvS, uv);
393
394 if (uvComp && !isExclusion(uvComp)) {
395 uvS = uvComp;
396 composed = TRUE;
82e740b6 397
fe067ad9 398 /* preCC should not be changed to curCC */
399 /* e.g. 1E14 = 0045 0304 0300 where CC(0304) == CC(0300) */
400 if (p < e)
401 continue;
402 }
403 else
404 composed = FALSE;
405 }
406
407 if (!composed) {
408 preCC = curCC;
409 if (curCC != 0 || !(p < e)) {
410 if (seq_max < cc_pos + 1) { /* extend if need */
411 seq_max = cc_pos + CC_SEQ_STEP; /* new size */
412 if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
413 New(0, seq_ext, seq_max, UV);
414 Copy(seq_ary, seq_ext, cc_pos, UV);
415 }
416 else {
417 Renew(seq_ext, seq_max, UV);
418 }
2b8d773d 419 seq_ptr = seq_ext; /* use seq_ext from now */
fe067ad9 420 }
421 seq_ptr[cc_pos] = uv;
422 ++cc_pos;
423 }
424 if (curCC != 0 && p < e)
425 continue;
ac5ea531 426 }
ac5ea531 427 }
428
fe067ad9 429 d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */
430 if (dend < d) /* real end is dend + UTF8_MAXLEN */
431 croak(ErrLongerThanSrc, "compose");
432
e524f5b2 433 if (cc_pos) {
fe067ad9 434 STRLEN i;
ac5ea531 435
fe067ad9 436 for (i = 0; i < cc_pos; i++) {
437 d = uvuni_to_utf8(d, seq_ptr[i]);
438 if (dend < d) /* real end is dend + UTF8_MAXLEN */
439 croak(ErrLongerThanSrc, "compose");
440 }
441 cc_pos = 0;
ac5ea531 442 }
fe067ad9 443
444 uvS = uv;
ac5ea531 445 }
fe067ad9 446 if (seq_ext)
447 Safefree(seq_ext);
448 return d;
449}
450
451MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
452
453SV*
454decompose(src, compat = &PL_sv_no)
455 SV * src
456 SV * compat
457 PROTOTYPE: $;$
458 PREINIT:
459 SV* dst;
460 U8 *s, *d, *dend;
461 STRLEN slen, dlen;
462 CODE:
463 s = (U8*)sv_2pvunicode(src,&slen);
464 dst = newSVpvn("", 0);
465 dlen = slen;
466 New(0, d, dlen+1, U8);
39f4556f 467 dend = pv_utf8_decompose(s, slen, &d, dlen, (bool)SvTRUE(compat));
9b374906 468 sv_setpvn(dst, (char *)d, dend - d);
fe067ad9 469 SvUTF8_on(dst);
470 Safefree(d);
8f118dcd 471 RETVAL = dst;
ac5ea531 472 OUTPUT:
473 RETVAL
474
fe067ad9 475SV*
476reorder(src)
477 SV * src
478 PROTOTYPE: $
479 PREINIT:
480 SV* dst;
481 U8 *s, *d, *dend;
482 STRLEN slen, dlen;
483 CODE:
484 s = (U8*)sv_2pvunicode(src,&slen);
485 dst = newSVpvn("", 0);
486 dlen = slen + UTF8_MAXLEN;
487 d = (U8*)SvGROW(dst,dlen+1);
488 SvUTF8_on(dst);
489 dend = pv_utf8_reorder(s, slen, d, dlen);
490 *dend = '\0';
491 SvCUR_set(dst, dend - d);
492 RETVAL = dst;
493 OUTPUT:
494 RETVAL
ac5ea531 495
2a204b45 496SV*
a092bcfd 497compose(src)
498 SV * src
ac5ea531 499 PROTOTYPE: $
82e740b6 500 ALIAS:
501 composeContiguous = 1
ac5ea531 502 PREINIT:
fe067ad9 503 SV* dst;
504 U8 *s, *d, *dend;
505 STRLEN slen, dlen;
2a204b45 506 CODE:
fe067ad9 507 s = (U8*)sv_2pvunicode(src,&slen);
508 dst = newSVpvn("", 0);
509 dlen = slen + UTF8_MAXLEN;
510 d = (U8*)SvGROW(dst,dlen+1);
ac5ea531 511 SvUTF8_on(dst);
fe067ad9 512 dend = pv_utf8_compose(s, slen, d, dlen, (bool)ix);
513 *dend = '\0';
514 SvCUR_set(dst, dend - d);
515 RETVAL = dst;
516 OUTPUT:
517 RETVAL
ac5ea531 518
fe067ad9 519SV*
520NFD(src)
521 SV * src
522 PROTOTYPE: $
523 ALIAS:
524 NFKD = 1
525 PREINIT:
526 SV *dst;
527 U8 *s, *t, *tend, *d, *dend;
528 STRLEN slen, tlen, dlen;
529 CODE:
530 /* decompose */
531 s = (U8*)sv_2pvunicode(src,&slen);
532 tlen = slen;
533 New(0, t, tlen+1, U8);
534 tend = pv_utf8_decompose(s, slen, &t, tlen, (bool)ix);
535 *tend = '\0';
536 tlen = tend - t; /* no longer know real tlen */
537
538 /* reorder */
539 dst = newSVpvn("", 0);
540 dlen = tlen + UTF8_MAXLEN;
541 d = (U8*)SvGROW(dst,dlen+1);
542 SvUTF8_on(dst);
543 dend = pv_utf8_reorder(t, tlen, d, dlen);
544 *dend = '\0';
545 SvCUR_set(dst, dend - d);
ac5ea531 546
fe067ad9 547 /* return */
548 Safefree(t);
549 RETVAL = dst;
550 OUTPUT:
551 RETVAL
82e740b6 552
fe067ad9 553SV*
554NFC(src)
555 SV * src
556 PROTOTYPE: $
557 ALIAS:
558 NFKC = 1
559 FCC = 2
560 PREINIT:
561 SV *dst;
562 U8 *s, *t, *tend, *u, *uend, *d, *dend;
563 STRLEN slen, tlen, ulen, dlen;
564 CODE:
565 /* decompose */
566 s = (U8*)sv_2pvunicode(src,&slen);
567 tlen = slen;
568 New(0, t, tlen+1, U8);
569 tend = pv_utf8_decompose(s, slen, &t, tlen, (bool)(ix==1));
570 *tend = '\0';
571 tlen = tend - t; /* no longer know real tlen */
572
573 /* reorder */
574 ulen = tlen + UTF8_MAXLEN;
575 New(0, u, ulen+1, U8);
576 uend = pv_utf8_reorder(t, tlen, u, ulen);
577 *uend = '\0';
578 ulen = uend - u;
579
580 /* compose */
581 dst = newSVpvn("", 0);
582 dlen = ulen + UTF8_MAXLEN;
583 d = (U8*)SvGROW(dst,dlen+1);
584 SvUTF8_on(dst);
585 dend = pv_utf8_compose(u, ulen, d, dlen, (bool)(ix==2));
586 *dend = '\0';
587 SvCUR_set(dst, dend - d);
ac5ea531 588
fe067ad9 589 /* return */
590 Safefree(t);
591 Safefree(u);
2a204b45 592 RETVAL = dst;
593 OUTPUT:
594 RETVAL
ac5ea531 595
2b8d773d 596SV*
a092bcfd 597checkNFD(src)
598 SV * src
8f118dcd 599 PROTOTYPE: $
600 ALIAS:
601 checkNFKD = 1
602 PREINIT:
8f118dcd 603 STRLEN srclen, retlen;
604 U8 *s, *e, *p, curCC, preCC;
2b8d773d 605 bool result = TRUE;
82e740b6 606 CODE:
a092bcfd 607 s = (U8*)sv_2pvunicode(src,&srclen);
8f118dcd 608 e = s + srclen;
609
610 preCC = 0;
611 for (p = s; p < e; p += retlen) {
fe067ad9 612 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
82e740b6 613 if (!retlen)
fe067ad9 614 croak(ErrRetlenIsZero, "checkNFD or -NFKD");
82e740b6 615
8f118dcd 616 curCC = getCombinClass(uv);
2b8d773d 617 if (preCC > curCC && curCC != 0) { /* canonical ordering violated */
618 result = FALSE;
619 break;
620 }
621 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) {
622 result = FALSE;
623 break;
624 }
8f118dcd 625 preCC = curCC;
626 }
2b8d773d 627 RETVAL = boolSV(result);
628 OUTPUT:
629 RETVAL
8f118dcd 630
631
2b8d773d 632SV*
a092bcfd 633checkNFC(src)
634 SV * src
8f118dcd 635 PROTOTYPE: $
636 ALIAS:
637 checkNFKC = 1
638 PREINIT:
8f118dcd 639 STRLEN srclen, retlen;
640 U8 *s, *e, *p, curCC, preCC;
2b8d773d 641 bool result = TRUE;
642 bool isMAYBE = FALSE;
82e740b6 643 CODE:
a092bcfd 644 s = (U8*)sv_2pvunicode(src,&srclen);
8f118dcd 645 e = s + srclen;
646
647 preCC = 0;
8f118dcd 648 for (p = s; p < e; p += retlen) {
fe067ad9 649 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
82e740b6 650 if (!retlen)
fe067ad9 651 croak(ErrRetlenIsZero, "checkNFC or -NFKC");
82e740b6 652
8f118dcd 653 curCC = getCombinClass(uv);
2b8d773d 654 if (preCC > curCC && curCC != 0) { /* canonical ordering violated */
655 result = FALSE;
656 break;
657 }
8f118dcd 658
659 /* get NFC/NFKC property */
660 if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */
661 ; /* YES */
2b8d773d 662 else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) {
663 result = FALSE;
664 break;
665 }
8f118dcd 666 else if (isComp2nd(uv))
667 isMAYBE = TRUE;
668 else if (ix) {
669 char *canon, *compat;
6c941e0c 670 /* NFKC_NO when having compatibility mapping. */
8f118dcd 671 canon = (char *) dec_canonical(uv);
672 compat = (char *) dec_compat(uv);
2b8d773d 673 if (compat && !(canon && strEQ(canon, compat))) {
674 result = FALSE;
675 break;
676 }
8f118dcd 677 } /* end of get NFC/NFKC property */
678
679 preCC = curCC;
680 }
2b8d773d 681 if (isMAYBE && result) /* NO precedes MAYBE */
8f118dcd 682 XSRETURN_UNDEF;
2b8d773d 683 RETVAL = boolSV(result);
684 OUTPUT:
685 RETVAL
8f118dcd 686
687
2b8d773d 688SV*
a092bcfd 689checkFCD(src)
690 SV * src
82e740b6 691 PROTOTYPE: $
692 ALIAS:
693 checkFCC = 1
694 PREINIT:
fe067ad9 695 STRLEN srclen, retlen;
82e740b6 696 U8 *s, *e, *p, curCC, preCC;
2b8d773d 697 bool result = TRUE;
698 bool isMAYBE = FALSE;
82e740b6 699 CODE:
a092bcfd 700 s = (U8*)sv_2pvunicode(src,&srclen);
82e740b6 701 e = s + srclen;
82e740b6 702 preCC = 0;
82e740b6 703 for (p = s; p < e; p += retlen) {
fe067ad9 704 U8 *sCan;
705 UV uvLead;
39f4556f 706 STRLEN canlen = 0;
fe067ad9 707 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
82e740b6 708 if (!retlen)
fe067ad9 709 croak(ErrRetlenIsZero, "checkFCD or -FCC");
82e740b6 710
711 sCan = (U8*) dec_canonical(uv);
712
713 if (sCan) {
39f4556f 714 STRLEN canret;
82e740b6 715 canlen = (STRLEN)strlen((char *) sCan);
e524f5b2 716 uvLead = utf8n_to_uvuni(sCan, canlen, &canret, AllowAnyUTF);
fe067ad9 717 if (!canret)
718 croak(ErrRetlenIsZero, "checkFCD or -FCC");
82e740b6 719 }
720 else {
721 uvLead = uv;
722 }
723
724 curCC = getCombinClass(uvLead);
725
2b8d773d 726 if (curCC != 0 && curCC < preCC) { /* canonical ordering violated */
727 result = FALSE;
728 break;
729 }
82e740b6 730
731 if (ix) {
2b8d773d 732 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) {
733 result = FALSE;
734 break;
735 }
82e740b6 736 else if (isComp2nd(uv))
737 isMAYBE = TRUE;
738 }
739
740 if (sCan) {
39f4556f 741 STRLEN canret;
fe067ad9 742 UV uvTrail;
743 U8* eCan = sCan + canlen;
744 U8* pCan = utf8_hop(eCan, -1);
82e740b6 745 if (pCan < sCan)
746 croak(ErrHopBeforeStart);
e524f5b2 747 uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, AllowAnyUTF);
fe067ad9 748 if (!canret)
749 croak(ErrRetlenIsZero, "checkFCD or -FCC");
82e740b6 750 preCC = getCombinClass(uvTrail);
751 }
752 else {
753 preCC = curCC;
754 }
755 }
2b8d773d 756 if (isMAYBE && result) /* NO precedes MAYBE */
82e740b6 757 XSRETURN_UNDEF;
2b8d773d 758 RETVAL = boolSV(result);
759 OUTPUT:
760 RETVAL
82e740b6 761
762
ac5ea531 763U8
764getCombinClass(uv)
765 UV uv
8f118dcd 766 PROTOTYPE: $
ac5ea531 767
768bool
2a204b45 769isExclusion(uv)
ac5ea531 770 UV uv
8f118dcd 771 PROTOTYPE: $
772
773bool
774isSingleton(uv)
775 UV uv
776 PROTOTYPE: $
777
778bool
779isNonStDecomp(uv)
780 UV uv
781 PROTOTYPE: $
782
783bool
784isComp2nd(uv)
785 UV uv
786 PROTOTYPE: $
787 ALIAS:
788 isNFC_MAYBE = 1
789 isNFKC_MAYBE = 2
790
791
792
2b8d773d 793SV*
8f118dcd 794isNFD_NO(uv)
795 UV uv
796 PROTOTYPE: $
797 ALIAS:
798 isNFKD_NO = 1
2b8d773d 799 PREINIT:
800 bool result = FALSE;
82e740b6 801 CODE:
8f118dcd 802 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
2b8d773d 803 result = TRUE; /* NFD_NO or NFKD_NO */
804 RETVAL = boolSV(result);
805 OUTPUT:
806 RETVAL
8f118dcd 807
808
2b8d773d 809SV*
8f118dcd 810isComp_Ex(uv)
811 UV uv
812 PROTOTYPE: $
813 ALIAS:
814 isNFC_NO = 0
815 isNFKC_NO = 1
2b8d773d 816 PREINIT:
817 bool result = FALSE;
82e740b6 818 CODE:
8f118dcd 819 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
2b8d773d 820 result = TRUE; /* NFC_NO or NFKC_NO */
8f118dcd 821 else if (ix) {
822 char *canon, *compat;
823 canon = (char *) dec_canonical(uv);
824 compat = (char *) dec_compat(uv);
825 if (compat && (!canon || strNE(canon, compat)))
2b8d773d 826 result = TRUE; /* NFC_NO or NFKC_NO */
8f118dcd 827 }
2b8d773d 828 RETVAL = boolSV(result);
829 OUTPUT:
830 RETVAL
ac5ea531 831
2a204b45 832SV*
ac5ea531 833getComposite(uv, uv2)
834 UV uv
835 UV uv2
2a204b45 836 PROTOTYPE: $$
837 PREINIT:
bcdb689b 838 UV composite;
2a204b45 839 CODE:
bcdb689b 840 composite = composite_uv(uv, uv2);
841 RETVAL = composite ? newSVuv(composite) : &PL_sv_undef;
2a204b45 842 OUTPUT:
843 RETVAL
ac5ea531 844
8f118dcd 845
846
ac5ea531 847SV*
848getCanon(uv)
849 UV uv
850 PROTOTYPE: $
851 ALIAS:
852 getCompat = 1
ac5ea531 853 CODE:
8f118dcd 854 if (Hangul_IsS(uv)) {
fe067ad9 855 U8 tmp[3 * UTF8_MAXLEN + 1];
856 U8 *t = tmp;
857 U8 *e = pv_cat_decompHangul(t, uv);
858 RETVAL = newSVpvn((char *)t, e - t);
ac5ea531 859 } else {
fe067ad9 860 U8* rstr = ix ? dec_compat(uv) : dec_canonical(uv);
8f118dcd 861 if (!rstr)
862 XSRETURN_UNDEF;
ac5ea531 863 RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
864 }
865 SvUTF8_on(RETVAL);
866 OUTPUT:
867 RETVAL
868
82e740b6 869
870void
a092bcfd 871splitOnLastStarter(src)
872 SV * src
82e740b6 873 PREINIT:
a092bcfd 874 SV *svp;
fe067ad9 875 STRLEN srclen;
82e740b6 876 U8 *s, *e, *p;
877 PPCODE:
a092bcfd 878 s = (U8*)sv_2pvunicode(src,&srclen);
82e740b6 879 e = s + srclen;
fe067ad9 880 p = e;
881 while (s < p) {
882 UV uv;
82e740b6 883 p = utf8_hop(p, -1);
884 if (p < s)
885 croak(ErrHopBeforeStart);
fe067ad9 886 uv = utf8n_to_uvuni(p, e - p, NULL, AllowAnyUTF);
82e740b6 887 if (getCombinClass(uv) == 0) /* Last Starter found */
888 break;
889 }
890
891 svp = sv_2mortal(newSVpvn((char*)s, p - s));
892 SvUTF8_on(svp);
893 XPUSHs(svp);
894
895 svp = sv_2mortal(newSVpvn((char*)p, e - p));
896 SvUTF8_on(svp);
897 XPUSHs(svp);
898