Remove a spurious \n in a perltie example,
[p5sagit/p5-mst-13.2.git] / ext / Unicode / Normalize / Normalize.xs
CommitLineData
ac5ea531 1
2#include "EXTERN.h"
3#include "perl.h"
4#include "XSUB.h"
5
6/* These 5 files are prepared by mkheader */
7#include "unfcmb.h"
8#include "unfcan.h"
9#include "unfcpt.h"
10#include "unfcmp.h"
11#include "unfexc.h"
12
13/* Perl 5.6.1 ? */
14#ifndef uvuni_to_utf8
15#define uvuni_to_utf8 uv_to_utf8
6c941e0c 16#endif /* uvuni_to_utf8 */
ac5ea531 17
18/* Perl 5.6.1 ? */
ab8fe378 19#ifndef utf8n_to_uvuni
20#define utf8n_to_uvuni utf8_to_uv
6c941e0c 21#endif /* utf8n_to_uvuni */
ac5ea531 22
e524f5b2 23/* UTF8_ALLOW_BOM is used before Perl 5.8.0 */
24#ifdef UTF8_ALLOW_BOM
25#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FFFF)
26#else
27#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)
28#endif
29
30/* if utf8n_to_uvuni() sets retlen to 0 (?) */
82e740b6 31#define ErrRetlenIsZero "panic (Unicode::Normalize): zero-length character"
32
33/* utf8_hop() hops back before start. Maybe broken UTF-8 */
34#define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start"
35
ac5ea531 36/* At present, char > 0x10ffff are unaffected without complaint, right? */
37#define VALID_UTF_MAX (0x10ffff)
38#define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
39
40/* HANGUL_H */
41#define Hangul_SBase 0xAC00
42#define Hangul_SFinal 0xD7A3
43#define Hangul_SCount 11172
44
45#define Hangul_NCount 588
46
47#define Hangul_LBase 0x1100
48#define Hangul_LFinal 0x1112
49#define Hangul_LCount 19
50
51#define Hangul_VBase 0x1161
52#define Hangul_VFinal 0x1175
53#define Hangul_VCount 21
54
55#define Hangul_TBase 0x11A7
56#define Hangul_TFinal 0x11C2
57#define Hangul_TCount 28
58
59#define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
2a204b45 60#define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0)
ac5ea531 61#define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
62#define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
63#define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
64#define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal))
65/* HANGUL_H */
66
67/* this is used for canonical ordering of combining characters (c.c.). */
68typedef struct {
69 U8 cc; /* combining class */
70 UV uv; /* codepoint */
71 STRLEN pos; /* position */
72} UNF_cc;
73
e524f5b2 74static int compare_cc (const void *a, const void *b)
ac5ea531 75{
76 int ret_cc;
6c941e0c 77 ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;
8f118dcd 78 if (ret_cc)
79 return ret_cc;
6c941e0c 80
81 return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos )
82 - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );
ac5ea531 83}
84
e524f5b2 85static U8* dec_canonical (UV uv)
ac5ea531 86{
87 U8 ***plane, **row;
8f118dcd 88 if (OVER_UTF_MAX(uv))
89 return NULL;
ac5ea531 90 plane = (U8***)UNF_canon[uv >> 16];
8f118dcd 91 if (! plane)
92 return NULL;
ac5ea531 93 row = plane[(uv >> 8) & 0xff];
94 return row ? row[uv & 0xff] : NULL;
95}
96
e524f5b2 97static U8* dec_compat (UV uv)
ac5ea531 98{
99 U8 ***plane, **row;
8f118dcd 100 if (OVER_UTF_MAX(uv))
101 return NULL;
ac5ea531 102 plane = (U8***)UNF_compat[uv >> 16];
8f118dcd 103 if (! plane)
104 return NULL;
ac5ea531 105 row = plane[(uv >> 8) & 0xff];
106 return row ? row[uv & 0xff] : NULL;
107}
108
e524f5b2 109static UV composite_uv (UV uv, UV uv2)
ac5ea531 110{
111 UNF_complist ***plane, **row, *cell, *i;
112
8f118dcd 113 if (! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
114 return 0;
ac5ea531 115
8f118dcd 116 if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {
ac5ea531 117 uv -= Hangul_LBase; /* lindex */
118 uv2 -= Hangul_VBase; /* vindex */
119 return(Hangul_SBase + (uv * Hangul_VCount + uv2) * Hangul_TCount);
120 }
8f118dcd 121 if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
ac5ea531 122 uv2 -= Hangul_TBase; /* tindex */
2a204b45 123 return(uv + uv2);
ac5ea531 124 }
125 plane = UNF_compos[uv >> 16];
8f118dcd 126 if (! plane)
127 return 0;
ac5ea531 128 row = plane[(uv >> 8) & 0xff];
8f118dcd 129 if (! row)
130 return 0;
ac5ea531 131 cell = row[uv & 0xff];
8f118dcd 132 if (! cell)
133 return 0;
134 for (i = cell; i->nextchar; i++) {
135 if (uv2 == i->nextchar)
136 return i->composite;
ac5ea531 137 }
138 return 0;
139}
140
e524f5b2 141static U8 getCombinClass (UV uv)
ac5ea531 142{
143 U8 **plane, *row;
8f118dcd 144 if (OVER_UTF_MAX(uv))
145 return 0;
ac5ea531 146 plane = (U8**)UNF_combin[uv >> 16];
8f118dcd 147 if (! plane)
148 return 0;
ac5ea531 149 row = plane[(uv >> 8) & 0xff];
150 return row ? row[uv & 0xff] : 0;
151}
152
e524f5b2 153static void sv_cat_decompHangul (SV* sv, UV uv)
ac5ea531 154{
155 UV sindex, lindex, vindex, tindex;
2a204b45 156 U8 *t, tmp[3 * UTF8_MAXLEN + 1];
ac5ea531 157
8f118dcd 158 if (! Hangul_IsS(uv))
159 return;
ac5ea531 160
161 sindex = uv - Hangul_SBase;
162 lindex = sindex / Hangul_NCount;
163 vindex = (sindex % Hangul_NCount) / Hangul_TCount;
164 tindex = sindex % Hangul_TCount;
165
2a204b45 166 t = tmp;
ac5ea531 167 t = uvuni_to_utf8(t, (lindex + Hangul_LBase));
168 t = uvuni_to_utf8(t, (vindex + Hangul_VBase));
8f118dcd 169 if (tindex)
170 t = uvuni_to_utf8(t, (tindex + Hangul_TBase));
ac5ea531 171 *t = '\0';
e524f5b2 172 sv_catpvn(sv, (char *)tmp, t - tmp);
173}
174
175static void sv_cat_uvuni (SV* sv, UV uv)
176{
177 U8 *t, tmp[UTF8_MAXLEN + 1];
178
179 t = tmp;
180 t = uvuni_to_utf8(t, uv);
181 *t = '\0';
182 sv_catpvn(sv, (char *)tmp, t - tmp);
ac5ea531 183}
184
185MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
186
ac5ea531 187SV*
d85850a7 188decompose(arg, compat = &PL_sv_no)
ac5ea531 189 SV * arg
190 SV * compat
d85850a7 191 PROTOTYPE: $;$
ac5ea531 192 PREINIT:
2a204b45 193 UV uv;
ac5ea531 194 SV *src, *dst;
73263a9c 195 STRLEN srclen, retlen;
196 U8 *s, *e, *p, *r;
ac5ea531 197 bool iscompat;
198 CODE:
8f118dcd 199 if (SvUTF8(arg)) {
ac5ea531 200 src = arg;
201 } else {
202 src = sv_mortalcopy(arg);
203 sv_utf8_upgrade(src);
204 }
ac5ea531 205 iscompat = SvTRUE(compat);
206
207 dst = newSV(1);
208 (void)SvPOK_only(dst);
209 SvUTF8_on(dst);
210
211 s = (U8*)SvPV(src,srclen);
212 e = s + srclen;
82e740b6 213 for (p = s; p < e; p += retlen) {
e524f5b2 214 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
82e740b6 215 if (!retlen)
216 croak(ErrRetlenIsZero);
217
8f118dcd 218 if (Hangul_IsS(uv))
219 sv_cat_decompHangul(dst, uv);
ac5ea531 220 else {
221 r = iscompat ? dec_compat(uv) : dec_canonical(uv);
8f118dcd 222 if (r)
223 sv_catpv(dst, (char *)r);
224 else
e524f5b2 225 sv_cat_uvuni(dst, uv);
ac5ea531 226 }
227 }
228 RETVAL = dst;
229 OUTPUT:
230 RETVAL
231
232
233
234SV*
235reorder(arg)
236 SV * arg
237 PROTOTYPE: $
238 PREINIT:
8f118dcd 239 SV *src, *dst;
240 STRLEN srclen, dstlen, retlen, stk_cc_max;
241 U8 *s, *e, *p, *d, curCC;
e524f5b2 242 UV uv, uvlast;
ac5ea531 243 UNF_cc * stk_cc;
e524f5b2 244 STRLEN i, cc_pos;
245 bool valid_uvlast;
ac5ea531 246 CODE:
8f118dcd 247 if (SvUTF8(arg)) {
248 src = arg;
249 } else {
250 src = sv_mortalcopy(arg);
251 sv_utf8_upgrade(src);
252 }
253
254 s = (U8*)SvPV(src, srclen);
e524f5b2 255 e = s + srclen;
8f118dcd 256 dstlen = srclen + 1;
257 dst = newSV(dstlen);
e524f5b2 258 (void)SvPOK_only(dst);
8f118dcd 259 SvUTF8_on(dst);
e524f5b2 260 d = (U8*)SvPVX(dst);
ac5ea531 261
262 stk_cc_max = 10; /* enough as an initial value? */
263 New(0, stk_cc, stk_cc_max, UNF_cc);
264
e524f5b2 265 for (p = s; p < e;) {
266 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
82e740b6 267 if (!retlen)
268 croak(ErrRetlenIsZero);
2a204b45 269 p += retlen;
270
82e740b6 271 curCC = getCombinClass(uv);
e524f5b2 272 if (curCC == 0) {
273 d = uvuni_to_utf8(d, uv);
8f118dcd 274 continue;
e524f5b2 275 }
ac5ea531 276
2a204b45 277 cc_pos = 0;
ac5ea531 278 stk_cc[cc_pos].cc = curCC;
279 stk_cc[cc_pos].uv = uv;
280 stk_cc[cc_pos].pos = cc_pos;
281
e524f5b2 282 valid_uvlast = FALSE;
8f118dcd 283 while (p < e) {
e524f5b2 284 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
82e740b6 285 if (!retlen)
286 croak(ErrRetlenIsZero);
287 p += retlen;
288
ac5ea531 289 curCC = getCombinClass(uv);
e524f5b2 290 if (curCC == 0) {
291 uvlast = uv;
292 valid_uvlast = TRUE;
8f118dcd 293 break;
e524f5b2 294 }
82e740b6 295
ac5ea531 296 cc_pos++;
8f118dcd 297 if (stk_cc_max <= cc_pos) { /* extend if need */
ac5ea531 298 stk_cc_max = cc_pos + 1;
299 Renew(stk_cc, stk_cc_max, UNF_cc);
300 }
301 stk_cc[cc_pos].cc = curCC;
302 stk_cc[cc_pos].uv = uv;
303 stk_cc[cc_pos].pos = cc_pos;
304 }
305
e524f5b2 306 /* reordered if there are two c.c.'s */
307 if (cc_pos) {
308 qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc);
309 }
ac5ea531 310
e524f5b2 311 for (i = 0; i <= cc_pos; i++) {
312 d = uvuni_to_utf8(d, stk_cc[i].uv);
313 }
314 if (valid_uvlast)
315 {
316 d = uvuni_to_utf8(d, uvlast);
ac5ea531 317 }
318 }
e524f5b2 319 *d = '\0';
320 SvCUR_set(dst, d - (U8*)SvPVX(dst));
ac5ea531 321 Safefree(stk_cc);
8f118dcd 322 RETVAL = dst;
ac5ea531 323 OUTPUT:
324 RETVAL
325
326
327
2a204b45 328SV*
ac5ea531 329compose(arg)
330 SV * arg
331 PROTOTYPE: $
82e740b6 332 ALIAS:
333 composeContiguous = 1
ac5ea531 334 PREINIT:
335 SV *src, *dst, *tmp;
336 U8 *s, *p, *e, *d, *t, *tmp_start, curCC, preCC;
337 UV uv, uvS, uvComp;
2a204b45 338 STRLEN srclen, dstlen, tmplen, retlen;
ac5ea531 339 bool beginning = TRUE;
2a204b45 340 CODE:
8f118dcd 341 if (SvUTF8(arg)) {
ac5ea531 342 src = arg;
343 } else {
344 src = sv_mortalcopy(arg);
345 sv_utf8_upgrade(src);
346 }
2a204b45 347
ac5ea531 348 s = (U8*)SvPV(src, srclen);
349 e = s + srclen;
d85850a7 350 dstlen = srclen + 1;
2a204b45 351 dst = newSV(dstlen);
ac5ea531 352 (void)SvPOK_only(dst);
353 SvUTF8_on(dst);
354 d = (U8*)SvPVX(dst);
355
356 /* for uncomposed combining char */
357 tmp = sv_2mortal(newSV(dstlen));
358 (void)SvPOK_only(tmp);
359 SvUTF8_on(tmp);
360
8f118dcd 361 for (p = s; p < e;) {
362 if (beginning) {
e524f5b2 363 uvS = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
82e740b6 364 if (!retlen)
365 croak(ErrRetlenIsZero);
ac5ea531 366 p += retlen;
367
8f118dcd 368 if (getCombinClass(uvS)) { /* no Starter found yet */
ac5ea531 369 d = uvuni_to_utf8(d, uvS);
370 continue;
371 }
372 beginning = FALSE;
373 }
374
375 /* Starter */
376 t = tmp_start = (U8*)SvPVX(tmp);
377 preCC = 0;
378
379 /* to the next Starter */
8f118dcd 380 while (p < e) {
e524f5b2 381 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
82e740b6 382 if (!retlen)
383 croak(ErrRetlenIsZero);
ac5ea531 384 p += retlen;
82e740b6 385
ac5ea531 386 curCC = getCombinClass(uv);
387
8f118dcd 388 if (preCC && preCC == curCC) {
ac5ea531 389 preCC = curCC;
390 t = uvuni_to_utf8(t, uv);
391 } else {
2a204b45 392 uvComp = composite_uv(uvS, uv);
ac5ea531 393
82e740b6 394 if (uvComp && ! isExclusion(uvComp) &&
395 (ix ? (t == tmp_start) : (preCC <= curCC))) {
d85850a7 396 STRLEN leftcur, rightcur, dstcur;
397 leftcur = UNISKIP(uvComp);
398 rightcur = UNISKIP(uvS) + UNISKIP(uv);
399
400 if (leftcur > rightcur) {
401 dstcur = d - (U8*)SvPVX(dst);
402 dstlen += leftcur - rightcur;
403 d = (U8*)SvGROW(dst,dstlen) + dstcur;
404 }
ac5ea531 405 /* preCC not changed to curCC */
406 uvS = uvComp;
1efaba7f 407 } else if (! curCC && p < e) { /* blocked */
ac5ea531 408 break;
409 } else {
410 preCC = curCC;
411 t = uvuni_to_utf8(t, uv);
412 }
413 }
414 }
2a204b45 415 d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */
d85850a7 416 tmplen = t - tmp_start;
417 if (tmplen) { /* uncomposed combining char */
ac5ea531 418 t = (U8*)SvPVX(tmp);
8f118dcd 419 while (tmplen--)
420 *d++ = *t++;
ac5ea531 421 }
422 uvS = uv;
423 } /* for */
00f2676f 424 *d = '\0';
d85850a7 425 SvCUR_set(dst, d - (U8*)SvPVX(dst));
2a204b45 426 RETVAL = dst;
427 OUTPUT:
428 RETVAL
ac5ea531 429
430
8f118dcd 431void
432checkNFD(arg)
433 SV * arg
434 PROTOTYPE: $
435 ALIAS:
436 checkNFKD = 1
437 PREINIT:
438 UV uv;
439 SV *src;
440 STRLEN srclen, retlen;
441 U8 *s, *e, *p, curCC, preCC;
82e740b6 442 CODE:
8f118dcd 443 if (SvUTF8(arg)) {
444 src = arg;
445 } else {
446 src = sv_mortalcopy(arg);
447 sv_utf8_upgrade(src);
448 }
449
450 s = (U8*)SvPV(src,srclen);
451 e = s + srclen;
452
453 preCC = 0;
454 for (p = s; p < e; p += retlen) {
e524f5b2 455 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
82e740b6 456 if (!retlen)
457 croak(ErrRetlenIsZero);
458
8f118dcd 459 curCC = getCombinClass(uv);
460 if (preCC > curCC && curCC != 0) /* canonical ordering violated */
461 XSRETURN_NO;
462 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
463 XSRETURN_NO;
464 preCC = curCC;
465 }
466 XSRETURN_YES;
467
468
469
470void
471checkNFC(arg)
472 SV * arg
473 PROTOTYPE: $
474 ALIAS:
475 checkNFKC = 1
476 PREINIT:
477 UV uv;
478 SV *src;
479 STRLEN srclen, retlen;
480 U8 *s, *e, *p, curCC, preCC;
481 bool isMAYBE;
82e740b6 482 CODE:
8f118dcd 483 if (SvUTF8(arg)) {
484 src = arg;
485 } else {
486 src = sv_mortalcopy(arg);
487 sv_utf8_upgrade(src);
488 }
489
490 s = (U8*)SvPV(src,srclen);
491 e = s + srclen;
492
493 preCC = 0;
494 isMAYBE = FALSE;
495 for (p = s; p < e; p += retlen) {
e524f5b2 496 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
82e740b6 497 if (!retlen)
498 croak(ErrRetlenIsZero);
499
8f118dcd 500 curCC = getCombinClass(uv);
501
502 if (preCC > curCC && curCC != 0) /* canonical ordering violated */
503 XSRETURN_NO;
504
505 /* get NFC/NFKC property */
506 if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */
507 ; /* YES */
508 else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
509 XSRETURN_NO;
510 else if (isComp2nd(uv))
511 isMAYBE = TRUE;
512 else if (ix) {
513 char *canon, *compat;
6c941e0c 514 /* NFKC_NO when having compatibility mapping. */
8f118dcd 515 canon = (char *) dec_canonical(uv);
516 compat = (char *) dec_compat(uv);
6c941e0c 517 if (compat && !(canon && strEQ(canon, compat)))
8f118dcd 518 XSRETURN_NO;
519 } /* end of get NFC/NFKC property */
520
521 preCC = curCC;
522 }
523 if (isMAYBE)
524 XSRETURN_UNDEF;
525 else
526 XSRETURN_YES;
527
528
529
82e740b6 530void
531checkFCD(arg)
532 SV * arg
533 PROTOTYPE: $
534 ALIAS:
535 checkFCC = 1
536 PREINIT:
537 UV uv, uvLead, uvTrail;
538 SV *src;
539 STRLEN srclen, retlen, canlen, canret;
540 U8 *s, *e, *p, curCC, preCC;
541 U8 *sCan, *pCan, *eCan;
542 bool isMAYBE;
543 CODE:
544 if (SvUTF8(arg)) {
545 src = arg;
546 } else {
547 src = sv_mortalcopy(arg);
548 sv_utf8_upgrade(src);
549 }
550
551 s = (U8*)SvPV(src,srclen);
552 e = s + srclen;
553
554 preCC = 0;
555 isMAYBE = FALSE;
556 for (p = s; p < e; p += retlen) {
e524f5b2 557 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
82e740b6 558 if (!retlen)
559 croak(ErrRetlenIsZero);
560
561 sCan = (U8*) dec_canonical(uv);
562
563 if (sCan) {
564 canlen = (STRLEN)strlen((char *) sCan);
e524f5b2 565 uvLead = utf8n_to_uvuni(sCan, canlen, &canret, AllowAnyUTF);
82e740b6 566 }
567 else {
568 uvLead = uv;
569 }
570
571 curCC = getCombinClass(uvLead);
572
573 if (curCC != 0 && curCC < preCC) /* canonical ordering violated */
574 XSRETURN_NO;
575
576 if (ix) {
577 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
578 XSRETURN_NO;
579 else if (isComp2nd(uv))
580 isMAYBE = TRUE;
581 }
582
583 if (sCan) {
584 eCan = sCan + canlen;
585 pCan = utf8_hop(eCan, -1);
586 if (pCan < sCan)
587 croak(ErrHopBeforeStart);
e524f5b2 588 uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, AllowAnyUTF);
82e740b6 589 preCC = getCombinClass(uvTrail);
590 }
591 else {
592 preCC = curCC;
593 }
594 }
595 if (isMAYBE)
596 XSRETURN_UNDEF;
597 else
598 XSRETURN_YES;
599
600
601
ac5ea531 602U8
603getCombinClass(uv)
604 UV uv
8f118dcd 605 PROTOTYPE: $
ac5ea531 606
607bool
2a204b45 608isExclusion(uv)
ac5ea531 609 UV uv
8f118dcd 610 PROTOTYPE: $
611
612bool
613isSingleton(uv)
614 UV uv
615 PROTOTYPE: $
616
617bool
618isNonStDecomp(uv)
619 UV uv
620 PROTOTYPE: $
621
622bool
623isComp2nd(uv)
624 UV uv
625 PROTOTYPE: $
626 ALIAS:
627 isNFC_MAYBE = 1
628 isNFKC_MAYBE = 2
629
630
631
632void
633isNFD_NO(uv)
634 UV uv
635 PROTOTYPE: $
636 ALIAS:
637 isNFKD_NO = 1
82e740b6 638 CODE:
8f118dcd 639 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
640 XSRETURN_YES; /* NFD_NO or NFKD_NO */
641 else
642 XSRETURN_NO;
643
644
645
646void
647isComp_Ex(uv)
648 UV uv
649 PROTOTYPE: $
650 ALIAS:
651 isNFC_NO = 0
652 isNFKC_NO = 1
82e740b6 653 CODE:
8f118dcd 654 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
655 XSRETURN_YES; /* NFC_NO or NFKC_NO */
656 else if (ix) {
657 char *canon, *compat;
658 canon = (char *) dec_canonical(uv);
659 compat = (char *) dec_compat(uv);
660 if (compat && (!canon || strNE(canon, compat)))
661 XSRETURN_YES; /* NFC_NO or NFKC_NO */
662 else
663 XSRETURN_NO;
664 }
665 else
666 XSRETURN_NO;
667
668
ac5ea531 669
2a204b45 670SV*
ac5ea531 671getComposite(uv, uv2)
672 UV uv
673 UV uv2
2a204b45 674 PROTOTYPE: $$
675 PREINIT:
bcdb689b 676 UV composite;
2a204b45 677 CODE:
bcdb689b 678 composite = composite_uv(uv, uv2);
679 RETVAL = composite ? newSVuv(composite) : &PL_sv_undef;
2a204b45 680 OUTPUT:
681 RETVAL
ac5ea531 682
8f118dcd 683
684
ac5ea531 685SV*
686getCanon(uv)
687 UV uv
688 PROTOTYPE: $
689 ALIAS:
690 getCompat = 1
691 PREINIT:
692 U8 * rstr;
693 CODE:
8f118dcd 694 if (Hangul_IsS(uv)) {
ac5ea531 695 SV * dst;
696 dst = newSV(1);
697 (void)SvPOK_only(dst);
698 sv_cat_decompHangul(dst, uv);
699 RETVAL = dst;
700 } else {
701 rstr = ix ? dec_compat(uv) : dec_canonical(uv);
8f118dcd 702 if (!rstr)
703 XSRETURN_UNDEF;
ac5ea531 704 RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
705 }
706 SvUTF8_on(RETVAL);
707 OUTPUT:
708 RETVAL
709
82e740b6 710
711void
712splitOnLastStarter(arg)
713 SV * arg
714 PREINIT:
715 UV uv;
716 SV *src, *svp;
717 STRLEN srclen, retlen;
718 U8 *s, *e, *p;
719 PPCODE:
720 if (SvUTF8(arg)) {
721 src = arg;
722 } else {
723 src = sv_mortalcopy(arg);
724 sv_utf8_upgrade(src);
725 }
726
727 s = (U8*)SvPV(src,srclen);
728 e = s + srclen;
729
730 for (p = e; s < p; ) {
731 p = utf8_hop(p, -1);
732 if (p < s)
733 croak(ErrHopBeforeStart);
e524f5b2 734 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
82e740b6 735 if (getCombinClass(uv) == 0) /* Last Starter found */
736 break;
737 }
738
739 svp = sv_2mortal(newSVpvn((char*)s, p - s));
740 SvUTF8_on(svp);
741 XPUSHs(svp);
742
743 svp = sv_2mortal(newSVpvn((char*)p, e - p));
744 SvUTF8_on(svp);
745 XPUSHs(svp);
746