6 /* These 5 files are prepared by mkheader */
15 #define uvuni_to_utf8 uv_to_utf8
16 #endif /* uvuni_to_utf8 */
19 #ifndef utf8n_to_uvuni
20 #define utf8n_to_uvuni utf8_to_uv
21 #endif /* utf8n_to_uvuni */
23 /* UTF8_ALLOW_BOM is used before Perl 5.8.0 */
25 #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FFFF)
27 #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)
30 /* if utf8n_to_uvuni() sets retlen to 0 (?) */
31 #define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character"
33 /* utf8_hop() hops back before start. Maybe broken UTF-8 */
34 #define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start"
36 /* It should never happen as there is no instance in UTF-8 and UTF-EBCDIC;
37 according to Versioning and Stability in UAX#15, no new composition
38 should come in future. */
39 #define ErrLongerThanSrc "panic (Unicode::Normalize %s): longer than source"
41 /* uvuni_to_utf8 wants UTF8_MAXBYTES free bytes available */
42 #define ErrTargetNotEnough "panic (Unicode::Normalize %s): target not enough"
44 /* At present, char > 0x10ffff are unaffected without complaint, right? */
45 #define VALID_UTF_MAX (0x10ffff)
46 #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
48 /* size of array for combining characters */
49 /* enough as an initial value? */
50 #define CC_SEQ_SIZE (10)
51 #define CC_SEQ_STEP (5)
54 #define Hangul_SBase 0xAC00
55 #define Hangul_SFinal 0xD7A3
56 #define Hangul_SCount 11172
58 #define Hangul_NCount 588
60 #define Hangul_LBase 0x1100
61 #define Hangul_LFinal 0x1112
62 #define Hangul_LCount 19
64 #define Hangul_VBase 0x1161
65 #define Hangul_VFinal 0x1175
66 #define Hangul_VCount 21
68 #define Hangul_TBase 0x11A7
69 #define Hangul_TFinal 0x11C2
70 #define Hangul_TCount 28
72 #define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
73 #define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0)
74 #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
75 #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
76 #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
77 #define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal))
80 /* this is used for canonical ordering of combining characters (c.c.). */
82 U8 cc; /* combining class */
83 UV uv; /* codepoint */
84 STRLEN pos; /* position */
87 static int compare_cc(const void *a, const void *b)
90 ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;
94 return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos )
95 - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );
98 static U8* dec_canonical(UV uv)
101 if (OVER_UTF_MAX(uv))
103 plane = (U8***)UNF_canon[uv >> 16];
106 row = plane[(uv >> 8) & 0xff];
107 return row ? row[uv & 0xff] : NULL;
110 static U8* dec_compat(UV uv)
113 if (OVER_UTF_MAX(uv))
115 plane = (U8***)UNF_compat[uv >> 16];
118 row = plane[(uv >> 8) & 0xff];
119 return row ? row[uv & 0xff] : NULL;
122 static UV composite_uv(UV uv, UV uv2)
124 UNF_complist ***plane, **row, *cell, *i;
126 if (!uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
129 if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {
130 UV lindex = uv - Hangul_LBase;
131 UV vindex = uv2 - Hangul_VBase;
132 return(Hangul_SBase + (lindex * Hangul_VCount + vindex) *
135 if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
136 UV tindex = uv2 - Hangul_TBase;
139 plane = UNF_compos[uv >> 16];
142 row = plane[(uv >> 8) & 0xff];
145 cell = row[uv & 0xff];
148 for (i = cell; i->nextchar; i++) {
149 if (uv2 == i->nextchar)
155 static U8 getCombinClass(UV uv)
158 if (OVER_UTF_MAX(uv))
160 plane = (U8**)UNF_combin[uv >> 16];
163 row = plane[(uv >> 8) & 0xff];
164 return row ? row[uv & 0xff] : 0;
167 static U8* pv_cat_decompHangul(U8* d, UV uv)
169 UV sindex = uv - Hangul_SBase;
170 UV lindex = sindex / Hangul_NCount;
171 UV vindex = (sindex % Hangul_NCount) / Hangul_TCount;
172 UV tindex = sindex % Hangul_TCount;
174 if (! Hangul_IsS(uv))
177 d = uvuni_to_utf8(d, (lindex + Hangul_LBase));
178 d = uvuni_to_utf8(d, (vindex + Hangul_VBase));
180 d = uvuni_to_utf8(d, (tindex + Hangul_TBase));
184 static char * sv_2pvunicode(SV *sv, STRLEN *lp)
188 s = (char*)SvPV(sv,len);
190 SV* tmpsv = sv_mortalcopy(sv);
192 (void)sv_pvn_force(tmpsv,&len);
193 sv_utf8_upgrade(tmpsv);
194 s = (char*)SvPV(tmpsv,len);
202 U8* pv_utf8_decompose(U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat)
211 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
213 croak(ErrRetlenIsZero, "decompose");
216 if (Hangul_IsS(uv)) {
217 STRLEN cur = d - dstart;
219 if (dlen < cur + UTF8_MAXLEN * 3) {
220 dlen += UTF8_MAXLEN * 3;
221 Renew(dstart, dlen+1, U8);
224 d = pv_cat_decompHangul(d, uv);
227 U8* r = iscompat ? dec_compat(uv) : dec_canonical(uv);
230 STRLEN len = (STRLEN)strlen((char *)r);
231 STRLEN cur = d - dstart;
232 if (dlen < cur + len) {
234 Renew(dstart, dlen+1, U8);
241 STRLEN cur = d - dstart;
243 if (dlen < cur + UTF8_MAXLEN) {
245 Renew(dstart, dlen+1, U8);
248 d = uvuni_to_utf8(d, uv);
257 U8* pv_utf8_reorder(U8* s, STRLEN slen, U8* d, STRLEN dlen)
263 UNF_cc seq_ary[CC_SEQ_SIZE];
264 UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */
265 UNF_cc* seq_ext = NULL; /* extend if need */
266 STRLEN seq_max = CC_SEQ_SIZE;
269 if (dlen < slen || dlen < slen + UTF8_MAXLEN)
270 croak(ErrTargetNotEnough, "reorder");
271 dend -= UTF8_MAXLEN; /* safety */
276 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
278 croak(ErrRetlenIsZero, "reorder");
281 curCC = getCombinClass(uv);
284 if (seq_max < cc_pos + 1) { /* extend if need */
285 seq_max = cc_pos + CC_SEQ_STEP; /* new size */
286 if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
288 New(0, seq_ext, seq_max, UNF_cc);
289 for (i = 0; i < cc_pos; i++)
290 seq_ext[i] = seq_ary[i];
293 Renew(seq_ext, seq_max, UNF_cc);
295 seq_ptr = seq_ext; /* till now use seq_ext */
298 seq_ptr[cc_pos].cc = curCC;
299 seq_ptr[cc_pos].uv = uv;
300 seq_ptr[cc_pos].pos = cc_pos;
310 if (cc_pos > 1) /* reordered if there are two c.c.'s */
311 qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc);
313 for (i = 0; i < cc_pos; i++) {
314 d = uvuni_to_utf8(d, seq_ptr[i].uv);
315 if (dend < d) /* real end is dend + UTF8_MAXLEN */
316 croak(ErrLongerThanSrc, "reorder");
322 d = uvuni_to_utf8(d, uv);
323 if (dend < d) /* real end is dend + UTF8_MAXLEN */
324 croak(ErrLongerThanSrc, "reorder");
333 U8* pv_utf8_compose(U8* s, STRLEN slen, U8* d, STRLEN dlen, bool iscontig)
339 UV uvS; /* code point of the starter */
340 bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */
343 UV seq_ary[CC_SEQ_SIZE];
344 UV* seq_ptr = seq_ary; /* use array at the beginning */
345 UV* seq_ext = NULL; /* extend if need */
346 STRLEN seq_max = CC_SEQ_SIZE;
349 if (dlen < slen || dlen < slen + UTF8_MAXLEN)
350 croak(ErrTargetNotEnough, "compose");
351 dend -= UTF8_MAXLEN; /* safety */
356 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
358 croak(ErrRetlenIsZero, "compose");
361 curCC = getCombinClass(uv);
365 uvS = uv; /* the first Starter is found */
371 d = uvuni_to_utf8(d, uv);
372 if (dend < d) /* real end is dend + UTF8_MAXLEN */
373 croak(ErrLongerThanSrc, "compose");
381 if (iscontig && cc_pos || /* discontiguous combination */
382 curCC != 0 && preCC == curCC || /* blocked by same CC */
383 preCC > curCC) /* blocked by higher CC: revised D2 */
387 iscontig && cc_pos == 0 -- contiguous combination
388 curCC == 0 && preCC == 0 -- starter + starter
389 curCC != 0 && preCC < curCC -- lower CC */
391 /* try composition */
392 UV uvComp = composite_uv(uvS, uv);
394 if (uvComp && !isExclusion(uvComp)) {
398 /* preCC should not be changed to curCC */
399 /* e.g. 1E14 = 0045 0304 0300 where CC(0304) == CC(0300) */
409 if (curCC != 0 || !(p < e)) {
410 if (seq_max < cc_pos + 1) { /* extend if need */
411 seq_max = cc_pos + CC_SEQ_STEP; /* new size */
412 if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
413 New(0, seq_ext, seq_max, UV);
414 Copy(seq_ary, seq_ext, cc_pos, UV);
417 Renew(seq_ext, seq_max, UV);
419 seq_ptr = seq_ext; /* till now use seq_ext */
421 seq_ptr[cc_pos] = uv;
424 if (curCC != 0 && p < e)
429 d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */
430 if (dend < d) /* real end is dend + UTF8_MAXLEN */
431 croak(ErrLongerThanSrc, "compose");
436 for (i = 0; i < cc_pos; i++) {
437 d = uvuni_to_utf8(d, seq_ptr[i]);
438 if (dend < d) /* real end is dend + UTF8_MAXLEN */
439 croak(ErrLongerThanSrc, "compose");
451 MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
454 decompose(src, compat = &PL_sv_no)
463 s = (U8*)sv_2pvunicode(src,&slen);
464 dst = newSVpvn("", 0);
466 New(0, d, dlen+1, U8);
467 dend = pv_utf8_decompose(s, slen, &d, dlen, SvTRUE(compat));
468 sv_setpvn(dst, d, dend - d);
484 s = (U8*)sv_2pvunicode(src,&slen);
485 dst = newSVpvn("", 0);
486 dlen = slen + UTF8_MAXLEN;
487 d = (U8*)SvGROW(dst,dlen+1);
489 dend = pv_utf8_reorder(s, slen, d, dlen);
491 SvCUR_set(dst, dend - d);
501 composeContiguous = 1
507 s = (U8*)sv_2pvunicode(src,&slen);
508 dst = newSVpvn("", 0);
509 dlen = slen + UTF8_MAXLEN;
510 d = (U8*)SvGROW(dst,dlen+1);
512 dend = pv_utf8_compose(s, slen, d, dlen, (bool)ix);
514 SvCUR_set(dst, dend - d);
527 U8 *s, *t, *tend, *d, *dend;
528 STRLEN slen, tlen, dlen;
531 s = (U8*)sv_2pvunicode(src,&slen);
533 New(0, t, tlen+1, U8);
534 tend = pv_utf8_decompose(s, slen, &t, tlen, (bool)ix);
536 tlen = tend - t; /* no longer know real tlen */
539 dst = newSVpvn("", 0);
540 dlen = tlen + UTF8_MAXLEN;
541 d = (U8*)SvGROW(dst,dlen+1);
543 dend = pv_utf8_reorder(t, tlen, d, dlen);
545 SvCUR_set(dst, dend - d);
562 U8 *s, *t, *tend, *u, *uend, *d, *dend;
563 STRLEN slen, tlen, ulen, dlen;
566 s = (U8*)sv_2pvunicode(src,&slen);
568 New(0, t, tlen+1, U8);
569 tend = pv_utf8_decompose(s, slen, &t, tlen, (bool)(ix==1));
571 tlen = tend - t; /* no longer know real tlen */
574 ulen = tlen + UTF8_MAXLEN;
575 New(0, u, ulen+1, U8);
576 uend = pv_utf8_reorder(t, tlen, u, ulen);
581 dst = newSVpvn("", 0);
582 dlen = ulen + UTF8_MAXLEN;
583 d = (U8*)SvGROW(dst,dlen+1);
585 dend = pv_utf8_compose(u, ulen, d, dlen, (bool)(ix==2));
587 SvCUR_set(dst, dend - d);
603 STRLEN srclen, retlen;
604 U8 *s, *e, *p, curCC, preCC;
606 s = (U8*)sv_2pvunicode(src,&srclen);
610 for (p = s; p < e; p += retlen) {
611 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
613 croak(ErrRetlenIsZero, "checkNFD or -NFKD");
615 curCC = getCombinClass(uv);
616 if (preCC > curCC && curCC != 0) /* canonical ordering violated */
618 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
633 STRLEN srclen, retlen;
634 U8 *s, *e, *p, curCC, preCC;
637 s = (U8*)sv_2pvunicode(src,&srclen);
642 for (p = s; p < e; p += retlen) {
643 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
645 croak(ErrRetlenIsZero, "checkNFC or -NFKC");
647 curCC = getCombinClass(uv);
648 if (preCC > curCC && curCC != 0) /* canonical ordering violated */
651 /* get NFC/NFKC property */
652 if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */
654 else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
656 else if (isComp2nd(uv))
659 char *canon, *compat;
660 /* NFKC_NO when having compatibility mapping. */
661 canon = (char *) dec_canonical(uv);
662 compat = (char *) dec_compat(uv);
663 if (compat && !(canon && strEQ(canon, compat)))
665 } /* end of get NFC/NFKC property */
683 STRLEN srclen, retlen;
684 U8 *s, *e, *p, curCC, preCC;
687 s = (U8*)sv_2pvunicode(src,&srclen);
691 for (p = s; p < e; p += retlen) {
694 STRLEN canlen, canret;
695 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
697 croak(ErrRetlenIsZero, "checkFCD or -FCC");
699 sCan = (U8*) dec_canonical(uv);
702 canlen = (STRLEN)strlen((char *) sCan);
703 uvLead = utf8n_to_uvuni(sCan, canlen, &canret, AllowAnyUTF);
705 croak(ErrRetlenIsZero, "checkFCD or -FCC");
711 curCC = getCombinClass(uvLead);
713 if (curCC != 0 && curCC < preCC) /* canonical ordering violated */
717 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
719 else if (isComp2nd(uv))
725 U8* eCan = sCan + canlen;
726 U8* pCan = utf8_hop(eCan, -1);
728 croak(ErrHopBeforeStart);
729 uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, AllowAnyUTF);
731 croak(ErrRetlenIsZero, "checkFCD or -FCC");
732 preCC = getCombinClass(uvTrail);
782 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
783 XSRETURN_YES; /* NFD_NO or NFKD_NO */
797 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
798 XSRETURN_YES; /* NFC_NO or NFKC_NO */
800 char *canon, *compat;
801 canon = (char *) dec_canonical(uv);
802 compat = (char *) dec_compat(uv);
803 if (compat && (!canon || strNE(canon, compat)))
804 XSRETURN_YES; /* NFC_NO or NFKC_NO */
814 getComposite(uv, uv2)
821 composite = composite_uv(uv, uv2);
822 RETVAL = composite ? newSVuv(composite) : &PL_sv_undef;
835 if (Hangul_IsS(uv)) {
836 U8 tmp[3 * UTF8_MAXLEN + 1];
838 U8 *e = pv_cat_decompHangul(t, uv);
839 RETVAL = newSVpvn((char *)t, e - t);
841 U8* rstr = ix ? dec_compat(uv) : dec_canonical(uv);
844 RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
852 splitOnLastStarter(src)
859 s = (U8*)sv_2pvunicode(src,&srclen);
866 croak(ErrHopBeforeStart);
867 uv = utf8n_to_uvuni(p, e - p, NULL, AllowAnyUTF);
868 if (getCombinClass(uv) == 0) /* Last Starter found */
872 svp = sv_2mortal(newSVpvn((char*)s, p - s));
876 svp = sv_2mortal(newSVpvn((char*)p, e - p));