6 /* These 5 files are prepared by mkheader */
15 #define uvuni_to_utf8 uv_to_utf8
16 #endif /* uvuni_to_utf8 */
19 #ifndef utf8n_to_uvuni
20 #define utf8n_to_uvuni utf8_to_uv
21 #endif /* utf8n_to_uvuni */
23 /* UTF8_ALLOW_BOM is used before Perl 5.8.0 */
25 #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FFFF)
27 #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)
30 /* if utf8n_to_uvuni() sets retlen to 0 (?) */
31 #define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character"
33 /* utf8_hop() hops back before start. Maybe broken UTF-8 */
34 #define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start"
36 /* It should never happen as there is no instance in UTF-8 and UTF-EBCDIC.
37 If Unicode would add a new composition of A + B to C
38 where bytes::length(A) + bytes::length(B) < bytes::length(C),
39 this code should be fixed.
40 In this case, mkheader will prevent Unicode::Normalize from building. */
41 #define ErrLongerThanSrc "panic (Unicode::Normalize %s): longer than source"
43 /* uvuni_to_utf8 wants UTF8_MAXBYTES free bytes available */
44 #define ErrTargetNotEnough "panic (Unicode::Normalize %s): target not enough"
46 /* At present, char > 0x10ffff are unaffected without complaint, right? */
47 #define VALID_UTF_MAX (0x10ffff)
48 #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
50 /* size of array for combining characters */
51 /* enough as an initial value? */
52 #define CC_SEQ_SIZE (10)
53 #define CC_SEQ_STEP (5)
56 #define Hangul_SBase 0xAC00
57 #define Hangul_SFinal 0xD7A3
58 #define Hangul_SCount 11172
60 #define Hangul_NCount 588
62 #define Hangul_LBase 0x1100
63 #define Hangul_LFinal 0x1112
64 #define Hangul_LCount 19
66 #define Hangul_VBase 0x1161
67 #define Hangul_VFinal 0x1175
68 #define Hangul_VCount 21
70 #define Hangul_TBase 0x11A7
71 #define Hangul_TFinal 0x11C2
72 #define Hangul_TCount 28
74 #define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
75 #define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0)
76 #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
77 #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
78 #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
79 #define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal))
82 /* this is used for canonical ordering of combining characters (c.c.). */
84 U8 cc; /* combining class */
85 UV uv; /* codepoint */
86 STRLEN pos; /* position */
89 static int compare_cc(const void *a, const void *b)
92 ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;
96 return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos )
97 - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );
100 static U8* dec_canonical(UV uv)
103 if (OVER_UTF_MAX(uv))
105 plane = (U8***)UNF_canon[uv >> 16];
108 row = plane[(uv >> 8) & 0xff];
109 return row ? row[uv & 0xff] : NULL;
112 static U8* dec_compat(UV uv)
115 if (OVER_UTF_MAX(uv))
117 plane = (U8***)UNF_compat[uv >> 16];
120 row = plane[(uv >> 8) & 0xff];
121 return row ? row[uv & 0xff] : NULL;
124 static UV composite_uv(UV uv, UV uv2)
126 UNF_complist ***plane, **row, *cell, *i;
128 if (!uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
131 if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {
132 UV lindex = uv - Hangul_LBase;
133 UV vindex = uv2 - Hangul_VBase;
134 return(Hangul_SBase + (lindex * Hangul_VCount + vindex) *
137 if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
138 UV tindex = uv2 - Hangul_TBase;
141 plane = UNF_compos[uv >> 16];
144 row = plane[(uv >> 8) & 0xff];
147 cell = row[uv & 0xff];
150 for (i = cell; i->nextchar; i++) {
151 if (uv2 == i->nextchar)
157 static U8 getCombinClass(UV uv)
160 if (OVER_UTF_MAX(uv))
162 plane = (U8**)UNF_combin[uv >> 16];
165 row = plane[(uv >> 8) & 0xff];
166 return row ? row[uv & 0xff] : 0;
169 static U8* pv_cat_decompHangul(U8* d, UV uv)
171 UV sindex = uv - Hangul_SBase;
172 UV lindex = sindex / Hangul_NCount;
173 UV vindex = (sindex % Hangul_NCount) / Hangul_TCount;
174 UV tindex = sindex % Hangul_TCount;
176 if (! Hangul_IsS(uv))
179 d = uvuni_to_utf8(d, (lindex + Hangul_LBase));
180 d = uvuni_to_utf8(d, (vindex + Hangul_VBase));
182 d = uvuni_to_utf8(d, (tindex + Hangul_TBase));
186 static char* sv_2pvunicode(SV *sv, STRLEN *lp)
192 SV* tmpsv = sv_2mortal(newSVpvn(s, len));
194 s = SvPV_force(tmpsv,len);
195 sv_utf8_upgrade(tmpsv);
204 U8* pv_utf8_decompose(U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat)
213 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
215 croak(ErrRetlenIsZero, "decompose");
218 if (Hangul_IsS(uv)) {
219 STRLEN cur = d - dstart;
221 if (dlen < cur + UTF8_MAXLEN * 3) {
222 dlen += UTF8_MAXLEN * 3;
223 Renew(dstart, dlen+1, U8);
226 d = pv_cat_decompHangul(d, uv);
229 U8* r = iscompat ? dec_compat(uv) : dec_canonical(uv);
232 STRLEN len = (STRLEN)strlen((char *)r);
233 STRLEN cur = d - dstart;
234 if (dlen < cur + len) {
236 Renew(dstart, dlen+1, U8);
243 STRLEN cur = d - dstart;
245 if (dlen < cur + UTF8_MAXLEN) {
247 Renew(dstart, dlen+1, U8);
250 d = uvuni_to_utf8(d, uv);
259 U8* pv_utf8_reorder(U8* s, STRLEN slen, U8* d, STRLEN dlen)
265 UNF_cc seq_ary[CC_SEQ_SIZE];
266 UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */
267 UNF_cc* seq_ext = NULL; /* extend if need */
268 STRLEN seq_max = CC_SEQ_SIZE;
271 if (dlen < slen || dlen < slen + UTF8_MAXLEN)
272 croak(ErrTargetNotEnough, "reorder");
273 dend -= UTF8_MAXLEN; /* safety */
278 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
280 croak(ErrRetlenIsZero, "reorder");
283 curCC = getCombinClass(uv);
286 if (seq_max < cc_pos + 1) { /* extend if need */
287 seq_max = cc_pos + CC_SEQ_STEP; /* new size */
288 if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
290 New(0, seq_ext, seq_max, UNF_cc);
291 for (i = 0; i < cc_pos; i++)
292 seq_ext[i] = seq_ary[i];
295 Renew(seq_ext, seq_max, UNF_cc);
297 seq_ptr = seq_ext; /* use seq_ext from now */
300 seq_ptr[cc_pos].cc = curCC;
301 seq_ptr[cc_pos].uv = uv;
302 seq_ptr[cc_pos].pos = cc_pos;
312 if (cc_pos > 1) /* reordered if there are two c.c.'s */
313 qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc);
315 for (i = 0; i < cc_pos; i++) {
316 d = uvuni_to_utf8(d, seq_ptr[i].uv);
317 if (dend < d) /* real end is dend + UTF8_MAXLEN */
318 croak(ErrLongerThanSrc, "reorder");
324 d = uvuni_to_utf8(d, uv);
325 if (dend < d) /* real end is dend + UTF8_MAXLEN */
326 croak(ErrLongerThanSrc, "reorder");
335 U8* pv_utf8_compose(U8* s, STRLEN slen, U8* d, STRLEN dlen, bool iscontig)
341 UV uvS = 0; /* code point of the starter */
342 bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */
345 UV seq_ary[CC_SEQ_SIZE];
346 UV* seq_ptr = seq_ary; /* use array at the beginning */
347 UV* seq_ext = NULL; /* extend if need */
348 STRLEN seq_max = CC_SEQ_SIZE;
351 if (dlen < slen || dlen < slen + UTF8_MAXLEN)
352 croak(ErrTargetNotEnough, "compose");
353 dend -= UTF8_MAXLEN; /* safety */
358 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
360 croak(ErrRetlenIsZero, "compose");
363 curCC = getCombinClass(uv);
367 uvS = uv; /* the first Starter is found */
373 d = uvuni_to_utf8(d, uv);
374 if (dend < d) /* real end is dend + UTF8_MAXLEN */
375 croak(ErrLongerThanSrc, "compose");
383 if (iscontig && cc_pos || /* discontiguous combination */
384 curCC != 0 && preCC == curCC || /* blocked by same CC */
385 preCC > curCC) /* blocked by higher CC: revised D2 */
389 iscontig && cc_pos == 0 -- contiguous combination
390 curCC == 0 && preCC == 0 -- starter + starter
391 curCC != 0 && preCC < curCC -- lower CC */
393 /* try composition */
394 UV uvComp = composite_uv(uvS, uv);
396 if (uvComp && !isExclusion(uvComp)) {
400 /* preCC should not be changed to curCC */
401 /* e.g. 1E14 = 0045 0304 0300 where CC(0304) == CC(0300) */
411 if (curCC != 0 || !(p < e)) {
412 if (seq_max < cc_pos + 1) { /* extend if need */
413 seq_max = cc_pos + CC_SEQ_STEP; /* new size */
414 if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
415 New(0, seq_ext, seq_max, UV);
416 Copy(seq_ary, seq_ext, cc_pos, UV);
419 Renew(seq_ext, seq_max, UV);
421 seq_ptr = seq_ext; /* use seq_ext from now */
423 seq_ptr[cc_pos] = uv;
426 if (curCC != 0 && p < e)
431 d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */
432 if (dend < d) /* real end is dend + UTF8_MAXLEN */
433 croak(ErrLongerThanSrc, "compose");
438 for (i = 0; i < cc_pos; i++) {
439 d = uvuni_to_utf8(d, seq_ptr[i]);
440 if (dend < d) /* real end is dend + UTF8_MAXLEN */
441 croak(ErrLongerThanSrc, "compose");
453 MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
456 decompose(src, compat = &PL_sv_no)
465 s = (U8*)sv_2pvunicode(src,&slen);
466 dst = newSVpvn("", 0);
468 New(0, d, dlen+1, U8);
469 dend = pv_utf8_decompose(s, slen, &d, dlen, (bool)SvTRUE(compat));
470 sv_setpvn(dst, (char *)d, dend - d);
486 s = (U8*)sv_2pvunicode(src,&slen);
487 dst = newSVpvn("", 0);
488 dlen = slen + UTF8_MAXLEN;
489 d = (U8*)SvGROW(dst,dlen+1);
491 dend = pv_utf8_reorder(s, slen, d, dlen);
493 SvCUR_set(dst, dend - d);
503 composeContiguous = 1
509 s = (U8*)sv_2pvunicode(src,&slen);
510 dst = newSVpvn("", 0);
511 dlen = slen + UTF8_MAXLEN;
512 d = (U8*)SvGROW(dst,dlen+1);
514 dend = pv_utf8_compose(s, slen, d, dlen, (bool)ix);
516 SvCUR_set(dst, dend - d);
529 U8 *s, *t, *tend, *d, *dend;
530 STRLEN slen, tlen, dlen;
533 s = (U8*)sv_2pvunicode(src,&slen);
535 New(0, t, tlen+1, U8);
536 tend = pv_utf8_decompose(s, slen, &t, tlen, (bool)ix);
538 tlen = tend - t; /* no longer know real tlen */
541 dst = newSVpvn("", 0);
542 dlen = tlen + UTF8_MAXLEN;
543 d = (U8*)SvGROW(dst,dlen+1);
545 dend = pv_utf8_reorder(t, tlen, d, dlen);
547 SvCUR_set(dst, dend - d);
564 U8 *s, *t, *tend, *u, *uend, *d, *dend;
565 STRLEN slen, tlen, ulen, dlen;
568 s = (U8*)sv_2pvunicode(src,&slen);
570 New(0, t, tlen+1, U8);
571 tend = pv_utf8_decompose(s, slen, &t, tlen, (bool)(ix==1));
573 tlen = tend - t; /* no longer know real tlen */
576 ulen = tlen + UTF8_MAXLEN;
577 New(0, u, ulen+1, U8);
578 uend = pv_utf8_reorder(t, tlen, u, ulen);
583 dst = newSVpvn("", 0);
584 dlen = ulen + UTF8_MAXLEN;
585 d = (U8*)SvGROW(dst,dlen+1);
587 dend = pv_utf8_compose(u, ulen, d, dlen, (bool)(ix==2));
589 SvCUR_set(dst, dend - d);
605 STRLEN srclen, retlen;
606 U8 *s, *e, *p, curCC, preCC;
609 s = (U8*)sv_2pvunicode(src,&srclen);
613 for (p = s; p < e; p += retlen) {
614 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
616 croak(ErrRetlenIsZero, "checkNFD or -NFKD");
618 curCC = getCombinClass(uv);
619 if (preCC > curCC && curCC != 0) { /* canonical ordering violated */
623 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) {
629 RETVAL = boolSV(result);
641 STRLEN srclen, retlen;
642 U8 *s, *e, *p, curCC, preCC;
644 bool isMAYBE = FALSE;
646 s = (U8*)sv_2pvunicode(src,&srclen);
650 for (p = s; p < e; p += retlen) {
651 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
653 croak(ErrRetlenIsZero, "checkNFC or -NFKC");
655 curCC = getCombinClass(uv);
656 if (preCC > curCC && curCC != 0) { /* canonical ordering violated */
661 /* get NFC/NFKC property */
662 if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */
664 else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) {
668 else if (isComp2nd(uv))
671 char *canon, *compat;
672 /* NFKC_NO when having compatibility mapping. */
673 canon = (char *) dec_canonical(uv);
674 compat = (char *) dec_compat(uv);
675 if (compat && !(canon && strEQ(canon, compat))) {
679 } /* end of get NFC/NFKC property */
683 if (isMAYBE && result) /* NO precedes MAYBE */
685 RETVAL = boolSV(result);
697 STRLEN srclen, retlen;
698 U8 *s, *e, *p, curCC, preCC;
700 bool isMAYBE = FALSE;
702 s = (U8*)sv_2pvunicode(src,&srclen);
705 for (p = s; p < e; p += retlen) {
709 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
711 croak(ErrRetlenIsZero, "checkFCD or -FCC");
713 sCan = (U8*) dec_canonical(uv);
717 canlen = (STRLEN)strlen((char *) sCan);
718 uvLead = utf8n_to_uvuni(sCan, canlen, &canret, AllowAnyUTF);
720 croak(ErrRetlenIsZero, "checkFCD or -FCC");
726 curCC = getCombinClass(uvLead);
728 if (curCC != 0 && curCC < preCC) { /* canonical ordering violated */
734 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) {
738 else if (isComp2nd(uv))
745 U8* eCan = sCan + canlen;
746 U8* pCan = utf8_hop(eCan, -1);
748 croak(ErrHopBeforeStart);
749 uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, AllowAnyUTF);
751 croak(ErrRetlenIsZero, "checkFCD or -FCC");
752 preCC = getCombinClass(uvTrail);
758 if (isMAYBE && result) /* NO precedes MAYBE */
760 RETVAL = boolSV(result);
804 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
805 result = TRUE; /* NFD_NO or NFKD_NO */
806 RETVAL = boolSV(result);
821 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
822 result = TRUE; /* NFC_NO or NFKC_NO */
824 char *canon, *compat;
825 canon = (char *) dec_canonical(uv);
826 compat = (char *) dec_compat(uv);
827 if (compat && (!canon || strNE(canon, compat)))
828 result = TRUE; /* NFC_NO or NFKC_NO */
830 RETVAL = boolSV(result);
835 getComposite(uv, uv2)
842 composite = composite_uv(uv, uv2);
843 RETVAL = composite ? newSVuv(composite) : &PL_sv_undef;
856 if (Hangul_IsS(uv)) {
857 U8 tmp[3 * UTF8_MAXLEN + 1];
859 U8 *e = pv_cat_decompHangul(t, uv);
860 RETVAL = newSVpvn((char *)t, e - t);
862 U8* rstr = ix ? dec_compat(uv) : dec_canonical(uv);
865 RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
873 splitOnLastStarter(src)
880 s = (U8*)sv_2pvunicode(src,&srclen);
887 croak(ErrHopBeforeStart);
888 uv = utf8n_to_uvuni(p, e - p, NULL, AllowAnyUTF);
889 if (getCombinClass(uv) == 0) /* Last Starter found */
893 svp = sv_2mortal(newSVpvn((char*)s, p - s));
897 svp = sv_2mortal(newSVpvn((char*)p, e - p));