X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=utf8.c;h=d7b078486d44c3d7419865c8bee366ec059276f9;hb=701a277b5182d929c4baa83d419c46c6d08d2101;hp=ac90a38283fcfe8633982ecb38e024c2c10e4a90;hpb=6d47b937228c87804679e481f9585c659c524592;p=p5sagit%2Fp5-mst-13.2.git diff --git a/utf8.c b/utf8.c index ac90a38..d7b0784 100644 --- a/utf8.c +++ b/utf8.c @@ -1180,6 +1180,30 @@ Perl_is_utf8_mark(pTHX_ U8 *p) return swash_fetch(PL_utf8_mark, p, TRUE); } +/* +=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special + +The "p" contains the pointer to the UTF-8 string encoding +the character that is being converted. + +The "ustrp" is a pointer to the character buffer to put the +conversion result to. The "lenp" is a pointer to the length +of the result. + +The "swash" is a pointer to the swash to use. + +The "normal" is a string like "ToLower" which means the swash +$utf8::ToLower, which is stored in lib/unicore/To/Lower.pl, +and loaded by SWASHGET, using lib/utf8_heavy.pl. + +The "special" is a string like "utf8::ToSpecLower", which means +the hash %utf8::ToSpecLower, which is stored in the same file, +lib/unicore/To/Lower.pl, and also loaded by SWASHGET. The access +to the hash is by Perl_to_utf8_case(). + +=cut + */ + UV Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp,char *normal, char *special) { @@ -1196,6 +1220,8 @@ Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp,char *normal HE *he; uv = utf8_to_uvchr(p, 0); + if (uv <= 0xff) + uv = NATIVE_TO_UTF(uv); if ((hv = get_hv(special, FALSE)) && (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv))) && @@ -1206,6 +1232,7 @@ Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp,char *normal if (*lenp > 1 || UNI_IS_INVARIANT(c)) Copy(s, ustrp, *lenp, U8); else { + c = UTF_TO_NATIVE(c); /* something in the 0x80..0xFF range */ ustrp[0] = UTF8_EIGHT_BIT_HI(c); ustrp[1] = UTF8_EIGHT_BIT_LO(c); @@ -1223,21 +1250,28 @@ UV Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp) { return Perl_to_utf8_case(aTHX_ p, ustrp, lenp, - &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper"); + &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper"); } UV Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp) { return Perl_to_utf8_case(aTHX_ p, ustrp, lenp, - &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle"); + &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle"); } UV Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp) { return Perl_to_utf8_case(aTHX_ p, ustrp, lenp, - &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower"); + &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower"); +} + +UV +Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp) +{ + return Perl_to_utf8_case(aTHX_ p, ustrp, lenp, + &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold"); } /* a "swash" is a swatch hash */ @@ -1313,7 +1347,7 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8) U32 off; STRLEN slen; STRLEN needents; - U8 *tmps; + U8 *tmps = NULL; U32 bit; SV *retval; U8 tmputf8[2]; @@ -1484,4 +1518,76 @@ Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) return UNI_TO_NATIVE(uv); } +char * +Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags) +{ + int truncated = 0; + char *s, *e; + + sv_setpvn(dsv, "", 0); + for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) { + UV u; + if (pvlim && SvCUR(dsv) >= pvlim) { + truncated++; + break; + } + u = utf8_to_uvchr((U8*)s, 0); + Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u); + } + if (truncated) + sv_catpvn(dsv, "...", 3); + + return SvPVX(dsv); +} + +char * +Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags) +{ + return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv), + pvlim, flags); +} + +I32 +Perl_ibcmp_utf8(pTHX_ const char *s1, bool u1, const char *s2, bool u2, register I32 len) +{ + register U8 *a = (U8*)s1; + register U8 *b = (U8*)s2; + STRLEN la, lb; + UV ca, cb; + STRLEN ulen1, ulen2; + U8 tmpbuf1[UTF8_MAXLEN*3+1]; + U8 tmpbuf2[UTF8_MAXLEN*3+1]; + + while (len) { + if (u1) + ca = utf8_to_uvchr((U8*)a, &la); + else { + ca = *a; + la = 1; + } + if (u2) + cb = utf8_to_uvchr((U8*)b, &lb); + else { + cb = *b; + lb = 1; + } + if (ca != cb) { + if (u1) + to_uni_lower(NATIVE_TO_UNI(ca), tmpbuf1, &ulen1); + else + ulen1 = 1; + if (u2) + to_uni_lower(NATIVE_TO_UNI(cb), tmpbuf2, &ulen2); + else + ulen2 = 1; + if (ulen1 != ulen2 + || (ulen1 == 1 && PL_fold[ca] != PL_fold[cb]) + || memNE(tmpbuf1, tmpbuf2, ulen1)) + return 1; + } + a += la; + b += lb; + } + return 0; +}