From: Jarkko Hietaniemi Date: Sun, 17 Feb 2002 20:44:59 +0000 (+0000) Subject: Clearing up to_utf8_case() continues: this time use X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=2a37f04dcbe9376a1280272d62501209825ac83c;p=p5sagit%2Fp5-mst-13.2.git Clearing up to_utf8_case() continues: this time use a single return, and EBCDICification for all paths. p4raw-id: //depot/perl@14734 --- diff --git a/utf8.c b/utf8.c index 314bbff..b33e3db 100644 --- a/utf8.c +++ b/utf8.c @@ -1288,8 +1288,10 @@ UV Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special) { UV uv0, uv1, uv2; - U8 tmpbuf[UTF8_MAXLEN_FOLD+1]; + U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *d; + char *s = NULL; STRLEN len; + bool has_utf8 = FALSE; if (!*swashp) *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0); @@ -1301,76 +1303,76 @@ Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *norma uvuni_to_utf8(tmpbuf, uv1); uv2 = swash_fetch(*swashp, tmpbuf, TRUE); if (uv2) { - /* It was "normal" (single character mapping). */ - UV uv3 = UNI_TO_NATIVE(uv2); - - len = uvchr_to_utf8(ustrp, uv3) - ustrp; - if (lenp) - *lenp = len; - - return uv3; + /* It was "normal" (a single character mapping). */ + d = uvuni_to_utf8(ustrp, uv2); + has_utf8 = !UNI_IS_INVARIANT(uv2); } else { + /* It might be "special" (sometimes, but not always, + * a multicharacter mapping) */ HV *hv; SV *keysv; HE *he; + SV *val; if ((hv = get_hv(special, FALSE)) && (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv1))) && - (he = hv_fetch_ent(hv, keysv, FALSE, 0))) { - SV *val = HeVAL(he); - char *s = SvPV(val, len); + (he = hv_fetch_ent(hv, keysv, FALSE, 0)) && + (val = HeVAL(he))) { - if (len > 1) { + s = SvPV(val, len); + if (len == 1) + d = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)); + else { Copy(s, ustrp, len, U8); -#ifdef EBCDIC - { - /* If we have EBCDIC we need to remap the - * characters coming in from the "special" - * (usually, but not always multicharacter) - * mapping, since any characters in the low 256 - * are in Unicode code points, not EBCDIC. - * --jhi */ - U8 *d = tmpbuf; - U8 *t, *tend; - - if (SvUTF8(val)) { - STRLEN tlen = 0; - - for (t = ustrp, tend = t + len; - t < tend; t += tlen) { - UV c = utf8_to_uvchr(t, &tlen); - - if (tlen > 0) - d = uvchr_to_utf8(d, UNI_TO_NATIVE(c)); - else - break; - } - } else { - for (t = ustrp, tend = t + len; - t < tend; t++) - d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t)); - } - len = d - tmpbuf; - Copy(tmpbuf, ustrp, len, U8); - } -#endif + d = ustrp + len; } - else - len = uvchr_to_utf8(ustrp, UNI_TO_NATIVE(*(U8*)s)) - ustrp; - if (lenp) - *lenp = len; - - return utf8_to_uvchr(ustrp, 0); + if (SvUTF8(val)) + has_utf8 = TRUE; } + else { + /* It was not "special", either. */ + d = uvuni_to_utf8(ustrp, uv1); + has_utf8 = !UNI_IS_INVARIANT(uv1); + } + } - /* So it was not "special": just copy it. */ - len = uvchr_to_utf8(ustrp, uv0) - ustrp; - if (lenp) - *lenp = len; + len = d - ustrp; - return uv0; +#ifdef EBCDIC + { + /* If we have EBCDIC we need to remap the characters since + * any characters in the low 256 are in Unicode code points, + * not EBCDIC. */ + U8 *t, *tend; + + d = tmpbuf; + if (has_utf8) { + STRLEN tlen = 0; + + for (t = ustrp, tend = t + len; + t < tend; t += tlen) { + UV c = utf8_to_uvchr(t, &tlen); + + if (tlen > 0) + d = uvchr_to_utf8(d, UNI_TO_NATIVE(c)); + else + break; + } + } else { + for (t = ustrp, tend = t + len; + t < tend; t++) + d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t)); + } + len = d - tmpbuf; + Copy(tmpbuf, ustrp, len, U8); } +#endif + + if (lenp) + *lenp = len; + + return utf8_to_uvchr(ustrp, 0); } /*