From: Jarkko Hietaniemi Date: Wed, 13 Feb 2002 00:24:37 +0000 (+0000) Subject: Rewrite the "special mapping" part of to_utf8_case(), X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=d2dcd0fb0e5b4b6b0e01e4cff08a37dff0d015ce;p=p5sagit%2Fp5-mst-13.2.git Rewrite the "special mapping" part of to_utf8_case(), this time with fewer bugs. (See: The Law of Cybernetic Entymology.) p4raw-id: //depot/perl@14664 --- diff --git a/utf8.c b/utf8.c index 6c9cebd..4913b44 100644 --- a/utf8.c +++ b/utf8.c @@ -1303,36 +1303,56 @@ Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *norma (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv))) && (he = hv_fetch_ent(hv, keysv, FALSE, 0))) { SV *val = HeVAL(he); - char *s = SvPV(val, *lenp); - U8 c = *(U8*)s; + STRLEN len; + char *s = SvPV(val, len); - if (*lenp > 1 || UNI_IS_INVARIANT(c)) - Copy(s, ustrp, *lenp, U8); - else { - /* something in the 0x80..0xFF range */ - ustrp[0] = UTF8_EIGHT_BIT_HI(c); - ustrp[1] = UTF8_EIGHT_BIT_LO(c); - *lenp = 2; - } + if (len > 1) { + Copy(s, ustrp, len, U8); #ifdef EBCDIC - { - U8 tmpbuf[UTF8_MAXLEN_FOLD+1]; - U8 *d = tmpbuf; - U8 *t, *tend; - STRLEN tlen; - - for (t = ustrp, tend = t + *lenp; t < tend; t += tlen) { - UV c = utf8_to_uvchr(t, &tlen); - - if (tlen > 0) - d = uvchr_to_utf8(d, UNI_TO_NATIVE(c)); - else - break; + { + /* If we have EBCDIC we need to remap the + * characters coming in from the "special" + * (usually, but not always multicharacter) + * mapping, since any characters in the low 256 + * are in Unicode code points, not EBCDIC. + * If we either had a bit in the "special" + * mappings indicating "contains lower 256", + * or if we on EBCDIC platforms regenerate the + * lib/unicore/To/Foo.pl, we could do without + * this, but for now, let's do it this way. + * --jhi */ + + U8 tmpbuf[UTF8_MAXLEN_FOLD+1]; + U8 *d = tmpbuf; + U8 *t, *tend; + STRLEN tlen; + + for (t = ustrp, tend = t + len; t < tend; t += tlen) { + UV c = utf8_to_uvchr(t, &tlen); + + if (tlen > 0) + d = uvchr_to_utf8(d, UNI_TO_NATIVE(c)); + else + break; + } + len = d - tmpbuf; + Copy(tmpbuf, ustrp, len, U8); } - *lenp = d - tmpbuf; - Copy(tmpbuf, ustrp, *lenp, U8); - } #endif + } + else { + U8 c = UNI_TO_NATIVE(*s); + + if (NATIVE_IS_INVARIANT(c)) + ustrp[0] = c; + else { + ustrp[0] = UTF8_EIGHT_BIT_HI(c); + ustrp[1] = UTF8_EIGHT_BIT_LO(c); + len = 2; + } + } + if (lenp) + *lenp = len; return utf8_to_uvchr(ustrp, 0); } uv = NATIVE_TO_UNI(uv);