From: Nicholas Clark Date: Sat, 29 Apr 2006 13:43:26 +0000 (+0000) Subject: lc plus an 8 bit locale could mangle UTF-8 values returned by X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=ec9af7d430b6660eff7240fa20757fa5feb233a8;p=p5sagit%2Fp5-mst-13.2.git lc plus an 8 bit locale could mangle UTF-8 values returned by overloaded stringification. p4raw-id: //depot/perl@28011 --- diff --git a/pp.c b/pp.c index cd218cc..faf9c16 100644 --- a/pp.c +++ b/pp.c @@ -3570,107 +3570,115 @@ PP(pp_lc) { dVAR; dSP; - SV *sv = TOPs; + SV *source = TOPs; STRLEN len; + STRLEN min; + SV *dest; + const U8 *s; + U8 *d; - SvGETMAGIC(sv); - if (DO_UTF8(sv)) { + SvGETMAGIC(source); + + if (SvPADTMP(source) && !SvREADONLY(source) && !SvAMAGIC(source) + && !DO_UTF8(source)) { + /* We can convert in place. */ + + dest = source; + s = d = (U8*)SvPV_force_nomg(source, len); + min = len + 1; + } else { dTARGET; - const U8 *s; - STRLEN ulen; - register U8 *d; - const U8 *send; - U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; - s = (const U8*)SvPV_nomg_const(sv,len); - if (!len) { - SvUTF8_off(TARG); /* decontaminate */ - sv_setpvn(TARG, "", 0); - sv = TARG; - SETs(sv); + dest = TARG; + + /* The old implementation would copy source into TARG at this point. + This had the side effect that if source was undef, TARG was now + an undefined SV with PADTMP set, and they don't warn inside + sv_2pv_flags(). However, we're now getting the PV direct from + source, which doesn't have PADTMP set, so it would warn. Hence the + little games. */ + + if (SvOK(source)) { + s = (const U8*)SvPV_nomg_const(source, len); + } else { + s = ""; + len = 0; } - else { - STRLEN min = len + 1; + min = len + 1; - SvUPGRADE(TARG, SVt_PV); - SvGROW(TARG, min); - (void)SvPOK_only(TARG); - d = (U8*)SvPVX(TARG); - send = s + len; - while (s < send) { - const STRLEN u = UTF8SKIP(s); - const UV uv = toLOWER_utf8(s, tmpbuf, &ulen); + SvUPGRADE(dest, SVt_PV); + d = SvGROW(dest, min); + (void)SvPOK_only(dest); -#define GREEK_CAPITAL_LETTER_SIGMA 0x03A3 /* Unicode U+03A3 */ - if (uv == GREEK_CAPITAL_LETTER_SIGMA) { - NOOP; - /* - * Now if the sigma is NOT followed by - * /$ignorable_sequence$cased_letter/; - * and it IS preceded by - * /$cased_letter$ignorable_sequence/; - * where $ignorable_sequence is - * [\x{2010}\x{AD}\p{Mn}]* - * and $cased_letter is - * [\p{Ll}\p{Lo}\p{Lt}] - * then it should be mapped to 0x03C2, - * (GREEK SMALL LETTER FINAL SIGMA), - * instead of staying 0x03A3. - * "should be": in other words, - * this is not implemented yet. - * See lib/unicore/SpecialCasing.txt. - */ - } - if (ulen > u && (SvLEN(TARG) < (min += ulen - u))) { - /* If the eventually required minimum size outgrows - * the available space, we need to grow. */ - const UV o = d - (U8*)SvPVX_const(TARG); + SETs(dest); + } - /* If someone lowercases one million U+0130s we - * SvGROW() one million times. Or we could try - * guessing how much to allocate without allocating. - * too much. Such is life. */ - SvGROW(TARG, min); - d = (U8*)SvPVX(TARG) + o; - } - Copy(tmpbuf, d, ulen, U8); - d += ulen; - s += u; + /* Overloaded values may have toggled the UTF-8 flag on source, so we need + to check DO_UTF8 again here. */ + + if (DO_UTF8(source)) { + const U8 *const send = s + len; + U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; + + while (s < send) { + const STRLEN u = UTF8SKIP(s); + STRLEN ulen; + const UV uv = toLOWER_utf8(s, tmpbuf, &ulen); + +#define GREEK_CAPITAL_LETTER_SIGMA 0x03A3 /* Unicode U+03A3 */ + if (uv == GREEK_CAPITAL_LETTER_SIGMA) { + NOOP; + /* + * Now if the sigma is NOT followed by + * /$ignorable_sequence$cased_letter/; + * and it IS preceded by /$cased_letter$ignorable_sequence/; + * where $ignorable_sequence is [\x{2010}\x{AD}\p{Mn}]* + * and $cased_letter is [\p{Ll}\p{Lo}\p{Lt}] + * then it should be mapped to 0x03C2, + * (GREEK SMALL LETTER FINAL SIGMA), + * instead of staying 0x03A3. + * "should be": in other words, this is not implemented yet. + * See lib/unicore/SpecialCasing.txt. + */ } - *d = '\0'; - SvUTF8_on(TARG); - SvCUR_set(TARG, d - (U8*)SvPVX_const(TARG)); - sv = TARG; - SETs(sv); - } - } - else { - U8 *s; - if (!SvPADTMP(sv) || SvREADONLY(sv)) { - dTARGET; - SvUTF8_off(TARG); /* decontaminate */ - sv_setsv_nomg(TARG, sv); - sv = TARG; - SETs(sv); + if (ulen > u && (SvLEN(dest) < (min += ulen - u))) { + /* If the eventually required minimum size outgrows + * the available space, we need to grow. */ + const UV o = d - (U8*)SvPVX_const(dest); + + /* If someone lowercases one million U+0130s we SvGROW() one + * million times. Or we could try guessing how much to + allocate without allocating too much. Such is life. */ + SvGROW(dest, min); + d = (U8*)SvPVX(dest) + o; + } + Copy(tmpbuf, d, ulen, U8); + d += ulen; + s += u; } - - s = (U8*)SvPV_force_nomg(sv, len); + SvUTF8_on(dest); + *d = '\0'; + SvCUR_set(dest, d - (U8*)SvPVX_const(dest)); + } else { if (len) { - register const U8 * const send = s + len; - + const U8 *const send = s + len; if (IN_LOCALE_RUNTIME) { TAINT; - SvTAINTED_on(sv); - for (; s < send; s++) - *s = toLOWER_LC(*s); + SvTAINTED_on(dest); + for (; s < send; d++, s++) + *d = toLOWER_LC(*s); } else { - for (; s < send; s++) - *s = toLOWER(*s); + for (; s < send; d++, s++) + *d = toLOWER(*s); } } + if (source != dest) { + *d = '\0'; + SvCUR_set(dest, d - (U8*)SvPVX_const(dest)); + } } - SvSETMAGIC(sv); + SvSETMAGIC(dest); RETURN; } diff --git a/t/uni/overload.t b/t/uni/overload.t index 9338f75..3ecfafb 100644 --- a/t/uni/overload.t +++ b/t/uni/overload.t @@ -7,9 +7,9 @@ BEGIN { } } -use Test::More tests => 8; +use Test::More tests => 12; -package UTF8Field; +package UTF8Toggle; use strict; use overload '""' => 'stringify'; @@ -36,9 +36,33 @@ package main; foreach my $t ("ASCII", "B\366se") { my $length = length $t; - my $u = UTF8Field->new($t); + my $u = UTF8Toggle->new($t); is (length $u, $length, "length of '$t'"); is (length $u, $length, "length of '$t'"); is (length $u, $length, "length of '$t'"); is (length $u, $length, "length of '$t'"); } + +my $have_setlocale = 0; +eval { + require POSIX; + import POSIX ':locale_h'; + $have_setlocale++; +}; + +SKIP: { + if (!$have_setlocale) { + skip "No setlocale", 4; + } elsif (!setlocale(&POSIX::LC_ALL, "en_GB.ISO8859-1")) { + skip "Could not setlocale to en_GB.ISO8859-1", 4; + } else { + use locale; + my $u = UTF8Toggle->new("\311"); + my $lc = lc $u; + is (length $lc, 1); + is ($lc, "\351", "E accute -> e accute"); + $lc = lc $u; + is (length $lc, 1); + is ($lc, "\351", "E accute -> e accute"); + } +}