From: Nicholas Clark <nick@ccl4.org>
Date: Sat, 29 Apr 2006 15:55:51 +0000 (+0000)
Subject: lcfirst/ucfist plus an 8 bit locale could mangle UTF-8 values
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=d54190f6ca0aac8b08bb59370c53932771933c40;p=p5sagit%2Fp5-mst-13.2.git

lcfirst/ucfist plus an 8 bit locale could mangle UTF-8 values
returned by overloaded stringification.

p4raw-id: //depot/perl@28013
---

diff --git a/pp.c b/pp.c
index 86299ac..b937e0a 100644
--- a/pp.c
+++ b/pp.c
@@ -3413,28 +3413,64 @@ PP(pp_ucfirst)
 {
     dVAR;
     dSP;
-    SV *sv = TOPs;
-    const U8 *s;
+    SV *source = TOPs;
     STRLEN slen;
+    STRLEN need;
+    SV *dest;
+    bool inplace = TRUE;
+    bool doing_utf8;
     const int op_type = PL_op->op_type;
+    const U8 *s;
+    U8 *d;
+    U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
+    STRLEN ulen;
+    STRLEN tculen;
 
-    SvGETMAGIC(sv);
-    if (DO_UTF8(sv) &&
-	(s = (const U8*)SvPV_nomg_const(sv, slen)) && slen &&
-	UTF8_IS_START(*s)) {
-	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
-	STRLEN ulen;
-	STRLEN tculen;
+    SvGETMAGIC(source);
+    if (SvOK(source)) {
+	s = (const U8*)SvPV_nomg_const(source, slen);
+    } else {
+	s = "";
+	slen = 0;
+    }
 
+    if (slen && DO_UTF8(source) && UTF8_IS_START(*s)) {
+	doing_utf8 = TRUE;
 	utf8_to_uvchr(s, &ulen);
 	if (op_type == OP_UCFIRST) {
 	    toTITLE_utf8(s, tmpbuf, &tculen);
 	} else {
 	    toLOWER_utf8(s, tmpbuf, &tculen);
 	}
+	/* If the two differ, we definately cannot do inplace.  */
+	inplace = ulen == tculen;
+	need = slen + 1 - ulen + tculen;
+    } else {
+	doing_utf8 = FALSE;
+	need = slen + 1;
+    }
+
+    if (SvPADTMP(source) && !SvREADONLY(source) && inplace) {
+	/* We can convert in place.  */
+
+	dest = source;
+	s = d = (U8*)SvPV_force_nomg(source, slen);
+    } else {
+	dTARGET;
 
-	if (!SvPADTMP(sv) || SvREADONLY(sv) || ulen != tculen) {
-	    dTARGET;
+	dest = TARG;
+
+	SvUPGRADE(dest, SVt_PV);
+	d = SvGROW(dest, need);
+	(void)SvPOK_only(dest);
+
+	SETs(dest);
+
+	inplace = FALSE;
+    }
+
+    if (doing_utf8) {
+	if(!inplace) {
 	    /* slen is the byte length of the whole SV.
 	     * ulen is the byte length of the original Unicode character
 	     * stored as UTF-8 at s.
@@ -3442,40 +3478,41 @@ PP(pp_ucfirst)
 	     * lowercased) Unicode character stored as UTF-8 at tmpbuf.
 	     * We first set the result to be the titlecased (/lowercased)
 	     * character, and then append the rest of the SV data. */
-	    sv_setpvn(TARG, (char*)tmpbuf, tculen);
+	    sv_setpvn(dest, (char*)tmpbuf, tculen);
 	    if (slen > ulen)
-	        sv_catpvn(TARG, (char*)(s + ulen), slen - ulen);
-	    SvUTF8_on(TARG);
-	    sv = TARG;
-	    SETs(sv);
+	        sv_catpvn(dest, (char*)(s + ulen), slen - ulen);
+	    SvUTF8_on(dest);
 	}
 	else {
-	    s = (U8*)SvPV_force_nomg(sv, slen);
-	    Copy(tmpbuf, s, tculen, U8);
+	    Copy(tmpbuf, d, tculen, U8);
+	    SvCUR_set(dest, need - 1);
 	}
     }
     else {
-	U8 *s1;
-	if (!SvPADTMP(sv) || SvREADONLY(sv)) {
-	    dTARGET;
-	    SvUTF8_off(TARG);				/* decontaminate */
-	    sv_setsv_nomg(TARG, sv);
-	    sv = TARG;
-	    SETs(sv);
-	}
-	s1 = (U8*)SvPV_force_nomg(sv, slen);
-	if (*s1) {
+	if (*s) {
 	    if (IN_LOCALE_RUNTIME) {
 		TAINT;
-		SvTAINTED_on(sv);
-		*s1 = (op_type == OP_UCFIRST)
-		    ? toUPPER_LC(*s1) : toLOWER_LC(*s1);
+		SvTAINTED_on(dest);
+		*d = (op_type == OP_UCFIRST)
+		    ? toUPPER_LC(*s) : toLOWER_LC(*s);
 	    }
 	    else
-		*s1 = (op_type == OP_UCFIRST) ? toUPPER(*s1) : toLOWER(*s1);
+		*d = (op_type == OP_UCFIRST) ? toUPPER(*s) : toLOWER(*s);
+	} else {
+	    /* See bug #39028  */
+	    *d = *s;
+	}
+
+	if (SvUTF8(source))
+	    SvUTF8_on(dest);
+
+	if (!inplace) {
+	    /* This will copy the trailing NUL  */
+	    Copy(s + 1, d + 1, slen, U8);
+	    SvCUR_set(dest, need - 1);
 	}
     }
-    SvSETMAGIC(sv);
+    SvSETMAGIC(dest);
     RETURN;
 }
 
diff --git a/t/uni/overload.t b/t/uni/overload.t
index 407d4c6..38328f1 100644
--- a/t/uni/overload.t
+++ b/t/uni/overload.t
@@ -7,7 +7,7 @@ BEGIN {
     }
 }
 
-use Test::More tests => 16;
+use Test::More tests => 24;
 
 package UTF8Toggle;
 use strict;
@@ -72,5 +72,21 @@ SKIP: {
 	$uc = uc $u;
 	is (length $uc, 1);
 	is ($uc, "\311", "e accute -> E accute");
+
+	$u = UTF8Toggle->new("\311");
+	$lc = lcfirst $u;
+	is (length $lc, 1);
+	is ($lc, "\351", "E accute -> e accute");
+	$lc = lcfirst $u;
+	is (length $lc, 1);
+	is ($lc, "\351", "E accute -> e accute");
+
+	$u = UTF8Toggle->new("\351");
+	$uc = ucfirst $u;
+	is (length $uc, 1);
+	is ($uc, "\311", "e accute -> E accute");
+	$uc = ucfirst $u;
+	is (length $uc, 1);
+	is ($uc, "\311", "e accute -> E accute");
     }
 }