From: Nicholas Clark <nick@ccl4.org>
Date: Mon, 6 Feb 2006 22:28:41 +0000 (+0000)
Subject: Optimise index so that if the big string is ISO-8859-1 but the little
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=2f040f7f3a7618c48a8d153deb2b7e4a59efefb0;p=p5sagit%2Fp5-mst-13.2.git

Optimise index so that if the big string is ISO-8859-1 but the little
string is UTF-8, it tries to downgrade the little string, rather than
upgrade the big string. For half-meg big strings this is a fourfold
speed gain.

p4raw-id: //depot/perl@27113
---

diff --git a/pp.c b/pp.c
index 96d5ef6..e9f159b 100644
--- a/pp.c
+++ b/pp.c
@@ -3117,8 +3117,8 @@ PP(pp_index)
     const char *tmps2;
     STRLEN biglen;
     const I32 arybase = PL_curcop->cop_arybase;
-    int big_utf8;
-    int little_utf8;
+    bool big_utf8;
+    bool little_utf8;
 
     if (MAXARG < 3)
 	offset = 0;
@@ -3130,22 +3130,43 @@ PP(pp_index)
     little_utf8 = DO_UTF8(little);
     if (big_utf8 ^ little_utf8) {
 	/* One needs to be upgraded.  */
-	SV * const bytes = little_utf8 ? big : little;
-	STRLEN len;
-	const char * const p = SvPV_const(bytes, len);
-
-	temp = newSVpvn(p, len);
+	if (little_utf8 && !PL_encoding) {
+	    /* Well, maybe instead we might be able to downgrade the small
+	       string?  */
+	    STRLEN little_len;
+	    const U8 * const little_pv = (U8*) SvPV_const(little, little_len);
+	    char * const pv = (char*)bytes_from_utf8(little_pv, &little_len,
+						     &little_utf8);
+	    if (little_utf8) {
+		/* If the large string is ISO-8859-1, and it's not possible to
+		   convert the small string to ISO-8859-1, then there is no
+		   way that it could be found anywhere by index.  */
+		retval = -1;
+		goto fail;
+	    }
 
-	if (PL_encoding) {
-	    sv_recode_to_utf8(temp, PL_encoding);
-	} else {
-	    sv_utf8_upgrade(temp);
-	}
-	if (little_utf8) {
-	    big = temp;
-	    big_utf8 = TRUE;
+	    /* At this point, pv is a malloc()ed string. So donate it to temp
+	       to ensure it will get free()d  */
+	    little = temp = newSV(0);
+	    sv_usepvn(temp, pv, little_len);
 	} else {
-	    little = temp;
+	    SV * const bytes = little_utf8 ? big : little;
+	    STRLEN len;
+	    const char * const p = SvPV_const(bytes, len);
+
+	    temp = newSVpvn(p, len);
+
+	    if (PL_encoding) {
+		sv_recode_to_utf8(temp, PL_encoding);
+	    } else {
+		sv_utf8_upgrade(temp);
+	    }
+	    if (little_utf8) {
+		big = temp;
+		big_utf8 = TRUE;
+	    } else {
+		little = temp;
+	    }
 	}
     }
     if (big_utf8 && offset > 0)
@@ -3158,12 +3179,14 @@ PP(pp_index)
     if (!(tmps2 = fbm_instr((unsigned char*)tmps + offset,
       (unsigned char*)tmps + biglen, little, 0)))
 	retval = -1;
-    else
+    else {
 	retval = tmps2 - tmps;
-    if (retval > 0 && big_utf8)
-	sv_pos_b2u(big, &retval);
+	if (big_utf8)
+	    sv_pos_b2u(big, &retval);
+    }
     if (temp)
 	SvREFCNT_dec(temp);
+ fail:
     PUSHi(retval + arybase);
     RETURN;
 }