From: Jarkko Hietaniemi <jhi@iki.fi>
Date: Tue, 6 Nov 2001 15:18:41 +0000 (+0000)
Subject: More UTF-8 EXACT tweaking, plus a forgotten UTF-8
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=5ff6fc6d3e84f8da3756f8b5246037f5e410021e;p=p5sagit%2Fp5-mst-13.2.git

More UTF-8 EXACT tweaking, plus a forgotten UTF-8
toggle-on from the encoding pragma.

p4raw-id: //depot/perl@12872
---

diff --git a/regcomp.c b/regcomp.c
index cd3857e..12e0395 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -1764,7 +1764,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
     pm->op_pmflags = RExC_flags16;
     if (UTF)
-	r->reganch |= ROPT_UTF8;
+        r->reganch |= ROPT_UTF8;	/* Unicode in it? */
     r->regstclass = NULL;
     if (RExC_naughty >= 10)	/* Probably an expensive pattern. */
 	r->reganch |= ROPT_NAUGHTY;
@@ -3168,6 +3168,7 @@ tryagain:
 	      RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
 	 } else
 	      RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
+	 RExC_utf8 = 1;
     }
 
     return(ret);
diff --git a/regexec.c b/regexec.c
index 60d93f7..712c4d9 100644
--- a/regexec.c
+++ b/regexec.c
@@ -2204,43 +2204,40 @@ S_regmatch(pTHX_ regnode *prog)
 	    s = STRING(scan);
 	    ln = STR_LEN(scan);
 	    if (do_utf8 != (UTF!=0)) {
+		/* The target and the pattern have differing "utf8ness". */
 		char *l = locinput;
 		char *e = s + ln;
 		STRLEN len;
 
-		if (do_utf8)
+		if (do_utf8) {
+		    /* The target is utf8, the pattern is not utf8. */
 		    while (s < e) {
-			UV uv;
-
 			if (l >= PL_regeol)
-			    sayNO;
-			uv = NATIVE_TO_UNI(*(U8*)s);
-			if (UTF8_IS_START(uv)) {
-			     len = UTF8SKIP(s);
-			     if (memNE(s, l, len))
-				  sayNO;
-			     l += len;
-			     s += len;
-			} else {
-			     if (uv != utf8_to_uvchr((U8*)l, &len))
-				  sayNO;
-			     l += len;
-			     s ++;
-			}
+			     sayNO;
+			if (NATIVE_TO_UNI(*(U8*)s) !=
+			    utf8_to_uvchr((U8*)l, &len))
+			     sayNO;
+			l += len;
+			s ++;
 		    }
-		else
+		}
+		else {
+		    /* The target is not utf8, the pattern is utf8. */
 		    while (s < e) {
 			if (l >= PL_regeol)
 			    sayNO;
-			if (*((U8*)l) != utf8_to_uvchr((U8*)s, &len))
+			if (NATIVE_TO_UNI(*((U8*)l)) !=
+			    utf8_to_uvchr((U8*)s, &len))
 			    sayNO;
 			s += len;
 			l ++;
 		    }
+		}
 		locinput = l;
 		nextchr = UCHARAT(locinput);
 		break;
 	    }
+	    /* The target and the pattern have the same "utf8ness". */
 	    /* Inline the first character, for speed. */
 	    if (UCHARAT(s) != nextchr)
 		sayNO;