More UTF-8 EXACT tweaking, plus a forgotten UTF-8

diff --git a/regcomp.c b/regcomp.c

index cd3857e..12e0395 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -1764,7 +1764,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
     pm->op_pmflags = RExC_flags16;
     if (UTF)
-       r->reganch |= ROPT_UTF8;
+        r->reganch |= ROPT_UTF8;       /* Unicode in it? */
     r->regstclass = NULL;
     if (RExC_naughty >= 10)    /* Probably an expensive pattern. */
        r->reganch |= ROPT_NAUGHTY;
@@ -3168,6 +3168,7 @@ tryagain:
              RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
         } else
              RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
+        RExC_utf8 = 1;
     }
 
     return(ret);
diff --git a/regexec.c b/regexec.c

index 60d93f7..712c4d9 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -2204,43 +2204,40 @@ S_regmatch(pTHX_ regnode *prog)
            s = STRING(scan);
            ln = STR_LEN(scan);
            if (do_utf8 != (UTF!=0)) {
+               /* The target and the pattern have differing "utf8ness". */
                char *l = locinput;
                char *e = s + ln;
                STRLEN len;
 
-               if (do_utf8)
+               if (do_utf8) {
+                   /* The target is utf8, the pattern is not utf8. */
                    while (s < e) {
-                       UV uv;
-
                        if (l >= PL_regeol)
-                           sayNO;
-                       uv = NATIVE_TO_UNI(*(U8*)s);
-                       if (UTF8_IS_START(uv)) {
-                            len = UTF8SKIP(s);
-                            if (memNE(s, l, len))
-                                 sayNO;
-                            l += len;
-                            s += len;
-                       } else {
-                            if (uv != utf8_to_uvchr((U8*)l, &len))
-                                 sayNO;
-                            l += len;
-                            s ++;
-                       }
+                            sayNO;
+                       if (NATIVE_TO_UNI(*(U8*)s) !=
+                           utf8_to_uvchr((U8*)l, &len))
+                            sayNO;
+                       l += len;
+                       s ++;
                    }
-               else
+               }
+               else {
+                   /* The target is not utf8, the pattern is utf8. */
                    while (s < e) {
                        if (l >= PL_regeol)
                            sayNO;
-                       if (*((U8*)l) != utf8_to_uvchr((U8*)s, &len))
+                       if (NATIVE_TO_UNI(*((U8*)l)) !=
+                           utf8_to_uvchr((U8*)s, &len))
                            sayNO;
                        s += len;
                        l ++;
                    }
+               }
                locinput = l;
                nextchr = UCHARAT(locinput);
                break;
            }
+           /* The target and the pattern have the same "utf8ness". */
            /* Inline the first character, for speed. */
            if (UCHARAT(s) != nextchr)
                sayNO;
regcomp.c		patch \| blob \| blame \| history
regexec.c		patch \| blob \| blame \| history