Unnecessary/Lingering UTF8 flag might mess up caseless matching
Jarkko Hietaniemi [Tue, 1 Jan 2002 03:35:01 +0000 (03:35 +0000)]
(found by Jeffrey Friedl)

p4raw-id: //depot/perl@13992

regexec.c
t/op/pat.t
utf8.c

index 3aed549..0f738d1 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -995,7 +995,8 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                        c = utf8_to_uvchr((U8*)s, &len);
                        if ( c == c1
                             && (ln == len ||
-                                !ibcmp_utf8(s, do_utf8, strend - s,
+                                !ibcmp_utf8(s, do_utf8,
+                                            strend - s > ln ? ln : strend - s,
                                             m, UTF, ln))
                             && (norun || regtry(prog, s)) )
                            goto got_it;
@@ -1007,7 +1008,8 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                                  && (f == c1 || f == c2)
                                  && (ln == foldlen ||
                                      !ibcmp_utf8((char *)foldbuf,
-                                                 do_utf8, foldlen,
+                                                 do_utf8,
+                                                 foldlen > ln ? ln : foldlen,
                                                  m, UTF, ln))
                                  && (norun || regtry(prog, s)) )
                                  goto got_it;
@@ -1032,7 +1034,8 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 
                        if ( (c == c1 || c == c2)
                             && (ln == len ||
-                                !ibcmp_utf8(s, do_utf8, strend - s,
+                                !ibcmp_utf8(s, do_utf8,
+                                            strend - s > ln ? ln : strend - s,
                                             m, UTF, ln))
                             && (norun || regtry(prog, s)) )
                            goto got_it;
@@ -1044,7 +1047,8 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                                  && (f == c1 || f == c2)
                                  && (ln == foldlen ||
                                      !ibcmp_utf8((char *)foldbuf,
-                                                 do_utf8, foldlen,
+                                                 do_utf8,
+                                                 foldlen > ln ? ln : foldlen,
                                                  m, UTF, ln))
                                  && (norun || regtry(prog, s)) )
                                  goto got_it;
index 0eda689..b797bdf 100755 (executable)
@@ -6,7 +6,7 @@
 
 $| = 1;
 
-print "1..825\n";
+print "1..828\n";
 
 BEGIN {
     chdir 't' if -d 't';
@@ -2520,3 +2520,41 @@ print "# some Unicode properties\n";
        $& eq "franc\N{COMBINING CEDILLA}ais" ?
        "ok 825\n" : "not ok 825\n";
 }
+
+{
+    print "# Does lingering (and useless) UTF8 flag mess up /i matching?\n";
+
+    {
+       my $regex  = "ABcde";
+       my $string = "abcDE\x{100}";
+       chop($string);
+       if ($string =~ m/$regex/i) {
+           print "ok 826\n";
+       } else {
+           print "not ok 826\n";
+       }
+    }
+
+    {
+       my $regex  = "ABcde\x{100}";
+       my $string = "abcDE";
+       chop($regex);
+       if ($string =~ m/$regex/i) {
+           print "ok 827\n";
+       } else {
+           print "not ok 827\n";
+       }
+    }
+
+    {
+       my $regex  = "ABcde\x{100}";
+       my $string = "abcDE\x{100}";
+       chop($regex);
+       chop($string);
+       if ($string =~ m/$regex/i) {
+           print "ok 828\n";
+       } else {
+           print "not ok 828\n";
+       }
+    }
+}
diff --git a/utf8.c b/utf8.c
index 54ab529..0051796 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -1672,9 +1672,9 @@ Perl_ibcmp_utf8(pTHX_ const char *s1, bool u1, register I32 len1, const char *s2
      register U8 *be = b + len2;
      STRLEN la, lb;
      UV ca, cb;
-     STRLEN ulen1, ulen2;
-     U8 tmpbuf1[UTF8_MAXLEN_FOLD+1];
-     U8 tmpbuf2[UTF8_MAXLEN_FOLD+1];
+     STRLEN foldlen1, foldlen2;
+     U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
+     U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
      
      while (a < ae && b < be) {
          if (u1) {
@@ -1682,7 +1682,7 @@ Perl_ibcmp_utf8(pTHX_ const char *s1, bool u1, register I32 len1, const char *s2
                    break;
               ca = utf8_to_uvchr((U8*)a, &la);
          } else {
-              ca = *a;
+              ca = NATIVE_TO_UNI(*a);
               la = 1;
          }
          if (u2) {
@@ -1690,21 +1690,17 @@ Perl_ibcmp_utf8(pTHX_ const char *s1, bool u1, register I32 len1, const char *s2
                    break;
               cb = utf8_to_uvchr((U8*)b, &lb);
          } else {
-              cb = *b;
+              cb = NATIVE_TO_UNI(*b);
               lb = 1;
          }
          if (ca != cb) {
-              if (u1)
-                   to_uni_fold(NATIVE_TO_UNI(ca), tmpbuf1, &ulen1);
-              else
-                   ulen1 = 1;
-              if (u2)
-                   to_uni_fold(NATIVE_TO_UNI(cb), tmpbuf2, &ulen2);
-              else
-                   ulen2 = 1;
-              if (ulen1 != ulen2
-                  || (ca < 256 && cb < 256 && ca != PL_fold[cb])
-                  || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
+              to_uni_fold(ca, foldbuf1, &foldlen1);
+              ca = utf8_to_uvchr(foldbuf1, 0);
+              
+              to_uni_fold(cb, foldbuf2, &foldlen2);
+              cb = utf8_to_uvchr(foldbuf2, 0);
+
+              if (ca != cb || foldlen1 != foldlen2)
                    return 1; /* mismatch */
          }
          a += la;