From: Jarkko Hietaniemi Date: Tue, 1 Jan 2002 03:35:01 +0000 (+0000) Subject: Unnecessary/Lingering UTF8 flag might mess up caseless matching X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=ffce6cc29631cbe46068af7601780981e4969f2a;p=p5sagit%2Fp5-mst-13.2.git Unnecessary/Lingering UTF8 flag might mess up caseless matching (found by Jeffrey Friedl) p4raw-id: //depot/perl@13992 --- diff --git a/regexec.c b/regexec.c index 3aed549..0f738d1 100644 --- a/regexec.c +++ b/regexec.c @@ -995,7 +995,8 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta c = utf8_to_uvchr((U8*)s, &len); if ( c == c1 && (ln == len || - !ibcmp_utf8(s, do_utf8, strend - s, + !ibcmp_utf8(s, do_utf8, + strend - s > ln ? ln : strend - s, m, UTF, ln)) && (norun || regtry(prog, s)) ) goto got_it; @@ -1007,7 +1008,8 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta && (f == c1 || f == c2) && (ln == foldlen || !ibcmp_utf8((char *)foldbuf, - do_utf8, foldlen, + do_utf8, + foldlen > ln ? ln : foldlen, m, UTF, ln)) && (norun || regtry(prog, s)) ) goto got_it; @@ -1032,7 +1034,8 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta if ( (c == c1 || c == c2) && (ln == len || - !ibcmp_utf8(s, do_utf8, strend - s, + !ibcmp_utf8(s, do_utf8, + strend - s > ln ? ln : strend - s, m, UTF, ln)) && (norun || regtry(prog, s)) ) goto got_it; @@ -1044,7 +1047,8 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta && (f == c1 || f == c2) && (ln == foldlen || !ibcmp_utf8((char *)foldbuf, - do_utf8, foldlen, + do_utf8, + foldlen > ln ? ln : foldlen, m, UTF, ln)) && (norun || regtry(prog, s)) ) goto got_it; diff --git a/t/op/pat.t b/t/op/pat.t index 0eda689..b797bdf 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -6,7 +6,7 @@ $| = 1; -print "1..825\n"; +print "1..828\n"; BEGIN { chdir 't' if -d 't'; @@ -2520,3 +2520,41 @@ print "# some Unicode properties\n"; $& eq "franc\N{COMBINING CEDILLA}ais" ? "ok 825\n" : "not ok 825\n"; } + +{ + print "# Does lingering (and useless) UTF8 flag mess up /i matching?\n"; + + { + my $regex = "ABcde"; + my $string = "abcDE\x{100}"; + chop($string); + if ($string =~ m/$regex/i) { + print "ok 826\n"; + } else { + print "not ok 826\n"; + } + } + + { + my $regex = "ABcde\x{100}"; + my $string = "abcDE"; + chop($regex); + if ($string =~ m/$regex/i) { + print "ok 827\n"; + } else { + print "not ok 827\n"; + } + } + + { + my $regex = "ABcde\x{100}"; + my $string = "abcDE\x{100}"; + chop($regex); + chop($string); + if ($string =~ m/$regex/i) { + print "ok 828\n"; + } else { + print "not ok 828\n"; + } + } +} diff --git a/utf8.c b/utf8.c index 54ab529..0051796 100644 --- a/utf8.c +++ b/utf8.c @@ -1672,9 +1672,9 @@ Perl_ibcmp_utf8(pTHX_ const char *s1, bool u1, register I32 len1, const char *s2 register U8 *be = b + len2; STRLEN la, lb; UV ca, cb; - STRLEN ulen1, ulen2; - U8 tmpbuf1[UTF8_MAXLEN_FOLD+1]; - U8 tmpbuf2[UTF8_MAXLEN_FOLD+1]; + STRLEN foldlen1, foldlen2; + U8 foldbuf1[UTF8_MAXLEN_FOLD+1]; + U8 foldbuf2[UTF8_MAXLEN_FOLD+1]; while (a < ae && b < be) { if (u1) { @@ -1682,7 +1682,7 @@ Perl_ibcmp_utf8(pTHX_ const char *s1, bool u1, register I32 len1, const char *s2 break; ca = utf8_to_uvchr((U8*)a, &la); } else { - ca = *a; + ca = NATIVE_TO_UNI(*a); la = 1; } if (u2) { @@ -1690,21 +1690,17 @@ Perl_ibcmp_utf8(pTHX_ const char *s1, bool u1, register I32 len1, const char *s2 break; cb = utf8_to_uvchr((U8*)b, &lb); } else { - cb = *b; + cb = NATIVE_TO_UNI(*b); lb = 1; } if (ca != cb) { - if (u1) - to_uni_fold(NATIVE_TO_UNI(ca), tmpbuf1, &ulen1); - else - ulen1 = 1; - if (u2) - to_uni_fold(NATIVE_TO_UNI(cb), tmpbuf2, &ulen2); - else - ulen2 = 1; - if (ulen1 != ulen2 - || (ca < 256 && cb < 256 && ca != PL_fold[cb]) - || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1)) + to_uni_fold(ca, foldbuf1, &foldlen1); + ca = utf8_to_uvchr(foldbuf1, 0); + + to_uni_fold(cb, foldbuf2, &foldlen2); + cb = utf8_to_uvchr(foldbuf2, 0); + + if (ca != cb || foldlen1 != foldlen2) return 1; /* mismatch */ } a += la;