From: Slaven Rezic <slaven@rezic.de>
Date: Sun, 4 Jan 2009 16:28:33 +0000 (+0100)
Subject: Another regexp failure with utf8-flagged string and byte-flagged pattern (reminder)
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=c012444fd89eef64e1d1687642cdb9f968e96739;p=p5sagit%2Fp5-mst-13.2.git

Another regexp failure with utf8-flagged string and byte-flagged pattern (reminder)

Date: 17 Nov 2007 16:29:29 +0100
Message-ID: <87r6iohova.fsf@biokovo-amd64.herceg.de>
---

diff --git a/regexec.c b/regexec.c
index 94d6761..bc8da6e 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1007,15 +1007,16 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 
 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
+    UV uvc_unfolded = 0;						    \
     switch (trie_type) {                                                    \
     case trie_utf8_fold:                                                    \
 	if ( foldlen>0 ) {                                                  \
-	    uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
+	    uvc_unfolded = uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
 	    foldlen -= len;                                                 \
 	    uscan += len;                                                   \
 	    len=0;                                                          \
 	} else {                                                            \
-	    uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );   \
+	    uvc_unfolded = uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
 	    uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
 	    foldlen -= UNISKIP( uvc );                                      \
 	    uscan = foldbuf + UNISKIP( uvc );                               \
@@ -1054,6 +1055,9 @@ uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
 		charid = (U16)SvIV(*svpp);                                  \
 	}                                                                   \
     }                                                                       \
+    if (!charid && trie_type == trie_utf8_fold && !UTF) {		    \
+	charid = trie->charmap[uvc_unfolded];			    	    \
+    }								    	    \
 } STMT_END
 
 #define REXEC_FBC_EXACTISH_CHECK(CoNd)                 \
diff --git a/t/op/pat.t b/t/op/pat.t
index aa275bd..586b317 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -13,7 +13,7 @@ sub run_tests;
 
 $| = 1;
 
-my $EXPECTED_TESTS = 3865;  # Update this when adding/deleting tests.
+my $EXPECTED_TESTS = 3961;  # Update this when adding/deleting tests.
 
 BEGIN {
     chdir 't' if -d 't';
@@ -3896,6 +3896,15 @@ sub run_tests {
         iseq $1, "\xd6", "Upgrade error";
     }
 
+    {
+# more TRIE/AHOCORASICK problems with mixed utf8 / latin-1 and case folding
+	for my $chr (160 .. 255) {
+	    my $chr_byte = chr($chr);
+	    my $chr_utf8 = chr($chr); utf8::upgrade($chr_utf8);
+	    my $rx = qr{$chr_byte|X}i;
+	    ok($chr_utf8 =~ $rx, "utf8/latin, codepoint $chr");
+	}
+    }
 
     {
         # Regardless of utf8ness any character matches itself when