Another regexp failure with utf8-flagged string and byte-flagged pattern (reminder)
Slaven Rezic [Sun, 4 Jan 2009 16:28:33 +0000 (17:28 +0100)]
Date: 17 Nov 2007 16:29:29 +0100
Message-ID: <87r6iohova.fsf@biokovo-amd64.herceg.de>

regexec.c
t/op/pat.t

index 94d6761..bc8da6e 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -1007,15 +1007,16 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 
 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
+    UV uvc_unfolded = 0;                                                   \
     switch (trie_type) {                                                    \
     case trie_utf8_fold:                                                    \
        if ( foldlen>0 ) {                                                  \
-           uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
+           uvc_unfolded = uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
            foldlen -= len;                                                 \
            uscan += len;                                                   \
            len=0;                                                          \
        } else {                                                            \
-           uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );   \
+           uvc_unfolded = uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
            uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
            foldlen -= UNISKIP( uvc );                                      \
            uscan = foldbuf + UNISKIP( uvc );                               \
@@ -1054,6 +1055,9 @@ uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
                charid = (U16)SvIV(*svpp);                                  \
        }                                                                   \
     }                                                                       \
+    if (!charid && trie_type == trie_utf8_fold && !UTF) {                  \
+       charid = trie->charmap[uvc_unfolded];                               \
+    }                                                                      \
 } STMT_END
 
 #define REXEC_FBC_EXACTISH_CHECK(CoNd)                 \
index aa275bd..586b317 100755 (executable)
@@ -13,7 +13,7 @@ sub run_tests;
 
 $| = 1;
 
-my $EXPECTED_TESTS = 3865;  # Update this when adding/deleting tests.
+my $EXPECTED_TESTS = 3961;  # Update this when adding/deleting tests.
 
 BEGIN {
     chdir 't' if -d 't';
@@ -3896,6 +3896,15 @@ sub run_tests {
         iseq $1, "\xd6", "Upgrade error";
     }
 
+    {
+# more TRIE/AHOCORASICK problems with mixed utf8 / latin-1 and case folding
+       for my $chr (160 .. 255) {
+           my $chr_byte = chr($chr);
+           my $chr_utf8 = chr($chr); utf8::upgrade($chr_utf8);
+           my $rx = qr{$chr_byte|X}i;
+           ok($chr_utf8 =~ $rx, "utf8/latin, codepoint $chr");
+       }
+    }
 
     {
         # Regardless of utf8ness any character matches itself when