From: Slaven Rezic Date: Sun, 4 Jan 2009 16:28:33 +0000 (+0100) Subject: Another regexp failure with utf8-flagged string and byte-flagged pattern (reminder) X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=c012444fd89eef64e1d1687642cdb9f968e96739;p=p5sagit%2Fp5-mst-13.2.git Another regexp failure with utf8-flagged string and byte-flagged pattern (reminder) Date: 17 Nov 2007 16:29:29 +0100 Message-ID: <87r6iohova.fsf@biokovo-amd64.herceg.de> --- diff --git a/regexec.c b/regexec.c index 94d6761..bc8da6e 100644 --- a/regexec.c +++ b/regexec.c @@ -1007,15 +1007,16 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \ uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \ + UV uvc_unfolded = 0; \ switch (trie_type) { \ case trie_utf8_fold: \ if ( foldlen>0 ) { \ - uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \ + uvc_unfolded = uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \ foldlen -= len; \ uscan += len; \ len=0; \ } else { \ - uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \ + uvc_unfolded = uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \ uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \ foldlen -= UNISKIP( uvc ); \ uscan = foldbuf + UNISKIP( uvc ); \ @@ -1054,6 +1055,9 @@ uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \ charid = (U16)SvIV(*svpp); \ } \ } \ + if (!charid && trie_type == trie_utf8_fold && !UTF) { \ + charid = trie->charmap[uvc_unfolded]; \ + } \ } STMT_END #define REXEC_FBC_EXACTISH_CHECK(CoNd) \ diff --git a/t/op/pat.t b/t/op/pat.t index aa275bd..586b317 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -13,7 +13,7 @@ sub run_tests; $| = 1; -my $EXPECTED_TESTS = 3865; # Update this when adding/deleting tests. +my $EXPECTED_TESTS = 3961; # Update this when adding/deleting tests. BEGIN { chdir 't' if -d 't'; @@ -3896,6 +3896,15 @@ sub run_tests { iseq $1, "\xd6", "Upgrade error"; } + { +# more TRIE/AHOCORASICK problems with mixed utf8 / latin-1 and case folding + for my $chr (160 .. 255) { + my $chr_byte = chr($chr); + my $chr_utf8 = chr($chr); utf8::upgrade($chr_utf8); + my $rx = qr{$chr_byte|X}i; + ok($chr_utf8 =~ $rx, "utf8/latin, codepoint $chr"); + } + } { # Regardless of utf8ness any character matches itself when