From: Yves Orton Date: Mon, 17 Dec 2007 15:21:46 +0000 (+0100) Subject: Fix various bugs in regex engine with mixed utf8/latin pattern and strings. Related... X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=a0a388a13daad79062b3c4b126f22d2f46fc82dd;p=p5sagit%2Fp5-mst-13.2.git Fix various bugs in regex engine with mixed utf8/latin pattern and strings. Related to [perl #36207] among others Message-ID: <9b18b3110712170621h41de2c76k331971e3660abcb0@mail.gmail.com> p4raw-id: //depot/perl@32628 --- diff --git a/MANIFEST b/MANIFEST index 810f749..490dc10 100644 --- a/MANIFEST +++ b/MANIFEST @@ -3786,6 +3786,7 @@ t/op/read.t See if read() works t/op/recurse.t See if deep recursion works t/op/ref.t See if refs and objects work t/op/reg_email.t See if regex recursion works by parsing email addresses +t/op/reg_fold.t See if case folding works properly t/op/regexp_noamp.t See if regular expressions work with optimizations t/op/regexp_notrie.t See if regular expressions work without trie optimisation t/op/regexp_qr_embed.t See if regular expressions work with embedded qr// diff --git a/regcomp.c b/regcomp.c index 19d6f15..5a175ba 100644 --- a/regcomp.c +++ b/regcomp.c @@ -2791,7 +2791,15 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, last = cur; } } else { - if ( last ) { +/* + Currently we assume that the trie can handle unicode and ascii + matches fold cased matches. If this proves true then the following + define will prevent tries in this situation. + + #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT) +*/ +#define TRIE_TYPE_IS_SAFE 1 + if ( last && TRIE_TYPE_IS_SAFE ) { make_trie( pRExC_state, startbranch, first, cur, tail, count, optype, depth+1 ); @@ -2819,7 +2827,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); }); - if ( last ) { + + if ( last && TRIE_TYPE_IS_SAFE ) { made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 ); #ifdef TRIE_STUDY_OPT if ( ((made == MADE_EXACT_TRIE && @@ -6867,6 +6876,7 @@ tryagain: case 0xDF: case 0xC3: case 0xCE: + do_foldchar: if (!LOC && FOLD) { U32 len,cp; len=0; /* silence a spurious compiler warning */ @@ -6893,7 +6903,11 @@ tryagain: required, as the default for this switch is to jump to the literal text handling code. */ - switch (*++RExC_parse) { + switch ((U8)*++RExC_parse) { + case 0xDF: + case 0xC3: + case 0xCE: + goto do_foldchar; /* Special Escapes */ case 'A': RExC_seen_zerolen++; @@ -7211,8 +7225,13 @@ tryagain: an unescaped equivalent literal. */ - switch (*++p) { + switch ((U8)*++p) { /* These are all the special escapes. */ + case 0xDF: + case 0xC3: + case 0xCE: + if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF)) + goto normal_default; case 'A': /* Start assertion */ case 'b': case 'B': /* Word-boundary assertion*/ case 'C': /* Single char !DANGEROUS! */ diff --git a/regexec.c b/regexec.c index 1b9fca1..6afcea4 100644 --- a/regexec.c +++ b/regexec.c @@ -989,7 +989,11 @@ Perl_re_intuit_start(pTHX_ REGEXP * const prog, SV *sv, char *strpos, return NULL; } - +#define DECL_TRIE_TYPE(scan) \ + const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \ + trie_type = (scan->flags != EXACT) \ + ? (do_utf8 ? trie_utf8_fold : (UTF ? trie_latin_utf8_fold : trie_plain)) \ + : (do_utf8 ? trie_utf8 : trie_plain) #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \ uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \ @@ -1007,6 +1011,19 @@ uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \ uscan = foldbuf + UNISKIP( uvc ); \ } \ break; \ + case trie_latin_utf8_fold: \ + if ( foldlen>0 ) { \ + uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \ + foldlen -= len; \ + uscan += len; \ + len=0; \ + } else { \ + len = 1; \ + uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen ); \ + foldlen -= UNISKIP( uvc ); \ + uscan = foldbuf + UNISKIP( uvc ); \ + } \ + break; \ case trie_utf8: \ uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \ break; \ @@ -1029,12 +1046,14 @@ uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \ } \ } STMT_END -#define REXEC_FBC_EXACTISH_CHECK(CoNd) \ +#define REXEC_FBC_EXACTISH_CHECK(CoNd) \ +{ \ + char *my_strend= (char *)strend; \ if ( (CoNd) \ && (ln == len || \ - ibcmp_utf8(s, NULL, 0, do_utf8, \ + !ibcmp_utf8(s, &my_strend, 0, do_utf8, \ m, NULL, ln, (bool)UTF)) \ - && (!reginfo || regtry(reginfo, &s)) ) \ + && (!reginfo || regtry(reginfo, &s)) ) \ goto got_it; \ else { \ U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \ @@ -1042,15 +1061,14 @@ uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \ f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \ if ( f != c \ && (f == c1 || f == c2) \ - && (ln == foldlen || \ - !ibcmp_utf8((char *) foldbuf, \ - NULL, foldlen, do_utf8, \ - m, \ - NULL, ln, (bool)UTF)) \ - && (!reginfo || regtry(reginfo, &s)) ) \ + && (ln == len || \ + !ibcmp_utf8(s, &my_strend, 0, do_utf8,\ + m, NULL, ln, (bool)UTF)) \ + && (!reginfo || regtry(reginfo, &s)) ) \ goto got_it; \ } \ - s += len +} \ +s += len #define REXEC_FBC_EXACTISH_SCAN(CoNd) \ STMT_START { \ @@ -1210,14 +1228,26 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, U8 tmpbuf1[UTF8_MAXBYTES_CASE+1]; U8 tmpbuf2[UTF8_MAXBYTES_CASE+1]; const U32 uniflags = UTF8_ALLOW_DEFAULT; - - to_utf8_lower((U8*)m, tmpbuf1, &ulen1); - to_utf8_upper((U8*)m, tmpbuf2, &ulen2); - + + /* XXX: Since the node will be case folded at compile + time this logic is a little odd, although im not + sure that its actually wrong. --dmq */ + + c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1); + c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2); + + /* XXX: This is kinda strange. to_utf8_XYZ returns the + codepoint of the first character in the converted + form, yet originally we did the extra step. + No tests fail by commenting this code out however + so Ive left it out. -- dmq. + c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE, 0, uniflags); c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE, 0, uniflags); + */ + lnc = 0; while (sm < ((U8 *) m + ln)) { lnc++; @@ -1252,24 +1282,33 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, * matching (called "loose matching" in Unicode). * ibcmp_utf8() will do just that. */ - if (do_utf8) { + if (do_utf8 || UTF) { UV c, f; U8 tmpbuf [UTF8_MAXBYTES+1]; - STRLEN len, foldlen; + STRLEN len = 1; + STRLEN foldlen; const U32 uniflags = UTF8_ALLOW_DEFAULT; if (c1 == c2) { /* Upper and lower of 1st char are equal - * probably not a "letter". */ while (s <= e) { - c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len, + if (do_utf8) { + c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len, uniflags); + } else { + c = *((U8*)s); + } REXEC_FBC_EXACTISH_CHECK(c == c1); } } else { while (s <= e) { - c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len, + if (do_utf8) { + c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len, uniflags); + } else { + c = *((U8*)s); + } /* Handle some of the three Greek sigmas cases. * Note that not all the possible combinations @@ -1287,6 +1326,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } } else { + /* Neither pattern nor string are UTF8 */ if (c1 == c2) REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1); else @@ -1461,10 +1501,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, case AHOCORASICKC: case AHOCORASICK: { - const enum { trie_plain, trie_utf8, trie_utf8_fold } - trie_type = do_utf8 ? - (c->flags == EXACT ? trie_utf8 : trie_utf8_fold) - : trie_plain; + DECL_TRIE_TYPE(c); /* what trie are we using right now */ reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ]; @@ -2872,10 +2909,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) case TRIE: { /* what type of TRIE am I? (utf8 makes this contextual) */ - const enum { trie_plain, trie_utf8, trie_utf8_fold } - trie_type = do_utf8 ? - (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold) - : trie_plain; + DECL_TRIE_TYPE(scan); /* what trie are we using right now */ reg_trie_data * const trie diff --git a/t/op/pat.t b/t/op/pat.t index 2697157..7d03eb6 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -3406,9 +3406,9 @@ if (!$ENV{PERL_SKIP_PSYCHO_TEST}){ ok($utf8 =~ /(abc|\xe9)/i, "utf8/latin trie"); ok($utf8 =~ /(abc|$latin1)/i, "utf8/latin trie runtime"); - ok("\xe9" =~ /$utf8/i, "# TODO latin/utf8"); + ok("\xe9" =~ /$utf8/i, "# latin/utf8"); ok("\xe9" =~ /(abc|$utf8)/i, "# latin/utf8 trie"); - ok($latin1 =~ /$utf8/i, "# TODO latin/utf8 runtime"); + ok($latin1 =~ /$utf8/i, "# latin/utf8 runtime"); ok($latin1 =~ /(abc|$utf8)/i, "# latin/utf8 trie runtime"); } @@ -4487,6 +4487,23 @@ sub kt iseq($1,"\xd6","#45605"); } +{ + # Regardless of utf8ness any character matches itself when + # doing a case insensitive match. See also [perl #36207] + for my $o (0..255) { + my @ch=(chr($o),chr($o)); + utf8::upgrade($ch[1]); + for my $u_str (0,1) { + for my $u_pat (0,1) { + ok( $ch[$u_str]=~/\Q$ch[$u_pat]\E/i, + "\$c=~/\$c/i : chr($o) : u_str=$u_str u_pat=$u_pat"); + ok( $ch[$u_str]=~/\Q$ch[$u_pat]\E|xyz/i, + "# \$c=~/\$c|xyz/i : chr($o) : u_str=$u_str u_pat=$u_pat"); + } + } + } +} + # Test counter is at bottom of file. Put new tests above here. #------------------------------------------------------------------- # Keep the following tests last -- they may crash perl @@ -4545,6 +4562,6 @@ ok($@=~/\QSequence \k... not terminated in regex;\E/); iseq(0+$::test,$::TestCount,"Got the right number of tests!"); # Don't forget to update this! BEGIN { - $::TestCount = 1965; + $::TestCount = 4013; print "1..$::TestCount\n"; } diff --git a/t/op/reg_fold.t b/t/op/reg_fold.t new file mode 100644 index 0000000..6064ecf --- /dev/null +++ b/t/op/reg_fold.t @@ -0,0 +1,37 @@ +use strict; +use warnings; +use Test::More; +my $count=1; +my @tests; +use Cwd; + +my $file="../lib/unicore/CaseFolding.txt"; +open my $fh,"<",$file + or die "Failed to read '$file' from '".cwd()."': $!"; +while (<$fh>) { + chomp; + my ($line,$comment)= split/\s+#\s+/, $_; + my ($cp,$type,@fc)=split/[\s;]+/,$line||''; + next unless $type and ($type eq 'F' or $type eq 'C'); + $_="\\x{$_}" for @fc; + my $cpv=hex("0x$cp"); + my $chr="chr(0x$cp)"; + my @str; + push @str,$chr if $cpv<128 or $cpv>256; + if ($cpv<256) { + push @str,"do{my \$c=$chr; utf8::upgrade(\$c); \$c}" + } + + foreach my $str ( @str ) { + my $expr="$str=~/@fc/ix"; + my $t=($cpv > 256 || $str=~/^do/) ? "unicode" : "latin"; + push @tests, + qq[ok($expr,'$chr=~/@fc/ix - $comment ($t string)')]; + $tests[-1]="TODO: { local \$TODO='[13:41] cue *It is all Greek to me* joke.';\n$tests[-1] }" + if $cp eq '0390' or $cp eq '03B0'; + $count++; + } +} +eval join ";\n","plan tests=>".($count-1),@tests,"1" + or die $@; +__DATA__ diff --git a/utf8.c b/utf8.c index e21cb4f..7bc2b09 100644 --- a/utf8.c +++ b/utf8.c @@ -2254,13 +2254,18 @@ Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const if (pe1) e1 = *(U8**)pe1; + /* assert(e1 || l1); */ if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1))) f1 = (const U8*)s1 + l1; if (pe2) e2 = *(U8**)pe2; + /* assert(e2 || l2); */ if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2))) f2 = (const U8*)s2 + l2; + /* This shouldn't happen. However, putting an assert() there makes some + * tests fail. */ + /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */ if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)) return 1; /* mismatch; possible infinite loop or false positive */