From: Yves Orton <demerphq@gmail.com>
Date: Mon, 17 Dec 2007 15:21:46 +0000 (+0100)
Subject: Fix various bugs in regex engine with mixed utf8/latin pattern and strings. Related... 
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=a0a388a13daad79062b3c4b126f22d2f46fc82dd;p=p5sagit%2Fp5-mst-13.2.git

Fix various bugs in regex engine with mixed utf8/latin pattern and strings. Related to [perl #36207] among others
Message-ID: <9b18b3110712170621h41de2c76k331971e3660abcb0@mail.gmail.com>

p4raw-id: //depot/perl@32628
---

diff --git a/MANIFEST b/MANIFEST
index 810f749..490dc10 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -3786,6 +3786,7 @@ t/op/read.t			See if read() works
 t/op/recurse.t			See if deep recursion works
 t/op/ref.t			See if refs and objects work
 t/op/reg_email.t		See if regex recursion works by parsing email addresses
+t/op/reg_fold.t			See if case folding works properly
 t/op/regexp_noamp.t		See if regular expressions work with optimizations
 t/op/regexp_notrie.t		See if regular expressions work without trie optimisation
 t/op/regexp_qr_embed.t		See if regular expressions work with embedded qr//
diff --git a/regcomp.c b/regcomp.c
index 19d6f15..5a175ba 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2791,7 +2791,15 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                     last = cur;
                                 }
                             } else {
-                                if ( last ) {
+/* 
+    Currently we assume that the trie can handle unicode and ascii
+    matches fold cased matches. If this proves true then the following
+    define will prevent tries in this situation. 
+    
+    #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
+*/
+#define TRIE_TYPE_IS_SAFE 1
+                                if ( last && TRIE_TYPE_IS_SAFE ) {
                                     make_trie( pRExC_state, 
                                             startbranch, first, cur, tail, count, 
                                             optype, depth+1 );
@@ -2819,7 +2827,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
 
                         });
-                        if ( last ) {
+                        
+                        if ( last && TRIE_TYPE_IS_SAFE ) {
                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
 #ifdef TRIE_STUDY_OPT	
                             if ( ((made == MADE_EXACT_TRIE && 
@@ -6867,6 +6876,7 @@ tryagain:
     case 0xDF:
     case 0xC3:
     case 0xCE:
+        do_foldchar:
         if (!LOC && FOLD) {
             U32 len,cp;
 	    len=0; /* silence a spurious compiler warning */
@@ -6893,7 +6903,11 @@ tryagain:
 	   required, as the default for this switch is to jump to the
 	   literal text handling code.
 	*/
-	switch (*++RExC_parse) {
+	switch ((U8)*++RExC_parse) {
+	case 0xDF:
+	case 0xC3:
+	case 0xCE:
+	           goto do_foldchar;	    
 	/* Special Escapes */
 	case 'A':
 	    RExC_seen_zerolen++;
@@ -7211,8 +7225,13 @@ tryagain:
 		       an unescaped equivalent literal.
 		    */
 
-		    switch (*++p) {
+		    switch ((U8)*++p) {
 		    /* These are all the special escapes. */
+    		    case 0xDF:
+    		    case 0xC3:
+    		    case 0xCE:
+    		           if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
+    		                goto normal_default;		    
 		    case 'A':             /* Start assertion */
 		    case 'b': case 'B':   /* Word-boundary assertion*/
 		    case 'C':             /* Single char !DANGEROUS! */
diff --git a/regexec.c b/regexec.c
index 1b9fca1..6afcea4 100644
--- a/regexec.c
+++ b/regexec.c
@@ -989,7 +989,11 @@ Perl_re_intuit_start(pTHX_ REGEXP * const prog, SV *sv, char *strpos,
     return NULL;
 }
 
-
+#define DECL_TRIE_TYPE(scan) \
+    const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
+		    trie_type = (scan->flags != EXACT) \
+		              ? (do_utf8 ? trie_utf8_fold : (UTF ? trie_latin_utf8_fold : trie_plain)) \
+                              : (do_utf8 ? trie_utf8 : trie_plain)
 
 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
@@ -1007,6 +1011,19 @@ uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
 	    uscan = foldbuf + UNISKIP( uvc );                               \
 	}                                                                   \
 	break;                                                              \
+    case trie_latin_utf8_fold:                                              \
+	if ( foldlen>0 ) {                                                  \
+	    uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
+	    foldlen -= len;                                                 \
+	    uscan += len;                                                   \
+	    len=0;                                                          \
+	} else {                                                            \
+	    len = 1;                                                        \
+	    uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
+	    foldlen -= UNISKIP( uvc );                                      \
+	    uscan = foldbuf + UNISKIP( uvc );                               \
+	}                                                                   \
+	break;                                                              \
     case trie_utf8:                                                         \
 	uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
 	break;                                                              \
@@ -1029,12 +1046,14 @@ uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
     }                                                                       \
 } STMT_END
 
-#define REXEC_FBC_EXACTISH_CHECK(CoNd)                  \
+#define REXEC_FBC_EXACTISH_CHECK(CoNd)                 \
+{                                                      \
+    char *my_strend= (char *)strend;                   \
     if ( (CoNd)                                        \
 	 && (ln == len ||                              \
-	     ibcmp_utf8(s, NULL, 0,  do_utf8,          \
+	     !ibcmp_utf8(s, &my_strend, 0,  do_utf8,   \
 			m, NULL, ln, (bool)UTF))       \
-	 && (!reginfo || regtry(reginfo, &s)) )         \
+	 && (!reginfo || regtry(reginfo, &s)) )        \
 	goto got_it;                                   \
     else {                                             \
 	 U8 foldbuf[UTF8_MAXBYTES_CASE+1];             \
@@ -1042,15 +1061,14 @@ uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
 	 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);  \
 	 if ( f != c                                   \
 	      && (f == c1 || f == c2)                  \
-	      && (ln == foldlen ||                     \
-		  !ibcmp_utf8((char *) foldbuf,        \
-			      NULL, foldlen, do_utf8,  \
-			      m,                       \
-			      NULL, ln, (bool)UTF))    \
-	      && (!reginfo || regtry(reginfo, &s)) )    \
+	      && (ln == len ||                         \
+	        !ibcmp_utf8(s, &my_strend, 0,  do_utf8,\
+			      m, NULL, ln, (bool)UTF)) \
+	      && (!reginfo || regtry(reginfo, &s)) )   \
 	      goto got_it;                             \
     }                                                  \
-    s += len
+}                                                      \
+s += len
 
 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
 STMT_START {                                              \
@@ -1210,14 +1228,26 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 		U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
 		U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
 		const U32 uniflags = UTF8_ALLOW_DEFAULT;
-
-		to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
-		to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
-
+		
+                /* XXX: Since the node will be case folded at compile
+                   time this logic is a little odd, although im not 
+                   sure that its actually wrong. --dmq */
+                   
+		c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
+		c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
+
+		/* XXX: This is kinda strange. to_utf8_XYZ returns the 
+                   codepoint of the first character in the converted
+                   form, yet originally we did the extra step. 
+                   No tests fail by commenting this code out however
+                   so Ive left it out. -- dmq.
+                   
 		c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE, 
 				    0, uniflags);
 		c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
 				    0, uniflags);
+                */
+                
 		lnc = 0;
 		while (sm < ((U8 *) m + ln)) {
 		    lnc++;
@@ -1252,24 +1282,33 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 	     * matching (called "loose matching" in Unicode).
 	     * ibcmp_utf8() will do just that. */
 
-	    if (do_utf8) {
+	    if (do_utf8 || UTF) {
 	        UV c, f;
 	        U8 tmpbuf [UTF8_MAXBYTES+1];
-		STRLEN len, foldlen;
+		STRLEN len = 1;
+		STRLEN foldlen;
 		const U32 uniflags = UTF8_ALLOW_DEFAULT;
 		if (c1 == c2) {
 		    /* Upper and lower of 1st char are equal -
 		     * probably not a "letter". */
 		    while (s <= e) {
-		        c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
+		        if (do_utf8) {
+		            c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
 					   uniflags);
+                        } else {
+                            c = *((U8*)s);
+                        }					  
 			REXEC_FBC_EXACTISH_CHECK(c == c1);
 		    }
 		}
 		else {
 		    while (s <= e) {
-		      c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
+		        if (do_utf8) {
+		            c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
 					   uniflags);
+                        } else {
+                            c = *((U8*)s);
+                        }
 
 			/* Handle some of the three Greek sigmas cases.
 			 * Note that not all the possible combinations
@@ -1287,6 +1326,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 		}
 	    }
 	    else {
+	        /* Neither pattern nor string are UTF8 */
 		if (c1 == c2)
 		    REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
 		else
@@ -1461,10 +1501,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 	case AHOCORASICKC:
 	case AHOCORASICK: 
 	    {
-	        const enum { trie_plain, trie_utf8, trie_utf8_fold }
-		    trie_type = do_utf8 ?
-			  (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
-			: trie_plain;
+	        DECL_TRIE_TYPE(c);
                 /* what trie are we using right now */
         	reg_ac_data *aho
         	    = (reg_ac_data*)progi->data->data[ ARG( c ) ];
@@ -2872,10 +2909,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
 	case TRIE:
 	    {
                 /* what type of TRIE am I? (utf8 makes this contextual) */
-	        const enum { trie_plain, trie_utf8, trie_utf8_fold }
-		    trie_type = do_utf8 ?
-			  (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
-			: trie_plain;
+                DECL_TRIE_TYPE(scan);
 
                 /* what trie are we using right now */
 		reg_trie_data * const trie
diff --git a/t/op/pat.t b/t/op/pat.t
index 2697157..7d03eb6 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -3406,9 +3406,9 @@ if (!$ENV{PERL_SKIP_PSYCHO_TEST}){
     ok($utf8 =~ /(abc|\xe9)/i, "utf8/latin trie");
     ok($utf8 =~ /(abc|$latin1)/i, "utf8/latin trie runtime");
 
-    ok("\xe9" =~ /$utf8/i, "# TODO latin/utf8");
+    ok("\xe9" =~ /$utf8/i, "# latin/utf8");
     ok("\xe9" =~ /(abc|$utf8)/i, "# latin/utf8 trie");
-    ok($latin1 =~ /$utf8/i, "# TODO latin/utf8 runtime");
+    ok($latin1 =~ /$utf8/i, "# latin/utf8 runtime");
     ok($latin1 =~ /(abc|$utf8)/i, "# latin/utf8 trie runtime");
 }
 
@@ -4487,6 +4487,23 @@ sub kt
     iseq($1,"\xd6","#45605");
 }
 
+{
+    # Regardless of utf8ness any character matches itself when 
+    # doing a case insensitive match. See also [perl #36207] 
+    for my $o (0..255) {
+        my @ch=(chr($o),chr($o));
+        utf8::upgrade($ch[1]);
+        for my $u_str (0,1) {
+            for my $u_pat (0,1) {
+                ok( $ch[$u_str]=~/\Q$ch[$u_pat]\E/i,
+                    "\$c=~/\$c/i : chr($o) : u_str=$u_str u_pat=$u_pat");
+                ok( $ch[$u_str]=~/\Q$ch[$u_pat]\E|xyz/i,
+                "# \$c=~/\$c|xyz/i : chr($o) : u_str=$u_str u_pat=$u_pat");
+            }
+        }
+    }
+}
+
 # Test counter is at bottom of file. Put new tests above here.
 #-------------------------------------------------------------------
 # Keep the following tests last -- they may crash perl
@@ -4545,6 +4562,6 @@ ok($@=~/\QSequence \k... not terminated in regex;\E/);
 iseq(0+$::test,$::TestCount,"Got the right number of tests!");
 # Don't forget to update this!
 BEGIN {
-    $::TestCount = 1965;
+    $::TestCount = 4013;
     print "1..$::TestCount\n";
 }
diff --git a/t/op/reg_fold.t b/t/op/reg_fold.t
new file mode 100644
index 0000000..6064ecf
--- /dev/null
+++ b/t/op/reg_fold.t
@@ -0,0 +1,37 @@
+use strict;
+use warnings;
+use Test::More;
+my $count=1;
+my @tests;
+use Cwd;
+
+my $file="../lib/unicore/CaseFolding.txt";
+open my $fh,"<",$file 
+    or die "Failed to read '$file' from '".cwd()."': $!";
+while (<$fh>) {
+    chomp;
+    my ($line,$comment)= split/\s+#\s+/, $_;
+    my ($cp,$type,@fc)=split/[\s;]+/,$line||'';
+    next unless $type and ($type eq 'F' or $type eq 'C');
+    $_="\\x{$_}" for @fc;
+    my $cpv=hex("0x$cp");
+    my $chr="chr(0x$cp)";
+    my @str;
+    push @str,$chr if $cpv<128 or $cpv>256;
+    if ($cpv<256) {
+        push @str,"do{my \$c=$chr; utf8::upgrade(\$c); \$c}"
+    }
+    
+    foreach my $str ( @str ) {    
+        my $expr="$str=~/@fc/ix";
+        my $t=($cpv > 256 || $str=~/^do/) ? "unicode" : "latin";
+        push @tests,
+            qq[ok($expr,'$chr=~/@fc/ix - $comment ($t string)')];
+        $tests[-1]="TODO: { local \$TODO='[13:41] <BinGOs> cue *It is all Greek to me* joke.';\n$tests[-1] }"
+            if $cp eq '0390' or $cp eq '03B0';
+        $count++;
+    }
+}    
+eval join ";\n","plan tests=>".($count-1),@tests,"1"
+    or die $@;
+__DATA__
diff --git a/utf8.c b/utf8.c
index e21cb4f..7bc2b09 100644
--- a/utf8.c
+++ b/utf8.c
@@ -2254,13 +2254,18 @@ Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const
      
      if (pe1)
 	  e1 = *(U8**)pe1;
+     /* assert(e1 || l1); */
      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
 	  f1 = (const U8*)s1 + l1;
      if (pe2)
 	  e2 = *(U8**)pe2;
+     /* assert(e2 || l2); */
      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
 	  f2 = (const U8*)s2 + l2;
 
+     /* This shouldn't happen. However, putting an assert() there makes some
+      * tests fail. */
+     /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */
      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
 	  return 1; /* mismatch; possible infinite loop or false positive */