From: Yves Orton <demerphq@gmail.com>
Date: Fri, 9 Jun 2006 19:48:11 +0000 (+0200)
Subject: Re: [PATCH] Better version of the Aho-Corasick patch and lots of benchmarks.
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=f2278c82849286ea411021a8d3efc5ccb2dc1250;p=p5sagit%2Fp5-mst-13.2.git

Re: [PATCH] Better version of the Aho-Corasick patch and lots of benchmarks.
Message-ID: <9b18b3110606091048n4d1f3b17vc608943044d4532a@mail.gmail.com>

p4raw-id: //depot/perl@28379
---

diff --git a/ext/re/re.pm b/ext/re/re.pm
index 2da3a25..c44994e 100644
--- a/ext/re/re.pm
+++ b/ext/re/re.pm
@@ -84,14 +84,15 @@ that these flags can be set directly via ${^RE_DEBUG_FLAGS} by using the
 following flag values:
 
 
-    RE_DEBUG_COMPILE       0x01
-    RE_DEBUG_EXECUTE       0x02
-    RE_DEBUG_TRIE_COMPILE  0x04
-    RE_DEBUG_TRIE_EXECUTE  0x08
-    RE_DEBUG_TRIE_MORE     0x10
-    RE_DEBUG_OPTIMISE      0x20
-    RE_DEBUG_OFFSETS       0x40
-    RE_DEBUG_PARSE         0x80
+    RE_DEBUG_COMPILE       0x001
+    RE_DEBUG_EXECUTE       0x002
+    RE_DEBUG_TRIE_COMPILE  0x004
+    RE_DEBUG_TRIE_EXECUTE  0x008
+    RE_DEBUG_TRIE_MORE     0x010
+    RE_DEBUG_OPTIMISE      0x020
+    RE_DEBUG_OFFSETS       0x040
+    RE_DEBUG_PARSE         0x080
+    RE_DEBUG_OFFSETS_DEBUG 0x100
 
 The directive C<use re 'debug'> and its equivalents are I<not> lexically
 scoped, as the other directives are.  They have both compile-time and run-time
@@ -123,18 +124,20 @@ sub setcolor {
 }
 
 my %flags = (
-    COMPILE      => 1,
-    EXECUTE      => 2,
-    TRIE_COMPILE => 4,
-    TRIE_EXECUTE => 8,
-    TRIE_MORE    => 16,
-    OPTIMISE     => 32,
-    OPTIMIZE     => 32, # alias
-    OFFSETS      => 64,
-    PARSE        => 128,
-    ALL          => 255,
-    All          => 15,
-    More         => 31,
+    COMPILE       => 1,
+    EXECUTE       => 2,
+    TRIE_COMPILE  => 4,
+    TRIE_EXECUTE  => 8,
+    TRIE_MORE     => 16,
+    OPTIMISE      => 32,
+    OPTIMIZE      => 32, # alias
+    OFFSETS       => 64,
+    PARSE         => 128,
+    OFFSETS_DEBUG => 256,
+    OFFSETS_OLD   => 576,
+    ALL           => 0xFFFF,
+    All           => 15,
+    More          => 31,
 );
 
 my $installed = 0;
diff --git a/regcomp.c b/regcomp.c
index c99a0f8..df5d890 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -3639,6 +3639,7 @@ reStudy:
     Newxz(r->startp, RExC_npar, I32);
     Newxz(r->endp, RExC_npar, I32);
 
+    DEBUG_r( RX_DEBUG_on(r) );
     DEBUG_COMPILE_r({
         if (SvIV(re_debug_flags)> (RE_DEBUG_COMPILE | RE_DEBUG_EXECUTE)) 
             PerlIO_printf(Perl_debug_log,"Final program:\n");
diff --git a/regexec.c b/regexec.c
index ffe9888..5338e79 100644
--- a/regexec.c
+++ b/regexec.c
@@ -3020,6 +3020,17 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
 	    */
 
 		if ( st->u.trie.accepted == 1 ) {
+		    DEBUG_EXECUTE_r({
+			SV ** const tmp = RX_DEBUG(reginfo->prog) 
+			                ? av_fetch( trie->words, st->u.trie.accept_buff[ 0 ].wordnum-1, 0 )
+			                : NULL;
+			PerlIO_printf( Perl_debug_log,
+			    "%*s  %sonly one match : #%d <%s>%s\n",
+			    REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
+        		    st->u.trie.accept_buff[ 0 ].wordnum,
+        		    tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
+        		    PL_colors[5] );
+		    });
 		    PL_reginput = (char *)st->u.trie.accept_buff[ 0 ].endpos;
 		    /* in this case we free tmps/leave before we call regmatch
 		       as we wont be using accept_buff again. */
@@ -3050,6 +3061,18 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
 				    st->u.trie.accept_buff[best].wordnum)
 				best = cur;
 			}
+			DEBUG_EXECUTE_r({
+			    reg_trie_data * const trie = (reg_trie_data*)
+					    rex->data->data[ARG(scan)];
+			    SV ** const tmp = RX_DEBUG(reginfo->prog) 
+			                ? av_fetch( trie->words, st->u.trie.accept_buff[ best ].wordnum - 1, 0 )
+			                : NULL;
+    			    PerlIO_printf( Perl_debug_log, "%*s  %strying alternation #%d <%s> at node #%d %s\n",
+    			        REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
+    			        st->u.trie.accept_buff[best].wordnum,
+        		        tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
+        		        PL_colors[5] );
+			});
 			if ( best<st->u.trie.accepted ) {
 			    reg_trie_accepted tmp = st->u.trie.accept_buff[ best ];
 			    st->u.trie.accept_buff[ best ] = st->u.trie.accept_buff[ st->u.trie.accepted ];
diff --git a/regexp.h b/regexp.h
index 0493267..777d82f 100644
--- a/regexp.h
+++ b/regexp.h
@@ -55,41 +55,47 @@ typedef struct regexp {
 
 #define ROPT_ANCH		(ROPT_ANCH_BOL|ROPT_ANCH_MBOL|ROPT_ANCH_GPOS|ROPT_ANCH_SBOL)
 #define ROPT_ANCH_SINGLE	(ROPT_ANCH_SBOL|ROPT_ANCH_GPOS)
-#define ROPT_ANCH_BOL	 	0x00001
-#define ROPT_ANCH_MBOL	 	0x00002
-#define ROPT_ANCH_SBOL	 	0x00004
-#define ROPT_ANCH_GPOS	 	0x00008
-#define ROPT_SKIP		0x00010
-#define ROPT_IMPLICIT		0x00020	/* Converted .* to ^.* */
-#define ROPT_NOSCAN		0x00040	/* Check-string always at start. */
-#define ROPT_GPOS_SEEN		0x00080
-#define ROPT_CHECK_ALL		0x00100
-#define ROPT_LOOKBEHIND_SEEN	0x00200
-#define ROPT_EVAL_SEEN		0x00400
-#define ROPT_CANY_SEEN		0x00800
+#define ROPT_ANCH_BOL	 	0x00000001
+#define ROPT_ANCH_MBOL	 	0x00000002
+#define ROPT_ANCH_SBOL	 	0x00000004
+#define ROPT_ANCH_GPOS	 	0x00000008
+#define ROPT_SKIP		0x00000010
+#define ROPT_IMPLICIT		0x00000020	/* Converted .* to ^.* */
+#define ROPT_NOSCAN		0x00000040	/* Check-string always at start. */
+#define ROPT_GPOS_SEEN		0x00000080
+#define ROPT_CHECK_ALL		0x00000100
+#define ROPT_LOOKBEHIND_SEEN	0x00000200
+#define ROPT_EVAL_SEEN		0x00000400
+#define ROPT_CANY_SEEN		0x00000800
 #define ROPT_SANY_SEEN		ROPT_CANY_SEEN /* src bckwrd cmpt */
 
 /* 0xf800 of reganch is used by PMf_COMPILETIME */
 
-#define ROPT_UTF8		0x10000
-#define ROPT_NAUGHTY		0x20000 /* how exponential is this pattern? */
-#define ROPT_COPY_DONE		0x40000	/* subbeg is a copy of the string */
-#define ROPT_TAINTED_SEEN	0x80000
+#define ROPT_UTF8		0x00010000
+#define ROPT_NAUGHTY		0x00020000 /* how exponential is this pattern? */
+#define ROPT_COPY_DONE		0x00040000	/* subbeg is a copy of the string */
+#define ROPT_TAINTED_SEEN	0x00080000
 #define ROPT_MATCH_UTF8		0x10000000 /* subbeg is utf-8 */
 
-#define RE_USE_INTUIT_NOML	0x0100000 /* Best to intuit before matching */
-#define RE_USE_INTUIT_ML	0x0200000
-#define REINT_AUTORITATIVE_NOML	0x0400000 /* Can trust a positive answer */
-#define REINT_AUTORITATIVE_ML	0x0800000 
-#define REINT_ONCE_NOML		0x1000000 /* Intuit can succed once only. */
-#define REINT_ONCE_ML		0x2000000
-#define RE_INTUIT_ONECHAR	0x4000000
-#define RE_INTUIT_TAIL		0x8000000
+#define RE_USE_INTUIT_NOML	0x00100000 /* Best to intuit before matching */
+#define RE_USE_INTUIT_ML	0x00200000
+#define REINT_AUTORITATIVE_NOML	0x00400000 /* Can trust a positive answer */
+#define REINT_AUTORITATIVE_ML	0x00800000 
+#define REINT_ONCE_NOML		0x01000000 /* Intuit can succed once only. */
+#define REINT_ONCE_ML		0x02000000
+#define RE_INTUIT_ONECHAR	0x04000000
+#define RE_INTUIT_TAIL		0x08000000
+
+#define RE_DEBUG_BIT            0x20000000
 
 #define RE_USE_INTUIT		(RE_USE_INTUIT_NOML|RE_USE_INTUIT_ML)
 #define REINT_AUTORITATIVE	(REINT_AUTORITATIVE_NOML|REINT_AUTORITATIVE_ML)
 #define REINT_ONCE		(REINT_ONCE_NOML|REINT_ONCE_ML)
 
+#define RX_DEBUG(prog)	((prog)->reganch & RE_DEBUG_BIT)
+#define RX_DEBUG_on(prog) ((prog)->reganch |= RE_DEBUG_BIT)
+
+
 #define RX_MATCH_TAINTED(prog)	((prog)->reganch & ROPT_TAINTED_SEEN)
 #define RX_MATCH_TAINTED_on(prog) ((prog)->reganch |= ROPT_TAINTED_SEEN)
 #define RX_MATCH_TAINTED_off(prog) ((prog)->reganch &= ~ROPT_TAINTED_SEEN)