From: Yves Orton Date: Fri, 9 Jun 2006 19:48:11 +0000 (+0200) Subject: Re: [PATCH] Better version of the Aho-Corasick patch and lots of benchmarks. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=f2278c82849286ea411021a8d3efc5ccb2dc1250;p=p5sagit%2Fp5-mst-13.2.git Re: [PATCH] Better version of the Aho-Corasick patch and lots of benchmarks. Message-ID: <9b18b3110606091048n4d1f3b17vc608943044d4532a@mail.gmail.com> p4raw-id: //depot/perl@28379 --- diff --git a/ext/re/re.pm b/ext/re/re.pm index 2da3a25..c44994e 100644 --- a/ext/re/re.pm +++ b/ext/re/re.pm @@ -84,14 +84,15 @@ that these flags can be set directly via ${^RE_DEBUG_FLAGS} by using the following flag values: - RE_DEBUG_COMPILE 0x01 - RE_DEBUG_EXECUTE 0x02 - RE_DEBUG_TRIE_COMPILE 0x04 - RE_DEBUG_TRIE_EXECUTE 0x08 - RE_DEBUG_TRIE_MORE 0x10 - RE_DEBUG_OPTIMISE 0x20 - RE_DEBUG_OFFSETS 0x40 - RE_DEBUG_PARSE 0x80 + RE_DEBUG_COMPILE 0x001 + RE_DEBUG_EXECUTE 0x002 + RE_DEBUG_TRIE_COMPILE 0x004 + RE_DEBUG_TRIE_EXECUTE 0x008 + RE_DEBUG_TRIE_MORE 0x010 + RE_DEBUG_OPTIMISE 0x020 + RE_DEBUG_OFFSETS 0x040 + RE_DEBUG_PARSE 0x080 + RE_DEBUG_OFFSETS_DEBUG 0x100 The directive C and its equivalents are I lexically scoped, as the other directives are. They have both compile-time and run-time @@ -123,18 +124,20 @@ sub setcolor { } my %flags = ( - COMPILE => 1, - EXECUTE => 2, - TRIE_COMPILE => 4, - TRIE_EXECUTE => 8, - TRIE_MORE => 16, - OPTIMISE => 32, - OPTIMIZE => 32, # alias - OFFSETS => 64, - PARSE => 128, - ALL => 255, - All => 15, - More => 31, + COMPILE => 1, + EXECUTE => 2, + TRIE_COMPILE => 4, + TRIE_EXECUTE => 8, + TRIE_MORE => 16, + OPTIMISE => 32, + OPTIMIZE => 32, # alias + OFFSETS => 64, + PARSE => 128, + OFFSETS_DEBUG => 256, + OFFSETS_OLD => 576, + ALL => 0xFFFF, + All => 15, + More => 31, ); my $installed = 0; diff --git a/regcomp.c b/regcomp.c index c99a0f8..df5d890 100644 --- a/regcomp.c +++ b/regcomp.c @@ -3639,6 +3639,7 @@ reStudy: Newxz(r->startp, RExC_npar, I32); Newxz(r->endp, RExC_npar, I32); + DEBUG_r( RX_DEBUG_on(r) ); DEBUG_COMPILE_r({ if (SvIV(re_debug_flags)> (RE_DEBUG_COMPILE | RE_DEBUG_EXECUTE)) PerlIO_printf(Perl_debug_log,"Final program:\n"); diff --git a/regexec.c b/regexec.c index ffe9888..5338e79 100644 --- a/regexec.c +++ b/regexec.c @@ -3020,6 +3020,17 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) */ if ( st->u.trie.accepted == 1 ) { + DEBUG_EXECUTE_r({ + SV ** const tmp = RX_DEBUG(reginfo->prog) + ? av_fetch( trie->words, st->u.trie.accept_buff[ 0 ].wordnum-1, 0 ) + : NULL; + PerlIO_printf( Perl_debug_log, + "%*s %sonly one match : #%d <%s>%s\n", + REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], + st->u.trie.accept_buff[ 0 ].wordnum, + tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", + PL_colors[5] ); + }); PL_reginput = (char *)st->u.trie.accept_buff[ 0 ].endpos; /* in this case we free tmps/leave before we call regmatch as we wont be using accept_buff again. */ @@ -3050,6 +3061,18 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) st->u.trie.accept_buff[best].wordnum) best = cur; } + DEBUG_EXECUTE_r({ + reg_trie_data * const trie = (reg_trie_data*) + rex->data->data[ARG(scan)]; + SV ** const tmp = RX_DEBUG(reginfo->prog) + ? av_fetch( trie->words, st->u.trie.accept_buff[ best ].wordnum - 1, 0 ) + : NULL; + PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n", + REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], + st->u.trie.accept_buff[best].wordnum, + tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan), + PL_colors[5] ); + }); if ( bestu.trie.accepted ) { reg_trie_accepted tmp = st->u.trie.accept_buff[ best ]; st->u.trie.accept_buff[ best ] = st->u.trie.accept_buff[ st->u.trie.accepted ]; diff --git a/regexp.h b/regexp.h index 0493267..777d82f 100644 --- a/regexp.h +++ b/regexp.h @@ -55,41 +55,47 @@ typedef struct regexp { #define ROPT_ANCH (ROPT_ANCH_BOL|ROPT_ANCH_MBOL|ROPT_ANCH_GPOS|ROPT_ANCH_SBOL) #define ROPT_ANCH_SINGLE (ROPT_ANCH_SBOL|ROPT_ANCH_GPOS) -#define ROPT_ANCH_BOL 0x00001 -#define ROPT_ANCH_MBOL 0x00002 -#define ROPT_ANCH_SBOL 0x00004 -#define ROPT_ANCH_GPOS 0x00008 -#define ROPT_SKIP 0x00010 -#define ROPT_IMPLICIT 0x00020 /* Converted .* to ^.* */ -#define ROPT_NOSCAN 0x00040 /* Check-string always at start. */ -#define ROPT_GPOS_SEEN 0x00080 -#define ROPT_CHECK_ALL 0x00100 -#define ROPT_LOOKBEHIND_SEEN 0x00200 -#define ROPT_EVAL_SEEN 0x00400 -#define ROPT_CANY_SEEN 0x00800 +#define ROPT_ANCH_BOL 0x00000001 +#define ROPT_ANCH_MBOL 0x00000002 +#define ROPT_ANCH_SBOL 0x00000004 +#define ROPT_ANCH_GPOS 0x00000008 +#define ROPT_SKIP 0x00000010 +#define ROPT_IMPLICIT 0x00000020 /* Converted .* to ^.* */ +#define ROPT_NOSCAN 0x00000040 /* Check-string always at start. */ +#define ROPT_GPOS_SEEN 0x00000080 +#define ROPT_CHECK_ALL 0x00000100 +#define ROPT_LOOKBEHIND_SEEN 0x00000200 +#define ROPT_EVAL_SEEN 0x00000400 +#define ROPT_CANY_SEEN 0x00000800 #define ROPT_SANY_SEEN ROPT_CANY_SEEN /* src bckwrd cmpt */ /* 0xf800 of reganch is used by PMf_COMPILETIME */ -#define ROPT_UTF8 0x10000 -#define ROPT_NAUGHTY 0x20000 /* how exponential is this pattern? */ -#define ROPT_COPY_DONE 0x40000 /* subbeg is a copy of the string */ -#define ROPT_TAINTED_SEEN 0x80000 +#define ROPT_UTF8 0x00010000 +#define ROPT_NAUGHTY 0x00020000 /* how exponential is this pattern? */ +#define ROPT_COPY_DONE 0x00040000 /* subbeg is a copy of the string */ +#define ROPT_TAINTED_SEEN 0x00080000 #define ROPT_MATCH_UTF8 0x10000000 /* subbeg is utf-8 */ -#define RE_USE_INTUIT_NOML 0x0100000 /* Best to intuit before matching */ -#define RE_USE_INTUIT_ML 0x0200000 -#define REINT_AUTORITATIVE_NOML 0x0400000 /* Can trust a positive answer */ -#define REINT_AUTORITATIVE_ML 0x0800000 -#define REINT_ONCE_NOML 0x1000000 /* Intuit can succed once only. */ -#define REINT_ONCE_ML 0x2000000 -#define RE_INTUIT_ONECHAR 0x4000000 -#define RE_INTUIT_TAIL 0x8000000 +#define RE_USE_INTUIT_NOML 0x00100000 /* Best to intuit before matching */ +#define RE_USE_INTUIT_ML 0x00200000 +#define REINT_AUTORITATIVE_NOML 0x00400000 /* Can trust a positive answer */ +#define REINT_AUTORITATIVE_ML 0x00800000 +#define REINT_ONCE_NOML 0x01000000 /* Intuit can succed once only. */ +#define REINT_ONCE_ML 0x02000000 +#define RE_INTUIT_ONECHAR 0x04000000 +#define RE_INTUIT_TAIL 0x08000000 + +#define RE_DEBUG_BIT 0x20000000 #define RE_USE_INTUIT (RE_USE_INTUIT_NOML|RE_USE_INTUIT_ML) #define REINT_AUTORITATIVE (REINT_AUTORITATIVE_NOML|REINT_AUTORITATIVE_ML) #define REINT_ONCE (REINT_ONCE_NOML|REINT_ONCE_ML) +#define RX_DEBUG(prog) ((prog)->reganch & RE_DEBUG_BIT) +#define RX_DEBUG_on(prog) ((prog)->reganch |= RE_DEBUG_BIT) + + #define RX_MATCH_TAINTED(prog) ((prog)->reganch & ROPT_TAINTED_SEEN) #define RX_MATCH_TAINTED_on(prog) ((prog)->reganch |= ROPT_TAINTED_SEEN) #define RX_MATCH_TAINTED_off(prog) ((prog)->reganch &= ~ROPT_TAINTED_SEEN)