From: Yves Orton Date: Sat, 2 Sep 2006 16:40:12 +0000 (+0200) Subject: Re: [PATCH] Trie jumping X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=786e8c118e1218e4c348fecf469934e080881633;p=p5sagit%2Fp5-mst-13.2.git Re: [PATCH] Trie jumping Message-ID: <9b18b3110609020740y2eb9004cpab313c3353a437ca@mail.gmail.com> p4raw-id: //depot/perl@28785 --- diff --git a/embed.fnc b/embed.fnc index e9c5da4..a34cb00 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1331,13 +1331,15 @@ Es |I32 |regpposixcc |NN struct RExC_state_t* state|I32 value Es |void |checkposixcc |NN struct RExC_state_t* state Es |I32 |make_trie |NN struct RExC_state_t* state|NN regnode *startbranch \ |NN regnode *first|NN regnode *last|NN regnode *tail \ - |U32 flags|U32 depth + |U32 word_count|U32 flags|U32 depth Es |void |make_trie_failtable |NN struct RExC_state_t* state \ |NN regnode *source|NN regnode *node|U32 depth # ifdef DEBUGGING Es |const regnode*|dumpuntil|NN const regexp *r|NN const regnode *start \ |NN const regnode *node \ - |NULLOK const regnode *last|NN SV* sv|I32 l + |NULLOK const regnode *last \ + |NULLOK const regnode *plast \ + |NN SV* sv|I32 indent|U32 depth Es |void |put_byte |NN SV* sv|int c Es |void |dump_trie |NN const struct _reg_trie_data *trie|U32 depth Es |void |dump_trie_interim_list|NN const struct _reg_trie_data *trie|U32 next_alloc|U32 depth @@ -1360,7 +1362,8 @@ ERs |char* |find_byclass |NN regexp * prog|NN const regnode *c|NN char *s|NN con Es |void |to_utf8_substr |NN regexp * prog Es |void |to_byte_substr |NN regexp * prog # ifdef DEBUGGING -Es |void |dump_exec_pos |NN const char *locinput|NN const regnode *scan|const bool do_utf8 +Es |void |dump_exec_pos |NN const char *locinput|NN const regnode *scan|NN const char *loc_regeol\ + |NN const char *loc_bostr|NN const char *loc_reg_starttry|const bool do_utf8 Es |void |debug_start_match|NN const regexp *prog|const bool do_utf8|NN const char *start|NN const char *end|NN const char *blurb # endif #endif diff --git a/embed.h b/embed.h index 1536130..6959cb9 100644 --- a/embed.h +++ b/embed.h @@ -3519,12 +3519,12 @@ #if defined(PERL_CORE) || defined(PERL_EXT) #define regpposixcc(a,b) S_regpposixcc(aTHX_ a,b) #define checkposixcc(a) S_checkposixcc(aTHX_ a) -#define make_trie(a,b,c,d,e,f,g) S_make_trie(aTHX_ a,b,c,d,e,f,g) +#define make_trie(a,b,c,d,e,f,g,h) S_make_trie(aTHX_ a,b,c,d,e,f,g,h) #define make_trie_failtable(a,b,c,d) S_make_trie_failtable(aTHX_ a,b,c,d) #endif # ifdef DEBUGGING #if defined(PERL_CORE) || defined(PERL_EXT) -#define dumpuntil(a,b,c,d,e,f) S_dumpuntil(aTHX_ a,b,c,d,e,f) +#define dumpuntil(a,b,c,d,e,f,g,h) S_dumpuntil(aTHX_ a,b,c,d,e,f,g,h) #define put_byte(a,b) S_put_byte(aTHX_ a,b) #define dump_trie(a,b) S_dump_trie(aTHX_ a,b) #define dump_trie_interim_list(a,b,c) S_dump_trie_interim_list(aTHX_ a,b,c) @@ -3549,7 +3549,7 @@ #endif # ifdef DEBUGGING #if defined(PERL_CORE) || defined(PERL_EXT) -#define dump_exec_pos(a,b,c) S_dump_exec_pos(aTHX_ a,b,c) +#define dump_exec_pos(a,b,c,d,e,f) S_dump_exec_pos(aTHX_ a,b,c,d,e,f) #define debug_start_match(a,b,c,d,e) S_debug_start_match(aTHX_ a,b,c,d,e) #endif # endif diff --git a/ext/re/re.pm b/ext/re/re.pm index a9bff82..ee262c6 100644 --- a/ext/re/re.pm +++ b/ext/re/re.pm @@ -235,10 +235,11 @@ my %flags = ( OFFSETS_DEBUG => 0x020000, STATE => 0x040000, ); -$flags{ALL} = $flags{COMPILE} | $flags{EXECUTE} | $flags{STATE}; +$flags{ALL} = -1; $flags{All} = $flags{all} = $flags{DUMP} | $flags{EXECUTE}; -$flags{More} = $flags{MORE} = $flags{ALL} | $flags{TRIE_MORE}; +$flags{More} = $flags{MORE} = $flags{All} | $flags{TRIE_MORE} | $flags{STATE}; $flags{State} = $flags{DUMP} | $flags{EXECUTE} | $flags{STATE}; +$flags{TRIE} = $flags{DUMP} | $flags{EXECUTE} | $flags{TRIE_COMPILE}; my $installed = 0; @@ -259,7 +260,12 @@ sub bits { foreach my $idx (0..$#_){ my $s=$_[$idx]; if ($s eq 'Debug' or $s eq 'Debugcolor') { - setcolor() if $s eq 'Debugcolor'; + if ($s eq 'Debugcolor') { + setcolor(); + } else { + # $ENV{PERL_RE_COLORS}||=qq'\t\t> <\t> <\t\t' + } + ${^RE_DEBUG_FLAGS} = 0 unless defined ${^RE_DEBUG_FLAGS}; for my $idx ($idx+1..$#_) { if ($flags{$_[$idx]}) { diff --git a/opcode.h b/opcode.h index 4b32c85..6dd31c3 100644 --- a/opcode.h +++ b/opcode.h @@ -1535,9 +1535,9 @@ EXT Perl_check_t PL_check[] /* or perlvars.h */ #ifndef PERL_GLOBAL_STRUCT_INIT #ifndef DOINIT -EXTCONST U32 PL_opargs[]; +EXT const U32 PL_opargs[]; #else -EXTCONST U32 PL_opargs[] = { +EXT const U32 PL_opargs[] = { 0x00000000, /* null */ 0x00000000, /* stub */ 0x00003604, /* scalar */ diff --git a/proto.h b/proto.h index 4d8e91b..ae714fb 100644 --- a/proto.h +++ b/proto.h @@ -3638,7 +3638,7 @@ STATIC I32 S_regpposixcc(pTHX_ struct RExC_state_t* state, I32 value) STATIC void S_checkposixcc(pTHX_ struct RExC_state_t* state) __attribute__nonnull__(pTHX_1); -STATIC I32 S_make_trie(pTHX_ struct RExC_state_t* state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags, U32 depth) +STATIC I32 S_make_trie(pTHX_ struct RExC_state_t* state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2) __attribute__nonnull__(pTHX_3) @@ -3651,11 +3651,11 @@ STATIC void S_make_trie_failtable(pTHX_ struct RExC_state_t* state, regnode *sou __attribute__nonnull__(pTHX_3); # ifdef DEBUGGING -STATIC const regnode* S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, const regnode *last, SV* sv, I32 l) +STATIC const regnode* S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, const regnode *last, const regnode *plast, SV* sv, I32 indent, U32 depth) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2) __attribute__nonnull__(pTHX_3) - __attribute__nonnull__(pTHX_5); + __attribute__nonnull__(pTHX_6); STATIC void S_put_byte(pTHX_ SV* sv, int c) __attribute__nonnull__(pTHX_1); @@ -3726,9 +3726,12 @@ STATIC void S_to_byte_substr(pTHX_ regexp * prog) __attribute__nonnull__(pTHX_1); # ifdef DEBUGGING -STATIC void S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8) +STATIC void S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const char *loc_regeol, const char *loc_bostr, const char *loc_reg_starttry, const bool do_utf8) __attribute__nonnull__(pTHX_1) - __attribute__nonnull__(pTHX_2); + __attribute__nonnull__(pTHX_2) + __attribute__nonnull__(pTHX_3) + __attribute__nonnull__(pTHX_4) + __attribute__nonnull__(pTHX_5); STATIC void S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8, const char *start, const char *end, const char *blurb) __attribute__nonnull__(pTHX_1) diff --git a/regcomp.c b/regcomp.c index c26677a..8feb297 100644 --- a/regcomp.c +++ b/regcomp.c @@ -171,6 +171,7 @@ typedef struct RExC_state_t { /* whether trie related optimizations are enabled */ #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION #define TRIE_STUDY_OPT +#define FULL_TRIE_STUDY #define TRIE_STCLASS #endif /* Length of a variant. */ @@ -232,7 +233,8 @@ static const scan_data_t zero_scan_data = #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) #define SCF_WHILEM_VISITED_POS 0x2000 -#define SCF_EXACT_TRIE 0x4000 /* should re study once we are done? */ +#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */ + #define UTF (RExC_utf8 != 0) #define LOC ((RExC_flags & PMf_LOCALE) != 0) @@ -444,7 +446,7 @@ static const scan_data_t zero_scan_data = static void clear_re(pTHX_ void *r); /* Mark that we cannot extend a found fixed substring at this point. - Updata the longest found anchored substring and the longest found + Update the longest found anchored substring and the longest found floating substrings if needed. */ STATIC void @@ -633,199 +635,11 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con } } -/* - - make_trie(startbranch,first,last,tail,flags,depth) - startbranch: the first branch in the whole branch sequence - first : start branch of sequence of branch-exact nodes. - May be the same as startbranch - last : Thing following the last branch. - May be the same as tail. - tail : item following the branch sequence - flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/ - depth : indent depth - -Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node. - -A trie is an N'ary tree where the branches are determined by digital -decomposition of the key. IE, at the root node you look up the 1st character and -follow that branch repeat until you find the end of the branches. Nodes can be -marked as "accepting" meaning they represent a complete word. Eg: - - /he|she|his|hers/ - -would convert into the following structure. Numbers represent states, letters -following numbers represent valid transitions on the letter from that state, if -the number is in square brackets it represents an accepting state, otherwise it -will be in parenthesis. - - +-h->+-e->[3]-+-r->(8)-+-s->[9] - | | - | (2) - | | - (1) +-i->(6)-+-s->[7] - | - +-s->(3)-+-h->(4)-+-e->[5] - - Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers) - -This shows that when matching against the string 'hers' we will begin at state 1 -read 'h' and move to state 2, read 'e' and move to state 3 which is accepting, -then read 'r' and go to state 8 followed by 's' which takes us to state 9 which -is also accepting. Thus we know that we can match both 'he' and 'hers' with a -single traverse. We store a mapping from accepting to state to which word was -matched, and then when we have multiple possibilities we try to complete the -rest of the regex in the order in which they occured in the alternation. - -The only prior NFA like behaviour that would be changed by the TRIE support is -the silent ignoring of duplicate alternations which are of the form: - - / (DUPE|DUPE) X? (?{ ... }) Y /x - -Thus EVAL blocks follwing a trie may be called a different number of times with -and without the optimisation. With the optimisations dupes will be silently -ignored. This inconsistant behaviour of EVAL type nodes is well established as -the following demonstrates: - - 'words'=~/(word|word|word)(?{ print $1 })[xyz]/ - -which prints out 'word' three times, but - - 'words'=~/(word|word|word)(?{ print $1 })S/ - -which doesnt print it out at all. This is due to other optimisations kicking in. - -Example of what happens on a structural level: - -The regexp /(ac|ad|ab)+/ will produce the folowing debug output: - - 1: CURLYM[1] {1,32767}(18) - 5: BRANCH(8) - 6: EXACT (16) - 8: BRANCH(11) - 9: EXACT (16) - 11: BRANCH(14) - 12: EXACT (16) - 16: SUCCEED(0) - 17: NOTHING(18) - 18: END(0) - -This would be optimizable with startbranch=5, first=5, last=16, tail=16 -and should turn into: - - 1: CURLYM[1] {1,32767}(18) - 5: TRIE(16) - [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1] - - - - 16: SUCCEED(0) - 17: NOTHING(18) - 18: END(0) - -Cases where tail != last would be like /(?foo|bar)baz/: - - 1: BRANCH(4) - 2: EXACT (8) - 4: BRANCH(7) - 5: EXACT (8) - 7: TAIL(8) - 8: EXACT (10) - 10: END(0) - -which would be optimizable with startbranch=1, first=1, last=7, tail=8 -and would end up looking like: - - 1: TRIE(8) - [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1] - - - 7: TAIL(8) - 8: EXACT (10) - 10: END(0) - - d = uvuni_to_utf8_flags(d, uv, 0); - -is the recommended Unicode-aware way of saying - - *(d++) = uv; -*/ - -#define TRIE_STORE_REVCHAR \ - STMT_START { \ - SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \ - if (UTF) SvUTF8_on(tmp); \ - av_push( TRIE_REVCHARMAP(trie), tmp ); \ - } STMT_END - -#define TRIE_READ_CHAR STMT_START { \ - wordlen++; \ - if ( UTF ) { \ - if ( folder ) { \ - if ( foldlen > 0 ) { \ - uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \ - foldlen -= len; \ - scan += len; \ - len = 0; \ - } else { \ - uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\ - uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \ - foldlen -= UNISKIP( uvc ); \ - scan = foldbuf + UNISKIP( uvc ); \ - } \ - } else { \ - uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\ - } \ - } else { \ - uvc = (U32)*uc; \ - len = 1; \ - } \ -} STMT_END - - #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ] #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid ) #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate ) #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 ) -#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \ - if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \ - TRIE_LIST_LEN( state ) *= 2; \ - Renew( trie->states[ state ].trans.list, \ - TRIE_LIST_LEN( state ), reg_trie_trans_le ); \ - } \ - TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \ - TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \ - TRIE_LIST_CUR( state )++; \ -} STMT_END - -#define TRIE_LIST_NEW(state) STMT_START { \ - Newxz( trie->states[ state ].trans.list, \ - 4, reg_trie_trans_le ); \ - TRIE_LIST_CUR( state ) = 1; \ - TRIE_LIST_LEN( state ) = 4; \ -} STMT_END - -#define TRIE_HANDLE_WORD(state) STMT_START { \ - if ( !trie->states[ state ].wordnum ) { \ - /* we haven't inserted this word into the structure yet. */ \ - if (trie->wordlen) \ - trie->wordlen[ curword ] = wordlen; \ - trie->states[ state ].wordnum = ++curword; \ - DEBUG_r({ \ - /* store the word for dumping */ \ - SV* tmp; \ - if (OP(noper) != NOTHING) \ - tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \ - else \ - tmp = newSVpvn( "", 0 ); \ - if ( UTF ) SvUTF8_on( tmp ); \ - av_push( trie->words, tmp ); \ - }); \ - } else { \ - NOOP; /* It's a dupe. So ignore it. */ \ - } \ -} STMT_END #ifdef DEBUGGING /* @@ -882,7 +696,7 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth) PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------"); PerlIO_printf( Perl_debug_log, "\n"); - for( state = 1 ; state < TRIE_LASTSTATE(trie) ; state++ ) { + for( state = 1 ; state < trie->laststate ; state++ ) { const U32 base = trie->states[ state ].trans.base; PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state); @@ -1043,114 +857,235 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_allo #endif -#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \ - ( ( base + charid >= ucharcount \ - && base + charid < ubound \ - && state == trie->trans[ base - ucharcount + charid ].check \ - && trie->trans[ base - ucharcount + charid ].next ) \ - ? trie->trans[ base - ucharcount + charid ].next \ - : ( state==1 ? special : 0 ) \ - ) +/* make_trie(startbranch,first,last,tail,word_count,flags,depth) + startbranch: the first branch in the whole branch sequence + first : start branch of sequence of branch-exact nodes. + May be the same as startbranch + last : Thing following the last branch. + May be the same as tail. + tail : item following the branch sequence + count : words in the sequence + flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/ + depth : indent depth -STATIC void -S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth) -{ -/* The Trie is constructed and compressed now so we can build a fail array now if its needed +Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node. - This is apparently the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the - "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88 - ISBN 0-201-10088-6 +A trie is an N'ary tree where the branches are determined by digital +decomposition of the key. IE, at the root node you look up the 1st character and +follow that branch repeat until you find the end of the branches. Nodes can be +marked as "accepting" meaning they represent a complete word. Eg: - We find the fail state for each state in the trie, this state is the longest proper - suffix of the current states 'word' that is also a proper prefix of another word in our - trie. State 1 represents the word '' and is the thus the default fail state. This allows - the DFA not to have to restart after its tried and failed a word at a given point, it - simply continues as though it had been matching the other word in the first place. - Consider - 'abcdgu'=~/abcdefg|cdgu/ - When we get to 'd' we are still matching the first word, we would encounter 'g' which would - fail, which would bring use to the state representing 'd' in the second word where we would - try 'g' and succeed, prodceding to match 'cdgu'. - */ - /* add a fail transition */ - reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)]; - U32 *q; - const U32 ucharcount = trie->uniquecharcount; - const U32 numstates = trie->laststate; - const U32 ubound = trie->lasttrans + ucharcount; - U32 q_read = 0; - U32 q_write = 0; - U32 charid; - U32 base = trie->states[ 1 ].trans.base; - U32 *fail; - reg_ac_data *aho; - const U32 data_slot = add_data( pRExC_state, 1, "T" ); - GET_RE_DEBUG_FLAGS_DECL; -#ifndef DEBUGGING - PERL_UNUSED_ARG(depth); -#endif + /he|she|his|hers/ +would convert into the following structure. Numbers represent states, letters +following numbers represent valid transitions on the letter from that state, if +the number is in square brackets it represents an accepting state, otherwise it +will be in parenthesis. - ARG_SET( stclass, data_slot ); - Newxz( aho, 1, reg_ac_data ); - RExC_rx->data->data[ data_slot ] = (void*)aho; - aho->trie=trie; - aho->states=(reg_trie_state *)savepvn((const char*)trie->states, - (trie->laststate+1)*sizeof(reg_trie_state)); - Newxz( q, numstates, U32); - Newxz( aho->fail, numstates, U32 ); - aho->refcount = 1; - fail = aho->fail; - fail[ 0 ] = fail[ 1 ] = 1; + +-h->+-e->[3]-+-r->(8)-+-s->[9] + | | + | (2) + | | + (1) +-i->(6)-+-s->[7] + | + +-s->(3)-+-h->(4)-+-e->[5] - for ( charid = 0; charid < ucharcount ; charid++ ) { - const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 ); - if ( newstate ) { - q[ q_write ] = newstate; - /* set to point at the root */ - fail[ q[ q_write++ ] ]=1; - } - } - while ( q_read < q_write) { - const U32 cur = q[ q_read++ % numstates ]; - base = trie->states[ cur ].trans.base; + Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers) + +This shows that when matching against the string 'hers' we will begin at state 1 +read 'h' and move to state 2, read 'e' and move to state 3 which is accepting, +then read 'r' and go to state 8 followed by 's' which takes us to state 9 which +is also accepting. Thus we know that we can match both 'he' and 'hers' with a +single traverse. We store a mapping from accepting to state to which word was +matched, and then when we have multiple possibilities we try to complete the +rest of the regex in the order in which they occured in the alternation. + +The only prior NFA like behaviour that would be changed by the TRIE support is +the silent ignoring of duplicate alternations which are of the form: + + / (DUPE|DUPE) X? (?{ ... }) Y /x + +Thus EVAL blocks follwing a trie may be called a different number of times with +and without the optimisation. With the optimisations dupes will be silently +ignored. This inconsistant behaviour of EVAL type nodes is well established as +the following demonstrates: + + 'words'=~/(word|word|word)(?{ print $1 })[xyz]/ + +which prints out 'word' three times, but + + 'words'=~/(word|word|word)(?{ print $1 })S/ + +which doesnt print it out at all. This is due to other optimisations kicking in. + +Example of what happens on a structural level: + +The regexp /(ac|ad|ab)+/ will produce the folowing debug output: + + 1: CURLYM[1] {1,32767}(18) + 5: BRANCH(8) + 6: EXACT (16) + 8: BRANCH(11) + 9: EXACT (16) + 11: BRANCH(14) + 12: EXACT (16) + 16: SUCCEED(0) + 17: NOTHING(18) + 18: END(0) + +This would be optimizable with startbranch=5, first=5, last=16, tail=16 +and should turn into: + + 1: CURLYM[1] {1,32767}(18) + 5: TRIE(16) + [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1] + + + + 16: SUCCEED(0) + 17: NOTHING(18) + 18: END(0) + +Cases where tail != last would be like /(?foo|bar)baz/: + + 1: BRANCH(4) + 2: EXACT (8) + 4: BRANCH(7) + 5: EXACT (8) + 7: TAIL(8) + 8: EXACT (10) + 10: END(0) + +which would be optimizable with startbranch=1, first=1, last=7, tail=8 +and would end up looking like: + + 1: TRIE(8) + [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1] + + + 7: TAIL(8) + 8: EXACT (10) + 10: END(0) + + d = uvuni_to_utf8_flags(d, uv, 0); + +is the recommended Unicode-aware way of saying + + *(d++) = uv; +*/ + +#define TRIE_STORE_REVCHAR \ + STMT_START { \ + SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \ + if (UTF) SvUTF8_on(tmp); \ + av_push( TRIE_REVCHARMAP(trie), tmp ); \ + } STMT_END + +#define TRIE_READ_CHAR STMT_START { \ + wordlen++; \ + if ( UTF ) { \ + if ( folder ) { \ + if ( foldlen > 0 ) { \ + uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \ + foldlen -= len; \ + scan += len; \ + len = 0; \ + } else { \ + uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\ + uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \ + foldlen -= UNISKIP( uvc ); \ + scan = foldbuf + UNISKIP( uvc ); \ + } \ + } else { \ + uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\ + } \ + } else { \ + uvc = (U32)*uc; \ + len = 1; \ + } \ +} STMT_END + + + +#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \ + if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \ + TRIE_LIST_LEN( state ) *= 2; \ + Renew( trie->states[ state ].trans.list, \ + TRIE_LIST_LEN( state ), reg_trie_trans_le ); \ + } \ + TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \ + TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \ + TRIE_LIST_CUR( state )++; \ +} STMT_END - for ( charid = 0 ; charid < ucharcount ; charid++ ) { - const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 ); - if (ch_state) { - U32 fail_state = cur; - U32 fail_base; - do { - fail_state = fail[ fail_state ]; - fail_base = aho->states[ fail_state ].trans.base; - } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) ); +#define TRIE_LIST_NEW(state) STMT_START { \ + Newxz( trie->states[ state ].trans.list, \ + 4, reg_trie_trans_le ); \ + TRIE_LIST_CUR( state ) = 1; \ + TRIE_LIST_LEN( state ) = 4; \ +} STMT_END - fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ); - fail[ ch_state ] = fail_state; - if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum ) - { - aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum; - } - q[ q_write++ % numstates] = ch_state; - } - } - } +#define TRIE_HANDLE_WORD(state) STMT_START { \ + U16 dupe= trie->states[ state ].wordnum; \ + regnode * const noper_next = regnext( noper ); \ + \ + if (trie->wordlen) \ + trie->wordlen[ curword ] = wordlen; \ + DEBUG_r({ \ + /* store the word for dumping */ \ + SV* tmp; \ + if (OP(noper) != NOTHING) \ + tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \ + else \ + tmp = newSVpvn( "", 0 ); \ + if ( UTF ) SvUTF8_on( tmp ); \ + av_push( trie->words, tmp ); \ + }); \ + \ + curword++; \ + \ + if ( noper_next < tail ) { \ + if (!trie->jump) \ + Newxz( trie->jump, word_count + 1, U16); \ + trie->jump[curword] = (U16)(tail - noper_next); \ + if (!jumper) \ + jumper = noper_next; \ + if (!nextbranch) \ + nextbranch= regnext(cur); \ + } \ + \ + if ( dupe ) { \ + /* So it's a dupe. This means we need to maintain a */\ + /* linked-list from the first to the next. */\ + /* we only allocate the nextword buffer when there */\ + /* a dupe, so first time we have to do the allocation */\ + if (!trie->nextword) \ + Newxz( trie->nextword, word_count + 1, U16); \ + while ( trie->nextword[dupe] ) \ + dupe= trie->nextword[dupe]; \ + trie->nextword[dupe]= curword; \ + } else { \ + /* we haven't inserted this word yet. */ \ + trie->states[ state ].wordnum = curword; \ + } \ +} STMT_END - DEBUG_TRIE_COMPILE_MORE_r({ - PerlIO_printf(Perl_debug_log, "%*sFail: 1", (int)(depth * 2), ""); - for( q_read=2; q_read= ucharcount \ + && base + charid < ubound \ + && state == trie->trans[ base - ucharcount + charid ].check \ + && trie->trans[ base - ucharcount + charid ].next ) \ + ? trie->trans[ base - ucharcount + charid ].next \ + : ( state==1 ? special : 0 ) \ + ) +#define MADE_TRIE 1 +#define MADE_JUMP_TRIE 2 +#define MADE_EXACT_TRIE 4 STATIC I32 -S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags, U32 depth) +S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth) { dVAR; /* first pass, loop through and scan words */ @@ -1161,6 +1096,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs UV uvc = 0; U16 curword = 0; U32 next_alloc = 0; + regnode *jumper = NULL; + regnode *nextbranch = NULL; /* we just use folder as a flag in utf8 */ const U8 * const folder = ( flags == EXACTF ? PL_fold @@ -1175,14 +1112,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs #ifndef DEBUGGING /* these are only used during construction but are useful during * debugging so we store them in the struct when debugging. - * Wordcount is actually superfluous in debugging as we have - * (AV*)trie->words to use for it, but that's not available when - * not debugging... We could make the macro use the AV during - * debugging though... */ - U16 trie_wordcount=0; STRLEN trie_charcount=0; - /*U32 trie_laststate=0;*/ AV *trie_revcharmap; #endif GET_RE_DEBUG_FLAGS_DECL; @@ -1193,6 +1124,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs Newxz( trie, 1, reg_trie_data ); trie->refcount = 1; trie->startstate = 1; + trie->wordcount = word_count; RExC_rx->data->data[ data_slot ] = (void*)trie; Newxz( trie->charmap, 256, U16 ); if (!(UTF && folder)) @@ -1208,10 +1140,11 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } DEBUG_OPTIMISE_r({ PerlIO_printf( Perl_debug_log, - "%*smake_trie start==%d, first==%d, last==%d, tail==%d\n", + "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", (int)depth * 2 + 2, "", REG_NODE_NUM(startbranch),REG_NODE_NUM(first), - REG_NODE_NUM(last), REG_NODE_NUM(tail)); + REG_NODE_NUM(last), REG_NODE_NUM(tail), + depth); }); /* -- First loop and Setup -- @@ -1246,7 +1179,6 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs U32 wordlen = 0; /* required init */ STRLEN chars=0; - TRIE_WORDCOUNT(trie)++; if (OP(noper) == NOTHING) { trie->minlen= 0; continue; @@ -1295,11 +1227,11 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs DEBUG_TRIE_COMPILE_r( PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", (int)depth * 2 + 2,"", - ( trie->widecharmap ? "UTF8" : "NATIVE" ), TRIE_WORDCOUNT(trie), + ( trie->widecharmap ? "UTF8" : "NATIVE" ), word_count, (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, (int)trie->minlen, (int)trie->maxlen ) ); - Newxz( trie->wordlen, TRIE_WORDCOUNT(trie), U32 ); + Newxz( trie->wordlen, word_count, U32 ); /* We now know what we are dealing with in terms of unique chars and @@ -1355,52 +1287,52 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; if (OP(noper) != NOTHING) { - for ( ; uc < e ; uc += len ) { + for ( ; uc < e ; uc += len ) { - TRIE_READ_CHAR; + TRIE_READ_CHAR; - if ( uvc < 256 ) { - charid = trie->charmap[ uvc ]; - } else { - SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0); - if ( !svpp ) { - charid = 0; + if ( uvc < 256 ) { + charid = trie->charmap[ uvc ]; } else { - charid=(U16)SvIV( *svpp ); + SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0); + if ( !svpp ) { + charid = 0; + } else { + charid=(U16)SvIV( *svpp ); + } } - } - if ( charid ) { + /* charid is now 0 if we dont know the char read, or nonzero if we do */ + if ( charid ) { - U16 check; - U32 newstate = 0; + U16 check; + U32 newstate = 0; - charid--; - if ( !trie->states[ state ].trans.list ) { - TRIE_LIST_NEW( state ); - } - for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) { - if ( TRIE_LIST_ITEM( state, check ).forid == charid ) { - newstate = TRIE_LIST_ITEM( state, check ).newstate; - break; + charid--; + if ( !trie->states[ state ].trans.list ) { + TRIE_LIST_NEW( state ); } + for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) { + if ( TRIE_LIST_ITEM( state, check ).forid == charid ) { + newstate = TRIE_LIST_ITEM( state, check ).newstate; + break; + } + } + if ( ! newstate ) { + newstate = next_alloc++; + TRIE_LIST_PUSH( state, charid, newstate ); + transcount++; + } + state = newstate; + } else { + Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); } - if ( ! newstate ) { - newstate = next_alloc++; - TRIE_LIST_PUSH( state, charid, newstate ); - transcount++; - } - state = newstate; - } else { - Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); } - /* charid is now 0 if we dont know the char read, or nonzero if we do */ - } } TRIE_HANDLE_WORD(state); } /* end second pass */ - TRIE_LASTSTATE(trie) = next_alloc; + trie->laststate = next_alloc; Renew( trie->states, next_alloc, reg_trie_state ); /* and now dump it out before we compress it */ @@ -1540,29 +1472,29 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; if ( OP(noper) != NOTHING ) { - for ( ; uc < e ; uc += len ) { + for ( ; uc < e ; uc += len ) { - TRIE_READ_CHAR; + TRIE_READ_CHAR; - if ( uvc < 256 ) { - charid = trie->charmap[ uvc ]; - } else { - SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0); - charid = svpp ? (U16)SvIV(*svpp) : 0; - } - if ( charid ) { - charid--; - if ( !trie->trans[ state + charid ].next ) { - trie->trans[ state + charid ].next = next_alloc; - trie->trans[ state ].check++; - next_alloc += trie->uniquecharcount; + if ( uvc < 256 ) { + charid = trie->charmap[ uvc ]; + } else { + SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0); + charid = svpp ? (U16)SvIV(*svpp) : 0; } - state = trie->trans[ state + charid ].next; - } else { - Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); + if ( charid ) { + charid--; + if ( !trie->trans[ state + charid ].next ) { + trie->trans[ state + charid ].next = next_alloc; + trie->trans[ state ].check++; + next_alloc += trie->uniquecharcount; + } + state = trie->trans[ state + charid ].next; + } else { + Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); + } + /* charid is now 0 if we dont know the char read, or nonzero if we do */ } - /* charid is now 0 if we dont know the char read, or nonzero if we do */ - } } accept_state = TRIE_NODENUM( state ); TRIE_HANDLE_WORD(accept_state); @@ -1600,6 +1532,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs even earlier), but the .check field determines if the transition is valid. + XXX - wrong maybe? The following process inplace converts the table to the compressed table: We first do not compress the root node 1,and mark its all its .check pointers as 1 and set its .base pointer as 1 as well. This @@ -1625,7 +1558,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs This pointer is independent of the main pointer and scans forward looking for null transitions that are allocated to a state. When it finds one it writes the single transition into the "hole". If the - pointer doesnt find one the single transition is appeneded as normal. + pointer doesnt find one the single transition is appended as normal. - Once compressed we can Renew/realloc the structures to release the excess space. @@ -1638,7 +1571,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs const U32 laststate = TRIE_NODENUM( next_alloc ); U32 state, charid; U32 pos = 0, zp=0; - TRIE_LASTSTATE(trie) = laststate; + trie->laststate = laststate; for ( state = 1 ; state < laststate ; state++ ) { U8 flag = 0; @@ -1700,7 +1633,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs regnode *convert; U8 nodetype =(U8)(flags & 0xFF); char *str=NULL; + #ifdef DEBUGGING + U32 mjd_offset; U32 mjd_nodelen; #endif @@ -1733,20 +1668,20 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs DEBUG_OPTIMISE_r( PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n", (int)depth * 2 + 2, "", - mjd_offset,mjd_nodelen) + (UV)mjd_offset, (UV)mjd_nodelen) ); /* But first we check to see if there is a common prefix we can split out as an EXACT and put in front of the TRIE node. */ trie->startstate= 1; - if ( trie->bitmap && !trie->widecharmap ) { + if ( trie->bitmap && !trie->widecharmap && !trie->jump ) { U32 state; DEBUG_OPTIMISE_r( PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n", (int)depth * 2 + 2, "", - TRIE_LASTSTATE(trie)) + (UV)trie->laststate) ); - for ( state = 1 ; state < TRIE_LASTSTATE(trie)-1 ; state++ ) { + for ( state = 1 ; state < trie->laststate-1 ; state++ ) { U32 ofs = 0; I32 idx = -1; U32 count = 0; @@ -1770,7 +1705,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs PerlIO_printf(Perl_debug_log, "%*sNew Start State=%"UVuf" Class: [", (int)depth * 2 + 2, "", - state)); + (UV)state)); if (idx >= 0) { SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0); const U8 * const ch = (U8*)SvPV_nolen_const( *tmp ); @@ -1798,7 +1733,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs PerlIO_printf( Perl_debug_log, "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n", (int)depth * 2 + 2, "", - state, idx, ch) + (UV)state, (UV)idx, ch) ); if ( state==1 ) { OP( convert ) = nodetype; @@ -1838,9 +1773,25 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } } if ( trie->maxlen ) { - OP( convert ) = TRIE; NEXT_OFF( convert ) = (U16)(tail - convert); ARG_SET( convert, data_slot ); + /* Store the offset to the first unabsorbed branch in + jump[0], which is otherwise unused by the jump logic. + We use this when dumping a trie and during optimisation. */ + if (trie->jump) + trie->jump[0] = (U16)(tail - nextbranch); + if (!jumper) + jumper= last; + /* XXXX */ + if ( !trie->states[trie->startstate].wordnum && trie->bitmap && + ( (char *)jumper - (char *)convert) >= sizeof(struct regnode_charclass) ) + { + OP( convert ) = TRIEC; + Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char); + Safefree(trie->bitmap); + trie->bitmap= NULL; + } else + OP( convert ) = TRIE; /* store the type in the flags */ convert->flags = nodetype; @@ -1848,18 +1799,18 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } /* needed for dumping*/ DEBUG_r({ - regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ]; + regnode *optimize = convert + + NODE_STEP_REGNODE + + regarglen[ OP( convert ) ]; regnode *opt = convert; while (++optjump + ? MADE_JUMP_TRIE + : trie->startstate>1 + ? MADE_EXACT_TRIE + : MADE_TRIE; +} + +STATIC void +S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth) +{ +/* The Trie is constructed and compressed now so we can build a fail array now if its needed + + This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the + "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88 + ISBN 0-201-10088-6 + + We find the fail state for each state in the trie, this state is the longest proper + suffix of the current states 'word' that is also a proper prefix of another word in our + trie. State 1 represents the word '' and is the thus the default fail state. This allows + the DFA not to have to restart after its tried and failed a word at a given point, it + simply continues as though it had been matching the other word in the first place. + Consider + 'abcdgu'=~/abcdefg|cdgu/ + When we get to 'd' we are still matching the first word, we would encounter 'g' which would + fail, which would bring use to the state representing 'd' in the second word where we would + try 'g' and succeed, prodceding to match 'cdgu'. + */ + /* add a fail transition */ + reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)]; + U32 *q; + const U32 ucharcount = trie->uniquecharcount; + const U32 numstates = trie->laststate; + const U32 ubound = trie->lasttrans + ucharcount; + U32 q_read = 0; + U32 q_write = 0; + U32 charid; + U32 base = trie->states[ 1 ].trans.base; + U32 *fail; + reg_ac_data *aho; + const U32 data_slot = add_data( pRExC_state, 1, "T" ); + GET_RE_DEBUG_FLAGS_DECL; +#ifndef DEBUGGING + PERL_UNUSED_ARG(depth); +#endif + + + ARG_SET( stclass, data_slot ); + Newxz( aho, 1, reg_ac_data ); + RExC_rx->data->data[ data_slot ] = (void*)aho; + aho->trie=trie; + aho->states=(reg_trie_state *)savepvn((const char*)trie->states, + (trie->laststate+1)*sizeof(reg_trie_state)); + Newxz( q, numstates, U32); + Newxz( aho->fail, numstates, U32 ); + aho->refcount = 1; + fail = aho->fail; + /* initialize fail[0..1] to be 1 so that we always have + a valid final fail state */ + fail[ 0 ] = fail[ 1 ] = 1; + + for ( charid = 0; charid < ucharcount ; charid++ ) { + const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 ); + if ( newstate ) { + q[ q_write ] = newstate; + /* set to point at the root */ + fail[ q[ q_write++ ] ]=1; + } + } + while ( q_read < q_write) { + const U32 cur = q[ q_read++ % numstates ]; + base = trie->states[ cur ].trans.base; + + for ( charid = 0 ; charid < ucharcount ; charid++ ) { + const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 ); + if (ch_state) { + U32 fail_state = cur; + U32 fail_base; + do { + fail_state = fail[ fail_state ]; + fail_base = aho->states[ fail_state ].trans.base; + } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) ); + + fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ); + fail[ ch_state ] = fail_state; + if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum ) + { + aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum; + } + q[ q_write++ % numstates] = ch_state; + } + } + } + /* restore fail[0..1] to 0 so that we "fall out" of the AC loop + when we fail in state 1, this allows us to use the + charclass scan to find a valid start char. This is based on the principle + that theres a good chance the string being searched contains lots of stuff + that cant be a start char. + */ + fail[ 0 ] = fail[ 1 ] = 0; + DEBUG_TRIE_COMPILE_r({ + PerlIO_printf(Perl_debug_log, "%*sStclass Failtable: 0", (int)(depth * 2), ""); + for( q_read=1; q_readEXACT=>X - BRANCH->EXACT=>X + BRANCH->EXACT=>x1 + BRANCH->EXACT=>x2 + tail which would be constructed from a pattern like /A|LIST|OF|WORDS/ @@ -2220,10 +2291,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, We have two cases - 1. patterns where the whole set of branch can be converted to a trie, + 1. patterns where the whole set of branch can be converted. - 2. patterns where only a subset of the alternations can be - converted to a trie. + 2. patterns where only a subset can be converted. In case 1 we can replace the whole set with a single regop for the trie. In case 2 we need to keep the start and end @@ -2232,19 +2302,24 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, 'BRANCH EXACT; BRANCH EXACT; BRANCH X' becomes BRANCH TRIE; BRANCH X; - Hypthetically when we know the regex isnt anchored we can - turn a case 1 into a DFA and let it rip... Every time it finds a match - it would just call its tail, no WHILEM/CURLY needed. + There is an additional case, that being where there is a + common prefix, which gets split out into an EXACT like node + preceding the TRIE node. + + If x(1..n)==tail then we can do a simple trie, if not we make + a "jump" trie, such that when we match the appropriate word + we "jump" to the appopriate tail node. Essentailly we turn + a nested if into a case structure of sorts. */ - if (PERL_ENABLE_TRIE_OPTIMISATION) { + int made=0; if (!re_trie_maxbuff) { re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); if (!SvIOK(re_trie_maxbuff)) sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); } - if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) { + if ( SvIV(re_trie_maxbuff)>=0 ) { regnode *cur; regnode *first = (regnode *)NULL; regnode *last = (regnode *)NULL; @@ -2328,7 +2403,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, if ( (((first && optype!=NOTHING) ? OP( noper ) == optype : PL_regkind[ OP( noper ) ] == EXACT ) || OP(noper) == NOTHING ) - && noper_next == tail && countdata->data[ ARG(scan) ]; - min += trie->minlen; - delta += (trie->maxlen - trie->minlen); - flags &= ~SCF_DO_STCLASS; /* xxx */ - if (flags & SCF_DO_SUBSTR) { - scan_commit(pRExC_state,data); /* Cannot expect anything... */ - data->pos_min += trie->minlen; - data->pos_delta += (trie->maxlen - trie->minlen); - if (trie->maxlen != trie->minlen) - data->longest = &(data->longest_float); - } - } -#endif else if (strchr((const char*)PL_varies,OP(scan))) { I32 mincount, maxcount, minnext, deltanext, fl = 0; I32 f = flags, pos_before = 0; @@ -2563,7 +2625,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, next = regnext(scan); if (OP(scan) == CURLYX) { I32 lp = (data ? *(data->last_closep) : 0); - scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX); + scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX); } scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS; next_is_eval = (OP(scan) == EVAL); @@ -3107,7 +3169,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, if (deltanext) { vFAIL("Variable length lookbehind not implemented"); } - else if (minnext > U8_MAX) { + else if (minnext > (I32)U8_MAX) { vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); } scan->flags = (U8)minnext; @@ -3154,6 +3216,138 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, cl_anything(pRExC_state, data->start_class); flags &= ~SCF_DO_STCLASS; } +#ifdef TRIE_STUDY_OPT +#ifdef FULL_TRIE_STUDY + else if (PL_regkind[OP(scan)] == TRIE) { + /* NOTE - There is similar code to this block above for handling + BRANCH nodes on the initial study. If you change stuff here + check there too. */ + regnode *tail= regnext(scan); + reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ]; + I32 max1 = 0, min1 = I32_MAX; + struct regnode_charclass_class accum; + + if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */ + scan_commit(pRExC_state, data); /* Cannot merge strings after this. */ + if (flags & SCF_DO_STCLASS) + cl_init_zero(pRExC_state, &accum); + + if (!trie->jump) { + min1= trie->minlen; + max1= trie->maxlen; + } else { + const regnode *nextbranch= NULL; + U32 word; + + for ( word=1 ; word <= trie->wordcount ; word++) + { + I32 deltanext=0, minnext=0, f = 0, fake; + struct regnode_charclass_class this_class; + + data_fake.flags = 0; + if (data) { + data_fake.whilem_c = data->whilem_c; + data_fake.last_closep = data->last_closep; + } + else + data_fake.last_closep = &fake; + + if (flags & SCF_DO_STCLASS) { + cl_init(pRExC_state, &this_class); + data_fake.start_class = &this_class; + f = SCF_DO_STCLASS_AND; + } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; + + if (trie->jump[word]) { + if (!nextbranch) + nextbranch = tail - trie->jump[0]; + scan= tail - trie->jump[word]; + /* We go from the jump point to the branch that follows + it. Note this means we need the vestigal unused branches + even though they arent otherwise used. + */ + minnext = study_chunk(pRExC_state, &scan, &deltanext, + (regnode *)nextbranch, &data_fake, f,depth+1); + } + if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) + nextbranch= regnext((regnode*)nextbranch); + + if (min1 > (I32)(minnext + trie->minlen)) + min1 = minnext + trie->minlen; + if (max1 < (I32)(minnext + deltanext + trie->maxlen)) + max1 = minnext + deltanext + trie->maxlen; + if (deltanext == I32_MAX) + is_inf = is_inf_internal = 1; + + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + + if (data) { + if (data_fake.flags & SF_HAS_EVAL) + data->flags |= SF_HAS_EVAL; + data->whilem_c = data_fake.whilem_c; + } + if (flags & SCF_DO_STCLASS) + cl_or(pRExC_state, &accum, &this_class); + } + } + if (flags & SCF_DO_SUBSTR) { + data->pos_min += min1; + data->pos_delta += max1 - min1; + if (max1 != min1 || is_inf) + data->longest = &(data->longest_float); + } + min += min1; + delta += max1 - min1; + if (flags & SCF_DO_STCLASS_OR) { + cl_or(pRExC_state, data->start_class, &accum); + if (min1) { + cl_and(data->start_class, &and_with); + flags &= ~SCF_DO_STCLASS; + } + } + else if (flags & SCF_DO_STCLASS_AND) { + if (min1) { + cl_and(data->start_class, &accum); + flags &= ~SCF_DO_STCLASS; + } + else { + /* Switch to OR mode: cache the old value of + * data->start_class */ + StructCopy(data->start_class, &and_with, + struct regnode_charclass_class); + flags &= ~SCF_DO_STCLASS_AND; + StructCopy(&accum, data->start_class, + struct regnode_charclass_class); + flags |= SCF_DO_STCLASS_OR; + data->start_class->flags |= ANYOF_EOS; + } + } + scan= tail; + continue; + } +#else + else if (PL_regkind[OP(scan)] == TRIE) { + reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ]; + U8*bang=NULL; + + min += trie->minlen; + delta += (trie->maxlen - trie->minlen); + flags &= ~SCF_DO_STCLASS; /* xxx */ + if (flags & SCF_DO_SUBSTR) { + scan_commit(pRExC_state,data); /* Cannot expect anything... */ + data->pos_min += trie->minlen; + data->pos_delta += (trie->maxlen - trie->minlen); + if (trie->maxlen != trie->minlen) + data->longest = &(data->longest_float); + } + if (trie->jump) /* no more substrings -- for now /grr*/ + flags &= ~SCF_DO_SUBSTR; + } +#endif /* old or new */ +#endif /* TRIE_STUDY_OPT */ /* Else: zero-length, ignore. */ scan = regnext(scan); } @@ -3163,7 +3357,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, *deltap = is_inf_internal ? I32_MAX : delta; if (flags & SCF_DO_SUBSTR && is_inf) data->pos_delta = I32_MAX - data->pos_min; - if (is_par > U8_MAX) + if (is_par > (I32)U8_MAX) is_par = 0; if (is_par && pars==1 && data) { data->flags |= SF_IN_PAR; @@ -3175,8 +3369,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, } if (flags & SCF_DO_STCLASS_OR) cl_and(data->start_class, &and_with); - if (flags & SCF_EXACT_TRIE) - data->flags |= SCF_EXACT_TRIE; + if (flags & SCF_TRIE_RESTUDY) + data->flags |= SCF_TRIE_RESTUDY; return min; } @@ -3229,6 +3423,15 @@ Perl_reginitcolors(pTHX) #endif +#ifdef TRIE_STUDY_OPT +#define CHECK_RESTUDY_GOTO \ + if ( \ + (data.flags & SCF_TRIE_RESTUDY) \ + && ! restudied++ \ + ) goto reStudy +#else +#define CHECK_RESTUDY_GOTO +#endif /* - pregcomp - compile a regular expression into internal code * @@ -3368,7 +3571,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) RExC_emit_start = r->program; RExC_emit = r->program; /* Store the count of eval-groups for security checks: */ - RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals); + RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals; REGC((U8)REG_MAGIC, (char*) RExC_emit++); r->data = 0; if (reg(pRExC_state, 0, &flags,1) == NULL) @@ -3378,6 +3581,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) Newx(r->substrs, 1, struct reg_substr_data); reStudy: + minlen=sawplus=sawopen=0; Zero(r->substrs, 1, struct reg_substr_data); StructCopy(&zero_scan_data, &data, scan_data_t); @@ -3425,7 +3629,7 @@ reStudy: /* An {n,m} with n>0 */ (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ) { - DEBUG_PEEP("first:",first,0); + if (OP(first) == PLUS) sawplus = 1; else @@ -3439,6 +3643,7 @@ reStudy: /* Starting-point info. */ again: + DEBUG_PEEP("first:",first,0); /* Ignore EXACT as we deal with it later. */ if (PL_regkind[OP(first)] == EXACT) { if (OP(first) == EXACT) @@ -3447,15 +3652,24 @@ reStudy: r->regstclass = first; } #ifdef TRIE_STCLASS - else if (OP(first) == TRIE && + else if (PL_regkind[OP(first)] == TRIE && ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0) { + regnode *trie_op; /* this can happen only on restudy */ - struct regnode_1 *trie_op; - Newxz(trie_op,1,struct regnode_1); - StructCopy(first,trie_op,struct regnode_1); - make_trie_failtable(pRExC_state, (regnode *)first, (regnode *)trie_op, 0); - r->regstclass = (regnode *)trie_op; + if ( OP(first) == TRIE ) { + struct regnode_1 *trieop; + Newxz(trieop,1,struct regnode_1); + StructCopy(first,trieop,struct regnode_1); + trie_op=(regnode *)trieop; + } else { + struct regnode_charclass *trieop; + Newxz(trieop,1,struct regnode_charclass); + StructCopy(first,trieop,struct regnode_charclass); + trie_op=(regnode *)trieop; + } + make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0); + r->regstclass = trie_op; } #endif else if (strchr((const char*)PL_simple,OP(first))) @@ -3539,12 +3753,10 @@ reStudy: minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */ &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0); -#ifdef TRIE_STUDY_OPT - if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) { - goto reStudy; - } -#endif + CHECK_RESTUDY_GOTO; + + if ( RExC_npar == 1 && data.longest == &(data.longest_fixed) && data.last_start_min == 0 && data.last_end > 0 && !RExC_seen_zerolen @@ -3673,11 +3885,7 @@ reStudy: minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0); -#ifdef TRIE_STUDY_OPT - if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) { - goto reStudy; - } -#endif + CHECK_RESTUDY_GOTO; r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 = r->float_substr = r->float_utf8 = NULL; @@ -3726,7 +3934,7 @@ reStudy: for (i = 1; i <= len; i++) { if (r->offsets[i*2-1] || r->offsets[i*2]) PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ", - i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]); + (UV)i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]); } PerlIO_printf(Perl_debug_log, "\n"); }); @@ -6201,7 +6409,7 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd) ? "Overwriting end of array!\n" : "OK", (UV)(place - RExC_emit_start), (UV)(RExC_parse - RExC_start), - RExC_offsets[0])); + (UV)RExC_offsets[0])); Set_Node_Offset(place, RExC_parse); Set_Node_Length(place, 1); } @@ -6377,7 +6585,7 @@ Perl_regdump(pTHX_ const regexp *r) SV * const sv = sv_newmortal(); SV *dsv= sv_newmortal(); - (void)dumpuntil(r, r->program, r->program + 1, NULL, sv, 0); + (void)dumpuntil(r, r->program, r->program + 1, NULL, NULL, sv, 0, 0); /* Header fields of interest. */ if (r->anchored_substr) { @@ -6683,8 +6891,6 @@ Perl_pregfree(pTHX_ struct regexp *r) { dVAR; - - GET_RE_DEBUG_FLAGS_DECL; if (!r || (--r->refcnt > 0)) @@ -6794,6 +7000,10 @@ Perl_pregfree(pTHX_ struct regexp *r) Safefree(trie->bitmap); if (trie->wordlen) Safefree(trie->wordlen); + if (trie->jump) + Safefree(trie->jump); + if (trie->nextword) + Safefree(trie->nextword); #ifdef DEBUGGING if (RX_DEBUG(r)) { if (trie->words) @@ -6946,31 +7156,41 @@ S_put_byte(pTHX_ SV *sv, int c) Perl_sv_catpvf(aTHX_ sv, "%c", c); } + #define CLEAR_OPTSTART \ if (optstart) STMT_START { \ DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \ optstart=NULL; \ } STMT_END -#define DUMPUNTIL(a,b,c,d,e,f) CLEAR_OPTSTART; node=dumpuntil(a,b,c,d,e,f); +#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); STATIC const regnode * S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, - const regnode *last, SV* sv, I32 l) + const regnode *last, const regnode *plast, + SV* sv, I32 indent, U32 depth) { dVAR; - register U8 op = EXACT; /* Arbitrary non-END op. */ + register U8 op = PSEUDO; /* Arbitrary non-END op. */ register const regnode *next; const regnode *optstart= NULL; GET_RE_DEBUG_FLAGS_DECL; - while (op != END && (!last || node < last)) { +#ifdef DEBUG_DUMPUNTIL + PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start, + last ? last-start : 0,plast ? plast-start : 0); +#endif + + if (plast && plast < last) + last= plast; + + while (PL_regkind[op] != END && (!last || node < last)) { /* While that wasn't END last time... */ NODE_ALIGN(node); op = OP(node); if (op == CLOSE) - l--; + indent--; next = regnext((regnode *)node); /* Where, what. */ @@ -6984,14 +7204,18 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, regprop(r, sv, node); PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start), - (int)(2*l + 1), "", SvPVX_const(sv)); + (int)(2*indent + 1), "", SvPVX_const(sv)); if (OP(node) != OPTIMIZED) { if (next == NULL) /* Next ptr. */ PerlIO_printf(Perl_debug_log, "(0)"); + else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH ) + PerlIO_printf(Perl_debug_log, "(FAIL)"); else PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start)); - (void)PerlIO_putc(Perl_debug_log, '\n'); + + if (PL_regkind[(U8)op] != TRIE) + (void)PerlIO_putc(Perl_debug_log, '\n'); } after_print: @@ -7003,36 +7227,39 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, : next); if (last && nnode > last) nnode = last; - DUMPUNTIL(r, start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1); + DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode); } } else if (PL_regkind[(U8)op] == BRANCH) { assert(next); - DUMPUNTIL(r, start, NEXTOPER(node), next, sv, l + 1); + DUMPUNTIL(NEXTOPER(node), next); } else if ( PL_regkind[(U8)op] == TRIE ) { const I32 n = ARG(node); const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n]; - const I32 arry_len = av_len(trie->words)+1; + const regnode *nextbranch= NULL; I32 word_idx; - PerlIO_printf(Perl_debug_log, - "%*s[StS:%"UVuf" Wds:%d Cs:%d Uq:%d #Sts:%"IVdf" Mn:%d Mx:%d", - (int)(2*(l+3)), - "", - trie->startstate, - TRIE_WORDCOUNT(trie), - (int)TRIE_CHARCOUNT(trie), - trie->uniquecharcount, - (IV)TRIE_LASTSTATE(trie)-1, - (int)trie->minlen, - (int)trie->maxlen - ); - if (trie->bitmap) { + + DEBUG_TRIE_COMPILE_r( + PerlIO_printf(Perl_debug_log, + " S:%"UVuf"/%"IVdf" W:%d L:%d/%d C:%d/%d ", + (UV)trie->startstate, + (IV)trie->laststate-1, + trie->wordcount, + (int)trie->minlen, + (int)trie->maxlen, + (int)TRIE_CHARCOUNT(trie), + trie->uniquecharcount + ); + ); + if ( op==TRIEC || trie->bitmap ) { int i; int rangestart= -1; + U8* bitmap= op==TRIEC ? ANYOF_BITMAP(node) : TRIE_BITMAP(trie); + sv_setpvn(sv, "", 0); for (i = 0; i <= 256; i++) { - if (i < 256 && TRIE_BITMAP_TEST(trie,i)) { + if (i < 256 && BITMAP_TEST(bitmap,i)) { if (rangestart == -1) rangestart = i; } else if (rangestart != -1) { @@ -7047,39 +7274,54 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, rangestart = -1; } } - PerlIO_printf(Perl_debug_log, " Stcls:%s]\n", SvPVX_const(sv)); + PerlIO_printf(Perl_debug_log, "[%s]\n", SvPVX_const(sv)); } else - PerlIO_printf(Perl_debug_log, " No-Stcls]\n"); + PerlIO_printf(Perl_debug_log, "\n"); - for (word_idx=0; word_idx < arry_len; word_idx++) { + + + for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) { SV ** const elem_ptr = av_fetch(trie->words,word_idx,0); - if (elem_ptr) - PerlIO_printf(Perl_debug_log, "%*s%s\n", - (int)(2*(l+4)), "", - pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60, + + PerlIO_printf(Perl_debug_log, "%*s%s ", + (int)(2*(indent+3)), "", + elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60, PL_colors[0], PL_colors[1], (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) | PERL_PV_PRETTY_ELIPSES | PERL_PV_PRETTY_LTGT - ) - ); + ) + : "???" + ); + if (trie->jump) { + U16 dist= trie->jump[word_idx+1]; + PerlIO_printf(Perl_debug_log, "(%u)\n",(next - dist) - start); + if (dist) { + if (!nextbranch) + nextbranch= next - trie->jump[0]; + DUMPUNTIL(next - dist, nextbranch); + } + if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) + nextbranch= regnext((regnode *)nextbranch); + } else { + PerlIO_printf(Perl_debug_log, "\n"); } - - node = NEXTOPER(node); - node += regarglen[(U8)op]; - + } + if (last && next > last) + node= last; + else + node= next; } - else if ( op == CURLY) { /* "next" might be very big: optimizer */ - DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS, - NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1); + else if ( op == CURLY ) { /* "next" might be very big: optimizer */ + DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, + NEXTOPER(node) + EXTRA_STEP_2ARGS + 1); } else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) { assert(next); - DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS, - next, sv, l + 1); + DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next); } else if ( op == PLUS || op == STAR) { - DUMPUNTIL(r, start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1); + DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1); } else if (op == ANYOF) { /* arglen 1 + class block */ @@ -7097,12 +7339,15 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, node += regarglen[(U8)op]; } if (op == CURLYX || op == OPEN) - l++; + indent++; else if (op == WHILEM) - l--; + indent--; } CLEAR_OPTSTART; - return node; +#ifdef DEBUG_DUMPUNTIL + PerlIO_printf(Perl_debug_log, "--- %d\n",indent); +#endif + return last ? last : node; } #endif /* DEBUGGING */ diff --git a/regcomp.h b/regcomp.h index 5c35f63..b6f3617 100644 --- a/regcomp.h +++ b/regcomp.h @@ -103,6 +103,7 @@ struct regnode_2 { #define ANYOF_BITMAP_SIZE 32 /* 256 b/(8 b/B) */ #define ANYOF_CLASSBITMAP_SIZE 4 /* up to 32 (8*4) named classes */ +/* also used by trie */ struct regnode_charclass { U8 flags; U8 type; @@ -152,6 +153,7 @@ struct regnode_charclass_class { /* has [[:blah:]] classes */ #define ARG(p) ARG_VALUE(ARG_LOC(p)) #define ARG1(p) ARG_VALUE(ARG1_LOC(p)) #define ARG2(p) ARG_VALUE(ARG2_LOC(p)) + #define ARG_SET(p, val) ARG__SET(ARG_LOC(p), (val)) #define ARG1_SET(p, val) ARG__SET(ARG1_LOC(p), (val)) #define ARG2_SET(p, val) ARG__SET(ARG2_LOC(p), (val)) @@ -187,6 +189,8 @@ struct regnode_charclass_class { /* has [[:blah:]] classes */ #define ARG_LOC(p) (((struct regnode_1 *)p)->arg1) #define ARG1_LOC(p) (((struct regnode_2 *)p)->arg1) #define ARG2_LOC(p) (((struct regnode_2 *)p)->arg2) + + #define NODE_STEP_REGNODE 1 /* sizeof(regnode)/sizeof(regnode) */ #define EXTRA_STEP_2ARGS EXTRA_SIZE(struct regnode_2) @@ -288,7 +292,7 @@ struct regnode_charclass_class { /* has [[:blah:]] classes */ #define ANYOF_BITMAP_ZERO(ret) Zero(((struct regnode_charclass*)(ret))->bitmap, ANYOF_BITMAP_SIZE, char) #define ANYOF_BITMAP(p) (((struct regnode_charclass*)(p))->bitmap) -#define ANYOF_BITMAP_BYTE(p, c) (ANYOF_BITMAP(p)[((c) >> 3) & 31]) +#define ANYOF_BITMAP_BYTE(p, c) (ANYOF_BITMAP(p)[(((U8)(c)) >> 3) & 31]) #define ANYOF_BITMAP_SET(p, c) (ANYOF_BITMAP_BYTE(p, c) |= ANYOF_BIT(c)) #define ANYOF_BITMAP_CLEAR(p,c) (ANYOF_BITMAP_BYTE(p, c) &= ~ANYOF_BIT(c)) #define ANYOF_BITMAP_TEST(p, c) (ANYOF_BITMAP_BYTE(p, c) & ANYOF_BIT(c)) @@ -305,6 +309,7 @@ struct regnode_charclass_class { /* has [[:blah:]] classes */ #define ANYOF_CLASS_SKIP ((ANYOF_CLASS_SIZE - 1)/sizeof(regnode)) #define ANYOF_CLASS_ADD_SKIP (ANYOF_CLASS_SKIP - ANYOF_SKIP) + /* * Utility definitions. */ @@ -454,24 +459,28 @@ typedef struct _reg_trie_trans reg_trie_trans; /* anything in here that needs to be freed later should be dealt with in pregfree */ struct _reg_trie_data { - U16 uniquecharcount; - U32 lasttrans; - U16 *charmap; - HV *widecharmap; - reg_trie_state *states; - reg_trie_trans *trans; - char *bitmap; - U32 refcount; - U32 startstate; - STRLEN minlen; - STRLEN maxlen; - U32 *wordlen; - U32 laststate; /* Build only */ + U16 uniquecharcount; /* unique chars in trie (width of trans table) */ + U32 lasttrans; /* last valid transition element */ + U16 *charmap; /* byte to charid lookup array */ + HV *widecharmap; /* code points > 255 to charid */ + reg_trie_state *states; /* state data */ + reg_trie_trans *trans; /* array of transition elements */ + char *bitmap; /* stclass bitmap */ + U32 refcount; /* number of times this trie is referenced */ + U32 startstate; /* initial state - used for common prefix optimisation */ + STRLEN minlen; /* minimum length of words in trie - build/opt only? */ + STRLEN maxlen; /* maximum length of words in trie - build/opt only? */ + U32 *wordlen; /* array of lengths of words */ + U16 *jump; /* optional 1 indexed array of offsets before tail + for the node following a given word. */ + U16 *nextword; /* optional 1 indexed array to support linked list + of duplicate wordnums */ + U32 laststate; /* Build only */ + U32 wordcount; /* Build only */ #ifdef DEBUGGING - U16 wordcount; /* Build only */ - STRLEN charcount; /* Build only */ - AV *words; - AV *revcharmap; + STRLEN charcount; /* Build only */ + AV *words; /* Array of words contained in trie, for dumping */ + AV *revcharmap; /* Map of each charid back to its character representation */ #endif }; typedef struct _reg_trie_data reg_trie_data; @@ -489,11 +498,13 @@ typedef struct _reg_ac_data reg_ac_data; three different sets... */ #define TRIE_BITMAP(p) (((reg_trie_data *)(p))->bitmap) -#define TRIE_BITMAP_BYTE(p, c) (TRIE_BITMAP(p)[(((U8)c) >> 3) & 31]) +#define TRIE_BITMAP_BYTE(p, c) (TRIE_BITMAP(p)[(((U8)(c)) >> 3) & 31]) #define TRIE_BITMAP_SET(p, c) (TRIE_BITMAP_BYTE(p, c) |= ANYOF_BIT((U8)c)) #define TRIE_BITMAP_CLEAR(p,c) (TRIE_BITMAP_BYTE(p, c) &= ~ANYOF_BIT((U8)c)) #define TRIE_BITMAP_TEST(p, c) (TRIE_BITMAP_BYTE(p, c) & ANYOF_BIT((U8)c)) +#define BITMAP_BYTE(p, c) (((U8*)p)[(((U8)(c)) >> 3) & 31]) +#define BITMAP_TEST(p, c) (BITMAP_BYTE(p, c) & ANYOF_BIT((U8)c)) /* these defines assume uniquecharcount is the correct variable, and state may be evaluated twice */ #define TRIE_NODENUM(state) (((state)-1)/(trie->uniquecharcount)+1) @@ -501,15 +512,10 @@ typedef struct _reg_ac_data reg_ac_data; #define TRIE_NODEIDX(state) ((state) ? (((state)-1)*(trie->uniquecharcount)+1) : (state)) #ifdef DEBUGGING -#define TRIE_WORDCOUNT(trie) ((trie)->wordcount) #define TRIE_CHARCOUNT(trie) ((trie)->charcount) -#define TRIE_LASTSTATE(trie) ((trie)->laststate) #define TRIE_REVCHARMAP(trie) ((trie)->revcharmap) #else -#define TRIE_WORDCOUNT(trie) (trie_wordcount) #define TRIE_CHARCOUNT(trie) (trie_charcount) -/*#define TRIE_LASTSTATE(trie) (trie_laststate)*/ -#define TRIE_LASTSTATE(trie) ((trie)->laststate) #define TRIE_REVCHARMAP(trie) (trie_revcharmap) #endif diff --git a/regcomp.sym b/regcomp.sym index 1a2bd31..f62b7bf 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -122,11 +122,13 @@ OPTIMIZED NOTHING,off Placeholder for dump. # Trie Related (behave the same as A|LIST|OF|WORDS would) TRIE TRIE, trie 1 Match many EXACT(FL?)? at once. flags==type -TRIEC TRIE, trie 1 Trie + charclass. (unused at present) +TRIEC TRIE, trie charclass Same as TRIE, but with embedded charclass data +# NEW STUFF HERE # Special opcode with the property that no opcode in a compiled program # will ever be of this type. Thus it can be used as a flag value that # no other opcode has been seen. END is used similarly, in that an END # node cant be optimized. So END implies "unoptimizable" and PSEUDO mean # "not seen anything to optimize yet". PSEUDO PSEUDO,off Pseudo opcode for internal use. +# NOTHING BELOW HERE \ No newline at end of file diff --git a/regexec.c b/regexec.c index 3731b60..75cc2f3 100644 --- a/regexec.c +++ b/regexec.c @@ -782,7 +782,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, /* Last resort... */ /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */ - if (prog->regstclass && OP(prog->regstclass)!=TRIE) { + if (prog->regstclass && PL_regkind[OP(prog->regstclass)]!=TRIE) { /* minlen == 0 is possible if regstclass is \b or \B, and the fixed substr is ''$. Since minlen is already taken into account, s+1 is before strend; @@ -895,10 +895,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, return NULL; } -/* We know what class REx starts with. Try to find this position... */ -/* if reginfo is NULL, its a dryrun */ -/* annoyingly all the vars in this routine have different names from their counterparts - in regmatch. /grrr */ + #define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \ foldlen, foldbuf, uniflags) STMT_START { \ @@ -1038,6 +1035,14 @@ if ((!reginfo || regtry(reginfo, s))) \ } \ break +#define DUMP_EXEC_POS(li,s,doutf8) \ + dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8) + +/* We know what class REx starts with. Try to find this position... */ +/* if reginfo is NULL, its a dryrun */ +/* annoyingly all the vars in this routine have different names from their counterparts + in regmatch. /grrr */ + STATIC char * S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, const char *strend, const regmatch_info *reginfo) @@ -1324,8 +1329,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, !isDIGIT_LC_utf8((U8*)s), !isDIGIT_LC(*s) ); + case TRIEC: case TRIE: - /*Perl_croak(aTHX_ "panic: unknown regstclass TRIE");*/ { const enum { trie_plain, trie_utf8, trie_utf8_fold } trie_type = do_utf8 ? @@ -1343,11 +1348,13 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, STRLEN maxlen = trie->maxlen; SV *sv_points; U8 **points; /* map of where we were in the input string - when reading a given string. For ASCII this + when reading a given char. For ASCII this is unnecessary overhead as the relationship is always 1:1, but for unicode, especially case folded unicode this is not true. */ U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; + U8 *bitmap=NULL; + GET_RE_DEBUG_FLAGS_DECL; @@ -1362,13 +1369,29 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, SvPOK_on(sv_points); sv_2mortal(sv_points); points=(U8**)SvPV_nolen(sv_points ); - - if (trie->bitmap && trie_type != trie_utf8_fold) { - while (s <= last_start && !TRIE_BITMAP_TEST(trie,*s) ) { - s++; - } + if ( trie_type != trie_utf8_fold && (trie->bitmap || OP(c)==TRIEC) ) { + if (trie->bitmap) + bitmap=(U8*)trie->bitmap; + else + bitmap=(U8*)ANYOF_BITMAP(c); } - + /* this is the Aho-Corasick algorithm modified a touch + to include special handling for long "unknown char" + sequences. The basic idea being that we use AC as long + as we are dealing with a possible matching char, when + we encounter an unknown char (and we have not encountered + an accepting state) we scan forward until we find a legal + starting char. + AC matching is basically that of trie matching, except + that when we encounter a failing transition, we fall back + to the current states "fail state", and try the current char + again, a process we repeat until we reach the root state, + state 1, or a legal transition. If we fail on the root state + then we can either terminate if we have reached an accepting + state previously, or restart the entire process from the beginning + if we have not. + + */ while (s <= last_start) { const U32 uniflags = UTF8_ALLOW_DEFAULT; U8 *uc = (U8*)s; @@ -1380,39 +1403,66 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, STRLEN foldlen = 0; U8 *uscan = (U8*)NULL; U8 *leftmost = NULL; - +#ifdef DEBUGGING + U32 accepted_word= 0; +#endif U32 pointpos = 0; while ( state && uc <= (U8*)strend ) { int failed=0; - if (aho->states[ state ].wordnum) { - U8 *lpos= points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1] ) % maxlen ]; - if (!leftmost || lpos < leftmost) + U32 word = aho->states[ state ].wordnum; + + if( state==1 && bitmap ) { + DEBUG_TRIE_EXECUTE_r( + if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { + dump_exec_pos( (char *)uc, c, strend, real_start, + uc, do_utf8 ); + PerlIO_printf( Perl_debug_log, + " Scanning for legal start char...\n"); + } + ); + while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { + uc++; + } + s= (char *)uc; + if (uc >(U8*)last_start) break; + } + + if ( word ) { + U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ]; + if (!leftmost || lpos < leftmost) { + DEBUG_r(accepted_word=word); leftmost= lpos; + } if (base==0) break; + } points[pointpos++ % maxlen]= uc; REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags); - DEBUG_TRIE_EXECUTE_r( + DEBUG_TRIE_EXECUTE_r({ + dump_exec_pos( (char *)uc, c, strend, real_start, + s, do_utf8 ); PerlIO_printf(Perl_debug_log, - "Pos: %d Charid:%3x CV:%4"UVxf" ", - (int)((const char*)uc - real_start), charid, uvc) - ); - uc += len; + " Charid:%3u CP:%4"UVxf" ", + charid, uvc); + }); do { #ifdef DEBUGGING - U32 word = aho->states[ state ].wordnum; + word = aho->states[ state ].wordnum; #endif base = aho->states[ state ].trans.base; - DEBUG_TRIE_EXECUTE_r( + DEBUG_TRIE_EXECUTE_r({ + if (failed) + dump_exec_pos( (char *)uc, c, strend, real_start, + s, do_utf8 ); PerlIO_printf( Perl_debug_log, - "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf" word=%"UVxf"\n", - failed ? "Fail transition to " : "", - (UV)state, (UV)base, (UV)uvc, (UV)word) - ); + "%sState: %4"UVxf", word=%"UVxf, + failed ? " Fail transition to " : "", + (UV)state, (UV)word); + }); if ( base ) { U32 tmp; if (charid && @@ -1424,53 +1474,60 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, && (tmp=trie->trans[base + charid - 1 - trie->uniquecharcount ].next)) { + DEBUG_TRIE_EXECUTE_r( + PerlIO_printf( Perl_debug_log," - legal\n")); state = tmp; break; } else { - failed++; - if ( state == 1 ) - break; - else - state = aho->fail[state]; + DEBUG_TRIE_EXECUTE_r( + PerlIO_printf( Perl_debug_log," - fail\n")); + failed = 1; + state = aho->fail[state]; } } else { /* we must be accepting here */ - failed++; + DEBUG_TRIE_EXECUTE_r( + PerlIO_printf( Perl_debug_log," - accepting\n")); + failed = 1; break; } } while(state); + uc += len; if (failed) { if (leftmost) break; - else if (!charid && trie->bitmap && trie_type != trie_utf8_fold) { - while ( uc <= (U8*)last_start && !TRIE_BITMAP_TEST(trie,*uc) ) { - uc++; - } - } + if (!state) state = 1; } } if ( aho->states[ state ].wordnum ) { U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ]; - if (!leftmost || lpos < leftmost) + if (!leftmost || lpos < leftmost) { + DEBUG_r(accepted_word=aho->states[ state ].wordnum); leftmost = lpos; + } } - DEBUG_TRIE_EXECUTE_r( - PerlIO_printf( Perl_debug_log, - "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf"\n", - "All done: ", - (UV)state, (UV)base, (UV)uvc) - ); if (leftmost) { s = (char*)leftmost; + DEBUG_TRIE_EXECUTE_r({ + PerlIO_printf( + Perl_debug_log,"Matches word #%"UVxf" at position %d. Trying full pattern...\n", + (UV)accepted_word, s - real_start + ); + }); if (!reginfo || regtry(reginfo, s)) { FREETMPS; LEAVE; goto got_it; } s = HOPc(s,1); + DEBUG_TRIE_EXECUTE_r({ + PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n"); + }); } else { + DEBUG_TRIE_EXECUTE_r( + PerlIO_printf( Perl_debug_log,"No match.\n")); break; } } @@ -1775,11 +1832,11 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * }); goto phooey; } - else if ((c = prog->regstclass)) { + else if ( (c = prog->regstclass) ) { if (minlen) { const OPCODE op = OP(prog->regstclass); /* don't bother with what can't match */ - if (PL_regkind[op] != EXACT && op != CANY && op != TRIE) + if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE) strend = HOPc(strend, -(minlen - 1)); } DEBUG_EXECUTE_r({ @@ -2303,27 +2360,32 @@ S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8, } STATIC void -S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8) +S_dump_exec_pos(pTHX_ const char *locinput, + const regnode *scan, + const char *loc_regeol, + const char *loc_bostr, + const char *loc_reg_starttry, + const bool do_utf8) { - const int docolor = *PL_colors[0]; + const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4]; const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */ - int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput); + int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput); /* The part of the string before starttry has one color (pref0_len chars), between starttry and current position another one (pref_len - pref0_len chars), after the current position the third one. We assume that pref0_len <= pref_len, otherwise we decrease pref0_len. */ - int pref_len = (locinput - PL_bostr) > (5 + taill) - l - ? (5 + taill) - l : locinput - PL_bostr; + int pref_len = (locinput - loc_bostr) > (5 + taill) - l + ? (5 + taill) - l : locinput - loc_bostr; int pref0_len; while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len))) pref_len++; - pref0_len = pref_len - (locinput - PL_reg_starttry); - if (l + pref_len < (5 + taill) && l < PL_regeol - locinput) - l = ( PL_regeol - locinput > (5 + taill) - pref_len - ? (5 + taill) - pref_len : PL_regeol - locinput); + pref0_len = pref_len - (locinput - loc_reg_starttry); + if (l + pref_len < (5 + taill) && l < loc_regeol - locinput) + l = ( loc_regeol - locinput > (5 + taill) - pref_len + ? (5 + taill) - pref_len : loc_regeol - locinput); while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l))) l--; if (pref0_len < 0) @@ -2334,18 +2396,18 @@ S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_u const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0; RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0), - (locinput - pref_len),pref0_len, 60, 4, 5); + (locinput - pref_len),pref0_len, pref0_len, 4, 5); RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1), (locinput - pref_len + pref0_len), - pref_len - pref0_len, 60, 2, 3); + pref_len - pref0_len, pref_len - pref0_len, 2, 3); RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2), - locinput, PL_regeol - locinput, 60, 0, 1); + locinput, loc_regeol - locinput, l, 0, 1); PerlIO_printf(Perl_debug_log, "%4"IVdf" <%.*s%.*s%s%.*s>%*s|", - (IV)(locinput - PL_bostr), + (IV)(locinput - loc_bostr), len0, s0, len1, s1, (docolor ? "" : "> <"), @@ -2418,6 +2480,7 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) st->sw = 0; st->logical = 0; st->cc = NULL; + /* Note that nextchr is a byte even in UTF */ nextchr = UCHARAT(locinput); scan = prog; @@ -2425,7 +2488,7 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) DEBUG_EXECUTE_r( { SV * const prop = sv_newmortal(); - dump_exec_pos( locinput, scan, do_utf8 ); + DUMP_EXEC_POS( locinput, scan, do_utf8 ); regprop(rex, prop, scan); PerlIO_printf(Perl_debug_log, @@ -2513,7 +2576,22 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) #undef ST #define ST st->u.trie - + case TRIEC: + /* In this case the charclass data is available inline so + we can fail fast without a lot of extra overhead. + */ + if (scan->flags == EXACT || !do_utf8) { + if(!ANYOF_BITMAP_TEST(scan, *locinput)) { + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s %sfailed to match trie start class...%s\n", + REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5]) + ); + sayNO_SILENT; + /* NOTREACHED */ + } + } + /* FALL THROUGH */ case TRIE: { /* what type of TRIE am I? (utf8 makes this contextual) */ @@ -2527,23 +2605,6 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) = (reg_trie_data*)rex->data->data[ ARG( scan ) ]; U32 state = trie->startstate; - U8 *uc = ( U8* )locinput; - U16 charid = 0; - U32 base = 0; - UV uvc = 0; - STRLEN len = 0; - STRLEN foldlen = 0; - U8 *uscan = (U8*)NULL; - STRLEN bufflen=0; - SV *sv_accept_buff = NULL; - U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; - - ST.accepted = 0; /* how many accepting states we have seen */ - ST.B = next; -#ifdef DEBUGGING - ST.me = scan; -#endif - if (trie->bitmap && trie_type != trie_utf8_fold && !TRIE_BITMAP_TEST(trie,*locinput) ) { @@ -2557,36 +2618,65 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) } else { DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "%*s %sfailed to match start class...%s\n", + "%*s %sfailed to match trie start class...%s\n", REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5]) ); sayNO_SILENT; } } + { + U8 *uc = ( U8* )locinput; + + STRLEN len = 0; + STRLEN foldlen = 0; + U8 *uscan = (U8*)NULL; + STRLEN bufflen=0; + SV *sv_accept_buff = NULL; + U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; + + ST.accepted = 0; /* how many accepting states we have seen */ + ST.B = next; + ST.jump = trie->jump; + +#ifdef DEBUGGING + ST.me = scan; +#endif + + + /* traverse the TRIE keeping track of all accepting states we transition through until we get to a failing node. */ while ( state && uc <= (U8*)PL_regeol ) { - - if (trie->states[ state ].wordnum) { - if (!ST.accepted ) { + U32 base = trie->states[ state ].trans.base; + UV uvc; + U16 charid; + /* We use charid to hold the wordnum as we don't use it + for charid until after we have done the wordnum logic. + We define an alias just so that the wordnum logic reads + more naturally. */ + +#define got_wordnum charid + got_wordnum = trie->states[ state ].wordnum; + + if ( got_wordnum ) { + if ( ! ST.accepted ) { ENTER; SAVETMPS; bufflen = TRIE_INITAL_ACCEPT_BUFFLEN; sv_accept_buff=newSV(bufflen * sizeof(reg_trie_accepted) - 1); - SvCUR_set(sv_accept_buff, - sizeof(reg_trie_accepted)); + SvCUR_set(sv_accept_buff, 0); SvPOK_on(sv_accept_buff); sv_2mortal(sv_accept_buff); SAVETMPS; ST.accept_buff = (reg_trie_accepted*)SvPV_nolen(sv_accept_buff ); } - else { + do { if (ST.accepted >= bufflen) { bufflen *= 2; ST.accept_buff =(reg_trie_accepted*) @@ -2595,20 +2685,21 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) } SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff) + sizeof(reg_trie_accepted)); - } - ST.accept_buff[ST.accepted].wordnum = trie->states[state].wordnum; - ST.accept_buff[ST.accepted].endpos = uc; - ++ST.accepted; - } - base = trie->states[ state ].trans.base; + + ST.accept_buff[ST.accepted].wordnum = got_wordnum; + ST.accept_buff[ST.accepted].endpos = uc; + ++ST.accepted; + } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum])); + } +#undef got_wordnum DEBUG_TRIE_EXECUTE_r({ - dump_exec_pos( (char *)uc, scan, do_utf8 ); + DUMP_EXEC_POS( (char *)uc, scan, do_utf8 ); PerlIO_printf( Perl_debug_log, - "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ", + "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ", 2+PL_regindent * 2, "", PL_colors[4], - (UV)state, (UV)base, (UV)ST.accepted ); + (UV)state, (UV)ST.accepted ); }); if ( base ) { @@ -2636,7 +2727,7 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) } DEBUG_TRIE_EXECUTE_r( PerlIO_printf( Perl_debug_log, - "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n", + "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n", charid, uvc, (UV)state, PL_colors[5] ); ); } @@ -2649,7 +2740,7 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)ST.accepted, PL_colors[5] ); ); - } + }} /* FALL THROUGH */ @@ -2677,7 +2768,12 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) LEAVE; locinput = PL_reginput; nextchr = UCHARAT(locinput); - scan = ST.B; + + if ( !ST.jump ) + scan = ST.B; + else + scan = ST.B - ST.jump[ST.accept_buff[0].wordnum]; + continue; /* execute rest of RE */ } @@ -2736,8 +2832,15 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) best = ST.accepted; } PL_reginput = (char *)ST.accept_buff[ best ].endpos; + if ( !ST.jump ) { + PUSH_STATE_GOTO(TRIE_next, ST.B); + /* NOTREACHED */ + } else { + PUSH_STATE_GOTO(TRIE_next, ST.B - ST.jump[ST.accept_buff[best].wordnum]); + /* NOTREACHED */ + } + /* NOTREACHED */ } - PUSH_STATE_GOTO(TRIE_next, ST.B); /* NOTREACHED */ #undef ST @@ -4281,6 +4384,7 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) newst->sw = 0; newst->logical = 0; + locinput = PL_reginput; nextchr = UCHARAT(locinput); st = newst; @@ -4317,6 +4421,7 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) st->minmod = 0; st->sw = 0; st->logical = 0; + #ifdef DEBUGGING PL_regindent++; #endif @@ -4433,8 +4538,9 @@ yes: no: DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "%*s %sfailed...%s\n", - REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5]) + "%*s %sfailed...%s\n", + REPORT_CODE_OFF+PL_regindent*2, "", + PL_colors[4], PL_colors[5]) ); no_final: do_no: diff --git a/regexp.h b/regexp.h index 8d71752..1a4ceb4 100644 --- a/regexp.h +++ b/regexp.h @@ -212,6 +212,7 @@ typedef struct regmatch_state { struct { reg_trie_accepted *accept_buff; U32 accepted; /* how many accepting states we have seen */ + U16 *jump; /* negative offsets from B */ regnode *B; /* node following the trie */ regnode *me; /* only needed for debugging */ } trie; diff --git a/regnodes.h b/regnodes.h index 3da3bad..d8e3006 100644 --- a/regnodes.h +++ b/regnodes.h @@ -67,7 +67,7 @@ #define RENUM 60 /* 0x3c Group with independently numbered parens. */ #define OPTIMIZED 61 /* 0x3d Placeholder for dump. */ #define TRIE 62 /* 0x3e Match many EXACT(FL?)? at once. flags==type */ -#define TRIEC 63 /* 0x3f Trie + charclass. (unused at present) */ +#define TRIEC 63 /* 0x3f Same as TRIE, but with embedded charclass data */ #define PSEUDO 64 /* 0x40 Pseudo opcode for internal use. */ #define REGNODE_MAX 64 @@ -209,7 +209,7 @@ static const U8 regarglen[] = { EXTRA_SIZE(struct regnode_1), /* RENUM */ 0, /* OPTIMIZED */ EXTRA_SIZE(struct regnode_1), /* TRIE */ - EXTRA_SIZE(struct regnode_1), /* TRIEC */ + EXTRA_SIZE(struct regnode_charclass), /* TRIEC */ 0, /* PSEUDO */ }; diff --git a/t/op/pat.t b/t/op/pat.t index f0f1b2b..4ff133b 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -7,7 +7,7 @@ $| = 1; # please update note at bottom of file when you change this -print "1..1231\n"; +print "1..1232\n"; BEGIN { chdir 't' if -d 't'; @@ -3394,6 +3394,12 @@ ok(("foba ba$s" =~ qr/(foo|Bass|bar)/i) ok(("foba ba$s" =~ qr/(foo|BaSS|bar)/i) && $1 eq "ba$s", "TRIEF + LATIN SMALL LETTER SHARP S =~ SS"); + +ok(("foba ba${s}pxySS$s$s" =~ qr/(b(?:a${s}t|a${s}f|a${s}p)[xy]+$s*)/i) + && $1 eq "ba${s}pxySS$s$s", + "COMMON PREFIX TRIEF + LATIN SMALL LETTER SHARP S"); + + } diff --git a/t/op/re_tests b/t/op/re_tests index d35104f..f8ee725 100644 --- a/t/op/re_tests +++ b/t/op/re_tests @@ -976,3 +976,17 @@ a(?!b(?!c))(..) abababc y $1 bc # test nested negatives a(?!b(?=a))(..) abababc y $1 bc # test nested lookaheads a(?!b(?!c(?!d(?!e))))...(.) abxabcdxabcde y $1 e X(?!b+(?!(c+)*(?!(c+)*d))).*X aXbbbbbbbcccccccccccccaaaX y - - +^(XXXXXXXXXX|YYYYYYYYYY|Z.Q*X|Z[TE]Q*P): ZEQQQQQQQQQQQQQQQQQQP: y $1 ZEQQQQQQQQQQQQQQQQQQP +^(XXXXXXXXXX|YYYYYYYYYY|Z.Q*X|Z[TE]Q*P): ZEQQQX: y $1 ZEQQQX +^([TUV]+|XXXXXXXXXX|YYYYYYYYYY|Z.Q*X|Z[TE]Q*P): ZEQQQQQQQQQQQQQQQQQQP: y $1 ZEQQQQQQQQQQQQQQQQQQP +^([TUV]+|XXXXXXXXXX|YYYYYYYYYY|Z.Q*X|Z[TE]Q*P): ZEQQQX: y $1 ZEQQQX +^([TUV]+|XXXXXXXXXX|YYYYYYYYYY|Z.Q*X|Z[TE]Q*P|[MKJ]): ZEQQQQQQQQQQQQQQQQQQP: y $1 ZEQQQQQQQQQQQQQQQQQQP +^([TUV]+|XXXXXXXXXX|YYYYYYYYYY|Z.Q*X|Z[TE]Q*P|[MKJ]): ZEQQQX: y $1 ZEQQQX +^(XXX|YYY|Z.Q*X|Z[TE]Q*P): ZEQQQQQQQQQQQQQQQQQQP: y $1 ZEQQQQQQQQQQQQQQQQQQP +^(XXX|YYY|Z.Q*X|Z[TE]Q*P): ZEQQQX: y $1 ZEQQQX +^([TUV]+|XXX|YYY|Z.Q*X|Z[TE]Q*P): ZEQQQQQQQQQQQQQQQQQQP: y $1 ZEQQQQQQQQQQQQQQQQQQP +^([TUV]+|XXX|YYY|Z.Q*X|Z[TE]Q*P): ZEQQQX: y $1 ZEQQQX +^([TUV]+|XXX|YYY|Z.Q*X|Z[TE]Q*P|[MKJ]): ZEQQQQQQQQQQQQQQQQQQP: y $1 ZEQQQQQQQQQQQQQQQQQQP +^([TUV]+|XXX|YYY|Z.Q*X|Z[TE]Q*P|[MKJ]): ZEQQQX: y $1 ZEQQQX +X(?:ABCF[cC]x*|ABCD|ABCF):(?:DIT|DID|DIM) XABCFCxxxxxxxxxx:DIM y $& XABCFCxxxxxxxxxx:DIM +(((ABCD|ABCE|ABCF)))(A|B|C[xy]*): ABCFCxxxxxxxxxx:DIM y $& ABCFCxxxxxxxxxx: