From: Yves Orton Date: Sat, 9 Sep 2006 22:00:05 +0000 (+0200) Subject: Teach regex optimiser how to handle (?=) and (?<=) properly. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=1de063289cf096bd67e3d9d1b4a6dca2498966fa;p=p5sagit%2Fp5-mst-13.2.git Teach regex optimiser how to handle (?=) and (?<=) properly. Message-ID: <9b18b3110609091300x1fd0b15dt32932902a0a80674@mail.gmail.com> p4raw-id: //depot/perl@28816 --- diff --git a/embed.fnc b/embed.fnc index a34cb00..082a7c5 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1313,7 +1313,7 @@ Es |void |regtail |NN struct RExC_state_t *state|NN regnode *p|NN const regnode Es |U32 |join_exact |NN struct RExC_state_t *state|NN regnode *scan|NN I32 *min|U32 flags|NULLOK regnode *val|U32 depth EsRn |char* |regwhite |NN char *p|NN const char *e Es |char* |nextchar |NN struct RExC_state_t *state -Es |void |scan_commit |NN const struct RExC_state_t* state|NN struct scan_data_t *data +Es |void |scan_commit |NN const struct RExC_state_t* state|NN struct scan_data_t *data|NN I32 *minlenp Esn |void |cl_anything |NN const struct RExC_state_t* state|NN struct regnode_charclass_class *cl EsRn |int |cl_is_anything |NN const struct regnode_charclass_class *cl Esn |void |cl_init |NN const struct RExC_state_t* state|NN struct regnode_charclass_class *cl @@ -1323,7 +1323,8 @@ Esn |void |cl_and |NN struct regnode_charclass_class *cl \ Esn |void |cl_or |NN const struct RExC_state_t* state|NN struct regnode_charclass_class *cl \ |NN const struct regnode_charclass_class *or_with Es |I32 |study_chunk |NN struct RExC_state_t* state|NN regnode **scanp \ - |NN I32 *deltap|NN regnode *last|NULLOK struct scan_data_t *data \ + |NN I32 *minlenp|NN I32 *deltap \ + |NN regnode *last|NULLOK struct scan_data_t *data \ |U32 flags|U32 depth EsRn |I32 |add_data |NN struct RExC_state_t* state|I32 n|NN const char *s rs |void |re_croak2 |NN const char* pat1|NN const char* pat2|... @@ -1357,6 +1358,7 @@ ERs |bool |reginclass |NULLOK const regexp *prog|NN const regnode *n|NN const U8 Es |CHECKPOINT|regcppush |I32 parenfloor Es |char* |regcppop |NN const regexp *rex ERsn |U8* |reghop3 |NN U8 *pos|I32 off|NN const U8 *lim +ERsn |U8* |reghop4 |NN U8 *pos|I32 off|NN const U8 *llim|NN const U8 *rlim ERsn |U8* |reghopmaybe3 |NN U8 *pos|I32 off|NN const U8 *lim ERs |char* |find_byclass |NN regexp * prog|NN const regnode *c|NN char *s|NN const char *strend|NULLOK const regmatch_info *reginfo Es |void |to_utf8_substr |NN regexp * prog diff --git a/embed.h b/embed.h index 6959cb9..dbb6ca3 100644 --- a/embed.h +++ b/embed.h @@ -1356,6 +1356,7 @@ #define regcppush S_regcppush #define regcppop S_regcppop #define reghop3 S_reghop3 +#define reghop4 S_reghop4 #define reghopmaybe3 S_reghopmaybe3 #define find_byclass S_find_byclass #define to_utf8_substr S_to_utf8_substr @@ -3504,14 +3505,14 @@ #define join_exact(a,b,c,d,e,f) S_join_exact(aTHX_ a,b,c,d,e,f) #define regwhite S_regwhite #define nextchar(a) S_nextchar(aTHX_ a) -#define scan_commit(a,b) S_scan_commit(aTHX_ a,b) +#define scan_commit(a,b,c) S_scan_commit(aTHX_ a,b,c) #define cl_anything S_cl_anything #define cl_is_anything S_cl_is_anything #define cl_init S_cl_init #define cl_init_zero S_cl_init_zero #define cl_and S_cl_and #define cl_or S_cl_or -#define study_chunk(a,b,c,d,e,f,g) S_study_chunk(aTHX_ a,b,c,d,e,f,g) +#define study_chunk(a,b,c,d,e,f,g,h) S_study_chunk(aTHX_ a,b,c,d,e,f,g,h) #define add_data S_add_data #endif #ifdef PERL_CORE @@ -3542,6 +3543,7 @@ #define regcppush(a) S_regcppush(aTHX_ a) #define regcppop(a) S_regcppop(aTHX_ a) #define reghop3 S_reghop3 +#define reghop4 S_reghop4 #define reghopmaybe3 S_reghopmaybe3 #define find_byclass(a,b,c,d,e) S_find_byclass(aTHX_ a,b,c,d,e) #define to_utf8_substr(a) S_to_utf8_substr(aTHX_ a) diff --git a/proto.h b/proto.h index ae714fb..48a66bb 100644 --- a/proto.h +++ b/proto.h @@ -3587,9 +3587,10 @@ STATIC char* S_regwhite(char *p, const char *e) STATIC char* S_nextchar(pTHX_ struct RExC_state_t *state) __attribute__nonnull__(pTHX_1); -STATIC void S_scan_commit(pTHX_ const struct RExC_state_t* state, struct scan_data_t *data) +STATIC void S_scan_commit(pTHX_ const struct RExC_state_t* state, struct scan_data_t *data, I32 *minlenp) __attribute__nonnull__(pTHX_1) - __attribute__nonnull__(pTHX_2); + __attribute__nonnull__(pTHX_2) + __attribute__nonnull__(pTHX_3); STATIC void S_cl_anything(const struct RExC_state_t* state, struct regnode_charclass_class *cl) __attribute__nonnull__(1) @@ -3616,11 +3617,12 @@ STATIC void S_cl_or(const struct RExC_state_t* state, struct regnode_charclass_c __attribute__nonnull__(2) __attribute__nonnull__(3); -STATIC I32 S_study_chunk(pTHX_ struct RExC_state_t* state, regnode **scanp, I32 *deltap, regnode *last, struct scan_data_t *data, U32 flags, U32 depth) +STATIC I32 S_study_chunk(pTHX_ struct RExC_state_t* state, regnode **scanp, I32 *minlenp, I32 *deltap, regnode *last, struct scan_data_t *data, U32 flags, U32 depth) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2) __attribute__nonnull__(pTHX_3) - __attribute__nonnull__(pTHX_4); + __attribute__nonnull__(pTHX_4) + __attribute__nonnull__(pTHX_5); STATIC I32 S_add_data(struct RExC_state_t* state, I32 n, const char *s) __attribute__warn_unused_result__ @@ -3707,6 +3709,12 @@ STATIC U8* S_reghop3(U8 *pos, I32 off, const U8 *lim) __attribute__nonnull__(1) __attribute__nonnull__(3); +STATIC U8* S_reghop4(U8 *pos, I32 off, const U8 *llim, const U8 *rlim) + __attribute__warn_unused_result__ + __attribute__nonnull__(1) + __attribute__nonnull__(3) + __attribute__nonnull__(4); + STATIC U8* S_reghopmaybe3(U8 *pos, I32 off, const U8 *lim) __attribute__warn_unused_result__ __attribute__nonnull__(1) diff --git a/regcomp.c b/regcomp.c index 1c64fd9..48d6aa2 100644 --- a/regcomp.c +++ b/regcomp.c @@ -174,23 +174,102 @@ typedef struct RExC_state_t { #define FULL_TRIE_STUDY #define TRIE_STCLASS #endif -/* Length of a variant. */ + + +/* About scan_data_t. + + During optimisation we recurse through the regexp program performing + various inplace (keyhole style) optimisations. In addition study_chunk + and scan_commit populate this data structure with information about + what strings MUST appear in the pattern. We look for the longest + string that must appear for at a fixed location, and we look for the + longest string that may appear at a floating location. So for instance + in the pattern: + + /FOO[xX]A.*B[xX]BAR/ + + Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating + strings (because they follow a .* construct). study_chunk will identify + both FOO and BAR as being the longest fixed and floating strings respectively. + + The strings can be composites, for instance + + /(f)(o)(o)/ + + will result in a composite fixed substring 'foo'. + + For each string some basic information is maintained: + + - offset or min_offset + This is the position the string must appear at, or not before. + It also implicitly (when combined with minlenp) tells us how many + character must match before the string we are searching. + Likewise when combined with minlenp and the length of the string + tells us how many characters must appear after the string we have + found. + + - max_offset + Only used for floating strings. This is the rightmost point that + the string can appear at. Ifset to I32 max it indicates that the + string can occur infinitely far to the right. + + - minlenp + A pointer to the minimum length of the pattern that the string + was found inside. This is important as in the case of positive + lookahead or positive lookbehind we can have multiple patterns + involved. Consider + + /(?=FOO).*F/ + + The minimum length of the pattern overall is 3, the minimum length + of the lookahead part is 3, but the minimum length of the part that + will actually match is 1. So 'FOO's minimum length is 3, but the + minimum length for the F is 1. This is important as the minimum length + is used to determine offsets in front of and behind the string being + looked for. Since strings can be composites this is the length of the + pattern at the time it was commited with a scan_commit. Note that + the length is calculated by study_chunk, so that the minimum lengths + are not known until the full pattern has been compiled, thus the + pointer to the value. + + - lookbehind + + In the case of lookbehind the string being searched for can be + offset past the start point of the final matching string. + If this value was just blithely removed from the min_offset it would + invalidate some of the calculations for how many chars must match + before or after (as they are derived from min_offset and minlen and + the length of the string being searched for). + When the final pattern is compiled and the data is moved from the + scan_data_t structure into the regexp structure the information + about lookbehind is factored in, with the information that would + have been lost precalculated in the end_shift field for the + associated string. + + The fields pos_min and pos_delta are used to store the minimum offset + and the delta to the maximum offset at the current point in the pattern. + +*/ typedef struct scan_data_t { - I32 len_min; - I32 len_delta; + /*I32 len_min; unused */ + /*I32 len_delta; unused */ I32 pos_min; I32 pos_delta; SV *last_found; - I32 last_end; /* min value, <0 unless valid. */ + I32 last_end; /* min value, <0 unless valid. */ I32 last_start_min; I32 last_start_max; - SV **longest; /* Either &l_fixed, or &l_float. */ - SV *longest_fixed; - I32 offset_fixed; - SV *longest_float; - I32 offset_float_min; - I32 offset_float_max; + SV **longest; /* Either &l_fixed, or &l_float. */ + SV *longest_fixed; /* longest fixed string found in pattern */ + I32 offset_fixed; /* offset where it starts */ + I32 *minlen_fixed; /* pointer to the minlen relevent to the string */ + I32 lookbehind_fixed; /* is the position of the string modfied by LB */ + SV *longest_float; /* longest floating string found in pattern */ + I32 offset_float_min; /* earliest point in string it can appear */ + I32 offset_float_max; /* latest point in string it can appear */ + I32 *minlen_float; /* pointer to the minlen relevent to the string */ + I32 lookbehind_float; /* is the position of the string modified by LB */ I32 flags; I32 whilem_c; I32 *last_closep; @@ -202,7 +281,7 @@ typedef struct scan_data_t { */ static const scan_data_t zero_scan_data = - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0}; #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL) #define SF_BEFORE_SEOL 0x0001 @@ -443,6 +522,39 @@ static const scan_data_t zero_scan_data = #define EXPERIMENTAL_INPLACESCAN #endif +#define DEBUG_STUDYDATA(data,depth) \ +DEBUG_OPTIMISE_r(if(data){ \ + PerlIO_printf(Perl_debug_log, \ + "%*s"/* Len:%"IVdf"/%"IVdf" */" Pos:%"IVdf"/%"IVdf \ + " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \ + (int)(depth)*2, "", \ + (IV)((data)->pos_min), \ + (IV)((data)->pos_delta), \ + (IV)((data)->flags), \ + (IV)((data)->whilem_c), \ + (IV)((data)->last_closep ? *((data)->last_closep) : -1) \ + ); \ + if ((data)->last_found) \ + PerlIO_printf(Perl_debug_log, \ + "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \ + " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \ + SvPVX_const((data)->last_found), \ + (IV)((data)->last_end), \ + (IV)((data)->last_start_min), \ + (IV)((data)->last_start_max), \ + ((data)->longest && \ + (data)->longest==&((data)->longest_fixed)) ? "*" : "", \ + SvPVX_const((data)->longest_fixed), \ + (IV)((data)->offset_fixed), \ + ((data)->longest && \ + (data)->longest==&((data)->longest_float)) ? "*" : "", \ + SvPVX_const((data)->longest_float), \ + (IV)((data)->offset_float_min), \ + (IV)((data)->offset_float_max) \ + ); \ + PerlIO_printf(Perl_debug_log,"\n"); \ +}); + static void clear_re(pTHX_ void *r); /* Mark that we cannot extend a found fixed substring at this point. @@ -450,10 +562,11 @@ static void clear_re(pTHX_ void *r); floating substrings if needed. */ STATIC void -S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data) +S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp) { const STRLEN l = CHR_SVLEN(data->last_found); const STRLEN old_l = CHR_SVLEN(*data->longest); + GET_RE_DEBUG_FLAGS_DECL; if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) { SvSetMagicSV(*data->longest, data->last_found); @@ -464,6 +577,8 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data) |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL); else data->flags &= ~SF_FIX_BEFORE_EOL; + data->minlen_fixed=minlenp; + data->lookbehind_fixed=0; } else { data->offset_float_min = l ? data->last_start_min : data->pos_min; @@ -477,6 +592,8 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data) |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL); else data->flags &= ~SF_FL_BEFORE_EOL; + data->minlen_float=minlenp; + data->lookbehind_float=0; } } SvCUR_set(data->last_found, 0); @@ -490,6 +607,7 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data) } data->last_end = -1; data->flags &= ~SF_BEFORE_EOL; + DEBUG_STUDYDATA(data,0); } /* Can match anything (initialization) */ @@ -1784,7 +1902,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs jumper = last; /* XXXX */ if ( !trie->states[trie->startstate].wordnum && trie->bitmap && - ((char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) ) + ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) ) { OP( convert ) = TRIEC; Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char); @@ -1954,6 +2072,10 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode Next ? (REG_NODE_NUM(Next)) : 0 ); \ }); + + + + #define JOIN_EXACT(scan,min,flags) \ if (PL_regkind[OP(scan)] == EXACT) \ join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1) @@ -2109,7 +2231,8 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags STATIC I32 -S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, +S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, + I32 *minlenp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags, U32 depth) /* scanp: Start here (read-write). */ /* deltap: Write maxlen-minlen here. */ @@ -2140,8 +2263,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, while (scan && OP(scan) != END && scan < last) { /* Peephole optimizer: */ + DEBUG_STUDYDATA(data,depth); DEBUG_PEEP("Peep",scan,depth); - JOIN_EXACT(scan,&min,0); /* Follow the next-chain of the current node and optimize @@ -2186,7 +2309,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode * const startbranch=scan; if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */ - scan_commit(pRExC_state, data); /* Cannot merge strings after this. */ + scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */ if (flags & SCF_DO_STCLASS) cl_init_zero(pRExC_state, &accum); @@ -2215,7 +2338,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, f |= SCF_WHILEM_VISITED_POS; /* we suppose the run is continuous, last=next...*/ - minnext = study_chunk(pRExC_state, &scan, &deltanext, + minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, next, &data_fake, f,depth+1); if (min1 > minnext) min1 = minnext; @@ -2536,7 +2659,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, /* Search for fixed substrings supports EXACT only. */ if (flags & SCF_DO_SUBSTR) { assert(data); - scan_commit(pRExC_state, data); + scan_commit(pRExC_state, data, minlenp); } if (UTF) { const U8 * const s = (U8 *)STRING(scan); @@ -2615,7 +2738,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, is_inf = is_inf_internal = 1; scan = regnext(scan); if (flags & SCF_DO_SUBSTR) { - scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */ + scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */ data->longest = &(data->longest_float); } goto optimize_curly_tail; @@ -2631,7 +2754,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, next_is_eval = (OP(scan) == EVAL); do_curly: if (flags & SCF_DO_SUBSTR) { - if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */ + if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */ pos_before = data->pos_min; } if (data) { @@ -2657,7 +2780,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, f &= ~SCF_WHILEM_VISITED_POS; /* This will finish on WHILEM, setting scan, or on NULL: */ - minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data, + minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, last, data, (mincount == 0 ? (f & ~SCF_DO_SUBSTR) : f),depth+1); @@ -2795,7 +2918,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, } #endif /* Optimize again: */ - study_chunk(pRExC_state, &nxt1, &deltanext, nxt, + study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt, NULL, 0,depth+1); } else @@ -2890,7 +3013,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, if (mincount != maxcount) { /* Cannot extend fixed substrings found inside the group. */ - scan_commit(pRExC_state,data); + scan_commit(pRExC_state,data,minlenp); if (mincount && last_str) { SV * const sv = data->last_found; MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? @@ -2922,7 +3045,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, continue; default: /* REF and CLUMP only? */ if (flags & SCF_DO_SUBSTR) { - scan_commit(pRExC_state,data); /* Cannot expect anything... */ + scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */ data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; @@ -2936,7 +3059,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, int value = 0; if (flags & SCF_DO_SUBSTR) { - scan_commit(pRExC_state,data); + scan_commit(pRExC_state,data,minlenp); data->pos_min++; } min++; @@ -3141,53 +3264,160 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, /* Lookbehind, or need to calculate parens/evals/stclass: */ && (scan->flags || data || (flags & SCF_DO_STCLASS)) && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) { - /* Lookahead/lookbehind */ - I32 deltanext, minnext, fake = 0; - regnode *nscan; - struct regnode_charclass_class intrnl; - int f = 0; - - data_fake.flags = 0; - if (data) { - data_fake.whilem_c = data->whilem_c; - data_fake.last_closep = data->last_closep; - } - else - data_fake.last_closep = &fake; - if ( flags & SCF_DO_STCLASS && !scan->flags - && OP(scan) == IFMATCH ) { /* Lookahead */ - cl_init(pRExC_state, &intrnl); - data_fake.start_class = &intrnl; - f |= SCF_DO_STCLASS_AND; - } - if (flags & SCF_WHILEM_VISITED_POS) - f |= SCF_WHILEM_VISITED_POS; - next = regnext(scan); - nscan = NEXTOPER(NEXTOPER(scan)); - minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1); - if (scan->flags) { - if (deltanext) { - vFAIL("Variable length lookbehind not implemented"); + if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY + || OP(scan) == UNLESSM ) + { + /* Negative Lookahead/lookbehind + In this case we can't do fixed string optimisation. + */ + + I32 deltanext, minnext, fake = 0; + regnode *nscan; + struct regnode_charclass_class intrnl; + int f = 0; + + data_fake.flags = 0; + if (data) { + data_fake.whilem_c = data->whilem_c; + data_fake.last_closep = data->last_closep; } - else if (minnext > (I32)U8_MAX) { - vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); + else + data_fake.last_closep = &fake; + if ( flags & SCF_DO_STCLASS && !scan->flags + && OP(scan) == IFMATCH ) { /* Lookahead */ + cl_init(pRExC_state, &intrnl); + data_fake.start_class = &intrnl; + f |= SCF_DO_STCLASS_AND; } - scan->flags = (U8)minnext; - } - if (data) { - if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) - pars++; - if (data_fake.flags & SF_HAS_EVAL) - data->flags |= SF_HAS_EVAL; - data->whilem_c = data_fake.whilem_c; + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; + next = regnext(scan); + nscan = NEXTOPER(NEXTOPER(scan)); + minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, last, &data_fake, f,depth+1); + if (scan->flags) { + if (deltanext) { + vFAIL("Variable length lookbehind not implemented"); + } + else if (minnext > (I32)U8_MAX) { + vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); + } + scan->flags = (U8)minnext; + } + if (data) { + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (data_fake.flags & SF_HAS_EVAL) + data->flags |= SF_HAS_EVAL; + data->whilem_c = data_fake.whilem_c; + } + if (f & SCF_DO_STCLASS_AND) { + const int was = (data->start_class->flags & ANYOF_EOS); + + cl_and(data->start_class, &intrnl); + if (was) + data->start_class->flags |= ANYOF_EOS; + } } - if (f & SCF_DO_STCLASS_AND) { - const int was = (data->start_class->flags & ANYOF_EOS); +#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY + else { + /* Positive Lookahead/lookbehind + In this case we can do fixed string optimisation, + but we must be careful about it. Note in the case of + lookbehind the positions will be offset by the minimum + length of the pattern, something we won't know about + until after the recurse. + */ + I32 deltanext, fake = 0; + regnode *nscan; + struct regnode_charclass_class intrnl; + int f = 0; + /* We use SAVEFREEPV so that when the full compile + is finished perl will clean up the allocated + minlens when its all done. This was we don't + have to worry about freeing them when we know + they wont be used, which would be a pain. + */ + I32 *minnextp; + Newx( minnextp, 1, I32 ); + SAVEFREEPV(minnextp); + + if (data) { + StructCopy(data, &data_fake, scan_data_t); + if ((flags & SCF_DO_SUBSTR) && data->last_found) { + f |= SCF_DO_SUBSTR; + if (scan->flags) + scan_commit(pRExC_state, &data_fake,minlenp); + data_fake.last_found=newSVsv(data->last_found); + } + } + else + data_fake.last_closep = &fake; + data_fake.flags = 0; + if (is_inf) + data_fake.flags |= SF_IS_INF; + if ( flags & SCF_DO_STCLASS && !scan->flags + && OP(scan) == IFMATCH ) { /* Lookahead */ + cl_init(pRExC_state, &intrnl); + data_fake.start_class = &intrnl; + f |= SCF_DO_STCLASS_AND; + } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; + next = regnext(scan); + nscan = NEXTOPER(NEXTOPER(scan)); + + *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext, last, &data_fake, f,depth+1); + if (scan->flags) { + if (deltanext) { + vFAIL("Variable length lookbehind not implemented"); + } + else if (*minnextp > (I32)U8_MAX) { + vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); + } + scan->flags = (U8)*minnextp; + } + + *minnextp += min; + + + if (f & SCF_DO_STCLASS_AND) { + const int was = (data->start_class->flags & ANYOF_EOS); + + cl_and(data->start_class, &intrnl); + if (was) + data->start_class->flags |= ANYOF_EOS; + } + if (data) { + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (data_fake.flags & SF_HAS_EVAL) + data->flags |= SF_HAS_EVAL; + data->whilem_c = data_fake.whilem_c; + if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) { + if (RExC_rx->minlen<*minnextp) + RExC_rx->minlen=*minnextp; + scan_commit(pRExC_state, &data_fake, minnextp); + SvREFCNT_dec(data_fake.last_found); + + if ( data_fake.minlen_fixed != minlenp ) + { + data->offset_fixed= data_fake.offset_fixed; + data->minlen_fixed= data_fake.minlen_fixed; + data->lookbehind_fixed+= scan->flags; + } + if ( data_fake.minlen_float != minlenp ) + { + data->minlen_float= data_fake.minlen_float; + data->offset_float_min=data_fake.offset_float_min; + data->offset_float_max=data_fake.offset_float_max; + data->lookbehind_float+= scan->flags; + } + } + } + - cl_and(data->start_class, &intrnl); - if (was) - data->start_class->flags |= ANYOF_EOS; } +#endif } else if (OP(scan) == OPEN) { pars++; @@ -3208,7 +3438,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, } else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */ if (flags & SCF_DO_SUBSTR) { - scan_commit(pRExC_state,data); + scan_commit(pRExC_state,data,minlenp); data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; @@ -3228,7 +3458,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, struct regnode_charclass_class accum; if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */ - scan_commit(pRExC_state, data); /* Cannot merge strings after this. */ + scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */ if (flags & SCF_DO_STCLASS) cl_init_zero(pRExC_state, &accum); @@ -3268,7 +3498,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, it. Note this means we need the vestigal unused branches even though they arent otherwise used. */ - minnext = study_chunk(pRExC_state, &scan, &deltanext, + minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, (regnode *)nextbranch, &data_fake, f,depth+1); } if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) @@ -3337,7 +3567,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, delta += (trie->maxlen - trie->minlen); flags &= ~SCF_DO_STCLASS; /* xxx */ if (flags & SCF_DO_SUBSTR) { - scan_commit(pRExC_state,data); /* Cannot expect anything... */ + scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */ data->pos_min += trie->minlen; data->pos_delta += (trie->maxlen - trie->minlen); if (trie->maxlen != trie->minlen) @@ -3371,6 +3601,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, cl_and(data->start_class, &and_with); if (flags & SCF_TRIE_RESTUDY) data->flags |= SCF_TRIE_RESTUDY; + + DEBUG_STUDYDATA(data,depth); + return min; } @@ -3581,7 +3814,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) Newx(r->substrs, 1, struct reg_substr_data); reStudy: - minlen=sawplus=sawopen=0; + r->minlen = minlen = sawplus = sawopen = 0; Zero(r->substrs, 1, struct reg_substr_data); StructCopy(&zero_scan_data, &data, scan_data_t); @@ -3589,12 +3822,11 @@ reStudy: if ( restudied ) { DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n")); RExC_state=copyRExC_state; - if (data.longest_fixed) + if (data.last_found) { SvREFCNT_dec(data.longest_fixed); - if (data.longest_float) SvREFCNT_dec(data.longest_float); - if (data.last_found) SvREFCNT_dec(data.last_found); + } } else { copyRExC_state=RExC_state; } @@ -3609,7 +3841,8 @@ reStudy: r->reganch |= ROPT_NAUGHTY; scan = r->program + 1; /* First BRANCH. */ - /* XXXX Should not we check for something else? Usually it is OPEN1... */ + /* testing for BRANCH here tells us whether there is "must appear" + data in the pattern. If there is then we can use it for optimisations */ if (OP(scan) != BRANCH) { /* Only one top-level choice. */ I32 fake; STRLEN longest_float_length, longest_fixed_length; @@ -3668,6 +3901,7 @@ reStudy: StructCopy(first,trieop,struct regnode_charclass); trie_op=(regnode *)trieop; } + OP(trie_op)+=2; make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0); r->regstclass = trie_op; } @@ -3750,7 +3984,7 @@ reStudy: stclass_flag = 0; data.last_closep = &last_close; - minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */ + minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */ &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0); @@ -3762,21 +3996,28 @@ reStudy: && !RExC_seen_zerolen && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS))) r->reganch |= ROPT_CHECK_ALL; - scan_commit(pRExC_state, &data); + scan_commit(pRExC_state, &data,&minlen); SvREFCNT_dec(data.last_found); + /* Note that code very similar to this but for anchored string + follows immediately below, changes may need to be made to both. + Be careful. + */ longest_float_length = CHR_SVLEN(data.longest_float); if (longest_float_length || (data.flags & SF_FL_BEFORE_EOL && (!(data.flags & SF_FL_BEFORE_MEOL) - || (RExC_flags & PMf_MULTILINE)))) { - int t; + || (RExC_flags & PMf_MULTILINE)))) + { + int t,ml; - if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */ + if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */ && data.offset_fixed == data.offset_float_min && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)) goto remove_float; /* As in (a)+. */ + /* copy the information about the longest float from the reg_scan_data + over to the program. */ if (SvUTF8(data.longest_float)) { r->float_utf8 = data.longest_float; r->float_substr = NULL; @@ -3784,8 +4025,20 @@ reStudy: r->float_substr = data.longest_float; r->float_utf8 = NULL; } - r->float_min_offset = data.offset_float_min; + /* float_end_shift is how many chars that must be matched that + follow this item. We calculate it ahead of time as once the + lookbehind offset is added in we lose the ability to correctly + calculate it.*/ + ml = data.minlen_float ? *(data.minlen_float) + : longest_float_length; + r->float_end_shift = ml - data.offset_float_min + - longest_float_length + (SvTAIL(data.longest_float) != 0) + + data.lookbehind_float; + r->float_min_offset = data.offset_float_min - data.lookbehind_float; r->float_max_offset = data.offset_float_max; + if (data.offset_float_max < (U32)I32_MAX) /* Don't offset infinity */ + r->float_max_offset -= data.lookbehind_float; + t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */ && (!(data.flags & SF_FL_BEFORE_MEOL) || (RExC_flags & PMf_MULTILINE))); @@ -3798,13 +4051,20 @@ reStudy: longest_float_length = 0; } + /* Note that code very similar to this but for floating string + is immediately above, changes may need to be made to both. + Be careful. + */ longest_fixed_length = CHR_SVLEN(data.longest_fixed); if (longest_fixed_length || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ && (!(data.flags & SF_FIX_BEFORE_MEOL) - || (RExC_flags & PMf_MULTILINE)))) { - int t; + || (RExC_flags & PMf_MULTILINE)))) + { + int t,ml; + /* copy the information about the longest fixed + from the reg_scan_data over to the program. */ if (SvUTF8(data.longest_fixed)) { r->anchored_utf8 = data.longest_fixed; r->anchored_substr = NULL; @@ -3812,7 +4072,17 @@ reStudy: r->anchored_substr = data.longest_fixed; r->anchored_utf8 = NULL; } - r->anchored_offset = data.offset_fixed; + /* fixed_end_shift is how many chars that must be matched that + follow this item. We calculate it ahead of time as once the + lookbehind offset is added in we lose the ability to correctly + calculate it.*/ + ml = data.minlen_fixed ? *(data.minlen_fixed) + : longest_fixed_length; + r->anchored_end_shift = ml - data.offset_fixed + - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0) + + data.lookbehind_fixed; + r->anchored_offset = data.offset_fixed - data.lookbehind_fixed; + t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */ && (!(data.flags & SF_FIX_BEFORE_MEOL) || (RExC_flags & PMf_MULTILINE))); @@ -3849,6 +4119,7 @@ reStudy: /* A temporary algorithm prefers floated substr to fixed one to dig more info. */ if (longest_fixed_length > longest_float_length) { + r->check_end_shift = r->anchored_end_shift; r->check_substr = r->anchored_substr; r->check_utf8 = r->anchored_utf8; r->check_offset_min = r->check_offset_max = r->anchored_offset; @@ -3856,10 +4127,11 @@ reStudy: r->reganch |= ROPT_NOSCAN; } else { + r->check_end_shift = r->float_end_shift; r->check_substr = r->float_substr; r->check_utf8 = r->float_utf8; - r->check_offset_min = data.offset_float_min; - r->check_offset_max = data.offset_float_max; + r->check_offset_min = r->float_min_offset; + r->check_offset_max = r->float_max_offset; } /* XXXX Currently intuiting is not compatible with ANCH_GPOS. This should be changed ASAP! */ @@ -3868,6 +4140,12 @@ reStudy: if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8)) r->reganch |= RE_INTUIT_TAIL; } + /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere) + if ( (STRLEN)minlen < longest_float_length ) + minlen= longest_float_length; + if ( (STRLEN)minlen < longest_fixed_length ) + minlen= longest_fixed_length; + */ } else { /* Several toplevels. Best we can is to set minlen. */ @@ -3882,7 +4160,7 @@ reStudy: data.start_class = &ch_class; data.last_closep = &last_close; - minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, + minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0); CHECK_RESTUDY_GOTO; @@ -3909,7 +4187,11 @@ reStudy: } } - r->minlen = minlen; + /* Guard against an embedded (?=) or (?<=) with a longer minlen than + the "real" pattern. */ + if (r->minlen < minlen) + r->minlen = minlen; + if (RExC_seen & REG_SEEN_GPOS) r->reganch |= ROPT_GPOS_SEEN; if (RExC_seen & REG_SEEN_LOOKBEHIND) @@ -6633,7 +6915,7 @@ Perl_regdump(pTHX_ const regexp *r) if (r->regstclass) { regprop(r, sv, r->regstclass); - PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv)); + PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv)); } if (r->reganch & ROPT_ANCH) { PerlIO_printf(Perl_debug_log, "anchored"); @@ -6672,6 +6954,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) #ifdef DEBUGGING dVAR; register int k; + GET_RE_DEBUG_FLAGS_DECL; sv_setpvn(sv, "", 0); if (OP(o) >= reg_num) /* regnode.type is unsigned */ @@ -6697,9 +6980,54 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) ); Perl_sv_catpvf(aTHX_ sv, " %s", s ); } else if (k == TRIE) { - Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]); /* print the details of the trie in dumpuntil instead, as * prog->data isn't available here */ + const char op = OP(o); + const I32 n = ARG(o); + const reg_ac_data * const ac = IS_TRIE_AC(op) ? + (reg_ac_data *)prog->data->data[n] : + NULL; + const reg_trie_data * const trie = !IS_TRIE_AC(op) ? + (reg_trie_data*)prog->data->data[n] : + ac->trie; + + Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]); + DEBUG_TRIE_COMPILE_r( + Perl_sv_catpvf(aTHX_ sv, + "", + (UV)trie->startstate, + (IV)trie->laststate-1, + (UV)trie->wordcount, + (UV)trie->minlen, + (UV)trie->maxlen, + (UV)TRIE_CHARCOUNT(trie), + (UV)trie->uniquecharcount + ) + ); + if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { + int i; + int rangestart = -1; + U8* bitmap = IS_ANYOF_TRIE(op) ? ANYOF_BITMAP(o) : TRIE_BITMAP(trie); + Perl_sv_catpvf(aTHX_ sv, "["); + for (i = 0; i <= 256; i++) { + if (i < 256 && BITMAP_TEST(bitmap,i)) { + if (rangestart == -1) + rangestart = i; + } else if (rangestart != -1) { + if (i <= rangestart + 3) + for (; rangestart < i; rangestart++) + put_byte(sv, rangestart); + else { + put_byte(sv, rangestart); + sv_catpvs(sv, "-"); + put_byte(sv, i - 1); + } + rangestart = -1; + } + } + Perl_sv_catpvf(aTHX_ sv, "]"); + } + } else if (k == CURLY) { if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */ @@ -7214,7 +7542,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, else PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start)); - if (PL_regkind[(U8)op] != TRIE) + /*if (PL_regkind[(U8)op] != TRIE)*/ (void)PerlIO_putc(Perl_debug_log, '\n'); } @@ -7235,51 +7563,17 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, DUMPUNTIL(NEXTOPER(node), next); } else if ( PL_regkind[(U8)op] == TRIE ) { + const char op = OP(node); const I32 n = ARG(node); - const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n]; + const reg_ac_data * const ac = op>=AHOCORASICK ? + (reg_ac_data *)r->data->data[n] : + NULL; + const reg_trie_data * const trie = opdata->data[n] : + ac->trie; const regnode *nextbranch= NULL; I32 word_idx; - - DEBUG_TRIE_COMPILE_r( - PerlIO_printf(Perl_debug_log, - " S:%"UVuf"/%"IVdf" W:%d L:%d/%d C:%d/%d ", - (UV)trie->startstate, - (IV)trie->laststate-1, - (int)trie->wordcount, - (int)trie->minlen, - (int)trie->maxlen, - (int)TRIE_CHARCOUNT(trie), - trie->uniquecharcount - ); - ); - if ( op==TRIEC || trie->bitmap ) { - int i; - int rangestart = -1; - U8* bitmap = op==TRIEC ? (U8*)ANYOF_BITMAP(node) : (U8*)TRIE_BITMAP(trie); - - sv_setpvn(sv, "", 0); - for (i = 0; i <= 256; i++) { - if (i < 256 && BITMAP_TEST(bitmap,i)) { - if (rangestart == -1) - rangestart = i; - } else if (rangestart != -1) { - if (i <= rangestart + 3) - for (; rangestart < i; rangestart++) - put_byte(sv, rangestart); - else { - put_byte(sv, rangestart); - sv_catpvs(sv, "-"); - put_byte(sv, i - 1); - } - rangestart = -1; - } - } - PerlIO_printf(Perl_debug_log, "[%s]\n", SvPVX_const(sv)); - } else - PerlIO_printf(Perl_debug_log, "\n"); - - - + sv_setpvn(sv, "", 0); for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) { SV ** const elem_ptr = av_fetch(trie->words,word_idx,0); @@ -7347,7 +7641,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, #ifdef DEBUG_DUMPUNTIL PerlIO_printf(Perl_debug_log, "--- %d\n",indent); #endif - return last ? last : node; + return node; } #endif /* DEBUGGING */ diff --git a/regcomp.h b/regcomp.h index b6f3617..b4f549f 100644 --- a/regcomp.h +++ b/regcomp.h @@ -13,6 +13,7 @@ typedef OP OP_4tree; /* Will be redefined later. */ #define PERL_ENABLE_TRIE_OPTIMISATION 1 #define PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION 1 +#define PERL_ENABLE_POSITIVE_ASSERTION_STUDY 1 #define PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS 0 /* @@ -396,6 +397,7 @@ struct reg_substr_datum { I32 max_offset; SV *substr; /* non-utf8 variant */ SV *utf8_substr; /* utf8 variant */ + I32 end_shift; }; struct reg_substr_data { @@ -405,14 +407,19 @@ struct reg_substr_data { #define anchored_substr substrs->data[0].substr #define anchored_utf8 substrs->data[0].utf8_substr #define anchored_offset substrs->data[0].min_offset +#define anchored_end_shift substrs->data[0].end_shift + #define float_substr substrs->data[1].substr #define float_utf8 substrs->data[1].utf8_substr #define float_min_offset substrs->data[1].min_offset #define float_max_offset substrs->data[1].max_offset +#define float_end_shift substrs->data[1].end_shift + #define check_substr substrs->data[2].substr #define check_utf8 substrs->data[2].utf8_substr #define check_offset_min substrs->data[2].min_offset #define check_offset_max substrs->data[2].max_offset +#define check_end_shift substrs->data[2].end_shift @@ -503,6 +510,10 @@ typedef struct _reg_ac_data reg_ac_data; #define TRIE_BITMAP_CLEAR(p,c) (TRIE_BITMAP_BYTE(p, c) &= ~ANYOF_BIT((U8)c)) #define TRIE_BITMAP_TEST(p, c) (TRIE_BITMAP_BYTE(p, c) & ANYOF_BIT((U8)c)) +#define IS_ANYOF_TRIE(op) ((op)==TRIEC || (op)==AHOCORASICKC) +#define IS_TRIE_AC(op) ((op)>=AHOCORASICK) + + #define BITMAP_BYTE(p, c) (((U8*)p)[(((U8)(c)) >> 3) & 31]) #define BITMAP_TEST(p, c) (BITMAP_BYTE(p, c) & ANYOF_BIT((U8)c)) diff --git a/regcomp.sym b/regcomp.sym index f62b7bf..25da9f7 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -8,11 +8,13 @@ # Add new regops to the end, and do not re-order the existing ops. # -# Exit points +#* Exit points (0,1) + END END, no End of program. SUCCEED END, no Return from a subroutine, basically. -# Anchors: +#* Anchors: (2..13) + BOL BOL, no Match "" at beginning of line. MBOL BOL, no Same, assuming multiline. SBOL BOL, no Same, assuming singleline. @@ -26,7 +28,8 @@ NBOUND NBOUND, no Match "" at any word non-boundary NBOUNDL NBOUND, no Match "" at any word non-boundary GPOS GPOS, no Matches where last m//g left off. -# [Special] alternatives +#* [Special] alternatives: (14..30) + REG_ANY REG_ANY, no Match any one character (except newline). SANY REG_ANY, no Match any one character. CANY REG_ANY, no Match any one byte. @@ -45,7 +48,9 @@ NDIGIT NDIGIT, no Match any non-numeric character NDIGITL NDIGIT, no Match any non-numeric character in locale CLUMP CLUMP, no Match any combining character sequence -# BRANCH The set of branches constituting a single choice are hooked +#* Alternation (31) + +# BRANCH The set of branches constituting a single choice are hooked # together with their "next" pointers, since precedence prevents # anything being concatenated to any individual branch. The # "next" pointer of the last BRANCH in a choice points to the @@ -55,21 +60,27 @@ CLUMP CLUMP, no Match any combining character sequence # BRANCH BRANCH, node Match this alternative, or the next... +#*Back pointer (32) + # BACK Normal "next" pointers all implicitly point forward; BACK # exists to make loop structures possible. # not used BACK BACK, no Match "", "next" ptr points backward. -# Literals +#*Literals (33..35) + EXACT EXACT, sv Match this string (preceded by length). EXACTF EXACT, sv Match this string, folded (prec. by length). EXACTFL EXACT, sv Match this string, folded in locale (w/len). -# Do nothing +#*Do nothing types (36..37) + NOTHING NOTHING,no Match empty string. # A variant of above which delimits a group, thus stops optimizations TAIL NOTHING,no Match empty string. Can jump here from outside. +#*Loops (38..44) + # STAR,PLUS '?', and complex '*' and '+', are implemented as circular # BRANCH structures using BACK. Simple cases (one character # per match) are implemented with STAR and PLUS for speed @@ -87,6 +98,8 @@ CURLYX CURLY, sv 2 Match this complex thing {n,m} times. # This terminator creates a loop structure for CURLYX WHILEM WHILEM, no Do curly processing and see if rest matches. +#*Buffer related (45..49) + # OPEN,CLOSE,GROUPP ...are numbered at compile time. OPEN OPEN, num 1 Mark this point in input as start of #n. CLOSE CLOSE, num 1 Analogous to OPEN. @@ -95,40 +108,57 @@ REF REF, num 1 Match some already matched string REFF REF, num 1 Match already matched string, folded REFFL REF, num 1 Match already matched string, folded in loc. -# grouping assertions +#*Grouping assertions (50..54) + IFMATCH BRANCHJ,off 1 2 Succeeds if the following matches. UNLESSM BRANCHJ,off 1 2 Fails if the following matches. SUSPEND BRANCHJ,off 1 1 "Independent" sub-RE. IFTHEN BRANCHJ,off 1 1 Switch, should be preceeded by switcher . GROUPP GROUPP, num 1 Whether the group matched. -# Support for long RE +#*Support for long RE (55..56) + LONGJMP LONGJMP,off 1 1 Jump far away. BRANCHJ BRANCHJ,off 1 1 BRANCH with long offset. -# The heavy worker +#*The heavy worker (57..58) + EVAL EVAL, evl 1 Execute some Perl code. -# Modifiers +#*Modifiers (59..60) + MINMOD MINMOD, no Next operator is not greedy. LOGICAL LOGICAL,no Next opcode should set the flag only. -# This is not used yet +# This is not used yet (61) RENUM BRANCHJ,off 1 1 Group with independently numbered parens. -# This is not really a node, but an optimized away piece of a "long" node. -# To simplify debugging output, we mark it as if it were a node -OPTIMIZED NOTHING,off Placeholder for dump. +#*Trie Related (62..64) + +# Behave the same as A|LIST|OF|WORDS would. The '..C' variants have +# inline charclass data (ascii only), the 'C' store it in the structure. +# NOTE: the relative order of the TRIE-like regops is signifigant -# Trie Related (behave the same as A|LIST|OF|WORDS would) TRIE TRIE, trie 1 Match many EXACT(FL?)? at once. flags==type TRIEC TRIE, trie charclass Same as TRIE, but with embedded charclass data -# NEW STUFF HERE +# For start classes, contains an added fail table. +AHOCORASICK TRIE, trie 1 Aho Corasick stclass. flags==type +AHOCORASICKC TRIE, trie charclass Same as AHOCORASICK, but with embedded charclass data + +# NEW STUFF ABOVE THIS LINE -- Please update counts below. + +#*Special Nodes (65, 66) + +# This is not really a node, but an optimized away piece of a "long" node. +# To simplify debugging output, we mark it as if it were a node +OPTIMIZED NOTHING,off Placeholder for dump. + # Special opcode with the property that no opcode in a compiled program # will ever be of this type. Thus it can be used as a flag value that # no other opcode has been seen. END is used similarly, in that an END # node cant be optimized. So END implies "unoptimizable" and PSEUDO mean # "not seen anything to optimize yet". PSEUDO PSEUDO,off Pseudo opcode for internal use. + # NOTHING BELOW HERE \ No newline at end of file diff --git a/regexec.c b/regexec.c index d82b135..49ea3ea 100644 --- a/regexec.c +++ b/regexec.c @@ -356,7 +356,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, register SV *check; char *strbeg; char *t; - const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */ + const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */ I32 ml_anch; register char *other_last = NULL; /* other substr checked before this */ char *check_at = NULL; /* check substr found at this pos */ @@ -374,7 +374,8 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, } DEBUG_EXECUTE_r( debug_start_match(prog, do_utf8, strpos, strend, - "Guessing start of match for"); + sv ? "Guessing start of match in sv for" + : "Guessing start of match in string for"); ); /* CHR_DIST() would be more correct here but it makes things slow. */ @@ -383,6 +384,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, "String too short... [re_intuit_start]\n")); goto fail; } + strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos; PL_regeol = strend; if (do_utf8) { @@ -394,9 +396,9 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, to_byte_substr(prog); check = prog->check_substr; } - if (check == &PL_sv_undef) { + if (check == &PL_sv_undef) { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - "Non-utf string cannot match utf check string\n")); + "Non-utf8 string cannot match utf8 check string\n")); goto fail; } if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */ @@ -419,6 +421,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, I32 slen; s = HOP3c(strpos, prog->check_offset_min, strend); + if (SvTAIL(check)) { slen = SvCUR(check); /* >= 1 */ @@ -448,8 +451,8 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, /* Match is anchored, but substr is not anchored wrt beg-of-str. */ s = strpos; start_shift = prog->check_offset_min; /* okay to underestimate on CC */ - end_shift = prog->minlen - start_shift - - CHR_SVLEN(check) + (SvTAIL(check) != 0); + end_shift = prog->check_end_shift; + if (!ml_anch) { const I32 end = prog->check_offset_max + CHR_SVLEN(check) - (SvTAIL(check) != 0); @@ -462,20 +465,37 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, else { /* Can match at random position */ ml_anch = 0; s = strpos; - start_shift = prog->check_offset_min; /* okay to underestimate on CC */ - /* Should be nonnegative! */ - end_shift = prog->minlen - start_shift - - CHR_SVLEN(check) + (SvTAIL(check) != 0); + start_shift = prog->check_offset_min; /* okay to underestimate on CC */ + end_shift = prog->check_end_shift; + + /* end shift should be non negative here */ } #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */ if (end_shift < 0) - Perl_croak(aTHX_ "panic: end_shift"); + Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ", + end_shift,prog->precomp); #endif restart: /* Find a possible match in the region s..strend by looking for the "check" substring in the region corrected by start/end_shift. */ + + { + I32 srch_start_shift = start_shift; + I32 srch_end_shift = end_shift; + if (srch_start_shift < 0 && strbeg - s > srch_start_shift) { + srch_end_shift -= ((strbeg - s) - srch_start_shift); + srch_start_shift = strbeg - s; + } + DEBUG_OPTIMISE_r({ + PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n", + (IV)prog->check_offset_min, + (IV)srch_start_shift, + (IV)srch_end_shift, + (IV)prog->check_end_shift); + }); + if (flags & REXEC_SCREAM) { I32 p = -1; /* Internal iterator of scream. */ I32 * const pp = data ? data->scream_pos : &p; @@ -485,7 +505,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, && (BmPREVIOUS(check) == SvCUR(check) - 1) && SvTAIL(check) )) s = screaminstr(sv, check, - start_shift + (s - strbeg), end_shift, pp, 0); + srch_start_shift + (s - strbeg), srch_end_shift, pp, 0); else goto fail_finish; /* we may be pointing at the wrong string */ @@ -494,15 +514,27 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, if (data) *data->scream_olds = s; } - else if (prog->reganch & ROPT_CANY_SEEN) - s = fbm_instr((U8*)(s + start_shift), - (U8*)(strend - end_shift), - check, multiline ? FBMrf_MULTILINE : 0); - else - s = fbm_instr(HOP3(s, start_shift, strend), - HOP3(strend, -end_shift, strbeg), + else { + U8* start_point; + U8* end_point; + if (prog->reganch & ROPT_CANY_SEEN) { + start_point= (U8*)(s + srch_start_shift); + end_point= (U8*)(strend - srch_end_shift); + } else { + start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend); + end_point= HOP3(strend, -srch_end_shift, strbeg); + } + DEBUG_OPTIMISE_r({ + PerlIO_printf(Perl_debug_log, "fbm_instr len=%"IVdf" str=<%.*s>\n", + (int)(end_point - start_point), + (int)(end_point - start_point), + start_point); + }); + + s = fbm_instr( start_point, end_point, check, multiline ? FBMrf_MULTILINE : 0); - + } + } /* Update the count-of-usability, remove useless subpatterns, unshift s. */ @@ -520,12 +552,20 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, if (!s) goto fail_finish; - - check_at = s; - /* Finish the diagnostic message */ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) ); + /* XXX dmq: first branch is for positive lookbehind... + Our check string is offset from the beginning of the pattern. + So we need to do any stclass tests offset forward from that + point. I think. :-( + */ + + + + check_at=s; + + /* Got a candidate. Check MBOL anchoring, and the *other* substr. Start with the other substr. XXXX no SCREAM optimization yet - and a very coarse implementation @@ -534,7 +574,9 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, Probably it is right to do no SCREAM here... */ - if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) { + if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) + : (prog->float_substr && prog->anchored_substr)) + { /* Take into account the "other" substring. */ /* XXXX May be hopelessly wrong for UTF... */ if (!other_last) @@ -561,12 +603,17 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, last2 = last1 = HOP3c(strend, -prog->minlen, strbeg); if (last < last1) last1 = last; - /* XXXX It is not documented what units *_offsets are in. Assume bytes. */ + /* XXXX It is not documented what units *_offsets are in. + We assume bytes, but this is clearly wrong. + Meaning this code needs to be carefully reviewed for errors. + dmq. + */ + /* On end-of-str: see comment below. */ must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr; if (must == &PL_sv_undef) { s = (char*)NULL; - DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */ + DEBUG_r(must = prog->anchored_utf8); /* for debug */ } else s = fbm_instr( @@ -630,7 +677,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, and end-of-str is not later than strend we are OK. */ if (must == &PL_sv_undef) { s = (char*)NULL; - DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */ + DEBUG_r(must = prog->float_utf8); /* for debug message */ } else s = fbm_instr((unsigned char*)s, @@ -669,11 +716,26 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, } } - t = s - prog->check_offset_max; + + t= HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos); + + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, + "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n", + (IV)prog->check_offset_min, + (IV)prog->check_offset_max, + (IV)(s-strpos), + (IV)(t-strpos), + (IV)(t-s), + (IV)(strend-strpos) + ) + ); + if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ && (!do_utf8 - || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos)) - && t > strpos))) { + || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos)) + && t > strpos))) + { /* Fixed substring is found far enough so that the match cannot start at strpos. */ try_at_offset: @@ -782,6 +844,8 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, /* Last resort... */ /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */ + /* trie stclasses are too expensive to use here, we are better off to + leave it to regmatch itself */ if (prog->regstclass && PL_regkind[OP(prog->regstclass)]!=TRIE) { /* minlen == 0 is possible if regstclass is \b or \B, and the fixed substr is ''$. @@ -795,14 +859,17 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT ? CHR_DIST(str+STR_LEN(prog->regstclass), str) : 1); - const char * endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch) - ? HOP3c(s, (prog->minlen ? cl_l : 0), strend) - : (prog->float_substr || prog->float_utf8 - ? HOP3c(HOP3c(check_at, -start_shift, strbeg), - cl_l, strend) - : strend); - /*if (OP(prog->regstclass) == TRIE) - endpos++;*/ + char * endpos; + if (prog->anchored_substr || prog->anchored_utf8 || ml_anch) + endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend); + else if (prog->float_substr || prog->float_utf8) + endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend); + else + endpos= strend; + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n", + start_shift,check_at-strbeg,s-strbeg,endpos-strbeg)); + t = s; s = find_byclass(prog, prog->regstclass, s, endpos, NULL); if (!s) { @@ -1329,8 +1396,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, !isDIGIT_LC_utf8((U8*)s), !isDIGIT_LC(*s) ); - case TRIEC: - case TRIE: + case AHOCORASICKC: + case AHOCORASICK: { const enum { trie_plain, trie_utf8, trie_utf8_fold } trie_type = do_utf8 ? @@ -1369,7 +1436,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, SvPOK_on(sv_points); sv_2mortal(sv_points); points=(U8**)SvPV_nolen(sv_points ); - if ( trie_type != trie_utf8_fold && (trie->bitmap || OP(c)==TRIEC) ) { + if ( trie_type != trie_utf8_fold + && (trie->bitmap || OP(c)==AHOCORASICKC) ) + { if (trie->bitmap) bitmap=(U8*)trie->bitmap; else @@ -1412,19 +1481,21 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, int failed=0; U32 word = aho->states[ state ].wordnum; - if( state==1 && bitmap ) { - DEBUG_TRIE_EXECUTE_r( - if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { - dump_exec_pos( (char *)uc, c, strend, real_start, - (char*)uc, do_utf8 ); - PerlIO_printf( Perl_debug_log, - " Scanning for legal start char...\n"); + if( state==1 ) { + if ( bitmap ) { + DEBUG_TRIE_EXECUTE_r( + if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { + dump_exec_pos( (char *)uc, c, strend, real_start, + (char *)uc, do_utf8 ); + PerlIO_printf( Perl_debug_log, + " Scanning for legal start char...\n"); + } + ); + while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { + uc++; } - ); - while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { - uc++; + s= (char *)uc; } - s= (char *)uc; if (uc >(U8*)last_start) break; } @@ -1585,14 +1656,20 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * reginfo.prog = prog; RX_MATCH_UTF8_set(prog, do_utf8); + DEBUG_EXECUTE_r( + debug_start_match(prog, do_utf8, startpos, strend, + "Matching"); + ); minlen = prog->minlen; - if (strend - startpos < minlen) { + + if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too short [regexec_flags]...\n")); goto phooey; } + /* Check validity of program. */ if (UCHARAT(prog->program) != REG_MAGIC) { Perl_croak(aTHX_ "corrupted regexp program"); @@ -1651,10 +1728,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * } } - DEBUG_EXECUTE_r( - debug_start_match(prog, do_utf8, startpos, strend, - "Matching"); - ); + /* Simplest case: anchored match need be tried only once. */ /* [unless only anchor is BOL and multiline is set] */ @@ -1768,14 +1842,19 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * back_max = prog->float_max_offset; back_min = prog->float_min_offset; } + + if (must == &PL_sv_undef) /* could not downgrade utf8 check substring, so must fail */ goto phooey; - last = HOP3c(strend, /* Cannot start after this */ - -(I32)(CHR_SVLEN(must) - - (SvTAIL(must) != 0) + back_min), strbeg); - + if (back_min<0) { + last = strend; + } else { + last = HOP3c(strend, /* Cannot start after this */ + -(I32)(CHR_SVLEN(must) + - (SvTAIL(must) != 0) + back_min), strbeg); + } if (s > PL_bostr) last1 = HOPc(s, -1); else @@ -1788,9 +1867,9 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * strend = HOPc(strend, -dontbother); while ( (s <= last) && ((flags & REXEC_SCREAM) - ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg, + ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg, end_shift, &scream_pos, 0)) - : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend), + : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)), (unsigned char*)strend, must, multiline ? FBMrf_MULTILINE : 0))) ) { /* we may be pointing at the wrong string */ @@ -2352,9 +2431,10 @@ S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8, PL_colors[4], blurb, PL_colors[5], s0, s1); if (do_utf8||utf8_pat) - PerlIO_printf(Perl_debug_log, "UTF-8 %s...\n", - !do_utf8 ? "pattern" : !utf8_pat ? "string" : - "pattern and string" + PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n", + utf8_pat ? "pattern" : "", + utf8_pat && do_utf8 ? " and " : "", + do_utf8 ? "string" : "" ); } } @@ -2396,15 +2476,16 @@ S_dump_exec_pos(pTHX_ const char *locinput, const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0; RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0), - (locinput - pref_len),pref0_len, pref0_len, 4, 5); + (locinput - pref_len),pref0_len, 60, 4, 5); RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1), (locinput - pref_len + pref0_len), - pref_len - pref0_len, pref_len - pref0_len, 2, 3); + pref_len - pref0_len, 60, 2, 3); RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2), - locinput, loc_regeol - locinput, l, 0, 1); + locinput, loc_regeol - locinput, 10, 0, 1); + const STRLEN tlen=len0+len1+len2; PerlIO_printf(Perl_debug_log, "%4"IVdf" <%.*s%.*s%s%.*s>%*s|", (IV)(locinput - loc_bostr), @@ -2412,7 +2493,7 @@ S_dump_exec_pos(pTHX_ const char *locinput, len1, s1, (docolor ? "" : "> <"), len2, s2, - 15 - l - pref_len + 1, + tlen > 19 ? 0 : 19 - tlen, ""); } } @@ -2488,6 +2569,7 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) DEBUG_EXECUTE_r( { SV * const prop = sv_newmortal(); + regnode *rnext=regnext(scan); DUMP_EXEC_POS( locinput, scan, do_utf8 ); regprop(rex, prop, scan); @@ -2495,7 +2577,8 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) "%3"IVdf":%*s%s(%"IVdf")\n", (IV)(scan - rex->program), PL_regindent*2, "", SvPVX_const(prop), - PL_regkind[OP(scan)] == END ? 0 : (IV)(regnext(scan) - rex->program)); + (PL_regkind[OP(scan)] == END || !rnext) ? + 0 : (IV)(rnext - rex->program)); }); next = scan + NEXT_OFF(scan); @@ -3332,11 +3415,6 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) } /* run the pattern returned from (??{...}) */ - DEBUG_EXECUTE_r( - debug_start_match(re, do_utf8, locinput, PL_regeol, - "Matching embedded"); - ); - ST.cp = regcppush(0); /* Save *all* the positions. */ REGCP_SET(ST.lastcp); *PL_reglastparen = 0; @@ -3354,6 +3432,10 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) rex = re; ST.B = next; + DEBUG_EXECUTE_r( + debug_start_match(re, do_utf8, locinput, PL_regeol, + "Matching embedded"); + ); /* now continue from first node in postoned RE */ PUSH_YES_STATE_GOTO(EVAL_A, re->program + 1); /* NOTREACHED */ @@ -3383,7 +3465,6 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) /* continue at the node following the (??{...}) */ scan = ST.B; continue; - case EVAL_A_fail: /* unsuccessfully ran inner rex (??{rex}) */ /* Restore state to the outer re then re-throw the failure */ if (ST.toggleutf) @@ -4443,7 +4524,6 @@ yes_final: if (yes_state) { /* we have successfully completed a subexpression, but we must now * pop to the state marked by yes_state and continue from there */ - assert(st != yes_state); while (yes_state < SLAB_FIRST(PL_regmatch_slab) || yes_state > SLAB_LAST(PL_regmatch_slab)) @@ -4461,9 +4541,9 @@ yes_final: PL_regmatch_state = st; switch (st->resume_state) { + case EVAL_A: case IFMATCH_A: case CURLYM_A: - case EVAL_A: state_num = st->resume_state; goto reenter_switch; @@ -5064,21 +5144,43 @@ S_reghop3(U8 *s, I32 off, const U8* lim) } } else { - while (off++) { - if (s > lim) { - s--; - if (UTF8_IS_CONTINUED(*s)) { - while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s)) - s--; - } - /* XXX could check well-formedness here */ + while (off++ && s > lim) { + s--; + if (UTF8_IS_CONTINUED(*s)) { + while (s > lim && UTF8_IS_CONTINUATION(*s)) + s--; } + /* XXX could check well-formedness here */ } } return s; } STATIC U8 * +S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim) +{ + dVAR; + if (off >= 0) { + while (off-- && s < rlim) { + /* XXX could check well-formedness here */ + s += UTF8SKIP(s); + } + } + else { + while (off++ && s > llim) { + s--; + if (UTF8_IS_CONTINUED(*s)) { + while (s > llim && UTF8_IS_CONTINUATION(*s)) + s--; + } + /* XXX could check well-formedness here */ + } + } + return s; +} + + +STATIC U8 * S_reghopmaybe3(U8* s, I32 off, const U8* lim) { dVAR; @@ -5091,17 +5193,13 @@ S_reghopmaybe3(U8* s, I32 off, const U8* lim) return NULL; } else { - while (off++) { - if (s > lim) { - s--; - if (UTF8_IS_CONTINUED(*s)) { - while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s)) - s--; - } - /* XXX could check well-formedness here */ + while (off++ && s > lim) { + s--; + if (UTF8_IS_CONTINUED(*s)) { + while (s > lim && UTF8_IS_CONTINUATION(*s)) + s--; } - else - break; + /* XXX could check well-formedness here */ } if (off <= 0) return NULL; diff --git a/regnodes.h b/regnodes.h index d8e3006..01985f5 100644 --- a/regnodes.h +++ b/regnodes.h @@ -65,11 +65,13 @@ #define MINMOD 58 /* 0x3a Next operator is not greedy. */ #define LOGICAL 59 /* 0x3b Next opcode should set the flag only. */ #define RENUM 60 /* 0x3c Group with independently numbered parens. */ -#define OPTIMIZED 61 /* 0x3d Placeholder for dump. */ -#define TRIE 62 /* 0x3e Match many EXACT(FL?)? at once. flags==type */ -#define TRIEC 63 /* 0x3f Same as TRIE, but with embedded charclass data */ -#define PSEUDO 64 /* 0x40 Pseudo opcode for internal use. */ -#define REGNODE_MAX 64 +#define TRIE 61 /* 0x3d Match many EXACT(FL?)? at once. flags==type */ +#define TRIEC 62 /* 0x3e Same as TRIE, but with embedded charclass data */ +#define AHOCORASICK 63 /* 0x3f Aho Corasick stclass. flags==type */ +#define AHOCORASICKC 64 /* 0x40 Same as AHOCORASICK, but with embedded charclass data */ +#define OPTIMIZED 65 /* 0x41 Placeholder for dump. */ +#define PSEUDO 66 /* 0x42 Pseudo opcode for internal use. */ +#define REGNODE_MAX 66 #ifndef DOINIT EXTCONST U8 PL_regkind[]; @@ -136,9 +138,11 @@ EXTCONST U8 PL_regkind[] = { MINMOD, /* MINMOD */ LOGICAL, /* LOGICAL */ BRANCHJ, /* RENUM */ - NOTHING, /* OPTIMIZED */ TRIE, /* TRIE */ TRIE, /* TRIEC */ + TRIE, /* AHOCORASICK */ + TRIE, /* AHOCORASICKC */ + NOTHING, /* OPTIMIZED */ PSEUDO, /* PSEUDO */ }; #endif @@ -207,9 +211,11 @@ static const U8 regarglen[] = { 0, /* MINMOD */ 0, /* LOGICAL */ EXTRA_SIZE(struct regnode_1), /* RENUM */ - 0, /* OPTIMIZED */ EXTRA_SIZE(struct regnode_1), /* TRIE */ EXTRA_SIZE(struct regnode_charclass), /* TRIEC */ + EXTRA_SIZE(struct regnode_1), /* AHOCORASICK */ + EXTRA_SIZE(struct regnode_charclass), /* AHOCORASICKC */ + 0, /* OPTIMIZED */ 0, /* PSEUDO */ }; @@ -275,9 +281,11 @@ static const char reg_off_by_arg[] = { 0, /* MINMOD */ 0, /* LOGICAL */ 1, /* RENUM */ - 0, /* OPTIMIZED */ 0, /* TRIE */ 0, /* TRIEC */ + 0, /* AHOCORASICK */ + 0, /* AHOCORASICKC */ + 0, /* OPTIMIZED */ 0, /* PSEUDO */ }; @@ -344,13 +352,15 @@ static const char * const reg_name[] = { "MINMOD", /* 0x3a */ "LOGICAL", /* 0x3b */ "RENUM", /* 0x3c */ - "OPTIMIZED", /* 0x3d */ - "TRIE", /* 0x3e */ - "TRIEC", /* 0x3f */ - "PSEUDO", /* 0x40 */ + "TRIE", /* 0x3d */ + "TRIEC", /* 0x3e */ + "AHOCORASICK", /* 0x3f */ + "AHOCORASICKC", /* 0x40 */ + "OPTIMIZED", /* 0x41 */ + "PSEUDO", /* 0x42 */ }; -static const int reg_num = 65; +static const int reg_num = 67; #endif /* DEBUGGING */ #endif /* REG_COMP_C */ diff --git a/sv.c b/sv.c index 82c5b23..53256b8 100644 --- a/sv.c +++ b/sv.c @@ -9508,6 +9508,7 @@ Perl_re_dup(pTHX_ const REGEXP *r, CLONE_PARAMS *param) for (s = ret->substrs->data, i = 0; i < 3; i++, s++) { s->min_offset = r->substrs->data[i].min_offset; s->max_offset = r->substrs->data[i].max_offset; + s->end_shift = r->substrs->data[i].end_shift; s->substr = sv_dup_inc(r->substrs->data[i].substr, param); s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param); } diff --git a/t/op/re_tests b/t/op/re_tests index f8ee725..3514144 100644 --- a/t/op/re_tests +++ b/t/op/re_tests @@ -990,3 +990,17 @@ X(?!b+(?!(c+)*(?!(c+)*d))).*X aXbbbbbbbcccccccccccccaaaX y - - ^([TUV]+|XXX|YYY|Z.Q*X|Z[TE]Q*P|[MKJ]): ZEQQQX: y $1 ZEQQQX X(?:ABCF[cC]x*|ABCD|ABCF):(?:DIT|DID|DIM) XABCFCxxxxxxxxxx:DIM y $& XABCFCxxxxxxxxxx:DIM (((ABCD|ABCE|ABCF)))(A|B|C[xy]*): ABCFCxxxxxxxxxx:DIM y $& ABCFCxxxxxxxxxx: +(?=foo) foo y pos 0 +(?=foo) XfooY y pos 1 +.*(?=foo) XfooY y pos 1 +(?<=foo) foo y pos 3 +(?<=foo) XfooY y pos 4 +.*(?<=foo) foo y pos 3 +.*(?<=foo) XfooY y pos 4 +(?<=foo)Y XfooY y pos 5 +o(?<=foo)Y ..XfooY.. y pos 7 +X(?=foo)f ..XfooY.. y pos 4 +X(?=foo) ..XfooY.. y pos 3 +X(?<=foo.)[YZ] ..XfooXY.. y pos 8 +(?=XY*foo) Xfoo y pos 0 +^(?=XY*foo) Xfoo y pos 0 diff --git a/t/op/regexp.t b/t/op/regexp.t index 662be92..678a46a 100755 --- a/t/op/regexp.t +++ b/t/op/regexp.t @@ -72,23 +72,33 @@ while () { $skip = ($skip_amp ? ($result =~ s/B//i) : ($result =~ s/B//)); $reason = 'skipping $&' if $reason eq '' && $skip_amp; $result =~ s/B//i unless $skip; - for $study ('', 'study \$subject') { + + for $study ('', 'study $subject') { $c = $iters; - if ($qr_embed) { - eval qq" - my \$RE = qr$pat; - $study; - \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--; - \$got = \"$repl\"; - "; - } - else { - eval qq" - $study; - \$match = (\$subject =~ $OP$pat) while \$c--; - \$got = \"$repl\"; - "; - } + if ($repl eq 'pos') { + $code= < `$err'\n"; next TEST } @@ -99,14 +109,14 @@ while () { next TEST; } elsif ($@) { - print "not ok $. $input => error `$err'\n"; next TEST; + print "not ok $. $input => error `$err'\n$code\n$@\n"; next TEST; } elsif ($result eq 'n') { if ($match) { print "not ok $. ($study) $input => false positive\n"; next TEST } } else { if (!$match || $got ne $expect) { - print "not ok $. ($study) $input => `$got', match=$match\n"; + print "not ok $. ($study) $input => `$got', match=$match\n$code\n"; next TEST; } }