From: Yves Orton Date: Mon, 30 Oct 2006 19:15:13 +0000 (+0200) Subject: The first patch from: X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=40d049e43280582bb162c511cc362c246f19862a;p=p5sagit%2Fp5-mst-13.2.git The first patch from: Subject: [PATCH] regex engine optimiser should grok subroutine patterns, and, name subroutine regops more intuitively Message-ID: <9b18b3110610300915x3abf6cddu9c2071a70bea48e1@mail.gmail.com> p4raw-id: //depot/perl@29161 --- diff --git a/embed.fnc b/embed.fnc index ea5450d..d3a1e27 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1331,6 +1331,8 @@ Esn |void |cl_or |NN const struct RExC_state_t* state|NN struct regnode_charcla Es |I32 |study_chunk |NN struct RExC_state_t* state|NN regnode **scanp \ |NN I32 *minlenp|NN I32 *deltap \ |NN regnode *last|NULLOK struct scan_data_t *data \ + |I32 stopparen|NULLOK U8* recursed \ + |NULLOK struct regnode_charclass_class *and_withp \ |U32 flags|U32 depth EsRn |I32 |add_data |NN struct RExC_state_t* state|I32 n|NN const char *s rs |void |re_croak2 |NN const char* pat1|NN const char* pat2|... diff --git a/embed.h b/embed.h index bc884d7..e6b1af3 100644 --- a/embed.h +++ b/embed.h @@ -3531,7 +3531,7 @@ #define cl_init_zero S_cl_init_zero #define cl_and S_cl_and #define cl_or S_cl_or -#define study_chunk(a,b,c,d,e,f,g,h) S_study_chunk(aTHX_ a,b,c,d,e,f,g,h) +#define study_chunk(a,b,c,d,e,f,g,h,i,j,k) S_study_chunk(aTHX_ a,b,c,d,e,f,g,h,i,j,k) #define add_data S_add_data #endif #ifdef PERL_CORE diff --git a/proto.h b/proto.h index 520c86e..951ee4d 100644 --- a/proto.h +++ b/proto.h @@ -3632,7 +3632,7 @@ STATIC void S_cl_or(const struct RExC_state_t* state, struct regnode_charclass_c __attribute__nonnull__(2) __attribute__nonnull__(3); -STATIC I32 S_study_chunk(pTHX_ struct RExC_state_t* state, regnode **scanp, I32 *minlenp, I32 *deltap, regnode *last, struct scan_data_t *data, U32 flags, U32 depth) +STATIC I32 S_study_chunk(pTHX_ struct RExC_state_t* state, regnode **scanp, I32 *minlenp, I32 *deltap, regnode *last, struct scan_data_t *data, I32 stopparen, U8* recursed, struct regnode_charclass_class *and_withp, U32 flags, U32 depth) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2) __attribute__nonnull__(pTHX_3) diff --git a/regcomp.c b/regcomp.c index be8be1b..8534b0c 100644 --- a/regcomp.c +++ b/regcomp.c @@ -117,10 +117,14 @@ typedef struct RExC_state_t { I32 extralen; I32 seen_zerolen; I32 seen_evals; - regnode **parens; /* offsets of each paren */ + regnode **open_parens; /* pointers to open parens */ + regnode **close_parens; /* pointers to close parens */ + regnode *opend; /* END node in program */ I32 utf8; HV *charnames; /* cache of named sequences */ HV *paren_names; /* Paren names */ + regnode **recurse; /* Recurse regops */ + I32 recurse_count; /* Number of recurse regops */ #if ADD_TO_REGEXEC char *starttry; /* -Dr: where regtry was called. */ #define RExC_starttry (pRExC_state->starttry) @@ -153,8 +157,12 @@ typedef struct RExC_state_t { #define RExC_seen_evals (pRExC_state->seen_evals) #define RExC_utf8 (pRExC_state->utf8) #define RExC_charnames (pRExC_state->charnames) -#define RExC_parens (pRExC_state->parens) +#define RExC_open_parens (pRExC_state->open_parens) +#define RExC_close_parens (pRExC_state->close_parens) +#define RExC_opend (pRExC_state->opend) #define RExC_paren_names (pRExC_state->paren_names) +#define RExC_recurse (pRExC_state->recurse) +#define RExC_recurse_count (pRExC_state->recurse_count) #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ @@ -182,6 +190,14 @@ typedef struct RExC_state_t { #endif + +#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3] +#define PBITVAL(paren) (1 << ((paren) & 7)) +#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren)) +#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren) +#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren)) + + /* About scan_data_t. During optimisation we recurse through the regexp program performing @@ -668,6 +684,8 @@ STATIC void S_cl_and(struct regnode_charclass_class *cl, const struct regnode_charclass_class *and_with) { + + assert(and_with->type == ANYOF); if (!(and_with->flags & ANYOF_CLASS) && !(cl->flags & ANYOF_CLASS) && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) @@ -2261,15 +2279,27 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set to the position after last scanned or to NULL. */ - +#define INIT_AND_WITHP \ + assert(!and_withp); \ + Newx(and_withp,1,struct regnode_charclass_class); \ + SAVEFREEPV(and_withp) STATIC I32 -S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, +S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *minlenp, I32 *deltap, - regnode *last, scan_data_t *data, U32 flags, U32 depth) + regnode *last, + scan_data_t *data, + I32 stopparen, + U8* recursed, + struct regnode_charclass_class *and_withp, + U32 flags, U32 depth) /* scanp: Start here (read-write). */ /* deltap: Write maxlen-minlen here. */ /* last: Stop before this one. */ + /* data: string data about the pattern */ + /* stopparen: treat close N as END */ + /* recursed: which subroutines have we recursed into */ + /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */ { dVAR; I32 min = 0, pars = 0, code; @@ -2279,17 +2309,15 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, int is_inf_internal = 0; /* The studied chunk is infinite */ I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; scan_data_t data_fake; - struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */ SV *re_trie_maxbuff = NULL; regnode *first_non_open = scan; - - GET_RE_DEBUG_FLAGS_DECL; #ifdef DEBUGGING - StructCopy(&zero_scan_data, &data_fake, scan_data_t); + StructCopy(&zero_scan_data, &data_fake, scan_data_t); #endif + if ( depth == 0 ) { - while (first_non_open && OP(first_non_open) == OPEN) + while (first_non_open && OP(first_non_open) == OPEN) first_non_open=regnext(first_non_open); } @@ -2352,7 +2380,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, num++; data_fake.flags = 0; - if (data) { + if (data) { data_fake.whilem_c = data->whilem_c; data_fake.last_closep = data->last_closep; } @@ -2372,7 +2400,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* we suppose the run is continuous, last=next...*/ minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, - next, &data_fake, f,depth+1); + next, &data_fake, + stopparen, recursed, NULL, f,depth+1); if (min1 > minnext) min1 = minnext; if (max1 < minnext + deltanext) @@ -2405,7 +2434,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (flags & SCF_DO_STCLASS_OR) { cl_or(pRExC_state, data->start_class, &accum); if (min1) { - cl_and(data->start_class, &and_with); + cl_and(data->start_class, and_withp); flags &= ~SCF_DO_STCLASS; } } @@ -2417,7 +2446,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, else { /* Switch to OR mode: cache the old value of * data->start_class */ - StructCopy(data->start_class, &and_with, + INIT_AND_WITHP; + StructCopy(data->start_class, and_withp, struct regnode_charclass_class); flags &= ~SCF_DO_STCLASS_AND; StructCopy(&accum, data->start_class, @@ -2683,7 +2713,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, else data->start_class->flags |= ANYOF_UNICODE_ALL; data->start_class->flags &= ~ANYOF_EOS; - cl_and(data->start_class, &and_with); + cl_and(data->start_class, and_withp); } flags &= ~SCF_DO_STCLASS; } @@ -2731,7 +2761,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, ANYOF_BITMAP_SET(data->start_class, uc); data->start_class->flags &= ~ANYOF_EOS; } - cl_and(data->start_class, &and_with); + cl_and(data->start_class, and_withp); } flags &= ~SCF_DO_STCLASS; } @@ -2778,8 +2808,15 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } goto optimize_curly_tail; case CURLY: - mincount = ARG1(scan); - maxcount = ARG2(scan); + if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM) + && (scan->flags == stopparen)) + { + mincount = 1; + maxcount = 1; + } else { + mincount = ARG1(scan); + maxcount = ARG2(scan); + } next = regnext(scan); if (OP(scan) == CURLYX) { I32 lp = (data ? *(data->last_closep) : 0); @@ -2815,7 +2852,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, f &= ~SCF_WHILEM_VISITED_POS; /* This will finish on WHILEM, setting scan, or on NULL: */ - minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, last, data, + minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, + last, data, stopparen, recursed, NULL, (mincount == 0 ? (f & ~SCF_DO_SUBSTR) : f),depth+1); @@ -2828,7 +2866,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, else if (flags & SCF_DO_STCLASS_AND) { /* Switch to OR mode: cache the old value of * data->start_class */ - StructCopy(data->start_class, &and_with, + INIT_AND_WITHP; + StructCopy(data->start_class, and_withp, struct regnode_charclass_class); flags &= ~SCF_DO_STCLASS_AND; StructCopy(&this_class, data->start_class, @@ -2839,7 +2878,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } else { /* Non-zero len */ if (flags & SCF_DO_STCLASS_OR) { cl_or(pRExC_state, data->start_class, &this_class); - cl_and(data->start_class, &and_with); + cl_and(data->start_class, and_withp); } else if (flags & SCF_DO_STCLASS_AND) cl_and(data->start_class, &this_class); @@ -2889,10 +2928,15 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, nxt = regnext(nxt); if (OP(nxt) != CLOSE) goto nogo; + if (RExC_open_parens) { + RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/ + RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/ + } /* Now we know that nxt2 is the only contents: */ oscan->flags = (U8)ARG(nxt); OP(oscan) = CURLYN; OP(nxt1) = NOTHING; /* was OPEN. */ + #ifdef DEBUGGING OP(nxt1 + 1) = OPTIMIZED; /* was count. */ NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */ @@ -2929,8 +2973,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (OP(nxt) != CLOSE) FAIL("Panic opt close"); oscan->flags = (U8)ARG(nxt); + if (RExC_open_parens) { + RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/ + RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/ + } OP(nxt1) = OPTIMIZED; /* was OPEN. */ OP(nxt) = OPTIMIZED; /* was CLOSE. */ + #ifdef DEBUGGING OP(nxt1 + 1) = OPTIMIZED; /* was count. */ OP(nxt + 1) = OPTIMIZED; /* was count. */ @@ -2954,7 +3003,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #endif /* Optimize again: */ study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt, - NULL, 0,depth+1); + NULL, stopparen, recursed, NULL, 0,depth+1); } else oscan->flags = 0; @@ -3286,7 +3335,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, break; } if (flags & SCF_DO_STCLASS_OR) - cl_and(data->start_class, &and_with); + cl_and(data->start_class, and_withp); flags &= ~SCF_DO_STCLASS; } } @@ -3328,7 +3377,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, f |= SCF_WHILEM_VISITED_POS; next = regnext(scan); nscan = NEXTOPER(NEXTOPER(scan)); - minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, last, &data_fake, f,depth+1); + minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, + last, &data_fake, stopparen, recursed, NULL, f, depth+1); if (scan->flags) { if (deltanext) { vFAIL("Variable length lookbehind not implemented"); @@ -3400,8 +3450,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, f |= SCF_WHILEM_VISITED_POS; next = regnext(scan); nscan = NEXTOPER(NEXTOPER(scan)); - - *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext, last, &data_fake, f,depth+1); + + *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext, + last, &data_fake, stopparen, recursed, NULL, f,depth+1); if (scan->flags) { if (deltanext) { vFAIL("Variable length lookbehind not implemented"); @@ -3411,17 +3462,16 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } scan->flags = (U8)*minnextp; } - + *minnextp += min; - - + if (f & SCF_DO_STCLASS_AND) { const int was = (data->start_class->flags & ANYOF_EOS); cl_and(data->start_class, &intrnl); if (was) data->start_class->flags |= ANYOF_EOS; - } + } if (data) { if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) pars++; @@ -3455,9 +3505,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #endif } else if (OP(scan) == OPEN) { - pars++; + if (stopparen != (I32)ARG(scan)) + pars++; } else if (OP(scan) == CLOSE) { + if (stopparen == (I32)ARG(scan)) { + break; + } if ((I32)ARG(scan) == is_par) { next = regnext(scan); @@ -3467,16 +3521,73 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (data) *(data->last_closep) = ARG(scan); } + else if (OP(scan) == RECURSE || OP(scan) == SRECURSE) { + /* set the pointer */ + I32 paren; + regnode *start; + regnode *end; + if (OP(scan) == RECURSE) { + paren = ARG(scan); + RExC_recurse[ARG2L(scan)] = scan; + start = RExC_open_parens[paren-1]; + end = RExC_close_parens[paren-1]; + } else { + paren = 0; + start = RExC_rx->program + 1; + end = RExC_opend; + } + assert(start); + assert(end); + if (!recursed) { + Newxz(recursed, (((RExC_npar)>>3) +1), U8); + SAVEFREEPV(recursed); + } + if (!PAREN_TEST(recursed,paren+1)) { + I32 deltanext = 0; + PAREN_SET(recursed,paren+1); + + DEBUG_PEEP("goto",start,depth); + min += study_chunk( + pRExC_state, + &start, + minlenp, + &deltanext, + end+1, + data, + paren, + recursed, + and_withp, + flags,depth+1); + delta+=deltanext; + if (deltanext == I32_MAX) { + is_inf = is_inf_internal = 1; + delta=deltanext; + } + DEBUG_PEEP("rtrn",end,depth); + PAREN_UNSET(recursed,paren+1); + } else { + if (flags & SCF_DO_SUBSTR) { + scan_commit(pRExC_state,data,minlenp); + data->longest = &(data->longest_float); + } + is_inf = is_inf_internal = 1; + if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ + cl_anything(pRExC_state, data->start_class); + flags &= ~SCF_DO_STCLASS; + } + } else if (OP(scan) == EVAL) { if (data) data->flags |= SF_HAS_EVAL; } - else if ( (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */ - || OP(scan)==RECURSE) /* recursion */ - { - if (OP(scan)==RECURSE) { - ARG2L_SET( scan, RExC_parens[ARG(scan)-1] - scan ); - } + else if ( OP(scan)==OPFAIL ) { + if (flags & SCF_DO_SUBSTR) { + scan_commit(pRExC_state,data,minlenp); + flags &= ~SCF_DO_SUBSTR; + } + } + else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */ + { if (flags & SCF_DO_SUBSTR) { scan_commit(pRExC_state,data,minlenp); data->longest = &(data->longest_float); @@ -3487,10 +3598,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, flags &= ~SCF_DO_STCLASS; } #ifdef TRIE_STUDY_OPT -#ifdef FULL_TRIE_STUDY +#ifdef FULL_TRIE_STUDY else if (PL_regkind[OP(scan)] == TRIE) { /* NOTE - There is similar code to this block above for handling - BRANCH nodes on the initial study. If you change stuff here + BRANCH nodes on the initial study. If you change stuff here check there too. */ regnode *trie_node= scan; regnode *tail= regnext(scan); @@ -3539,8 +3650,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, it. Note this means we need the vestigal unused branches even though they arent otherwise used. */ - minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, - (regnode *)nextbranch, &data_fake, f,depth+1); + minnext = study_chunk(pRExC_state, &scan, minlenp, + &deltanext, (regnode *)nextbranch, &data_fake, + stopparen, recursed, NULL, f,depth+1); } if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) nextbranch= regnext((regnode*)nextbranch); @@ -3575,7 +3687,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (flags & SCF_DO_STCLASS_OR) { cl_or(pRExC_state, data->start_class, &accum); if (min1) { - cl_and(data->start_class, &and_with); + cl_and(data->start_class, and_withp); flags &= ~SCF_DO_STCLASS; } } @@ -3587,7 +3699,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, else { /* Switch to OR mode: cache the old value of * data->start_class */ - StructCopy(data->start_class, &and_with, + INIT_AND_WITHP; + StructCopy(data->start_class, and_withp, struct regnode_charclass_class); flags &= ~SCF_DO_STCLASS_AND; StructCopy(&accum, data->start_class, @@ -3639,7 +3752,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->flags &= ~SF_IN_PAR; } if (flags & SCF_DO_STCLASS_OR) - cl_and(data->start_class, &and_with); + cl_and(data->start_class, and_withp); if (flags & SCF_TRIE_RESTUDY) data->flags |= SCF_TRIE_RESTUDY; @@ -3806,8 +3919,12 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) RExC_emit = &PL_regdummy; RExC_whilem_seen = 0; RExC_charnames = NULL; - RExC_parens = NULL; + RExC_open_parens = NULL; + RExC_close_parens = NULL; + RExC_opend = NULL; RExC_paren_names = NULL; + RExC_recurse = NULL; + RExC_recurse_count = 0; #if 0 /* REGC() is (currently) a NOP at the first pass. * Clever compilers notice this and complain. --jhi */ @@ -3865,8 +3982,10 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) r->paren_names = 0; if (RExC_seen & REG_SEEN_RECURSE) { - Newxz(RExC_parens, RExC_npar,regnode *); - SAVEFREEPV(RExC_parens); + Newxz(RExC_open_parens, RExC_npar,regnode *); + SAVEFREEPV(RExC_open_parens); + Newxz(RExC_close_parens,RExC_npar,regnode *); + SAVEFREEPV(RExC_close_parens); } /* Useful during FAIL. */ @@ -3899,11 +4018,14 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) /* XXXX To minimize changes to RE engine we always allocate 3-units-long substrs field. */ Newx(r->substrs, 1, struct reg_substr_data); + if (RExC_recurse_count) { + Newxz(RExC_recurse,RExC_recurse_count,regnode *); + SAVEFREEPV(RExC_recurse); + } reStudy: r->minlen = minlen = sawplus = sawopen = 0; Zero(r->substrs, 1, struct reg_substr_data); - StructCopy(&zero_scan_data, &data, scan_data_t); #ifdef TRIE_STUDY_OPT if ( restudied ) { @@ -3914,9 +4036,13 @@ reStudy: SvREFCNT_dec(data.longest_float); SvREFCNT_dec(data.last_found); } + StructCopy(&zero_scan_data, &data, scan_data_t); } else { + StructCopy(&zero_scan_data, &data, scan_data_t); copyRExC_state=RExC_state; } +#else + StructCopy(&zero_scan_data, &data, scan_data_t); #endif /* Dig out information for optimizations. */ @@ -4073,7 +4199,8 @@ reStudy: data.last_closep = &last_close; minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */ - &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0); + &data, -1, NULL, NULL, + SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0); CHECK_RESTUDY_GOTO; @@ -4249,7 +4376,7 @@ reStudy: data.last_closep = &last_close; minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size, - &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0); + &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0); CHECK_RESTUDY_GOTO; @@ -4293,6 +4420,12 @@ reStudy: else r->paren_names = NULL; + if (RExC_recurse_count) { + for ( ; RExC_recurse_count ; RExC_recurse_count-- ) { + const regnode *scan = RExC_recurse[RExC_recurse_count-1]; + ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan ); + } + } Newxz(r->startp, RExC_npar, I32); Newxz(r->endp, RExC_npar, I32); @@ -4645,7 +4778,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_parse++; vFAIL("Reference to nonexistent group"); } - ARG2L_SET( ret, 0); + ARG2L_SET( ret, RExC_recurse_count++); RExC_emit++; DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret))); @@ -4930,9 +5063,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ret = reganode(pRExC_state, OPEN, parno); if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) { DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, - "Setting paren #%"IVdf" to %d\n", (IV)parno, REG_NODE_NUM(ret))); - RExC_parens[parno-1]= ret; - + "Setting open paren #%"IVdf" to %d\n", + (IV)parno, REG_NODE_NUM(ret))); + RExC_open_parens[parno-1]= ret; } Set_Node_Length(ret, 1); /* MJD */ Set_Node_Offset(ret, RExC_parse); /* MJD */ @@ -4999,6 +5132,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) break; case 1: ender = reganode(pRExC_state, CLOSE, parno); + if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) { + DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, + "Setting close paren #%"IVdf" to %d\n", + (IV)parno, REG_NODE_NUM(ender))); + RExC_close_parens[parno-1]= ender; + } Set_Node_Offset(ender,RExC_parse+1); /* MJD */ Set_Node_Length(ender,1); /* MJD */ break; @@ -5013,6 +5152,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) break; case 0: ender = reg_node(pRExC_state, END); + if (!SIZE_ONLY) { + assert(!RExC_opend); /* there can only be one! */ + RExC_opend = ender; + } break; } REGTAIL(pRExC_state, lastbr, ender); @@ -7399,14 +7542,25 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) src = RExC_emit; RExC_emit += size; dst = RExC_emit; - if (RExC_parens) { + if (RExC_open_parens) { int paren; + DEBUG_PARSE_FMT("inst"," - %d",RExC_npar); for ( paren=0 ; paren < RExC_npar ; paren++ ) { - if ( RExC_parens[paren] >= src ) - RExC_parens[paren] += size; - } + if ( RExC_open_parens[paren] >= opnd ) { + DEBUG_PARSE_FMT("open"," - %d",size); + RExC_open_parens[paren] += size; + } else { + DEBUG_PARSE_FMT("open"," - %s","ok"); + } + if ( RExC_close_parens[paren] >= opnd ) { + DEBUG_PARSE_FMT("close"," - %d",size); + RExC_close_parens[paren] += size; + } else { + DEBUG_PARSE_FMT("close"," - %s","ok"); + } + } } - + while (src > opnd) { StructCopy(--src, --dst, regnode); if (RExC_offsets) { /* MJD 20010112 */ @@ -8494,9 +8648,9 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, : "???" ); if (trie->jump) { - U16 dist = trie->jump[word_idx+1]; + U16 dist= trie->jump[word_idx+1]; PerlIO_printf(Perl_debug_log, "(%u)\n", - (dist ? this_trie + dist : next) - start); + (dist ? this_trie + dist : next) - start); if (dist) { if (!nextbranch) nextbranch = this_trie + trie->jump[0]; diff --git a/regcomp.sym b/regcomp.sym index 73e27a8..a61f172 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -98,9 +98,8 @@ STAR STAR, node Match this (simple) thing 0 or more times. PLUS PLUS, node Match this (simple) thing 1 or more times. CURLY CURLY, sv 2 Match this simple thing {n,m} times. -CURLYN CURLY, no 2 Match next-after-this simple thing -# {n,m} times, set parenths. -CURLYM CURLY, no 2 Match this medium-complex thing {n,m} times. +CURLYN CURLY, no 2 Capture next-after-this simple thing +CURLYM CURLY, no 2 Capture this medium-complex thing {n,m} times. CURLYX CURLY, sv 2 Match this complex thing {n,m} times. # This terminator creates a loop structure for CURLYX @@ -156,7 +155,7 @@ AHOCORASICKC TRIE, trie charclass Same as AHOCORASICK, but with embedded charc #*Recursion (65..66) RECURSE RECURSE, num/ofs 2L recurse to paren arg1 at (signed) ofs arg2 -SRECURSE RECURSE, no recurse to start of pattern +SRECURSE SRECURSE, no recurse to start of pattern #*Named references (67..69) NREF NREF, no-sv 1 Match some already matched string diff --git a/regexec.c b/regexec.c index 17d0e01..d57fd35 100644 --- a/regexec.c +++ b/regexec.c @@ -331,7 +331,11 @@ Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *stren RExen without fixed substrings. Similarly, it is assumed that lengths of all the strings are no more than minlen, thus they cannot come from lookahead. - (Or minlen should take into account lookahead.) */ + (Or minlen should take into account lookahead.) + NOTE: Some of this comment is not correct. minlen does now take account + of lookahead/behind. Further research is required. -- demerphq + +*/ /* A failure to find a constant substring means that there is no need to make an expensive call to REx engine, thus we celebrate a failure. Similarly, @@ -4450,6 +4454,10 @@ NULL curly_try_B_max: /* a successful greedy match: now try to match B */ + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == ST.paren) { + goto fake_end; + } { UV c = 0; if (ST.c1 != CHRTEST_VOID) @@ -4459,10 +4467,6 @@ NULL /* If it could work, try it. */ if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) { CURLY_SETPAREN(ST.paren, ST.count); - if (cur_eval && cur_eval->u.eval.close_paren && - cur_eval->u.eval.close_paren == ST.paren) { - goto fake_end; - } PUSH_STATE_GOTO(CURLY_B_max, ST.B); /* NOTREACHED */ } diff --git a/regnodes.h b/regnodes.h index d6842b5..e3d1837 100644 --- a/regnodes.h +++ b/regnodes.h @@ -50,8 +50,8 @@ #define STAR 38 /* 0x26 Match this (simple) thing 0 or more times. */ #define PLUS 39 /* 0x27 Match this (simple) thing 1 or more times. */ #define CURLY 40 /* 0x28 Match this simple thing {n,m} times. */ -#define CURLYN 41 /* 0x29 Match next-after-this simple thing */ -#define CURLYM 42 /* 0x2a Match this medium-complex thing {n,m} times. */ +#define CURLYN 41 /* 0x29 Capture next-after-this simple thing */ +#define CURLYM 42 /* 0x2a Capture this medium-complex thing {n,m} times. */ #define CURLYX 43 /* 0x2b Match this complex thing {n,m} times. */ #define WHILEM 44 /* 0x2c Do curly processing and see if rest matches. */ #define OPEN 45 /* 0x2d Mark this point in input as start of */ @@ -191,7 +191,7 @@ EXTCONST U8 PL_regkind[] = { TRIE, /* AHOCORASICK */ TRIE, /* AHOCORASICKC */ RECURSE, /* RECURSE */ - RECURSE, /* SRECURSE */ + SRECURSE, /* SRECURSE */ NREF, /* NREF */ NREF, /* NREFF */ NREF, /* NREFFL */