X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=regcomp.c;h=70906539816e04059775146ea00e2391025f58e1;hb=acd15b85995cf95382f158dbf237ef9c2ee99b7b;hp=ec79ceda5cb8e1d70a8d9d325a292391173ce287;hpb=57b84237ab396aa1e1efe3b629c2b636d4869d93;p=p5sagit%2Fp5-mst-13.2.git diff --git a/regcomp.c b/regcomp.c index ec79ced..7090653 100644 --- a/regcomp.c +++ b/regcomp.c @@ -186,10 +186,11 @@ typedef struct RExC_state_t { * Flags to be passed up and down. */ #define WORST 0 /* Worst case. */ -#define HASWIDTH 0x1 /* Known to match non-null strings. */ -#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */ -#define SPSTART 0x4 /* Starts with * or +. */ -#define TRYAGAIN 0x8 /* Weeded out a declaration. */ +#define HASWIDTH 0x01 /* Known to match non-null strings. */ +#define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */ +#define SPSTART 0x04 /* Starts with * or +. */ +#define TRYAGAIN 0x08 /* Weeded out a declaration. */ +#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */ #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1) @@ -662,7 +663,7 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min } data->last_end = -1; data->flags &= ~SF_BEFORE_EOL; - DEBUG_STUDYDATA("cl_anything: ",data,0); + DEBUG_STUDYDATA("commit: ",data,0); } /* Can match anything (initialization) */ @@ -2050,6 +2051,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs /* needed for dumping*/ DEBUG_r(if (optimize) { regnode *opt = convert; + while ( ++opt < optimize) { Set_Node_Offset_Length(opt,0,0); } @@ -4139,8 +4141,50 @@ Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm) r->engine= RE_ENGINE_PTR; r->refcnt = 1; r->prelen = xend - exp; - r->precomp = savepvn(RExC_precomp, r->prelen); r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME; + { + bool has_k = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); + bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD); + bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT); + U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> 12); + const char *fptr = STD_PAT_MODS; /*"msix"*/ + char *p; + r->wraplen = r->prelen + has_minus + has_k + has_runon + + (sizeof(STD_PAT_MODS) - 1) + + (sizeof("(?:)") - 1); + + Newx(r->wrapped, r->wraplen, char ); + p = r->wrapped; + *p++='('; *p++='?'; + if (has_k) + *p++ = KEEPCOPY_PAT_MOD; /*'k'*/ + { + char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1; + char *colon = r + 1; + char ch; + + while((ch = *fptr++)) { + if(reganch & 1) + *p++ = ch; + else + *r-- = ch; + reganch >>= 1; + } + if(has_minus) { + *r = '-'; + p = colon; + } + } + + *p++=':'; + Copy(RExC_precomp, p, r->prelen, char); + r->precomp = p; + p += r->prelen; + if (has_runon) + *p++='\n'; + *p=')'; + } + r->intflags = 0; r->nparens = RExC_npar - 1; /* set early to validate backrefs */ @@ -4648,58 +4692,53 @@ reStudy: return(r); } -#undef CORE_ONLY_BLOCK #undef RE_ENGINE_PTR -#ifndef PERL_IN_XSUB_RE + SV* -Perl_reg_named_buff_get(pTHX_ SV* namesv, const REGEXP * const from_re, U32 flags) +Perl_reg_named_buff_get(pTHX_ const REGEXP * const rx, SV* namesv, U32 flags) { AV *retarray = NULL; SV *ret; if (flags & 1) retarray=newAV(); - - if (from_re || PL_curpm) { - const REGEXP * const rx = from_re ? from_re : PM_GETRE(PL_curpm); - if (rx && rx->paren_names) { - HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 ); - if (he_str) { - IV i; - SV* sv_dat=HeVAL(he_str); - I32 *nums=(I32*)SvPVX(sv_dat); - for ( i=0; inparens) >= nums[i] - && rx->startp[nums[i]] != -1 - && rx->endp[nums[i]] != -1) - { - ret = reg_numbered_buff_get(nums[i],rx,NULL,0); - if (!retarray) - return ret; - } else { - ret = newSVsv(&PL_sv_undef); - } - if (retarray) { - SvREFCNT_inc(ret); - av_push(retarray, ret); - } + + if (rx && rx->paren_names) { + HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 ); + if (he_str) { + IV i; + SV* sv_dat=HeVAL(he_str); + I32 *nums=(I32*)SvPVX(sv_dat); + for ( i=0; inparens) >= nums[i] + && rx->startp[nums[i]] != -1 + && rx->endp[nums[i]] != -1) + { + ret = CALLREG_NUMBUF(rx,nums[i],NULL); + if (!retarray) + return ret; + } else { + ret = newSVsv(&PL_sv_undef); + } + if (retarray) { + SvREFCNT_inc(ret); + av_push(retarray, ret); } - if (retarray) - return (SV*)retarray; } + if (retarray) + return (SV*)retarray; } } return NULL; } SV* -Perl_reg_numbered_buff_get(pTHX_ I32 paren, const REGEXP * const rx, SV* usesv, U32 flags) +Perl_reg_numbered_buff_get(pTHX_ const REGEXP * const rx, I32 paren, SV* usesv) { char *s = NULL; I32 i = 0; I32 s1, t1; SV *sv = usesv ? usesv : newSVpvs(""); - PERL_UNUSED_ARG(flags); if (!rx->subbeg) { sv_setsv(sv,&PL_sv_undef); @@ -4768,7 +4807,7 @@ Perl_reg_numbered_buff_get(pTHX_ I32 paren, const REGEXP * const rx, SV* usesv, } return sv; } -#endif + /* Scans the name of a named buffer from the pattern. * If flags is REG_RSN_RETURN_NULL returns null. @@ -4905,6 +4944,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) const I32 oregflags = RExC_flags; bool have_branch = 0; bool is_open = 0; + I32 freeze_paren = 0; + I32 after_freeze = 0; /* for (?g), (?gc), and (?o) warnings; warning about (?c) will warn about (?g) -- japhy */ @@ -5123,11 +5164,26 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) Perl_croak(aTHX_ "panic: paren_name hash element allocation failed"); } else if ( SvPOK(sv_dat) ) { - IV count=SvIV(sv_dat); - I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1); - SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32)); - pv[count]=RExC_npar; - SvIVX(sv_dat)++; + /* (?|...) can mean we have dupes so scan to check + its already been stored. Maybe a flag indicating + we are inside such a construct would be useful, + but the arrays are likely to be quite small, so + for now we punt -- dmq */ + IV count = SvIV(sv_dat); + I32 *pv = (I32*)SvPVX(sv_dat); + IV i; + for ( i = 0 ; i < count ; i++ ) { + if ( pv[i] == RExC_npar ) { + count = 0; + break; + } + } + if ( count ) { + pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1); + SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32)); + pv[count] = RExC_npar; + SvIVX(sv_dat)++; + } } else { (void)SvUPGRADE(sv_dat,SVt_PVNV); sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32)); @@ -5155,6 +5211,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) nextchar(pRExC_state); return ret; } + break; + case '|': /* (?|...) */ + /* branch reset, behave like a (?:...) except that + buffers in alternations share the same numbers */ + paren = ':'; + after_freeze = freeze_paren = RExC_npar; + break; case ':': /* (?:...) */ case '>': /* (?>...) */ break; @@ -5175,6 +5238,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (*RExC_parse != ')') FAIL("Sequence (?R) not terminated"); ret = reg_node(pRExC_state, GOSTART); + *flagp |= POSTPONED; nextchar(pRExC_state); return ret; /*notreached*/ @@ -5255,6 +5319,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */ Set_Node_Offset(ret, parse_start); /* MJD */ + *flagp |= POSTPONED; nextchar(pRExC_state); return ret; } /* named and numeric backreferences */ @@ -5271,6 +5336,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); /*NOTREACHED*/ } + *flagp |= POSTPONED; paren = *RExC_parse++; /* FALL THROUGH */ case '{': /* (?{...}) */ @@ -5557,7 +5623,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (!SIZE_ONLY ){ if (!RExC_nestroot) RExC_nestroot = parno; - if (RExC_seen & REG_SEEN_RECURSE) { + if (RExC_seen & REG_SEEN_RECURSE + && !RExC_open_parens[parno-1]) + { DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, "Setting open paren #%"IVdf" to %d\n", (IV)parno, REG_NODE_NUM(ret))); @@ -5601,7 +5669,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } else if (paren != '?') /* Not Conditional */ ret = br; - *flagp |= flags & (SPSTART | HASWIDTH); + *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); lastbr = br; while (*RExC_parse == '|') { if (!SIZE_ONLY && RExC_extralen) { @@ -5611,15 +5679,18 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (SIZE_ONLY) RExC_extralen += 2; /* Account for LONGJMP. */ nextchar(pRExC_state); + if (freeze_paren) { + if (RExC_npar > after_freeze) + after_freeze = RExC_npar; + RExC_npar = freeze_paren; + } br = regbranch(pRExC_state, &flags, 0, depth+1); if (br == NULL) return(NULL); REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */ lastbr = br; - if (flags&HASWIDTH) - *flagp |= HASWIDTH; - *flagp |= flags&SPSTART; + *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); } if (have_branch || paren != ':') { @@ -5712,7 +5783,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) FAIL("Junk on end of regexp"); /* "Can't happen". */ /* NOTREACHED */ } - + if (after_freeze) + RExC_npar = after_freeze; return(ret); } @@ -5759,7 +5831,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) } else if (ret == NULL) ret = latest; - *flagp |= flags&HASWIDTH; + *flagp |= flags&(HASWIDTH|POSTPONED); if (chain == NULL) /* First piece. */ *flagp |= flags&SPSTART; else { @@ -5941,7 +6013,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) goto do_curly; } nest_check: - if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) { + if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) { vWARN3(RExC_parse, "%.*s matches null string many times", (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0), @@ -6376,7 +6448,7 @@ tryagain: } return(NULL); } - *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE); + *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); break; case '|': case ')': @@ -6654,9 +6726,7 @@ tryagain: case '#': if (RExC_flags & RXf_PMf_EXTENDED) { - while (RExC_parse < RExC_end && *RExC_parse != '\n') - RExC_parse++; - if (RExC_parse < RExC_end) + if ( reg_skipcomment( pRExC_state ) ) goto tryagain; } /* FALL THROUGH */ @@ -6685,7 +6755,7 @@ tryagain: char * const oldp = p; if (RExC_flags & RXf_PMf_EXTENDED) - p = regwhite(p, RExC_end); + p = regwhite( pRExC_state, p ); switch (*p) { case '^': case '$': @@ -6833,13 +6903,13 @@ tryagain: ender = *p++; break; } - if (RExC_flags & RXf_PMf_EXTENDED) - p = regwhite(p, RExC_end); + if ( RExC_flags & RXf_PMf_EXTENDED) + p = regwhite( pRExC_state, p ); if (UTF && FOLD) { /* Prime the casefolded buffer. */ ender = toFOLD_uni(ender, tmpbuf, &foldlen); } - if (ISMULT2(p)) { /* Back off on ?+*. */ + if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */ if (len) p = oldp; else if (UTF) { @@ -6941,15 +7011,22 @@ tryagain: } STATIC char * -S_regwhite(char *p, const char *e) +S_regwhite( RExC_state_t *pRExC_state, char *p ) { + const char *e = RExC_end; while (p < e) { if (isSPACE(*p)) ++p; else if (*p == '#') { + bool ended = 0; do { - p++; - } while (p < e && *p != '\n'); + if (*p++ == '\n') { + ended = 1; + break; + } + } while (p < e); + if (!ended) + RExC_seen |= REG_SEEN_RUN_ON_COMMENT; } else break; @@ -7731,6 +7808,49 @@ parseit: #undef _C_C_T_ +/* reg_skipcomment() + + Absorbs an /x style # comments from the input stream. + Returns true if there is more text remaining in the stream. + Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment + terminates the pattern without including a newline. + + Note its the callers responsibility to ensure that we are + actually in /x mode + +*/ + +STATIC bool +S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state) +{ + bool ended = 0; + while (RExC_parse < RExC_end) + if (*RExC_parse++ == '\n') { + ended = 1; + break; + } + if (!ended) { + /* we ran off the end of the pattern without ending + the comment, so we have to add an \n when wrapping */ + RExC_seen |= REG_SEEN_RUN_ON_COMMENT; + return 0; + } else + return 1; +} + +/* nextchar() + + Advance that parse position, and optionall absorbs + "whitespace" from the inputstream. + + Without /x "whitespace" means (?#...) style comments only, + with /x this means (?#...) and # comments and whitespace proper. + + Returns the RExC_parse point from BEFORE the scan occurs. + + This is the /x friendly way of saying RExC_parse++. +*/ + STATIC char* S_nextchar(pTHX_ RExC_state_t *pRExC_state) { @@ -7753,9 +7873,8 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state) continue; } else if (*RExC_parse == '#') { - while (RExC_parse < RExC_end) - if (*RExC_parse++ == '\n') break; - continue; + if ( reg_skipcomment( pRExC_state ) ) + continue; } } return retval; @@ -7883,6 +8002,7 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) const int offset = regarglen[(U8)op]; const int size = NODE_STEP_REGNODE + offset; GET_RE_DEBUG_FLAGS_DECL; + PERL_UNUSED_ARG(depth); /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]); if (SIZE_ONLY) { @@ -8524,10 +8644,6 @@ Perl_pregfree(pTHX_ struct regexp *r) return; CALLREGFREE_PVT(r); /* free the private data */ - - /* gcov results gave these as non-null 100% of the time, so there's no - optimisation in checking them before calling Safefree */ - Safefree(r->precomp); RX_MATCH_COPY_FREE(r); #ifdef PERL_OLD_COPY_ON_WRITE if (r->saved_copy) @@ -8545,8 +8661,8 @@ Perl_pregfree(pTHX_ struct regexp *r) Safefree(r->substrs); } if (r->paren_names) - SvREFCNT_dec(r->paren_names); - + SvREFCNT_dec(r->paren_names); + Safefree(r->wrapped); Safefree(r->startp); Safefree(r->endp); Safefree(r); @@ -8738,11 +8854,14 @@ Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) } else ret->substrs = NULL; - ret->precomp = SAVEPVN(r->precomp, r->prelen); + ret->wrapped = SAVEPVN(r->wrapped, r->wraplen); + ret->precomp = ret->wrapped + (r->precomp - r->wrapped); + ret->prelen = r->prelen; + ret->wraplen = r->wraplen; + ret->refcnt = r->refcnt; ret->minlen = r->minlen; ret->minlenret = r->minlenret; - ret->prelen = r->prelen; ret->nparens = r->nparens; ret->lastparen = r->lastparen; ret->lastcloseparen = r->lastcloseparen; @@ -8809,8 +8928,8 @@ Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param) reti->swap = NULL; } - reti->regstclass = NULL; + if (ri->data) { struct reg_data *d; const int count = ri->data->count; @@ -8915,83 +9034,18 @@ Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param) */ #ifndef PERL_IN_XSUB_RE + char * Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) { dVAR; const regexp * const re = (regexp *)mg->mg_obj; - - if (!mg->mg_ptr) { - const char *fptr = STD_PAT_MODS; /*"msix"*/ - char reflags[7]; - char ch; - bool hask = ((re->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); - bool hasm = ((re->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD); - U16 reganch = (U16)((re->extflags & RXf_PMf_STD_PMMOD) >> 12); - bool need_newline = 0; - int left = 0; - int right = 4 + hask; - if (hask) - reflags[left++] = KEEPCOPY_PAT_MOD; /*'k'*/ - while((ch = *fptr++)) { - if(reganch & 1) { - reflags[left++] = ch; - } - else { - reflags[right--] = ch; - } - reganch >>= 1; - } - if(hasm) { - reflags[left] = '-'; - left = 5 + hask; - } - /* printf("[%*.7s]\n",left,reflags); */ - mg->mg_len = re->prelen + 4 + left; - /* - * If /x was used, we have to worry about a regex ending with a - * comment later being embedded within another regex. If so, we don't - * want this regex's "commentization" to leak out to the right part of - * the enclosing regex, we must cap it with a newline. - * - * So, if /x was used, we scan backwards from the end of the regex. If - * we find a '#' before we find a newline, we need to add a newline - * ourself. If we find a '\n' first (or if we don't find '#' or '\n'), - * we don't need to add anything. -jfriedl - */ - if (PMf_EXTENDED & re->extflags) { - const char *endptr = re->precomp + re->prelen; - while (endptr >= re->precomp) { - const char c = *(endptr--); - if (c == '\n') - break; /* don't need another */ - if (c == '#') { - /* we end while in a comment, so we need a newline */ - mg->mg_len++; /* save space for it */ - need_newline = 1; /* note to add it */ - break; - } - } - } - - Newx(mg->mg_ptr, mg->mg_len + 1 + left, char); - mg->mg_ptr[0] = '('; - mg->mg_ptr[1] = '?'; - Copy(reflags, mg->mg_ptr+2, left, char); - *(mg->mg_ptr+left+2) = ':'; - Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char); - if (need_newline) - mg->mg_ptr[mg->mg_len - 2] = '\n'; - mg->mg_ptr[mg->mg_len - 1] = ')'; - mg->mg_ptr[mg->mg_len] = 0; - } if (haseval) *haseval = re->seen_evals; if (flags) *flags = ((re->extflags & RXf_UTF8) ? 1 : 0); - if (lp) - *lp = mg->mg_len; - return mg->mg_ptr; + *lp = re->wraplen; + return re->wrapped; } /*