X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=regcomp.c;h=70906539816e04059775146ea00e2391025f58e1;hb=acd15b85995cf95382f158dbf237ef9c2ee99b7b;hp=a5eee5bb1e77fe936fa3bcdd542e48e6fd3402ee;hpb=23175d88b839c4a83ec070928df0fae32557300b;p=p5sagit%2Fp5-mst-13.2.git diff --git a/regcomp.c b/regcomp.c index a5eee5b..7090653 100644 --- a/regcomp.c +++ b/regcomp.c @@ -57,7 +57,7 @@ **** Alterations to Henry's code are... **** **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, - **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others + **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others **** **** You may distribute under the terms of either the GNU General Public **** License or the Artistic License, as specified in the README file. @@ -151,7 +151,9 @@ typedef struct RExC_state_t { #define RExC_end (pRExC_state->end) #define RExC_parse (pRExC_state->parse) #define RExC_whilem_seen (pRExC_state->whilem_seen) -#define RExC_offsets (pRExC_state->rxi->offsets) /* I am not like the others */ +#ifdef RE_TRACK_PATTERN_OFFSETS +#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */ +#endif #define RExC_emit (pRExC_state->emit) #define RExC_emit_start (pRExC_state->emit_start) #define RExC_naughty (pRExC_state->naughty) @@ -172,6 +174,7 @@ typedef struct RExC_state_t { #define RExC_recurse (pRExC_state->recurse) #define RExC_recurse_count (pRExC_state->recurse_count) + #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ ((*s) == '{' && regcurly(s))) @@ -183,10 +186,11 @@ typedef struct RExC_state_t { * Flags to be passed up and down. */ #define WORST 0 /* Worst case. */ -#define HASWIDTH 0x1 /* Known to match non-null strings. */ -#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */ -#define SPSTART 0x4 /* Starts with * or +. */ -#define TRYAGAIN 0x8 /* Weeded out a declaration. */ +#define HASWIDTH 0x01 /* Known to match non-null strings. */ +#define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */ +#define SPSTART 0x04 /* Starts with * or +. */ +#define TRYAGAIN 0x08 /* Weeded out a declaration. */ +#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */ #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1) @@ -510,7 +514,21 @@ static const scan_data_t zero_scan_data = * Element 0 holds the number n. * Position is 1 indexed. */ - +#ifndef RE_TRACK_PATTERN_OFFSETS +#define Set_Node_Offset_To_R(node,byte) +#define Set_Node_Offset(node,byte) +#define Set_Cur_Node_Offset +#define Set_Node_Length_To_R(node,len) +#define Set_Node_Length(node,len) +#define Set_Node_Cur_Length(node) +#define Node_Offset(n) +#define Node_Length(n) +#define Set_Node_Offset_Length(node,offset,len) +#define ProgLen(ri) ri->u.proglen +#define SetProgLen(ri,x) ri->u.proglen = x +#else +#define ProgLen(ri) ri->u.offsets[0] +#define SetProgLen(ri,x) ri->u.offsets[0] = x #define Set_Node_Offset_To_R(node,byte) STMT_START { \ if (! SIZE_ONLY) { \ MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \ @@ -553,11 +571,11 @@ static const scan_data_t zero_scan_data = Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \ Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \ } STMT_END - +#endif #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS #define EXPERIMENTAL_INPLACESCAN -#endif +#endif /*RE_TRACK_PATTERN_OFFSETS*/ #define DEBUG_STUDYDATA(str,data,depth) \ DEBUG_OPTIMISE_MORE_r(if(data){ \ @@ -645,7 +663,7 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min } data->last_end = -1; data->flags &= ~SF_BEFORE_EOL; - DEBUG_STUDYDATA("cl_anything: ",data,0); + DEBUG_STUDYDATA("commit: ",data,0); } /* Can match anything (initialization) */ @@ -1842,9 +1860,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs #ifdef DEBUGGING regnode *optimize = NULL; +#ifdef RE_TRACK_PATTERN_OFFSETS + U32 mjd_offset = 0; U32 mjd_nodelen = 0; -#endif +#endif /* RE_TRACK_PATTERN_OFFSETS */ +#endif /* DEBUGGING */ /* This means we convert either the first branch or the first Exact, depending on whether the thing following (in 'last') is a branch @@ -1857,25 +1878,28 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( first != startbranch || OP( last ) == BRANCH ) { /* branch sub-chain */ NEXT_OFF( first ) = (U16)(last - first); +#ifdef RE_TRACK_PATTERN_OFFSETS DEBUG_r({ mjd_offset= Node_Offset((convert)); mjd_nodelen= Node_Length((convert)); }); +#endif /* whole branch chain */ - } else { + } +#ifdef RE_TRACK_PATTERN_OFFSETS + else { DEBUG_r({ const regnode *nop = NEXTOPER( convert ); mjd_offset= Node_Offset((nop)); mjd_nodelen= Node_Length((nop)); }); } - DEBUG_OPTIMISE_r( PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n", (int)depth * 2 + 2, "", (UV)mjd_offset, (UV)mjd_nodelen) ); - +#endif /* But first we check to see if there is a common prefix we can split out as an EXACT and put in front of the TRIE node. */ trie->startstate= 1; @@ -2027,6 +2051,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs /* needed for dumping*/ DEBUG_r(if (optimize) { regnode *opt = convert; + while ( ++opt < optimize) { Set_Node_Offset_Length(opt,0,0); } @@ -4116,8 +4141,50 @@ Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm) r->engine= RE_ENGINE_PTR; r->refcnt = 1; r->prelen = xend - exp; - r->precomp = savepvn(RExC_precomp, r->prelen); r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME; + { + bool has_k = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); + bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD); + bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT); + U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> 12); + const char *fptr = STD_PAT_MODS; /*"msix"*/ + char *p; + r->wraplen = r->prelen + has_minus + has_k + has_runon + + (sizeof(STD_PAT_MODS) - 1) + + (sizeof("(?:)") - 1); + + Newx(r->wrapped, r->wraplen, char ); + p = r->wrapped; + *p++='('; *p++='?'; + if (has_k) + *p++ = KEEPCOPY_PAT_MOD; /*'k'*/ + { + char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1; + char *colon = r + 1; + char ch; + + while((ch = *fptr++)) { + if(reganch & 1) + *p++ = ch; + else + *r-- = ch; + reganch >>= 1; + } + if(has_minus) { + *r = '-'; + p = colon; + } + } + + *p++=':'; + Copy(RExC_precomp, p, r->prelen, char); + r->precomp = p; + p += r->prelen; + if (has_runon) + *p++='\n'; + *p=')'; + } + r->intflags = 0; r->nparens = RExC_npar - 1; /* set early to validate backrefs */ @@ -4129,15 +4196,14 @@ Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm) } /* Useful during FAIL. */ - Newxz(ri->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */ - if (ri->offsets) { - ri->offsets[0] = RExC_size; - } +#ifdef RE_TRACK_PATTERN_OFFSETS + Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */ DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log, "%s %"UVuf" bytes for offset annotations.\n", - ri->offsets ? "Got" : "Couldn't get", + ri->u.offsets ? "Got" : "Couldn't get", (UV)((2*RExC_size+1) * sizeof(U32)))); - +#endif + SetProgLen(ri,RExC_size); RExC_rx = r; RExC_rxi = ri; @@ -4582,13 +4648,18 @@ reStudy: r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names); else r->paren_names = NULL; + if (r->prelen == 3 && strEQ("\\s+", r->precomp)) + r->extflags |= RXf_WHITE; + else if (r->prelen == 1 && r->precomp[0] == '^') + r->extflags |= RXf_START_ONLY; + #ifdef DEBUGGING if (RExC_paren_names) { ri->name_list_idx = add_data( pRExC_state, 1, "p" ); ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list); } else - ri->name_list_idx = 0; #endif + ri->name_list_idx = 0; if (RExC_recurse_count) { for ( ; RExC_recurse_count ; RExC_recurse_count-- ) { @@ -4604,59 +4675,139 @@ reStudy: PerlIO_printf(Perl_debug_log,"Final program:\n"); regdump(r); }); - DEBUG_OFFSETS_r(if (ri->offsets) { - const U32 len = ri->offsets[0]; +#ifdef RE_TRACK_PATTERN_OFFSETS + DEBUG_OFFSETS_r(if (ri->u.offsets) { + const U32 len = ri->u.offsets[0]; U32 i; GET_RE_DEBUG_FLAGS_DECL; - PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->offsets[0]); + PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]); for (i = 1; i <= len; i++) { - if (ri->offsets[i*2-1] || ri->offsets[i*2]) + if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2]) PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ", - (UV)i, (UV)ri->offsets[i*2-1], (UV)ri->offsets[i*2]); + (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]); } PerlIO_printf(Perl_debug_log, "\n"); }); +#endif return(r); } -#undef CORE_ONLY_BLOCK #undef RE_ENGINE_PTR -#ifndef PERL_IN_XSUB_RE + SV* -Perl_reg_named_buff_sv(pTHX_ SV* namesv) +Perl_reg_named_buff_get(pTHX_ const REGEXP * const rx, SV* namesv, U32 flags) { - I32 parno = 0; /* no match */ - if (PL_curpm) { - const REGEXP * const rx = PM_GETRE(PL_curpm); - if (rx && rx->paren_names) { - HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 ); - if (he_str) { - IV i; - SV* sv_dat=HeVAL(he_str); - I32 *nums=(I32*)SvPVX(sv_dat); - for ( i=0; ilastparen) >= nums[i] && - rx->endp[nums[i]] != -1) - { - parno = nums[i]; - break; - } + AV *retarray = NULL; + SV *ret; + if (flags & 1) + retarray=newAV(); + + if (rx && rx->paren_names) { + HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 ); + if (he_str) { + IV i; + SV* sv_dat=HeVAL(he_str); + I32 *nums=(I32*)SvPVX(sv_dat); + for ( i=0; inparens) >= nums[i] + && rx->startp[nums[i]] != -1 + && rx->endp[nums[i]] != -1) + { + ret = CALLREG_NUMBUF(rx,nums[i],NULL); + if (!retarray) + return ret; + } else { + ret = newSVsv(&PL_sv_undef); + } + if (retarray) { + SvREFCNT_inc(ret); + av_push(retarray, ret); } } + if (retarray) + return (SV*)retarray; } } - if ( !parno ) { - return 0; + return NULL; +} + +SV* +Perl_reg_numbered_buff_get(pTHX_ const REGEXP * const rx, I32 paren, SV* usesv) +{ + char *s = NULL; + I32 i = 0; + I32 s1, t1; + SV *sv = usesv ? usesv : newSVpvs(""); + + if (!rx->subbeg) { + sv_setsv(sv,&PL_sv_undef); + return sv; + } + else + if (paren == -2 && rx->startp[0] != -1) { + /* $` */ + i = rx->startp[0]; + s = rx->subbeg; + } + else + if (paren == -1 && rx->endp[0] != -1) { + /* $' */ + s = rx->subbeg + rx->endp[0]; + i = rx->sublen - rx->endp[0]; + } + else + if ( 0 <= paren && paren <= (I32)rx->nparens && + (s1 = rx->startp[paren]) != -1 && + (t1 = rx->endp[paren]) != -1) + { + /* $& $1 ... */ + i = t1 - s1; + s = rx->subbeg + s1; + } else { + sv_setsv(sv,&PL_sv_undef); + return sv; + } + assert(rx->sublen >= (s - rx->subbeg) + i ); + if (i >= 0) { + const int oldtainted = PL_tainted; + TAINT_NOT; + sv_setpvn(sv, s, i); + PL_tainted = oldtainted; + if ( (rx->extflags & RXf_CANY_SEEN) + ? (RX_MATCH_UTF8(rx) + && (!i || is_utf8_string((U8*)s, i))) + : (RX_MATCH_UTF8(rx)) ) + { + SvUTF8_on(sv); + } + else + SvUTF8_off(sv); + if (PL_tainting) { + if (RX_MATCH_TAINTED(rx)) { + if (SvTYPE(sv) >= SVt_PVMG) { + MAGIC* const mg = SvMAGIC(sv); + MAGIC* mgt; + PL_tainted = 1; + SvMAGIC_set(sv, mg->mg_moremagic); + SvTAINT(sv); + if ((mgt = SvMAGIC(sv))) { + mg->mg_moremagic = mgt; + SvMAGIC_set(sv, mg); + } + } else { + PL_tainted = 1; + SvTAINT(sv); + } + } else + SvTAINTED_off(sv); + } } else { - GV *gv_paren; - SV *sv= sv_newmortal(); - Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno); - gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV); - return GvSVn(gv_paren); + sv_setsv(sv,&PL_sv_undef); } + return sv; } -#endif + /* Scans the name of a named buffer from the pattern. * If flags is REG_RSN_RETURN_NULL returns null. @@ -4793,6 +4944,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) const I32 oregflags = RExC_flags; bool have_branch = 0; bool is_open = 0; + I32 freeze_paren = 0; + I32 after_freeze = 0; /* for (?g), (?gc), and (?o) warnings; warning about (?c) will warn about (?g) -- japhy */ @@ -4921,8 +5074,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) return ret; } else if (*RExC_parse == '?') { /* (?...) */ - U32 posflags = 0, negflags = 0; - U32 *flagsp = &posflags; bool is_logical = 0; const char * const seqstart = RExC_parse; @@ -4965,8 +5116,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) nextchar(pRExC_state); return ret; } - goto unknown; - case '<': /* (?<...) */ + RExC_parse++; + vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /*NOTREACHED*/ + case '<': /* (?<...) */ if (*RExC_parse == '!') paren = ','; else if (*RExC_parse != '=') @@ -4981,8 +5134,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SIZE_ONLY ? /* reverse test from the others */ REG_RSN_RETURN_NAME : REG_RSN_RETURN_NULL); - if (RExC_parse == name_start) - goto unknown; + if (RExC_parse == name_start) { + RExC_parse++; + vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /*NOTREACHED*/ + } if (*RExC_parse != paren) vFAIL2("Sequence (?%c... not terminated", paren=='>' ? '<' : paren); @@ -5008,11 +5164,26 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) Perl_croak(aTHX_ "panic: paren_name hash element allocation failed"); } else if ( SvPOK(sv_dat) ) { - IV count=SvIV(sv_dat); - I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1); - SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32)); - pv[count]=RExC_npar; - SvIVX(sv_dat)++; + /* (?|...) can mean we have dupes so scan to check + its already been stored. Maybe a flag indicating + we are inside such a construct would be useful, + but the arrays are likely to be quite small, so + for now we punt -- dmq */ + IV count = SvIV(sv_dat); + I32 *pv = (I32*)SvPVX(sv_dat); + IV i; + for ( i = 0 ; i < count ; i++ ) { + if ( pv[i] == RExC_npar ) { + count = 0; + break; + } + } + if ( count ) { + pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1); + SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32)); + pv[count] = RExC_npar; + SvIVX(sv_dat)++; + } } else { (void)SvUPGRADE(sv_dat,SVt_PVNV); sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32)); @@ -5040,6 +5211,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) nextchar(pRExC_state); return ret; } + break; + case '|': /* (?|...) */ + /* branch reset, behave like a (?:...) except that + buffers in alternations share the same numbers */ + paren = ':'; + after_freeze = freeze_paren = RExC_npar; + break; case ':': /* (?:...) */ case '>': /* (?>...) */ break; @@ -5060,6 +5238,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (*RExC_parse != ')') FAIL("Sequence (?R) not terminated"); ret = reg_node(pRExC_state, GOSTART); + *flagp |= POSTPONED; nextchar(pRExC_state); return ret; /*notreached*/ @@ -5140,6 +5319,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */ Set_Node_Offset(ret, parse_start); /* MJD */ + *flagp |= POSTPONED; nextchar(pRExC_state); return ret; } /* named and numeric backreferences */ @@ -5151,8 +5331,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* FALL THROUGH*/ case '?': /* (??...) */ is_logical = 1; - if (*RExC_parse != '{') - goto unknown; + if (*RExC_parse != '{') { + RExC_parse++; + vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /*NOTREACHED*/ + } + *flagp |= POSTPONED; paren = *RExC_parse++; /* FALL THROUGH */ case '{': /* (?{...}) */ @@ -5346,13 +5530,20 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) vFAIL("Sequence (? incomplete"); break; default: - --RExC_parse; - parse_flags: /* (?i) */ - while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) { + --RExC_parse; + parse_flags: /* (?i) */ + { + U32 posflags = 0, negflags = 0; + U32 *flagsp = &posflags; + + while (*RExC_parse) { + /* && strchr("iogcmsx", *RExC_parse) */ /* (?g), (?gc) and (?o) are useless here and must be globally applied -- japhy */ - - if (*RExC_parse == 'o' || *RExC_parse == 'g') { + switch (*RExC_parse) { + CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp); + case 'o': + case 'g': if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G; if (! (wastedflags & wflagbit) ) { @@ -5367,8 +5558,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ); } } - } - else if (*RExC_parse == 'c') { + break; + + case 'c': if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { if (! (wastedflags & WASTED_C) ) { wastedflags |= WASTED_GC; @@ -5380,33 +5572,47 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ); } } - } - else { pmflag(flagsp, *RExC_parse); } - - ++RExC_parse; - } - if (*RExC_parse == '-') { - flagsp = &negflags; - wastedflags = 0; /* reset so (?g-c) warns twice */ + break; + case 'k': + if (flagsp == &negflags) { + if (SIZE_ONLY && ckWARN(WARN_REGEXP)) + vWARN(RExC_parse + 1,"Useless use of (?-k)"); + } else { + *flagsp |= RXf_PMf_KEEPCOPY; + } + break; + case '-': + if (flagsp == &negflags) { + RExC_parse++; + vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /*NOTREACHED*/ + } + flagsp = &negflags; + wastedflags = 0; /* reset so (?g-c) warns twice */ + break; + case ':': + paren = ':'; + /*FALLTHROUGH*/ + case ')': + RExC_flags |= posflags; + RExC_flags &= ~negflags; + nextchar(pRExC_state); + if (paren != ':') { + *flagp = TRYAGAIN; + return NULL; + } else { + ret = NULL; + goto parse_rest; + } + /*NOTREACHED*/ + default: + RExC_parse++; + vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /*NOTREACHED*/ + } ++RExC_parse; - goto parse_flags; } - RExC_flags |= posflags; - RExC_flags &= ~negflags; - if (*RExC_parse == ':') { - RExC_parse++; - paren = ':'; - break; - } - unknown: - if (*RExC_parse != ')') { - RExC_parse++; - vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); - } - nextchar(pRExC_state); - *flagp = TRYAGAIN; - return NULL; - } + }} /* one for the default block, one for the switch */ } else { /* (...) */ capturing_parens: @@ -5417,7 +5623,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (!SIZE_ONLY ){ if (!RExC_nestroot) RExC_nestroot = parno; - if (RExC_seen & REG_SEEN_RECURSE) { + if (RExC_seen & REG_SEEN_RECURSE + && !RExC_open_parens[parno-1]) + { DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, "Setting open paren #%"IVdf" to %d\n", (IV)parno, REG_NODE_NUM(ret))); @@ -5431,7 +5639,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } else /* ! paren */ ret = NULL; - + + parse_rest: /* Pick up the branches, linking them together. */ parse_start = RExC_parse; /* MJD */ br = regbranch(pRExC_state, &flags, 1,depth+1); @@ -5460,7 +5669,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } else if (paren != '?') /* Not Conditional */ ret = br; - *flagp |= flags & (SPSTART | HASWIDTH); + *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); lastbr = br; while (*RExC_parse == '|') { if (!SIZE_ONLY && RExC_extralen) { @@ -5470,15 +5679,18 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (SIZE_ONLY) RExC_extralen += 2; /* Account for LONGJMP. */ nextchar(pRExC_state); + if (freeze_paren) { + if (RExC_npar > after_freeze) + after_freeze = RExC_npar; + RExC_npar = freeze_paren; + } br = regbranch(pRExC_state, &flags, 0, depth+1); if (br == NULL) return(NULL); REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */ lastbr = br; - if (flags&HASWIDTH) - *flagp |= HASWIDTH; - *flagp |= flags&SPSTART; + *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); } if (have_branch || paren != ':') { @@ -5571,7 +5783,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) FAIL("Junk on end of regexp"); /* "Can't happen". */ /* NOTREACHED */ } - + if (after_freeze) + RExC_npar = after_freeze; return(ret); } @@ -5618,7 +5831,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) } else if (ret == NULL) ret = latest; - *flagp |= flags&HASWIDTH; + *flagp |= flags&(HASWIDTH|POSTPONED); if (chain == NULL) /* First piece. */ *flagp |= flags&SPSTART; else { @@ -5800,7 +6013,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) goto do_curly; } nest_check: - if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) { + if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) { vWARN3(RExC_parse, "%.*s matches null string many times", (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0), @@ -6025,7 +6238,9 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) char *s; char *p, *pend; STRLEN charlen = 1; +#ifdef DEBUGGING char * parse_start = name-3; /* needed for the offsets */ +#endif GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */ ret = reg_node(pRExC_state, @@ -6139,15 +6354,26 @@ S_reg_recode(pTHX_ const char value, SV **encp) /* - regatom - the lowest level - * - * Optimization: gobbles an entire sequence of ordinary characters so that - * it can turn them into a single node, which is smaller to store and - * faster to run. Backslashed characters are exceptions, each becoming a - * separate node; the code is simpler that way and it's not worth fixing. - * - * [Yes, it is worth fixing, some scripts can run twice the speed.] - * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes] - */ + + Try to identify anything special at the start of the pattern. If there + is, then handle it as required. This may involve generating a single regop, + such as for an assertion; or it may involve recursing, such as to + handle a () structure. + + If the string doesn't start with something special then we gobble up + as much literal text as we can. + + Once we have been able to handle whatever type of thing started the + sequence, we return. + + Note: we have to be careful with escapes, as they can be both literal + and special, and in the case of \10 and friends can either, depending + on context. Specifically there are two seperate switches for handling + escape sequences, with the one for handling literal escapes requiring + a dummy entry for all of the special escapes that are actually handled + by the other. +*/ + STATIC regnode * S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) { @@ -6159,6 +6385,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) DEBUG_PARSE("atom"); *flagp = WORST; /* Tentatively. */ + tryagain: switch (*RExC_parse) { case '^': @@ -6221,7 +6448,7 @@ tryagain: } return(NULL); } - *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE); + *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); break; case '|': case ')': @@ -6245,104 +6472,110 @@ tryagain: vFAIL("Quantifier follows nothing"); break; case '\\': + /* Special Escapes + + This switch handles escape sequences that resolve to some kind + of special regop and not to literal text. Escape sequnces that + resolve to literal text are handled below in the switch marked + "Literal Escapes". + + Every entry in this switch *must* have a corresponding entry + in the literal escape switch. However, the opposite is not + required, as the default for this switch is to jump to the + literal text handling code. + */ switch (*++RExC_parse) { + /* Special Escapes */ case 'A': RExC_seen_zerolen++; ret = reg_node(pRExC_state, SBOL); *flagp |= SIMPLE; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 'G': ret = reg_node(pRExC_state, GPOS); RExC_seen |= REG_SEEN_GPOS; *flagp |= SIMPLE; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; + case 'K': + RExC_seen_zerolen++; + ret = reg_node(pRExC_state, KEEPS); + *flagp |= SIMPLE; + goto finish_meta_pat; case 'Z': ret = reg_node(pRExC_state, SEOL); *flagp |= SIMPLE; RExC_seen_zerolen++; /* Do not optimize RE away */ - nextchar(pRExC_state); - break; + goto finish_meta_pat; case 'z': ret = reg_node(pRExC_state, EOS); *flagp |= SIMPLE; RExC_seen_zerolen++; /* Do not optimize RE away */ - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 'C': ret = reg_node(pRExC_state, CANY); RExC_seen |= REG_SEEN_CANY; *flagp |= HASWIDTH|SIMPLE; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 'X': ret = reg_node(pRExC_state, CLUMP); *flagp |= HASWIDTH; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 'w': ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM)); *flagp |= HASWIDTH|SIMPLE; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 'W': ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM)); *flagp |= HASWIDTH|SIMPLE; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 'b': RExC_seen_zerolen++; RExC_seen |= REG_SEEN_LOOKBEHIND; ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND)); *flagp |= SIMPLE; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 'B': RExC_seen_zerolen++; RExC_seen |= REG_SEEN_LOOKBEHIND; ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND)); *flagp |= SIMPLE; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 's': ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE)); *flagp |= HASWIDTH|SIMPLE; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 'S': ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE)); *flagp |= HASWIDTH|SIMPLE; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 'd': ret = reg_node(pRExC_state, DIGIT); *flagp |= HASWIDTH|SIMPLE; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 'D': ret = reg_node(pRExC_state, NDIGIT); *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + case 'v': + ret = reganode(pRExC_state, PRUNE, 0); + ret->flags = 1; + *flagp |= SIMPLE; + goto finish_meta_pat; + case 'V': + ret = reganode(pRExC_state, SKIP, 0); + ret->flags = 1; + *flagp |= SIMPLE; + finish_meta_pat: nextchar(pRExC_state); Set_Node_Length(ret, 2); /* MJD */ - break; + break; case 'p': case 'P': { char* const oldregxend = RExC_end; +#ifdef DEBUGGING char* parse_start = RExC_parse - 2; +#endif if (RExC_parse[1] == '{') { /* a lovely hack--pretend we saw [\pX] instead */ @@ -6419,16 +6652,6 @@ tryagain: } break; } - case 'n': - case 'r': - case 't': - case 'f': - case 'e': - case 'a': - case 'x': - case 'c': - case '0': - goto defchar; case 'g': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': @@ -6503,9 +6726,7 @@ tryagain: case '#': if (RExC_flags & RXf_PMf_EXTENDED) { - while (RExC_parse < RExC_end && *RExC_parse != '\n') - RExC_parse++; - if (RExC_parse < RExC_end) + if ( reg_skipcomment( pRExC_state ) ) goto tryagain; } /* FALL THROUGH */ @@ -6534,7 +6755,7 @@ tryagain: char * const oldp = p; if (RExC_flags & RXf_PMf_EXTENDED) - p = regwhite(p, RExC_end); + p = regwhite( pRExC_state, p ); switch (*p) { case '^': case '$': @@ -6545,29 +6766,40 @@ tryagain: case '|': goto loopdone; case '\\': + /* Literal Escapes Switch + + This switch is meant to handle escape sequences that + resolve to a literal character. + + Every escape sequence that represents something + else, like an assertion or a char class, is handled + in the switch marked 'Special Escapes' above in this + routine, but also has an entry here as anything that + isn't explicitly mentioned here will be treated as + an unescaped equivalent literal. + */ + switch (*++p) { - case 'A': - case 'C': - case 'X': - case 'G': - case 'g': - case 'Z': - case 'z': - case 'w': - case 'W': - case 'b': - case 'B': - case 's': - case 'S': - case 'd': - case 'D': - case 'p': - case 'P': - case 'N': - case 'R': - case 'k': + /* These are all the special escapes. */ + case 'A': /* Start assertion */ + case 'b': case 'B': /* Word-boundary assertion*/ + case 'C': /* Single char !DANGEROUS! */ + case 'd': case 'D': /* digit class */ + case 'g': case 'G': /* generic-backref, pos assertion */ + case 'k': case 'K': /* named backref, keep marker */ + case 'N': /* named char sequence */ + case 'p': case 'P': /* unicode property */ + case 's': case 'S': /* space class */ + case 'v': case 'V': /* (*PRUNE) and (*SKIP) */ + case 'w': case 'W': /* word class */ + case 'X': /* eXtended Unicode "combining character sequence" */ + case 'z': case 'Z': /* End of line/string assertion */ --p; goto loopdone; + + /* Anything after here is an escape that resolves to a + literal. (Except digits, which may or may not) + */ case 'n': ender = '\n'; p++; @@ -6671,13 +6903,13 @@ tryagain: ender = *p++; break; } - if (RExC_flags & RXf_PMf_EXTENDED) - p = regwhite(p, RExC_end); + if ( RExC_flags & RXf_PMf_EXTENDED) + p = regwhite( pRExC_state, p ); if (UTF && FOLD) { /* Prime the casefolded buffer. */ ender = toFOLD_uni(ender, tmpbuf, &foldlen); } - if (ISMULT2(p)) { /* Back off on ?+*. */ + if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */ if (len) p = oldp; else if (UTF) { @@ -6779,15 +7011,22 @@ tryagain: } STATIC char * -S_regwhite(char *p, const char *e) +S_regwhite( RExC_state_t *pRExC_state, char *p ) { + const char *e = RExC_end; while (p < e) { if (isSPACE(*p)) ++p; else if (*p == '#') { + bool ended = 0; do { - p++; - } while (p < e && *p != '\n'); + if (*p++ == '\n') { + ended = 1; + break; + } + } while (p < e); + if (!ended) + RExC_seen |= REG_SEEN_RUN_ON_COMMENT; } else break; @@ -7569,6 +7808,49 @@ parseit: #undef _C_C_T_ +/* reg_skipcomment() + + Absorbs an /x style # comments from the input stream. + Returns true if there is more text remaining in the stream. + Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment + terminates the pattern without including a newline. + + Note its the callers responsibility to ensure that we are + actually in /x mode + +*/ + +STATIC bool +S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state) +{ + bool ended = 0; + while (RExC_parse < RExC_end) + if (*RExC_parse++ == '\n') { + ended = 1; + break; + } + if (!ended) { + /* we ran off the end of the pattern without ending + the comment, so we have to add an \n when wrapping */ + RExC_seen |= REG_SEEN_RUN_ON_COMMENT; + return 0; + } else + return 1; +} + +/* nextchar() + + Advance that parse position, and optionall absorbs + "whitespace" from the inputstream. + + Without /x "whitespace" means (?#...) style comments only, + with /x this means (?#...) and # comments and whitespace proper. + + Returns the RExC_parse point from BEFORE the scan occurs. + + This is the /x friendly way of saying RExC_parse++. +*/ + STATIC char* S_nextchar(pTHX_ RExC_state_t *pRExC_state) { @@ -7591,9 +7873,8 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state) continue; } else if (*RExC_parse == '#') { - while (RExC_parse < RExC_end) - if (*RExC_parse++ == '\n') break; - continue; + if ( reg_skipcomment( pRExC_state ) ) + continue; } } return retval; @@ -7624,6 +7905,7 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) NODE_ALIGN_FILL(ret); ptr = ret; FILL_ADVANCE_NODE(ptr, op); +#ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n", "reg_node", __LINE__, @@ -7635,7 +7917,7 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) (UV)RExC_offsets[0])); Set_Node_Offset(RExC_emit, RExC_parse + (op == END)); } - +#endif RExC_emit = ptr; return(ret); } @@ -7677,6 +7959,7 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) NODE_ALIGN_FILL(ret); ptr = ret; FILL_ADVANCE_NODE_ARG(ptr, op, arg); +#ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", "reganode", @@ -7689,7 +7972,7 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) (UV)RExC_offsets[0])); Set_Cur_Node_Offset; } - +#endif RExC_emit = ptr; return(ret); } @@ -7719,6 +8002,7 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) const int offset = regarglen[(U8)op]; const int size = NODE_STEP_REGNODE + offset; GET_RE_DEBUG_FLAGS_DECL; + PERL_UNUSED_ARG(depth); /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]); if (SIZE_ONLY) { @@ -7750,6 +8034,7 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) while (src > opnd) { StructCopy(--src, --dst, regnode); +#ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD 20010112 */ MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n", "reg_insert", @@ -7763,10 +8048,12 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src)); Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src)); } +#endif } place = opnd; /* Op node, where operand used to be. */ +#ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", "reginsert", @@ -7780,6 +8067,7 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) Set_Node_Offset(place, RExC_parse); Set_Node_Length(place, 1); } +#endif src = NEXTOPER(place); FILL_ADVANCE_NODE(place, op); Zero(src, offset, regnode); @@ -8075,7 +8363,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) /* print the details of the trie in dumpuntil instead, as * progi->data isn't available here */ const char op = OP(o); - const I32 n = ARG(o); + const U32 n = ARG(o); const reg_ac_data * const ac = IS_TRIE_AC(op) ? (reg_ac_data *)progi->data->data[n] : NULL; @@ -8129,32 +8417,33 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) { Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ if ( prog->paren_names ) { - AV *list= (AV *)progi->data->data[progi->name_list_idx]; - SV **name= av_fetch(list, ARG(o), 0 ); - if (name) - Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", *name); - } - } else if (k == NREF) { - if ( prog->paren_names ) { - AV *list= (AV *)progi->data->data[ progi->name_list_idx ]; - SV *sv_dat=(SV*)progi->data->data[ ARG( o ) ]; - I32 *nums=(I32*)SvPVX(sv_dat); - SV **name= av_fetch(list, nums[0], 0 ); - I32 n; - if (name) { - for ( n=0; ndata->data[progi->name_list_idx]; + SV **name= av_fetch(list, ARG(o), 0 ); + if (name) + Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); + } + else { + AV *list= (AV *)progi->data->data[ progi->name_list_idx ]; + SV *sv_dat=(SV*)progi->data->data[ ARG( o ) ]; + I32 *nums=(I32*)SvPVX(sv_dat); + SV **name= av_fetch(list, nums[0], 0 ); + I32 n; + if (name) { + for ( n=0; nflags) Perl_sv_catpvf(aTHX_ sv, ":%"SVf, - (SV*)progi->data->data[ ARG( o ) ]); + SVfARG((SV*)progi->data->data[ ARG( o ) ])); } else if (k == LOGICAL) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ else if (k == ANYOF) { @@ -8355,10 +8644,6 @@ Perl_pregfree(pTHX_ struct regexp *r) return; CALLREGFREE_PVT(r); /* free the private data */ - - /* gcov results gave these as non-null 100% of the time, so there's no - optimisation in checking them before calling Safefree */ - Safefree(r->precomp); RX_MATCH_COPY_FREE(r); #ifdef PERL_OLD_COPY_ON_WRITE if (r->saved_copy) @@ -8376,8 +8661,8 @@ Perl_pregfree(pTHX_ struct regexp *r) Safefree(r->substrs); } if (r->paren_names) - SvREFCNT_dec(r->paren_names); - + SvREFCNT_dec(r->paren_names); + Safefree(r->wrapped); Safefree(r->startp); Safefree(r->endp); Safefree(r); @@ -8414,8 +8699,10 @@ Perl_regfree_internal(pTHX_ struct regexp *r) PL_colors[4],PL_colors[5],s); } }); - - Safefree(ri->offsets); /* 20010421 MJD */ +#ifdef RE_TRACK_PATTERN_OFFSETS + if (ri->u.offsets) + Safefree(ri->u.offsets); /* 20010421 MJD */ +#endif if (ri->data) { int n = ri->data->count; PAD* new_comppad = NULL; @@ -8567,11 +8854,14 @@ Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) } else ret->substrs = NULL; - ret->precomp = SAVEPVN(r->precomp, r->prelen); + ret->wrapped = SAVEPVN(r->wrapped, r->wraplen); + ret->precomp = ret->wrapped + (r->precomp - r->wrapped); + ret->prelen = r->prelen; + ret->wraplen = r->wraplen; + ret->refcnt = r->refcnt; ret->minlen = r->minlen; ret->minlenret = r->minlenret; - ret->prelen = r->prelen; ret->nparens = r->nparens; ret->lastparen = r->lastparen; ret->lastcloseparen = r->lastcloseparen; @@ -8624,7 +8914,7 @@ Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param) RXi_GET_DECL(r,ri); npar = r->nparens+1; - len = ri->offsets[0]; + len = ProgLen(ri); Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal); Copy(ri->program, reti->program, len+1, regnode); @@ -8638,8 +8928,8 @@ Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param) reti->swap = NULL; } - reti->regstclass = NULL; + if (ri->data) { struct reg_data *d; const int count = ri->data->count; @@ -8700,9 +8990,17 @@ Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param) else reti->data = NULL; - Newx(reti->offsets, 2*len+1, U32); - Copy(ri->offsets, reti->offsets, 2*len+1, U32); - + reti->name_list_idx = ri->name_list_idx; + +#ifdef RE_TRACK_PATTERN_OFFSETS + if (ri->u.offsets) { + Newx(reti->u.offsets, 2*len+1, U32); + Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32); + } +#else + SetProgLen(reti,len); +#endif + return (void*)reti; } @@ -8736,80 +9034,18 @@ Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param) */ #ifndef PERL_IN_XSUB_RE + char * Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) { dVAR; const regexp * const re = (regexp *)mg->mg_obj; - - if (!mg->mg_ptr) { - const char *fptr = "msix"; - char reflags[6]; - char ch; - int left = 0; - int right = 4; - bool need_newline = 0; - U16 reganch = (U16)((re->extflags & RXf_PMf_COMPILETIME) >> 12); - - while((ch = *fptr++)) { - if(reganch & 1) { - reflags[left++] = ch; - } - else { - reflags[right--] = ch; - } - reganch >>= 1; - } - if(left != 4) { - reflags[left] = '-'; - left = 5; - } - - mg->mg_len = re->prelen + 4 + left; - /* - * If /x was used, we have to worry about a regex ending with a - * comment later being embedded within another regex. If so, we don't - * want this regex's "commentization" to leak out to the right part of - * the enclosing regex, we must cap it with a newline. - * - * So, if /x was used, we scan backwards from the end of the regex. If - * we find a '#' before we find a newline, we need to add a newline - * ourself. If we find a '\n' first (or if we don't find '#' or '\n'), - * we don't need to add anything. -jfriedl - */ - if (PMf_EXTENDED & re->extflags) { - const char *endptr = re->precomp + re->prelen; - while (endptr >= re->precomp) { - const char c = *(endptr--); - if (c == '\n') - break; /* don't need another */ - if (c == '#') { - /* we end while in a comment, so we need a newline */ - mg->mg_len++; /* save space for it */ - need_newline = 1; /* note to add it */ - break; - } - } - } - - Newx(mg->mg_ptr, mg->mg_len + 1 + left, char); - mg->mg_ptr[0] = '('; - mg->mg_ptr[1] = '?'; - Copy(reflags, mg->mg_ptr+2, left, char); - *(mg->mg_ptr+left+2) = ':'; - Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char); - if (need_newline) - mg->mg_ptr[mg->mg_len - 2] = '\n'; - mg->mg_ptr[mg->mg_len - 1] = ')'; - mg->mg_ptr[mg->mg_len] = 0; - } if (haseval) *haseval = re->seen_evals; if (flags) *flags = ((re->extflags & RXf_UTF8) ? 1 : 0); - if (lp) - *lp = mg->mg_len; - return mg->mg_ptr; + *lp = re->wraplen; + return re->wrapped; } /* @@ -9019,7 +9255,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, else if ( PL_regkind[(U8)op] == TRIE ) { const regnode *this_trie = node; const char op = OP(node); - const I32 n = ARG(node); + const U32 n = ARG(node); const reg_ac_data * const ac = op>=AHOCORASICK ? (reg_ac_data *)ri->data->data[n] : NULL;