char *parse; /* Input-scan pointer. */
I32 whilem_seen; /* number of WHILEM in this expr */
regnode *emit_start; /* Start of emitted-code area */
+ regnode *emit_bound; /* First regnode outside of the allocated space */
regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
I32 naughty; /* How bad is this pattern? */
I32 sawback; /* Did we see \1, ...? */
regnode **open_parens; /* pointers to open parens */
regnode **close_parens; /* pointers to close parens */
regnode *opend; /* END node in program */
- I32 utf8;
+ I32 utf8; /* whether the pattern is utf8 or not */
+ I32 orig_utf8; /* whether the pattern was originally in utf8 */
+ /* XXX use this for future optimisation of case
+ * where pattern must be upgraded to utf8. */
HV *charnames; /* cache of named sequences */
HV *paren_names; /* Paren names */
#endif
#define RExC_emit (pRExC_state->emit)
#define RExC_emit_start (pRExC_state->emit_start)
+#define RExC_emit_bound (pRExC_state->emit_bound)
#define RExC_naughty (pRExC_state->naughty)
#define RExC_sawback (pRExC_state->sawback)
#define RExC_seen (pRExC_state->seen)
#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
#define RExC_seen_evals (pRExC_state->seen_evals)
#define RExC_utf8 (pRExC_state->utf8)
+#define RExC_orig_utf8 (pRExC_state->orig_utf8)
#define RExC_charnames (pRExC_state->charnames)
#define RExC_open_parens (pRExC_state->open_parens)
#define RExC_close_parens (pRExC_state->close_parens)
* Flags to be passed up and down.
*/
#define WORST 0 /* Worst case. */
-#define HASWIDTH 0x1 /* Known to match non-null strings. */
-#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
-#define SPSTART 0x4 /* Starts with * or +. */
-#define TRYAGAIN 0x8 /* Weeded out a declaration. */
+#define HASWIDTH 0x01 /* Known to match non-null strings. */
+#define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */
+#define SPSTART 0x04 /* Starts with * or +. */
+#define TRYAGAIN 0x08 /* Weeded out a declaration. */
+#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
}
data->last_end = -1;
data->flags &= ~SF_BEFORE_EOL;
- DEBUG_STUDYDATA("cl_anything: ",data,0);
+ DEBUG_STUDYDATA("commit: ",data,0);
}
/* Can match anything (initialization) */
U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
const U8 *scan = (U8*)NULL;
U32 wordlen = 0; /* required init */
- STRLEN chars=0;
+ STRLEN chars = 0;
+ bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
if (OP(noper) == NOTHING) {
trie->minlen= 0;
continue;
}
- if (trie->bitmap) {
- TRIE_BITMAP_SET(trie,*uc);
- if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
- }
+ if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
+ TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
+ regardless of encoding */
+
for ( ; uc < e ; uc += len ) {
TRIE_CHARCOUNT(trie)++;
TRIE_READ_CHAR;
trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
TRIE_STORE_REVCHAR;
}
+ if ( set_bit ) {
+ /* store the codepoint in the bitmap, and if its ascii
+ also store its folded equivelent. */
+ TRIE_BITMAP_SET(trie,uvc);
+ if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
+ set_bit = 0; /* We've done our bit :-) */
+ }
} else {
SV** svpp;
if ( !widecharmap )
/* needed for dumping*/
DEBUG_r(if (optimize) {
regnode *opt = convert;
+
while ( ++opt < optimize) {
Set_Node_Offset_Length(opt,0,0);
}
if (exp == NULL)
FAIL("NULL regexp argument");
- RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
+ RExC_utf8 = RExC_orig_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
- RExC_precomp = exp;
DEBUG_COMPILE_r({
SV *dsv= sv_newmortal();
RE_PV_QUOTED_DECL(s, RExC_utf8,
- dsv, RExC_precomp, (xend - exp), 60);
+ dsv, exp, (xend - exp), 60);
PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
PL_colors[4],PL_colors[5],s);
});
+
+redo_first_pass:
+ RExC_precomp = exp;
RExC_flags = pm->op_pmflags;
RExC_sawback = 0;
RExC_precomp = NULL;
return(NULL);
}
+ if (RExC_utf8 && !RExC_orig_utf8) {
+ /* It's possible to write a regexp in ascii that represents unicode
+ codepoints outside of the byte range, such as via \x{100}. If we
+ detect such a sequence we have to convert the entire pattern to utf8
+ and then recompile, as our sizing calculation will have been based
+ on 1 byte == 1 character, but we will need to use utf8 to encode
+ at least some part of the pattern, and therefore must convert the whole
+ thing.
+ XXX: somehow figure out how to make this less expensive...
+ -- dmq */
+ STRLEN len = xend-exp;
+ DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+ "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
+ exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
+ xend = exp + len;
+ RExC_orig_utf8 = RExC_utf8;
+ SAVEFREEPV(exp);
+ goto redo_first_pass;
+ }
DEBUG_PARSE_r({
PerlIO_printf(Perl_debug_log,
"Required size %"IVdf" nodes\n"
if (RExC_whilem_seen > 15)
RExC_whilem_seen = 15;
-#ifdef DEBUGGING
- /* Make room for a sentinel value at the end of the program */
- RExC_size++;
-#endif
-
/* Allocate space and zero-initialize. Note, the two step process
of zeroing when in debug mode, thus anything assigned has to
happen after that */
r->engine= RE_ENGINE_PTR;
r->refcnt = 1;
r->prelen = xend - exp;
- r->precomp = savepvn(RExC_precomp, r->prelen);
r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME;
+ {
+ bool has_k = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
+ bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
+ bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
+ U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> 12);
+ const char *fptr = STD_PAT_MODS; /*"msix"*/
+ char *p;
+ r->wraplen = r->prelen + has_minus + has_k + has_runon
+ + (sizeof(STD_PAT_MODS) - 1)
+ + (sizeof("(?:)") - 1);
+
+ Newx(r->wrapped, r->wraplen + 1, char );
+ p = r->wrapped;
+ *p++='('; *p++='?';
+ if (has_k)
+ *p++ = KEEPCOPY_PAT_MOD; /*'k'*/
+ {
+ char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
+ char *colon = r + 1;
+ char ch;
+
+ while((ch = *fptr++)) {
+ if(reganch & 1)
+ *p++ = ch;
+ else
+ *r-- = ch;
+ reganch >>= 1;
+ }
+ if(has_minus) {
+ *r = '-';
+ p = colon;
+ }
+ }
+
+ *p++ = ':';
+ Copy(RExC_precomp, p, r->prelen, char);
+ r->precomp = p;
+ p += r->prelen;
+ if (has_runon)
+ *p++ = '\n';
+ *p++ = ')';
+ *p = 0;
+ }
+
r->intflags = 0;
r->nparens = RExC_npar - 1; /* set early to validate backrefs */
RExC_npar = 1;
RExC_emit_start = ri->program;
RExC_emit = ri->program;
-#ifdef DEBUGGING
- /* put a sentinal on the end of the program so we can check for
- overwrites */
- ri->program[RExC_size].type = 255;
-#endif
+ RExC_emit_bound = ri->program + RExC_size + 1;
+
/* Store the count of eval-groups for security checks: */
RExC_rx->seen_evals = RExC_seen_evals;
REGC((U8)REG_MAGIC, (char*) RExC_emit++);
ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
}
}
- Newxz(r->startp, RExC_npar, I32);
- Newxz(r->endp, RExC_npar, I32);
+ Newxz(r->startp, RExC_npar * 2, I32);
+ r->endp = r->startp + RExC_npar;
/* assume we don't need to swap parens around before we match */
DEBUG_DUMP_r({
return(r);
}
-#undef CORE_ONLY_BLOCK
#undef RE_ENGINE_PTR
-#ifndef PERL_IN_XSUB_RE
+
SV*
-Perl_reg_named_buff_get(pTHX_ SV* namesv, const REGEXP * const from_re, U32 flags)
+Perl_reg_named_buff_get(pTHX_ const REGEXP * const rx, SV* namesv, U32 flags)
{
AV *retarray = NULL;
SV *ret;
if (flags & 1)
retarray=newAV();
-
- if (from_re || PL_curpm) {
- const REGEXP * const rx = from_re ? from_re : PM_GETRE(PL_curpm);
- if (rx && rx->paren_names) {
- HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
- if (he_str) {
- IV i;
- SV* sv_dat=HeVAL(he_str);
- I32 *nums=(I32*)SvPVX(sv_dat);
- for ( i=0; i<SvIVX(sv_dat); i++ ) {
- if ((I32)(rx->lastparen) >= nums[i] &&
- rx->endp[nums[i]] != -1)
- {
- ret = reg_numbered_buff_get(nums[i],rx,NULL,0);
- if (!retarray)
- return ret;
- } else {
- ret = newSVsv(&PL_sv_undef);
- }
- if (retarray) {
- SvREFCNT_inc(ret);
- av_push(retarray, ret);
- }
+
+ if (rx && rx->paren_names) {
+ HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
+ if (he_str) {
+ IV i;
+ SV* sv_dat=HeVAL(he_str);
+ I32 *nums=(I32*)SvPVX(sv_dat);
+ for ( i=0; i<SvIVX(sv_dat); i++ ) {
+ if ((I32)(rx->nparens) >= nums[i]
+ && rx->startp[nums[i]] != -1
+ && rx->endp[nums[i]] != -1)
+ {
+ ret = CALLREG_NUMBUF(rx,nums[i],NULL);
+ if (!retarray)
+ return ret;
+ } else {
+ ret = newSVsv(&PL_sv_undef);
+ }
+ if (retarray) {
+ SvREFCNT_inc(ret);
+ av_push(retarray, ret);
}
- if (retarray)
- return (SV*)retarray;
}
+ if (retarray)
+ return (SV*)retarray;
}
}
return NULL;
}
SV*
-Perl_reg_numbered_buff_get(pTHX_ I32 paren, const REGEXP * const rx, SV* usesv, U32 flags)
+Perl_reg_numbered_buff_get(pTHX_ const REGEXP * const rx, I32 paren, SV* usesv)
{
char *s = NULL;
I32 i = 0;
I32 s1, t1;
SV *sv = usesv ? usesv : newSVpvs("");
- PERL_UNUSED_ARG(flags);
if (!rx->subbeg) {
sv_setsv(sv,&PL_sv_undef);
}
return sv;
}
-#endif
+
/* Scans the name of a named buffer from the pattern.
* If flags is REG_RSN_RETURN_NULL returns null.
PerlIO_printf(Perl_debug_log,"%16s",""); \
\
if (SIZE_ONLY) \
- num=RExC_size; \
+ num = RExC_size + 1; \
else \
num=REG_NODE_NUM(RExC_emit); \
if (RExC_lastnum!=num) \
#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
#endif
-/* this idea is borrowed from STR_WITH_LEN in handy.h */
-#define CHECK_WORD(s,v,l) \
- (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
-
STATIC regnode *
S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
/* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
const I32 oregflags = RExC_flags;
bool have_branch = 0;
bool is_open = 0;
+ I32 freeze_paren = 0;
+ I32 after_freeze = 0;
/* for (?g), (?gc), and (?o) warnings; warning
about (?c) will warn about (?g) -- japhy */
GET_RE_DEBUG_FLAGS_DECL;
DEBUG_PARSE("reg ");
-
*flagp = 0; /* Tentatively. */
switch ( *start_verb ) {
case 'A': /* (*ACCEPT) */
- if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
+ if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
op = ACCEPT;
internal_argval = RExC_nestroot;
}
break;
case 'C': /* (*COMMIT) */
- if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
+ if ( memEQs(start_verb,verb_len,"COMMIT") )
op = COMMIT;
break;
case 'F': /* (*FAIL) */
- if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
+ if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
op = OPFAIL;
argok = 0;
}
break;
case ':': /* (*:NAME) */
case 'M': /* (*MARK:NAME) */
- if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) {
+ if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
op = MARKPOINT;
argok = -1;
}
break;
case 'P': /* (*PRUNE) */
- if ( CHECK_WORD("PRUNE",start_verb,verb_len) )
+ if ( memEQs(start_verb,verb_len,"PRUNE") )
op = PRUNE;
break;
case 'S': /* (*SKIP) */
- if ( CHECK_WORD("SKIP",start_verb,verb_len) )
+ if ( memEQs(start_verb,verb_len,"SKIP") )
op = SKIP;
break;
case 'T': /* (*THEN) */
/* [19:06] <TimToady> :: is then */
- if ( CHECK_WORD("THEN",start_verb,verb_len) ) {
+ if ( memEQs(start_verb,verb_len,"THEN") ) {
op = CUTGROUP;
RExC_seen |= REG_SEEN_CUTGROUP;
}
nextchar(pRExC_state);
return ret;
}
- goto unknown;
- case '<': /* (?<...) */
+ RExC_parse++;
+ vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
+ /*NOTREACHED*/
+ case '<': /* (?<...) */
if (*RExC_parse == '!')
paren = ',';
else if (*RExC_parse != '=')
SIZE_ONLY ? /* reverse test from the others */
REG_RSN_RETURN_NAME :
REG_RSN_RETURN_NULL);
- if (RExC_parse == name_start)
- goto unknown;
+ if (RExC_parse == name_start) {
+ RExC_parse++;
+ vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
+ /*NOTREACHED*/
+ }
if (*RExC_parse != paren)
vFAIL2("Sequence (?%c... not terminated",
paren=='>' ? '<' : paren);
Perl_croak(aTHX_
"panic: paren_name hash element allocation failed");
} else if ( SvPOK(sv_dat) ) {
- IV count=SvIV(sv_dat);
- I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
- SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
- pv[count]=RExC_npar;
- SvIVX(sv_dat)++;
+ /* (?|...) can mean we have dupes so scan to check
+ its already been stored. Maybe a flag indicating
+ we are inside such a construct would be useful,
+ but the arrays are likely to be quite small, so
+ for now we punt -- dmq */
+ IV count = SvIV(sv_dat);
+ I32 *pv = (I32*)SvPVX(sv_dat);
+ IV i;
+ for ( i = 0 ; i < count ; i++ ) {
+ if ( pv[i] == RExC_npar ) {
+ count = 0;
+ break;
+ }
+ }
+ if ( count ) {
+ pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
+ SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
+ pv[count] = RExC_npar;
+ SvIVX(sv_dat)++;
+ }
} else {
(void)SvUPGRADE(sv_dat,SVt_PVNV);
sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
nextchar(pRExC_state);
return ret;
}
+ break;
+ case '|': /* (?|...) */
+ /* branch reset, behave like a (?:...) except that
+ buffers in alternations share the same numbers */
+ paren = ':';
+ after_freeze = freeze_paren = RExC_npar;
+ break;
case ':': /* (?:...) */
case '>': /* (?>...) */
break;
if (*RExC_parse != ')')
FAIL("Sequence (?R) not terminated");
ret = reg_node(pRExC_state, GOSTART);
+ *flagp |= POSTPONED;
nextchar(pRExC_state);
return ret;
/*notreached*/
Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
Set_Node_Offset(ret, parse_start); /* MJD */
+ *flagp |= POSTPONED;
nextchar(pRExC_state);
return ret;
} /* named and numeric backreferences */
/* NOT REACHED */
- case 'p': /* (?p...) */
- if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
- vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
- /* FALL THROUGH*/
case '?': /* (??...) */
is_logical = 1;
- if (*RExC_parse != '{')
- goto unknown;
+ if (*RExC_parse != '{') {
+ RExC_parse++;
+ vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
+ /*NOTREACHED*/
+ }
+ *flagp |= POSTPONED;
paren = *RExC_parse++;
/* FALL THROUGH */
case '{': /* (?{...}) */
}
else
REGTAIL(pRExC_state, ret, ender);
+ RExC_size++; /* XXX WHY do we need this?!!
+ For large programs it seems to be required
+ but I can't figure out why. -- dmq*/
return ret;
}
else {
}
break;
case '-':
- if (flagsp == &negflags)
- goto unknown;
+ if (flagsp == &negflags) {
+ RExC_parse++;
+ vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
+ /*NOTREACHED*/
+ }
flagsp = &negflags;
wastedflags = 0; /* reset so (?g-c) warns twice */
break;
}
/*NOTREACHED*/
default:
- unknown:
RExC_parse++;
vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
/*NOTREACHED*/
if (!SIZE_ONLY ){
if (!RExC_nestroot)
RExC_nestroot = parno;
- if (RExC_seen & REG_SEEN_RECURSE) {
+ if (RExC_seen & REG_SEEN_RECURSE
+ && !RExC_open_parens[parno-1])
+ {
DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
"Setting open paren #%"IVdf" to %d\n",
(IV)parno, REG_NODE_NUM(ret)));
}
else if (paren != '?') /* Not Conditional */
ret = br;
- *flagp |= flags & (SPSTART | HASWIDTH);
+ *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
lastbr = br;
while (*RExC_parse == '|') {
if (!SIZE_ONLY && RExC_extralen) {
if (SIZE_ONLY)
RExC_extralen += 2; /* Account for LONGJMP. */
nextchar(pRExC_state);
+ if (freeze_paren) {
+ if (RExC_npar > after_freeze)
+ after_freeze = RExC_npar;
+ RExC_npar = freeze_paren;
+ }
br = regbranch(pRExC_state, &flags, 0, depth+1);
if (br == NULL)
return(NULL);
REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
lastbr = br;
- if (flags&HASWIDTH)
- *flagp |= HASWIDTH;
- *flagp |= flags&SPSTART;
+ *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
}
if (have_branch || paren != ':') {
FAIL("Junk on end of regexp"); /* "Can't happen". */
/* NOTREACHED */
}
-
+ if (after_freeze)
+ RExC_npar = after_freeze;
return(ret);
}
I32 flags = 0, c = 0;
GET_RE_DEBUG_FLAGS_DECL;
DEBUG_PARSE("brnc");
+
if (first)
ret = NULL;
else {
}
else if (ret == NULL)
ret = latest;
- *flagp |= flags&HASWIDTH;
+ *flagp |= flags&(HASWIDTH|POSTPONED);
if (chain == NULL) /* First piece. */
*flagp |= flags&SPSTART;
else {
goto do_curly;
}
nest_check:
- if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
+ if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
vWARN3(RExC_parse,
"%.*s matches null string many times",
(int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
}
return(NULL);
}
- *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
+ *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
break;
case '|':
case ')':
case '#':
if (RExC_flags & RXf_PMf_EXTENDED) {
- while (RExC_parse < RExC_end && *RExC_parse != '\n')
- RExC_parse++;
- if (RExC_parse < RExC_end)
+ if ( reg_skipcomment( pRExC_state ) )
goto tryagain;
}
/* FALL THROUGH */
char * const oldp = p;
if (RExC_flags & RXf_PMf_EXTENDED)
- p = regwhite(p, RExC_end);
+ p = regwhite( pRExC_state, p );
switch (*p) {
case '^':
case '$':
ender = *p++;
break;
}
- if (RExC_flags & RXf_PMf_EXTENDED)
- p = regwhite(p, RExC_end);
+ if ( RExC_flags & RXf_PMf_EXTENDED)
+ p = regwhite( pRExC_state, p );
if (UTF && FOLD) {
/* Prime the casefolded buffer. */
ender = toFOLD_uni(ender, tmpbuf, &foldlen);
}
- if (ISMULT2(p)) { /* Back off on ?+*. */
+ if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
if (len)
p = oldp;
else if (UTF) {
}
STATIC char *
-S_regwhite(char *p, const char *e)
+S_regwhite( RExC_state_t *pRExC_state, char *p )
{
+ const char *e = RExC_end;
while (p < e) {
if (isSPACE(*p))
++p;
else if (*p == '#') {
+ bool ended = 0;
do {
- p++;
- } while (p < e && *p != '\n');
+ if (*p++ == '\n') {
+ ended = 1;
+ break;
+ }
+ } while (p < e);
+ if (!ended)
+ RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
}
else
break;
return ret;
/****** !SIZE_ONLY AFTER HERE *********/
- if( stored == 1 && value < 256
+ if( stored == 1 && (value < 128 || (value < 256 && !UTF))
&& !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
) {
/* optimize single char class to an EXACT node
#undef _C_C_T_
+/* reg_skipcomment()
+
+ Absorbs an /x style # comments from the input stream.
+ Returns true if there is more text remaining in the stream.
+ Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
+ terminates the pattern without including a newline.
+
+ Note its the callers responsibility to ensure that we are
+ actually in /x mode
+
+*/
+
+STATIC bool
+S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
+{
+ bool ended = 0;
+ while (RExC_parse < RExC_end)
+ if (*RExC_parse++ == '\n') {
+ ended = 1;
+ break;
+ }
+ if (!ended) {
+ /* we ran off the end of the pattern without ending
+ the comment, so we have to add an \n when wrapping */
+ RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
+ return 0;
+ } else
+ return 1;
+}
+
+/* nextchar()
+
+ Advance that parse position, and optionall absorbs
+ "whitespace" from the inputstream.
+
+ Without /x "whitespace" means (?#...) style comments only,
+ with /x this means (?#...) and # comments and whitespace proper.
+
+ Returns the RExC_parse point from BEFORE the scan occurs.
+
+ This is the /x friendly way of saying RExC_parse++.
+*/
+
STATIC char*
S_nextchar(pTHX_ RExC_state_t *pRExC_state)
{
continue;
}
else if (*RExC_parse == '#') {
- while (RExC_parse < RExC_end)
- if (*RExC_parse++ == '\n') break;
- continue;
+ if ( reg_skipcomment( pRExC_state ) )
+ continue;
}
}
return retval;
RExC_size += 1;
return(ret);
}
-#ifdef DEBUGGING
- if (OP(RExC_emit) == 255)
- Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
- reg_name[op], OP(RExC_emit));
-#endif
+ if (RExC_emit >= RExC_emit_bound)
+ Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
+
NODE_ALIGN_FILL(ret);
ptr = ret;
FILL_ADVANCE_NODE(ptr, op);
if (RExC_offsets) { /* MJD */
MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
"reg_node", __LINE__,
- reg_name[op],
+ PL_reg_name[op],
(UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
? "Overwriting end of array!\n" : "OK",
(UV)(RExC_emit - RExC_emit_start),
*/
return(ret);
}
-#ifdef DEBUGGING
- if (OP(RExC_emit) == 255)
- Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
-#endif
+ if (RExC_emit >= RExC_emit_bound)
+ Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
+
NODE_ALIGN_FILL(ret);
ptr = ret;
FILL_ADVANCE_NODE_ARG(ptr, op, arg);
MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
"reganode",
__LINE__,
- reg_name[op],
+ PL_reg_name[op],
(UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
"Overwriting end of array!\n" : "OK",
(UV)(RExC_emit - RExC_emit_start),
const int offset = regarglen[(U8)op];
const int size = NODE_STEP_REGNODE + offset;
GET_RE_DEBUG_FLAGS_DECL;
+ PERL_UNUSED_ARG(depth);
/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
- DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
+ DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
if (SIZE_ONLY) {
RExC_size += size;
return;
dst = RExC_emit;
if (RExC_open_parens) {
int paren;
- DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
+ /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
for ( paren=0 ; paren < RExC_npar ; paren++ ) {
if ( RExC_open_parens[paren] >= opnd ) {
- DEBUG_PARSE_FMT("open"," - %d",size);
+ /*DEBUG_PARSE_FMT("open"," - %d",size);*/
RExC_open_parens[paren] += size;
} else {
- DEBUG_PARSE_FMT("open"," - %s","ok");
+ /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
}
if ( RExC_close_parens[paren] >= opnd ) {
- DEBUG_PARSE_FMT("close"," - %d",size);
+ /*DEBUG_PARSE_FMT("close"," - %d",size);*/
RExC_close_parens[paren] += size;
} else {
- DEBUG_PARSE_FMT("close"," - %s","ok");
+ /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
}
}
}
MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
"reg_insert",
__LINE__,
- reg_name[op],
+ PL_reg_name[op],
(UV)(dst - RExC_emit_start) > RExC_offsets[0]
? "Overwriting end of array!\n" : "OK",
(UV)(src - RExC_emit_start),
MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
"reginsert",
__LINE__,
- reg_name[op],
+ PL_reg_name[op],
(UV)(place - RExC_emit_start) > RExC_offsets[0]
? "Overwriting end of array!\n" : "OK",
(UV)(place - RExC_emit_start),
PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
(temp == NULL ? "->" : ""),
- (temp == NULL ? reg_name[OP(val)] : "")
+ (temp == NULL ? PL_reg_name[OP(val)] : "")
);
});
if (temp == NULL)
PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
SvPV_nolen_const(mysv),
REG_NODE_NUM(scan),
- reg_name[exact]);
+ PL_reg_name[exact]);
});
if (temp == NULL)
break;
/* It would be nice to FAIL() here, but this may be called from
regexec.c, and it would be hard to supply pRExC_state. */
Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
- sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
+ sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
k = PL_regkind[OP(o)];
const reg_trie_data * const trie
= (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
- Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
+ Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
DEBUG_TRIE_COMPILE_r(
Perl_sv_catpvf(aTHX_ sv,
"<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
if (!r || (--r->refcnt > 0))
return;
-
- CALLREGFREE_PVT(r); /* free the private data */
-
- /* gcov results gave these as non-null 100% of the time, so there's no
- optimisation in checking them before calling Safefree */
- Safefree(r->precomp);
+ if (r->mother_re) {
+ ReREFCNT_dec(r->mother_re);
+ } else {
+ CALLREGFREE_PVT(r); /* free the private data */
+ if (r->paren_names)
+ SvREFCNT_dec(r->paren_names);
+ Safefree(r->wrapped);
+ }
+ if (r->substrs) {
+ if (r->anchored_substr)
+ SvREFCNT_dec(r->anchored_substr);
+ if (r->anchored_utf8)
+ SvREFCNT_dec(r->anchored_utf8);
+ if (r->float_substr)
+ SvREFCNT_dec(r->float_substr);
+ if (r->float_utf8)
+ SvREFCNT_dec(r->float_utf8);
+ Safefree(r->substrs);
+ }
RX_MATCH_COPY_FREE(r);
#ifdef PERL_OLD_COPY_ON_WRITE
if (r->saved_copy)
- SvREFCNT_dec(r->saved_copy);
+ SvREFCNT_dec(r->saved_copy);
#endif
- if (r->substrs) {
- if (r->anchored_substr)
- SvREFCNT_dec(r->anchored_substr);
- if (r->anchored_utf8)
- SvREFCNT_dec(r->anchored_utf8);
- if (r->float_substr)
- SvREFCNT_dec(r->float_substr);
- if (r->float_utf8)
- SvREFCNT_dec(r->float_utf8);
- Safefree(r->substrs);
+ if (r->swap) {
+ Safefree(r->swap->startp);
+ Safefree(r->swap);
}
- if (r->paren_names)
- SvREFCNT_dec(r->paren_names);
-
Safefree(r->startp);
- Safefree(r->endp);
Safefree(r);
}
+
+/* reg_temp_copy()
+
+ This is a hacky workaround to the structural issue of match results
+ being stored in the regexp structure which is in turn stored in
+ PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
+ could be PL_curpm in multiple contexts, and could require multiple
+ result sets being associated with the pattern simultaneously, such
+ as when doing a recursive match with (??{$qr})
+
+ The solution is to make a lightweight copy of the regexp structure
+ when a qr// is returned from the code executed by (??{$qr}) this
+ lightweight copy doesnt actually own any of its data except for
+ the starp/end and the actual regexp structure itself.
+
+*/
+
+
+regexp *
+Perl_reg_temp_copy (pTHX_ struct regexp *r) {
+ regexp *ret;
+ register const I32 npar = r->nparens+1;
+ (void)ReREFCNT_inc(r);
+ Newx(ret, 1, regexp);
+ StructCopy(r, ret, regexp);
+ Newx(ret->startp, npar * 2, I32);
+ Copy(r->startp, ret->startp, npar * 2, I32);
+ ret->endp = ret->startp + npar;
+ ret->refcnt = 1;
+ if (r->substrs) {
+ Newx(ret->substrs, 1, struct reg_substr_data);
+ StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
+
+ SvREFCNT_inc_void(ret->anchored_substr);
+ SvREFCNT_inc_void(ret->anchored_utf8);
+ SvREFCNT_inc_void(ret->float_substr);
+ SvREFCNT_inc_void(ret->float_utf8);
+
+ /* check_substr and check_utf8, if non-NULL, point to either their
+ anchored or float namesakes, and don't hold a second reference. */
+ }
+ RX_MATCH_COPIED_off(ret);
+#ifdef PERL_OLD_COPY_ON_WRITE
+ /* this is broken. */
+ assert(0);
+ if (ret->saved_copy)
+ ret->saved_copy=NULL;
+#endif
+ ret->mother_re = r;
+ ret->swap = NULL;
+
+ return ret;
+}
#endif
/* regfree_internal()
Safefree(ri->data->what);
Safefree(ri->data);
}
- if (ri->swap) {
- Safefree(ri->swap->startp);
- Safefree(ri->swap->endp);
- Safefree(ri->swap);
- }
+
Safefree(ri);
}
#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
/*
- regdupe - duplicate a regexp.
-
- This routine is called by sv.c's re_dup and is expected to clone a
- given regexp structure. It is a no-op when not under USE_ITHREADS.
- (Originally this *was* re_dup() for change history see sv.c)
+ re_dup - duplicate a regexp.
+ This routine is expected to clone a given regexp structure. It is not
+ compiler under USE_ITHREADS.
+
After all of the core data stored in struct regexp is duplicated
the regexp_engine.dupe method is used to copy any private data
stored in the *pprivate pointer. This allows extensions to handle
{
dVAR;
regexp *ret;
- int i, npar;
- struct reg_substr_datum *s;
+ I32 npar;
if (!r)
return (REGEXP *)NULL;
npar = r->nparens+1;
- Newxz(ret, 1, regexp);
- Newx(ret->startp, npar, I32);
- Copy(r->startp, ret->startp, npar, I32);
- Newx(ret->endp, npar, I32);
- Copy(r->endp, ret->endp, npar, I32);
+ Newx(ret, 1, regexp);
+ StructCopy(r, ret, regexp);
+ Newx(ret->startp, npar * 2, I32);
+ Copy(r->startp, ret->startp, npar * 2, I32);
+ ret->endp = ret->startp + npar;
+ if(ret->swap) {
+ Newx(ret->swap, 1, regexp_paren_ofs);
+ /* no need to copy these */
+ Newx(ret->swap->startp, npar * 2, I32);
+ ret->swap->endp = ret->swap->startp + npar;
+ }
- if (r->substrs) {
+ if (ret->substrs) {
+ /* Do it this way to avoid reading from *r after the StructCopy().
+ That way, if any of the sv_dup_inc()s dislodge *r from the L1
+ cache, it doesn't matter. */
+ const bool anchored = r->check_substr == r->anchored_substr;
Newx(ret->substrs, 1, struct reg_substr_data);
- for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
- s->min_offset = r->substrs->data[i].min_offset;
- s->max_offset = r->substrs->data[i].max_offset;
- s->end_shift = r->substrs->data[i].end_shift;
- s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
- s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
- }
- } else
- ret->substrs = NULL;
-
- ret->precomp = SAVEPVN(r->precomp, r->prelen);
- ret->refcnt = r->refcnt;
- ret->minlen = r->minlen;
- ret->minlenret = r->minlenret;
- ret->prelen = r->prelen;
- ret->nparens = r->nparens;
- ret->lastparen = r->lastparen;
- ret->lastcloseparen = r->lastcloseparen;
- ret->intflags = r->intflags;
- ret->extflags = r->extflags;
-
- ret->sublen = r->sublen;
-
- ret->engine = r->engine;
-
- ret->paren_names = hv_dup_inc(r->paren_names, param);
+ StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
+
+ ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
+ ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
+ ret->float_substr = sv_dup_inc(ret->float_substr, param);
+ ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
+
+ /* check_substr and check_utf8, if non-NULL, point to either their
+ anchored or float namesakes, and don't hold a second reference. */
+
+ if (ret->check_substr) {
+ if (anchored) {
+ assert(r->check_utf8 == r->anchored_utf8);
+ ret->check_substr = ret->anchored_substr;
+ ret->check_utf8 = ret->anchored_utf8;
+ } else {
+ assert(r->check_substr == r->float_substr);
+ assert(r->check_utf8 == r->float_utf8);
+ ret->check_substr = ret->float_substr;
+ ret->check_utf8 = ret->float_utf8;
+ }
+ }
+ }
+
+ ret->wrapped = SAVEPVN(ret->wrapped, ret->wraplen+1);
+ ret->precomp = ret->wrapped + (ret->precomp - ret->wrapped);
+ ret->paren_names = hv_dup_inc(ret->paren_names, param);
+
+ if (ret->pprivate)
+ RXi_SET(ret,CALLREGDUPE_PVT(ret,param));
if (RX_MATCH_COPIED(ret))
- ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
+ ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
else
ret->subbeg = NULL;
#ifdef PERL_OLD_COPY_ON_WRITE
ret->saved_copy = NULL;
#endif
-
- ret->pprivate = r->pprivate;
- if (ret->pprivate)
- RXi_SET(ret,CALLREGDUPE_PVT(ret,param));
+
+ ret->mother_re = NULL;
+ ret->gofs = 0;
+ ret->seen_evals = 0;
ptr_table_store(PL_ptr_table, r, ret);
return ret;
Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal);
Copy(ri->program, reti->program, len+1, regnode);
- if(ri->swap) {
- Newx(reti->swap, 1, regexp_paren_ofs);
- /* no need to copy these */
- Newx(reti->swap->startp, npar, I32);
- Newx(reti->swap->endp, npar, I32);
- } else {
- reti->swap = NULL;
- }
-
reti->regstclass = NULL;
+
if (ri->data) {
struct reg_data *d;
const int count = ri->data->count;
*/
#ifndef PERL_IN_XSUB_RE
+
char *
Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
dVAR;
const regexp * const re = (regexp *)mg->mg_obj;
-
- if (!mg->mg_ptr) {
- const char *fptr = STD_PAT_MODS; /*"msix"*/
- char reflags[7];
- char ch;
- bool hask = ((re->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
- bool hasm = ((re->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
- U16 reganch = (U16)((re->extflags & RXf_PMf_STD_PMMOD) >> 12);
- bool need_newline = 0;
- int left = 0;
- int right = 4 + hask;
- if (hask)
- reflags[left++] = KEEPCOPY_PAT_MOD; /*'k'*/
- while((ch = *fptr++)) {
- if(reganch & 1) {
- reflags[left++] = ch;
- }
- else {
- reflags[right--] = ch;
- }
- reganch >>= 1;
- }
- if(hasm) {
- reflags[left] = '-';
- left = 5 + hask;
- }
- /* printf("[%*.7s]\n",left,reflags); */
- mg->mg_len = re->prelen + 4 + left;
- /*
- * If /x was used, we have to worry about a regex ending with a
- * comment later being embedded within another regex. If so, we don't
- * want this regex's "commentization" to leak out to the right part of
- * the enclosing regex, we must cap it with a newline.
- *
- * So, if /x was used, we scan backwards from the end of the regex. If
- * we find a '#' before we find a newline, we need to add a newline
- * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
- * we don't need to add anything. -jfriedl
- */
- if (PMf_EXTENDED & re->extflags) {
- const char *endptr = re->precomp + re->prelen;
- while (endptr >= re->precomp) {
- const char c = *(endptr--);
- if (c == '\n')
- break; /* don't need another */
- if (c == '#') {
- /* we end while in a comment, so we need a newline */
- mg->mg_len++; /* save space for it */
- need_newline = 1; /* note to add it */
- break;
- }
- }
- }
-
- Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
- mg->mg_ptr[0] = '(';
- mg->mg_ptr[1] = '?';
- Copy(reflags, mg->mg_ptr+2, left, char);
- *(mg->mg_ptr+left+2) = ':';
- Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
- if (need_newline)
- mg->mg_ptr[mg->mg_len - 2] = '\n';
- mg->mg_ptr[mg->mg_len - 1] = ')';
- mg->mg_ptr[mg->mg_len] = 0;
- }
if (haseval)
*haseval = re->seen_evals;
if (flags)
*flags = ((re->extflags & RXf_UTF8) ? 1 : 0);
-
if (lp)
- *lp = mg->mg_len;
- return mg->mg_ptr;
+ *lp = re->wraplen;
+ return re->wrapped;
}
/*