U16 flags16; /* are we folding, multilining? */
char *precomp; /* uncompiled string. */
regexp *rx;
+ char *start; /* Start of input for compile */
char *end; /* End of input for compile */
char *parse; /* Input-scan pointer. */
I32 whilem_seen; /* number of WHILEM in this expr */
+ regnode *emit_start; /* Start of emitted-code area */
regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
I32 naughty; /* How bad is this pattern? */
I32 sawback; /* Did we see \1, ...? */
#define RExC_flags16 (pRExC_state->flags16)
#define RExC_precomp (pRExC_state->precomp)
#define RExC_rx (pRExC_state->rx)
+#define RExC_start (pRExC_state->start)
#define RExC_end (pRExC_state->end)
#define RExC_parse (pRExC_state->parse)
#define RExC_whilem_seen (pRExC_state->whilem_seen)
+#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
#define RExC_emit (pRExC_state->emit)
+#define RExC_emit_start (pRExC_state->emit_start)
#define RExC_naughty (pRExC_state->naughty)
#define RExC_sawback (pRExC_state->sawback)
#define RExC_seen (pRExC_state->seen)
/* Allow for side effects in s */
#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (s);} STMT_END
+/* Macros for recording node offsets. 20001227 mjd@plover.com
+ * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
+ * element 2*n-1 of the array. Element #2n holds the byte length node #n.
+ * Element 0 holds the number n.
+ */
+
+#define MJD_OFFSET_DEBUG(x)
+/* #define MJD_OFFSET_DEBUG(x) fprintf x */
+
+
+# define Set_Node_Offset_To_R(node,byte) \
+ STMT_START { \
+ if (! SIZE_ONLY) { \
+ if((node) < 0) { \
+ Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
+ } else { \
+ RExC_offsets[2*(node)-1] = (byte); \
+ } \
+ } \
+ } STMT_END
+
+# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
+# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
+
+# define Set_Node_Length_To_R(node,len) \
+ STMT_START { \
+ if (! SIZE_ONLY) { \
+ MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
+ if((node) < 0) { \
+ Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
+ } else { \
+ RExC_offsets[2*(node)] = (len); \
+ } \
+ } \
+ } STMT_END
+
+# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
+# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
+# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
+
+/* Get offsets and lengths */
+#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
+#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
+
static void clear_re(pTHXo_ void *r);
/* Mark that we cannot extend a found fixed substring at this point.
/* First pass: determine size, legality. */
RExC_parse = exp;
+ RExC_start = exp;
RExC_end = xend;
RExC_naughty = 0;
RExC_npar = 1;
r->startp = 0; /* Useful during FAIL. */
r->endp = 0; /* Useful during FAIL. */
+ Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
+ if (r->offsets) {
+ r->offsets[0] = RExC_size;
+ }
+ DEBUG_r(PerlIO_printf(Perl_debug_log,
+ "%s %u bytes for offset annotations.\n",
+ r->offsets ? "Got" : "Couldn't get",
+ (2*RExC_size+1) * sizeof(U32)));
+
RExC_rx = r;
/* Second pass: emit code. */
RExC_end = xend;
RExC_naughty = 0;
RExC_npar = 1;
+ RExC_emit_start = r->program;
RExC_emit = r->program;
/* Store the count of eval-groups for security checks: */
RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
register regnode *ender = 0;
register I32 parno = 0;
I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
+ char * parse_start = RExC_parse; /* MJD */
char *oregcomp_parse = RExC_parse;
char c;
/* Make an OPEN node, if parenthesized. */
if (paren) {
- if (*RExC_parse == '?') {
+ if (*RExC_parse == '?') { /* (?...) */
U16 posflags = 0, negflags = 0;
U16 *flagsp = &posflags;
int logical = 0;
paren = *RExC_parse++;
ret = NULL; /* For look-ahead/behind. */
switch (paren) {
- case '<':
+ case '<': /* (?<...) */
RExC_seen |= REG_SEEN_LOOKBEHIND;
if (*RExC_parse == '!')
paren = ',';
if (*RExC_parse != '=' && *RExC_parse != '!')
goto unknown;
RExC_parse++;
- case '=':
- case '!':
+ case '=': /* (?=...) */
+ case '!': /* (?!...) */
RExC_seen_zerolen++;
- case ':':
- case '>':
+ case ':': /* (?:...) */
+ case '>': /* (?>...) */
break;
- case '$':
- case '@':
+ case '$': /* (?$...) */
+ case '@': /* (?@...) */
vFAIL2("Sequence (?%c...) not implemented", (int)paren);
break;
- case '#':
+ case '#': /* (?#...) */
while (*RExC_parse && *RExC_parse != ')')
RExC_parse++;
if (*RExC_parse != ')')
nextchar(pRExC_state);
*flagp = TRYAGAIN;
return NULL;
- case 'p':
+ case 'p': /* (?p...) */
if (SIZE_ONLY)
vWARN(RExC_parse, "(?p{}) is deprecated - use (??{})");
/* FALL THROUGH*/
- case '?':
+ case '?': /* (??...) */
logical = 1;
paren = *RExC_parse++;
/* FALL THROUGH */
- case '{':
+ case '{': /* (?{...}) */
{
I32 count = 1, n = 0;
char c;
if (!SIZE_ONLY)
ret->flags = 2;
regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
+ /* deal with the length of this later - MJD */
return ret;
}
return reganode(pRExC_state, EVAL, n);
}
- case '(':
+ case '(': /* (?(?{...})...) and (?(?=...)...) */
{
- if (RExC_parse[0] == '?') {
+ if (RExC_parse[0] == '?') { /* (?(?...)) */
if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
|| RExC_parse[1] == '<'
|| RExC_parse[1] == '{') { /* Lookahead or eval. */
}
}
else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
+ /* (?(1)...) */
parno = atoi(RExC_parse++);
while (isDIGIT(*RExC_parse))
RExC_parse++;
- ret = reganode(pRExC_state, GROUPP, parno);
+ ret = reganode(pRExC_state, GROUPP, parno);
+
if ((c = *nextchar(pRExC_state)) != ')')
vFAIL("Switch condition not recognized");
insert_if:
break;
default:
--RExC_parse;
- parse_flags:
+ parse_flags: /* (?i) */
while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
if (*RExC_parse != 'o')
pmflag(flagsp, *RExC_parse);
return NULL;
}
}
- else {
+ else { /* (...) */
parno = RExC_npar;
RExC_npar++;
ret = reganode(pRExC_state, OPEN, parno);
+ Set_Node_Length(ret, 1); /* MJD */
+ Set_Node_Offset(ret, RExC_parse); /* MJD */
open = 1;
}
}
- else
+ else /* ! paren */
ret = NULL;
/* Pick up the branches, linking them together. */
+ parse_start = RExC_parse; /* MJD */
br = regbranch(pRExC_state, &flags, 1);
+ /* branch_len = (paren != 0); */
+
if (br == NULL)
return(NULL);
if (*RExC_parse == '|') {
if (!SIZE_ONLY && RExC_extralen) {
reginsert(pRExC_state, BRANCHJ, br);
}
- else
+ else { /* MJD */
reginsert(pRExC_state, BRANCH, br);
+ Set_Node_Length(br, paren != 0);
+ Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
+ }
have_branch = 1;
if (SIZE_ONLY)
RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
RExC_extralen += 2; /* Account for LONGJMP. */
nextchar(pRExC_state);
br = regbranch(pRExC_state, &flags, 0);
+
if (br == NULL)
return(NULL);
regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
break;
case 1:
ender = reganode(pRExC_state, CLOSE, parno);
+ Set_Node_Offset(ender,RExC_parse+1); /* MJD */
+ Set_Node_Length(ender,1); /* MJD */
break;
case '<':
case ',':
else {
if (!SIZE_ONLY && RExC_extralen)
ret = reganode(pRExC_state, BRANCHJ,0);
- else
+ else {
ret = reg_node(pRExC_state, BRANCH);
+ Set_Node_Length(ret, 1);
+ }
}
if (!first && SIZE_ONLY)
char *maxpos;
I32 min;
I32 max = REG_INFTY;
+ char *parse_start;
ret = regatom(pRExC_state, &flags);
if (ret == NULL) {
op = *RExC_parse;
if (op == '{' && regcurly(RExC_parse)) {
+ parse_start = RExC_parse; /* MJD */
next = RExC_parse + 1;
maxpos = Nullch;
while (isDIGIT(*next) || *next == ',') {
if ((flags&SIMPLE)) {
RExC_naughty += 2 + RExC_naughty / 2;
reginsert(pRExC_state, CURLY, ret);
+ Set_Node_Offset(ret, parse_start+1); /* MJD */
+ Set_Node_Cur_Length(ret);
}
else {
regnode *w = reg_node(pRExC_state, WHILEM);
NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
}
reginsert(pRExC_state, CURLYX,ret);
+ /* MJD hk */
+ Set_Node_Offset(ret, parse_start+1);
+ Set_Node_Length(ret,
+ op == '{' ? (RExC_parse - parse_start) : 1);
+
if (!SIZE_ONLY && RExC_extralen)
NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
vFAIL("Regexp *+ operand could be empty");
#endif
+ parse_start = RExC_parse;
nextchar(pRExC_state);
*flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
{
register regnode *ret = 0;
I32 flags;
+ char *parse_start = RExC_parse;
*flagp = WORST; /* Tentatively. */
ret = reg_node(pRExC_state, SBOL);
else
ret = reg_node(pRExC_state, BOL);
+ Set_Node_Length(ret, 1); /* MJD */
break;
case '$':
nextchar(pRExC_state);
ret = reg_node(pRExC_state, SEOL);
else
ret = reg_node(pRExC_state, EOL);
+ Set_Node_Length(ret, 1); /* MJD */
break;
case '.':
nextchar(pRExC_state);
ret = reg_node(pRExC_state, REG_ANY);
*flagp |= HASWIDTH|SIMPLE;
RExC_naughty++;
+ Set_Node_Length(ret, 1); /* MJD */
break;
case '[':
{
}
nextchar(pRExC_state);
*flagp |= HASWIDTH|SIMPLE;
+ Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
break;
}
case '(':
ret = reg_node(pRExC_state, SBOL);
*flagp |= SIMPLE;
nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'G':
ret = reg_node(pRExC_state, GPOS);
RExC_seen |= REG_SEEN_GPOS;
*flagp |= SIMPLE;
nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'Z':
ret = reg_node(pRExC_state, SEOL);
*flagp |= SIMPLE;
RExC_seen_zerolen++; /* Do not optimize RE away */
nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'C':
ret = reg_node(pRExC_state, SANY);
RExC_seen |= REG_SEEN_SANY;
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'X':
ret = reg_node(pRExC_state, CLUMP);
*flagp |= HASWIDTH;
nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'w':
ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'W':
ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'b':
RExC_seen_zerolen++;
ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
*flagp |= SIMPLE;
nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'B':
RExC_seen_zerolen++;
ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
*flagp |= SIMPLE;
nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 's':
ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'S':
ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'd':
ret = reg_node(pRExC_state, DIGIT);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'D':
ret = reg_node(pRExC_state, NDIGIT);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'p':
case 'P':
{ /* a lovely hack--pretend we saw [\pX] instead */
char* oldregxend = RExC_end;
+ char* parse_start = RExC_parse;
if (RExC_parse[1] == '{') {
RExC_end = strchr(RExC_parse, '}');
RExC_end = oldregxend;
RExC_parse--;
+ Set_Node_Cur_Length(ret); /* MJD */
nextchar(pRExC_state);
*flagp |= HASWIDTH|SIMPLE;
}
if (num > 9 && num >= RExC_npar)
goto defchar;
else {
+ char * parse_start = RExC_parse - 1; /* MJD */
while (isDIGIT(*RExC_parse))
RExC_parse++;
? (LOC ? REFFL : REFF)
: REF, num);
*flagp |= HASWIDTH;
+
+ /* override incorrect value set in reganode MJD */
+ Set_Node_Offset(ret, parse_start+1);
+ Set_Node_Cur_Length(ret); /* MJD */
RExC_parse--;
nextchar(pRExC_state);
}
register char *p;
char *oldp, *s;
STRLEN numlen;
+ char *parse_start = RExC_parse - 1;
RExC_parse++;
}
loopdone:
RExC_parse = p - 1;
+ Set_Node_Cur_Length(ret); /* MJD */
nextchar(pRExC_state);
{
/* len is STRLEN which is unsigned, need to copy to signed */
bool need_class = 0;
SV *listsv;
register char *e;
+ char *parse_start = RExC_parse; /* MJD */
UV n;
bool dont_optimize_invert = FALSE;
NODE_ALIGN_FILL(ret);
ptr = ret;
FILL_ADVANCE_NODE(ptr, op);
+ if (RExC_offsets) { /* MJD */
+ MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
+ "reg_node", __LINE__,
+ reg_name[op],
+ RExC_emit - RExC_emit_start > RExC_offsets[0]
+ ? "Overwriting end of array!\n" : "OK",
+ RExC_emit - RExC_emit_start,
+ RExC_parse - RExC_start,
+ RExC_offsets[0]));
+ Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
+ }
+
RExC_emit = ptr;
return(ret);
NODE_ALIGN_FILL(ret);
ptr = ret;
FILL_ADVANCE_NODE_ARG(ptr, op, arg);
+ if (RExC_offsets) { /* MJD */
+ MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
+ "reganode",
+ RExC_emit - RExC_emit_start > RExC_offsets[0] ?
+ "Overwriting end of array!\n" : "OK",
+ RExC_emit - RExC_emit_start,
+ RExC_parse - RExC_start,
+ RExC_offsets[0]));
+ Set_Cur_Node_Offset;
+ }
+
RExC_emit = ptr;
return(ret);
src = RExC_emit;
RExC_emit += NODE_STEP_REGNODE + offset;
dst = RExC_emit;
- while (src > opnd)
+ while (src > opnd) {
StructCopy(--src, --dst, regnode);
+ if (RExC_offsets) { /* MJD 20010112 */
+ MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
+ "reg_insert",
+ dst - RExC_emit_start > RExC_offsets[0]
+ ? "Overwriting end of array!\n" : "OK",
+ src - RExC_emit_start,
+ dst - RExC_emit_start,
+ RExC_offsets[0]));
+ Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
+ Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
+ }
+ }
+
place = opnd; /* Op node, where operand used to be. */
+ if (RExC_offsets) { /* MJD */
+ MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
+ "reginsert",
+ place - RExC_emit_start > RExC_offsets[0]
+ ? "Overwriting end of array!\n" : "OK",
+ place - RExC_emit_start,
+ RExC_parse - RExC_start,
+ RExC_offsets[0]));
+ Set_Node_Offset(place, RExC_parse);
+ }
src = NEXTOPER(place);
FILL_ADVANCE_NODE(place, op);
Zero(src, offset, regnode);
if (r->reganch & ROPT_EVAL_SEEN)
PerlIO_printf(Perl_debug_log, "with eval ");
PerlIO_printf(Perl_debug_log, "\n");
+ if (r->offsets) {
+ U32 i;
+ U32 len = r->offsets[0];
+ PerlIO_printf(Perl_debug_log, "Offsets: [%u]\n\t", r->offsets[0]);
+ for (i = 1; i <= len; i++)
+ PerlIO_printf(Perl_debug_log, "%u[%u] ",
+ r->offsets[i*2-1],
+ r->offsets[i*2]);
+ PerlIO_printf(Perl_debug_log, "\n");
+ }
#endif /* DEBUGGING */
}
if (r->precomp)
Safefree(r->precomp);
+ if (r->offsets) /* 20010421 MJD */
+ Safefree(r->offsets);
if (RX_MATCH_COPIED(r))
Safefree(r->subbeg);
if (r->substrs) {