#define PERL_IN_REGCOMP_C
#include "perl.h"
-#ifdef PERL_IN_XSUB_RE
-# if defined(PERL_CAPI) || defined(PERL_OBJECT)
-# include "XSUB.h"
-# endif
-#else
+#ifndef PERL_IN_XSUB_RE
# include "INTERN.h"
#endif
U16 flags16; /* are we folding, multilining? */
char *precomp; /* uncompiled string. */
regexp *rx;
+ char *start; /* Start of input for compile */
char *end; /* End of input for compile */
char *parse; /* Input-scan pointer. */
I32 whilem_seen; /* number of WHILEM in this expr */
+ regnode *emit_start; /* Start of emitted-code area */
regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
I32 naughty; /* How bad is this pattern? */
I32 sawback; /* Did we see \1, ...? */
#define RExC_flags16 (pRExC_state->flags16)
#define RExC_precomp (pRExC_state->precomp)
#define RExC_rx (pRExC_state->rx)
+#define RExC_start (pRExC_state->start)
#define RExC_end (pRExC_state->end)
#define RExC_parse (pRExC_state->parse)
#define RExC_whilem_seen (pRExC_state->whilem_seen)
+#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
#define RExC_emit (pRExC_state->emit)
+#define RExC_emit_start (pRExC_state->emit_start)
#define RExC_naughty (pRExC_state->naughty)
#define RExC_sawback (pRExC_state->sawback)
#define RExC_seen (pRExC_state->seen)
* of t/op/regmesg.t, the tests in t/op/re_tests, and those in
* op/pragma/warn/regcomp.
*/
-#define MARKER1 "HERE" /* marker as it appears in the description */
-#define MARKER2 " << HERE " /* marker as it appears within the regex */
+#define MARKER1 "<-- HERE" /* marker as it appears in the description */
+#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
-#define REPORT_LOCATION " before " MARKER1 " mark in regex m/%.*s" MARKER2 "%s/"
+#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
/*
* Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
#define FAIL(msg) \
STMT_START { \
char *ellipses = ""; \
- unsigned len = strlen(RExC_precomp); \
+ IV len = RExC_end - RExC_precomp; \
\
if (!SIZE_ONLY) \
SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
#define FAIL2(pat,msg) \
STMT_START { \
char *ellipses = ""; \
- unsigned len = strlen(RExC_precomp); \
+ IV len = RExC_end - RExC_precomp; \
\
if (!SIZE_ONLY) \
SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
*/
#define Simple_vFAIL(m) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
+ IV offset = RExC_parse - RExC_precomp; \
\
Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
m, (int)offset, RExC_precomp, RExC_precomp + offset); \
*/
#define Simple_vFAIL2(m,a1) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
+ IV offset = RExC_parse - RExC_precomp; \
\
S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
(int)offset, RExC_precomp, RExC_precomp + offset); \
*/
#define Simple_vFAIL3(m, a1, a2) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
+ IV offset = RExC_parse - RExC_precomp; \
\
S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
(int)offset, RExC_precomp, RExC_precomp + offset); \
*/
#define Simple_vFAIL4(m, a1, a2, a3) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
+ IV offset = RExC_parse - RExC_precomp; \
\
S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
(int)offset, RExC_precomp, RExC_precomp + offset); \
*/
#define Simple_vFAIL5(m, a1, a2, a3, a4) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
+ IV offset = RExC_parse - RExC_precomp; \
S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
(int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
#define vWARN(loc,m) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
+ IV offset = loc - RExC_precomp; \
Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
m, (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END \
+#define vWARNdep(loc,m) \
+ STMT_START { \
+ IV offset = loc - RExC_precomp; \
+ int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED; \
+ Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\
+ m, (int)offset, RExC_precomp, RExC_precomp + offset); \
+ } STMT_END \
+
#define vWARN2(loc, m, a1) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
+ IV offset = loc - RExC_precomp; \
Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
a1, \
(int)offset, RExC_precomp, RExC_precomp + offset); \
#define vWARN3(loc, m, a1, a2) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp) - (RExC_end - (loc)); \
+ IV offset = loc - RExC_precomp; \
Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
a1, a2, \
(int)offset, RExC_precomp, RExC_precomp + offset); \
#define vWARN4(loc, m, a1, a2, a3) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
+ IV offset = loc - RExC_precomp; \
Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
a1, a2, a3, \
(int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
+/* used for the parse_flags section for (?c) -- japhy */
+#define vWARN5(loc, m, a1, a2, a3, a4) \
+ STMT_START { \
+ IV offset = loc - RExC_precomp; \
+ Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
+ a1, a2, a3, a4, \
+ (int)offset, RExC_precomp, RExC_precomp + offset); \
+ } STMT_END
+
/* Allow for side effects in s */
-#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (s);} STMT_END
+#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
+
+/* Macros for recording node offsets. 20001227 mjd@plover.com
+ * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
+ * element 2*n-1 of the array. Element #2n holds the byte length node #n.
+ * Element 0 holds the number n.
+ */
-static void clear_re(pTHXo_ void *r);
+#define MJD_OFFSET_DEBUG(x)
+/* #define MJD_OFFSET_DEBUG(x) fprintf x */
+
+
+# define Set_Node_Offset_To_R(node,byte) \
+ STMT_START { \
+ if (! SIZE_ONLY) { \
+ if((node) < 0) { \
+ Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
+ } else { \
+ RExC_offsets[2*(node)-1] = (byte); \
+ } \
+ } \
+ } STMT_END
+
+# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
+# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
+
+# define Set_Node_Length_To_R(node,len) \
+ STMT_START { \
+ if (! SIZE_ONLY) { \
+ MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
+ if((node) < 0) { \
+ Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
+ } else { \
+ RExC_offsets[2*(node)] = (len); \
+ } \
+ } \
+ } STMT_END
+
+# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
+# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
+# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
+
+/* Get offsets and lengths */
+#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
+#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
+
+static void clear_re(pTHX_ void *r);
/* Mark that we cannot extend a found fixed substring at this point.
Updata the longest found anchored substring and the longest found
int compat = 1;
if (uc >= 0x100 ||
- !(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
+ (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
&& !ANYOF_BITMAP_TEST(data->start_class, uc)
&& (!(data->start_class->flags & ANYOF_FOLD)
|| !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
+ )
compat = 0;
ANYOF_CLASS_ZERO(data->start_class);
ANYOF_BITMAP_ZERO(data->start_class);
int compat = 1;
if (uc >= 0x100 ||
- !(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
+ (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
&& !ANYOF_BITMAP_TEST(data->start_class, uc)
- && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc]))
+ && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
compat = 0;
ANYOF_CLASS_ZERO(data->start_class);
ANYOF_BITMAP_ZERO(data->start_class);
flags &= ~SCF_DO_STCLASS;
}
else if (strchr((char*)PL_varies,OP(scan))) {
- I32 mincount, maxcount, minnext, deltanext, fl;
+ I32 mincount, maxcount, minnext, deltanext, fl = 0;
I32 f = flags, pos_before = 0;
regnode *oscan = scan;
struct regnode_charclass_class this_class;
struct regnode_charclass_class *oclass = NULL;
+ I32 next_is_eval = 0;
switch (PL_regkind[(U8)OP(scan)]) {
case WHILEM: /* End of (?:...)* . */
scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
}
scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
+ next_is_eval = (OP(scan) == EVAL);
do_curly:
if (flags & SCF_DO_SUBSTR) {
if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
}
if (!scan) /* It was not CURLYX, but CURLY. */
scan = next;
- if (ckWARN(WARN_REGEXP) && (minnext + deltanext == 0)
- && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
+ if (ckWARN(WARN_REGEXP)
+ /* ? quantifier ok, except for (?{ ... }) */
+ && (next_is_eval || !(mincount == 0 && maxcount == 1))
+ && (minnext == 0) && (deltanext == 0)
+ && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
&& maxcount <= REG_INFTY/3) /* Complement check for big count */
{
vWARN(RExC_parse,
&& !deltanext && minnext == 1 ) {
/* Try to optimize to CURLYN. */
regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
- regnode *nxt1 = nxt, *nxt2;
+ regnode *nxt1 = nxt;
+#ifdef DEBUGGING
+ regnode *nxt2;
+#endif
/* Skip open. */
nxt = regnext(nxt);
&& !(PL_regkind[(U8)OP(nxt)] == EXACT
&& STR_LEN(nxt) == 1))
goto nogo;
+#ifdef DEBUGGING
nxt2 = nxt;
+#endif
nxt = regnext(nxt);
if (OP(nxt) != CLOSE)
goto nogo;
}
}
else if (strchr((char*)PL_simple,OP(scan))) {
- int value;
+ int value = 0;
if (flags & SCF_DO_SUBSTR) {
scan_commit(pRExC_state,data);
else
RExC_utf8 = 0;
- RExC_precomp = savepvn(exp, xend - exp);
+ RExC_precomp = exp;
DEBUG_r(if (!PL_colorset) reginitcolors());
DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
PL_colors[4],PL_colors[5],PL_colors[0],
/* First pass: determine size, legality. */
RExC_parse = exp;
+ RExC_start = exp;
RExC_end = xend;
RExC_naughty = 0;
RExC_npar = 1;
REGC((U8)REG_MAGIC, (char*)RExC_emit);
#endif
if (reg(pRExC_state, 0, &flags) == NULL) {
- Safefree(RExC_precomp);
RExC_precomp = Nullch;
return(NULL);
}
#endif
r->refcnt = 1;
r->prelen = xend - exp;
- r->precomp = RExC_precomp;
+ r->precomp = savepvn(RExC_precomp, r->prelen);
r->subbeg = NULL;
r->reganch = pm->op_pmflags & PMf_COMPILETIME;
r->nparens = RExC_npar - 1; /* set early to validate backrefs */
r->startp = 0; /* Useful during FAIL. */
r->endp = 0; /* Useful during FAIL. */
+ Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
+ if (r->offsets) {
+ r->offsets[0] = RExC_size;
+ }
+ DEBUG_r(PerlIO_printf(Perl_debug_log,
+ "%s %"UVuf" bytes for offset annotations.\n",
+ r->offsets ? "Got" : "Couldn't get",
+ (UV)((2*RExC_size+1) * sizeof(U32))));
+
RExC_rx = r;
/* Second pass: emit code. */
+ RExC_flags16 = pm->op_pmflags; /* don't let top level (?i) bleed */
RExC_parse = exp;
RExC_end = xend;
RExC_naughty = 0;
RExC_npar = 1;
+ RExC_emit_start = r->program;
RExC_emit = r->program;
/* Store the count of eval-groups for security checks: */
RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
&& !(data.start_class->flags & ANYOF_EOS)
&& !cl_is_anything(data.start_class)) {
- SV *sv;
I32 n = add_data(pRExC_state, 1, "f");
New(1006, RExC_rx->data->data[n], 1,
r->regstclass = (regnode*)RExC_rx->data->data[n];
r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
PL_regdata = r->data; /* for regprop() */
- DEBUG_r((sv = sv_newmortal(),
- regprop(sv, (regnode*)data.start_class),
- PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n",
- SvPVX(sv))));
+ DEBUG_r({ SV *sv = sv_newmortal();
+ regprop(sv, (regnode*)data.start_class);
+ PerlIO_printf(Perl_debug_log,
+ "synthetic stclass `%s'.\n",
+ SvPVX(sv));});
}
/* A temporary algorithm prefers floated substr to fixed one to dig more info. */
r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
if (!(data.start_class->flags & ANYOF_EOS)
&& !cl_is_anything(data.start_class)) {
- SV *sv;
I32 n = add_data(pRExC_state, 1, "f");
New(1006, RExC_rx->data->data[n], 1,
struct regnode_charclass_class);
r->regstclass = (regnode*)RExC_rx->data->data[n];
r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
- DEBUG_r((sv = sv_newmortal(),
- regprop(sv, (regnode*)data.start_class),
- PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n",
- SvPVX(sv))));
+ DEBUG_r({ SV* sv = sv_newmortal();
+ regprop(sv, (regnode*)data.start_class);
+ PerlIO_printf(Perl_debug_log,
+ "synthetic stclass `%s'.\n",
+ SvPVX(sv));});
}
}
r->reganch |= ROPT_LOOKBEHIND_SEEN;
if (RExC_seen & REG_SEEN_EVAL)
r->reganch |= ROPT_EVAL_SEEN;
- if (RExC_seen & REG_SEEN_SANY)
- r->reganch |= ROPT_SANY_SEEN;
+ if (RExC_seen & REG_SEEN_CANY)
+ r->reganch |= ROPT_CANY_SEEN;
Newz(1002, r->startp, RExC_npar, I32);
Newz(1002, r->endp, RExC_npar, I32);
PL_regdata = r->data; /* for regprop() */
register regnode *ender = 0;
register I32 parno = 0;
I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
+
+ /* for (?g), (?gc), and (?o) warnings; warning
+ about (?c) will warn about (?g) -- japhy */
+
+ I32 wastedflags = 0x00,
+ wasted_o = 0x01,
+ wasted_g = 0x02,
+ wasted_gc = 0x02 | 0x04,
+ wasted_c = 0x04;
+
+ char * parse_start = RExC_parse; /* MJD */
char *oregcomp_parse = RExC_parse;
char c;
*flagp = 0; /* Tentatively. */
+
/* Make an OPEN node, if parenthesized. */
if (paren) {
- if (*RExC_parse == '?') {
+ if (*RExC_parse == '?') { /* (?...) */
U16 posflags = 0, negflags = 0;
U16 *flagsp = &posflags;
int logical = 0;
paren = *RExC_parse++;
ret = NULL; /* For look-ahead/behind. */
switch (paren) {
- case '<':
+ case '<': /* (?<...) */
RExC_seen |= REG_SEEN_LOOKBEHIND;
if (*RExC_parse == '!')
paren = ',';
if (*RExC_parse != '=' && *RExC_parse != '!')
goto unknown;
RExC_parse++;
- case '=':
- case '!':
+ case '=': /* (?=...) */
+ case '!': /* (?!...) */
RExC_seen_zerolen++;
- case ':':
- case '>':
+ case ':': /* (?:...) */
+ case '>': /* (?>...) */
break;
- case '$':
- case '@':
+ case '$': /* (?$...) */
+ case '@': /* (?@...) */
vFAIL2("Sequence (?%c...) not implemented", (int)paren);
break;
- case '#':
+ case '#': /* (?#...) */
while (*RExC_parse && *RExC_parse != ')')
RExC_parse++;
if (*RExC_parse != ')')
nextchar(pRExC_state);
*flagp = TRYAGAIN;
return NULL;
- case 'p':
- if (SIZE_ONLY)
- vWARN(RExC_parse, "(?p{}) is deprecated - use (??{})");
+ case 'p': /* (?p...) */
+ if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
+ vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
/* FALL THROUGH*/
- case '?':
+ case '?': /* (??...) */
logical = 1;
paren = *RExC_parse++;
/* FALL THROUGH */
- case '{':
+ case '{': /* (?{...}) */
{
I32 count = 1, n = 0;
char c;
ENTER;
Perl_save_re_context(aTHX);
rop = sv_compile_2op(sv, &sop, "re", &av);
+ sop->op_private |= OPpREFCOUNTED;
+ /* re_dup will OpREFCNT_inc */
+ OpREFCNT_set(sop, 1);
LEAVE;
n = add_data(pRExC_state, 3, "nop");
/* No compiled RE interpolated, has runtime
components ===> unsafe. */
FAIL("Eval-group not allowed at runtime, use re 'eval'");
- if (PL_tainted)
+ if (PL_tainting && PL_tainted)
FAIL("Eval-group in insecure regular expression");
}
if (!SIZE_ONLY)
ret->flags = 2;
regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
+ /* deal with the length of this later - MJD */
return ret;
}
return reganode(pRExC_state, EVAL, n);
}
- case '(':
+ case '(': /* (?(?{...})...) and (?(?=...)...) */
{
- if (RExC_parse[0] == '?') {
+ if (RExC_parse[0] == '?') { /* (?(?...)) */
if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
|| RExC_parse[1] == '<'
|| RExC_parse[1] == '{') { /* Lookahead or eval. */
}
}
else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
+ /* (?(1)...) */
parno = atoi(RExC_parse++);
while (isDIGIT(*RExC_parse))
RExC_parse++;
- ret = reganode(pRExC_state, GROUPP, parno);
+ ret = reganode(pRExC_state, GROUPP, parno);
+
if ((c = *nextchar(pRExC_state)) != ')')
vFAIL("Switch condition not recognized");
insert_if:
break;
default:
--RExC_parse;
- parse_flags:
+ parse_flags: /* (?i) */
while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
- if (*RExC_parse != 'o')
- pmflag(flagsp, *RExC_parse);
+ /* (?g), (?gc) and (?o) are useless here
+ and must be globally applied -- japhy */
+
+ if (*RExC_parse == 'o' || *RExC_parse == 'g') {
+ if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
+ I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
+ if (! (wastedflags & wflagbit) ) {
+ wastedflags |= wflagbit;
+ vWARN5(
+ RExC_parse + 1,
+ "Useless (%s%c) - %suse /%c modifier",
+ flagsp == &negflags ? "?-" : "?",
+ *RExC_parse,
+ flagsp == &negflags ? "don't " : "",
+ *RExC_parse
+ );
+ }
+ }
+ }
+ else if (*RExC_parse == 'c') {
+ if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
+ if (! (wastedflags & wasted_c) ) {
+ wastedflags |= wasted_gc;
+ vWARN3(
+ RExC_parse + 1,
+ "Useless (%sc) - %suse /gc modifier",
+ flagsp == &negflags ? "?-" : "?",
+ flagsp == &negflags ? "don't " : ""
+ );
+ }
+ }
+ }
+ else { pmflag(flagsp, *RExC_parse); }
+
++RExC_parse;
}
if (*RExC_parse == '-') {
flagsp = &negflags;
+ wastedflags = 0; /* reset so (?g-c) warns twice */
++RExC_parse;
goto parse_flags;
}
return NULL;
}
}
- else {
+ else { /* (...) */
parno = RExC_npar;
RExC_npar++;
ret = reganode(pRExC_state, OPEN, parno);
+ Set_Node_Length(ret, 1); /* MJD */
+ Set_Node_Offset(ret, RExC_parse); /* MJD */
open = 1;
}
}
- else
+ else /* ! paren */
ret = NULL;
/* Pick up the branches, linking them together. */
+ parse_start = RExC_parse; /* MJD */
br = regbranch(pRExC_state, &flags, 1);
+ /* branch_len = (paren != 0); */
+
if (br == NULL)
return(NULL);
if (*RExC_parse == '|') {
if (!SIZE_ONLY && RExC_extralen) {
reginsert(pRExC_state, BRANCHJ, br);
}
- else
+ else { /* MJD */
reginsert(pRExC_state, BRANCH, br);
+ Set_Node_Length(br, paren != 0);
+ Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
+ }
have_branch = 1;
if (SIZE_ONLY)
RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
RExC_extralen += 2; /* Account for LONGJMP. */
nextchar(pRExC_state);
br = regbranch(pRExC_state, &flags, 0);
+
if (br == NULL)
return(NULL);
regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
break;
case 1:
ender = reganode(pRExC_state, CLOSE, parno);
+ Set_Node_Offset(ender,RExC_parse+1); /* MJD */
+ Set_Node_Length(ender,1); /* MJD */
break;
case '<':
case ',':
else {
if (!SIZE_ONLY && RExC_extralen)
ret = reganode(pRExC_state, BRANCHJ,0);
- else
+ else {
ret = reg_node(pRExC_state, BRANCH);
+ Set_Node_Length(ret, 1);
+ }
}
if (!first && SIZE_ONLY)
char *maxpos;
I32 min;
I32 max = REG_INFTY;
+ char *parse_start;
ret = regatom(pRExC_state, &flags);
if (ret == NULL) {
op = *RExC_parse;
if (op == '{' && regcurly(RExC_parse)) {
+ parse_start = RExC_parse; /* MJD */
next = RExC_parse + 1;
maxpos = Nullch;
while (isDIGIT(*next) || *next == ',') {
if ((flags&SIMPLE)) {
RExC_naughty += 2 + RExC_naughty / 2;
reginsert(pRExC_state, CURLY, ret);
+ Set_Node_Offset(ret, parse_start+1); /* MJD */
+ Set_Node_Cur_Length(ret);
}
else {
regnode *w = reg_node(pRExC_state, WHILEM);
NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
}
reginsert(pRExC_state, CURLYX,ret);
+ /* MJD hk */
+ Set_Node_Offset(ret, parse_start+1);
+ Set_Node_Length(ret,
+ op == '{' ? (RExC_parse - parse_start) : 1);
+
if (!SIZE_ONLY && RExC_extralen)
NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
vFAIL("Regexp *+ operand could be empty");
#endif
+ parse_start = RExC_parse;
nextchar(pRExC_state);
*flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
{
register regnode *ret = 0;
I32 flags;
+ char *parse_start = 0;
*flagp = WORST; /* Tentatively. */
ret = reg_node(pRExC_state, SBOL);
else
ret = reg_node(pRExC_state, BOL);
+ Set_Node_Length(ret, 1); /* MJD */
break;
case '$':
nextchar(pRExC_state);
ret = reg_node(pRExC_state, SEOL);
else
ret = reg_node(pRExC_state, EOL);
+ Set_Node_Length(ret, 1); /* MJD */
break;
case '.':
nextchar(pRExC_state);
ret = reg_node(pRExC_state, REG_ANY);
*flagp |= HASWIDTH|SIMPLE;
RExC_naughty++;
+ Set_Node_Length(ret, 1); /* MJD */
break;
case '[':
{
}
nextchar(pRExC_state);
*flagp |= HASWIDTH|SIMPLE;
+ Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
break;
}
case '(':
ret = reg_node(pRExC_state, SBOL);
*flagp |= SIMPLE;
nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'G':
ret = reg_node(pRExC_state, GPOS);
RExC_seen |= REG_SEEN_GPOS;
*flagp |= SIMPLE;
nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'Z':
ret = reg_node(pRExC_state, SEOL);
*flagp |= SIMPLE;
RExC_seen_zerolen++; /* Do not optimize RE away */
nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'C':
- ret = reg_node(pRExC_state, SANY);
- RExC_seen |= REG_SEEN_SANY;
+ ret = reg_node(pRExC_state, CANY);
+ RExC_seen |= REG_SEEN_CANY;
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'X':
ret = reg_node(pRExC_state, CLUMP);
*flagp |= HASWIDTH;
nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'w':
ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'W':
ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'b':
RExC_seen_zerolen++;
ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
*flagp |= SIMPLE;
nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'B':
RExC_seen_zerolen++;
ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
*flagp |= SIMPLE;
nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 's':
ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'S':
ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'd':
ret = reg_node(pRExC_state, DIGIT);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'D':
ret = reg_node(pRExC_state, NDIGIT);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'p':
case 'P':
- { /* a lovely hack--pretend we saw [\pX] instead */
+ {
char* oldregxend = RExC_end;
+ char* parse_start = RExC_parse;
if (RExC_parse[1] == '{') {
+ /* a lovely hack--pretend we saw [\pX] instead */
RExC_end = strchr(RExC_parse, '}');
if (!RExC_end) {
+ U8 c = (U8)*RExC_parse;
RExC_parse += 2;
RExC_end = oldregxend;
- vFAIL("Missing right brace on \\p{}");
+ vFAIL2("Missing right brace on \\%c{}", c);
}
RExC_end++;
}
RExC_end = oldregxend;
RExC_parse--;
+ Set_Node_Cur_Length(ret); /* MJD */
nextchar(pRExC_state);
*flagp |= HASWIDTH|SIMPLE;
}
if (num > 9 && num >= RExC_npar)
goto defchar;
else {
+ char * parse_start = RExC_parse - 1; /* MJD */
while (isDIGIT(*RExC_parse))
RExC_parse++;
? (LOC ? REFFL : REFF)
: REF, num);
*flagp |= HASWIDTH;
+
+ /* override incorrect value set in reganode MJD */
+ Set_Node_Offset(ret, parse_start+1);
+ Set_Node_Cur_Length(ret); /* MJD */
RExC_parse--;
nextchar(pRExC_state);
}
register char *p;
char *oldp, *s;
STRLEN numlen;
+ STRLEN ulen;
+ U8 tmpbuf[UTF8_MAXLEN*2+1];
+
+ parse_start = RExC_parse - 1;
RExC_parse++;
p++;
break;
case 'e':
-#ifdef ASCIIish
- ender = '\033';
-#else
- ender = '\047';
-#endif
+ ender = ASCII_TO_NATIVE('\033');
p++;
break;
case 'a':
-#ifdef ASCIIish
- ender = '\007';
-#else
- ender = '\057';
-#endif
+ ender = ASCII_TO_NATIVE('\007');
p++;
break;
case 'x':
vFAIL("Missing right brace on \\x{}");
}
else {
- numlen = 1; /* allow underscores */
- ender = (UV)scan_hex(p + 1, e - p - 1, &numlen);
+ I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
+ | PERL_SCAN_DISALLOW_PREFIX;
+ numlen = e - p - 1;
+ ender = grok_hex(p + 1, &numlen, &flags, NULL);
if (ender > 0xff)
RExC_utf8 = 1;
/* numlen is generous */
}
}
else {
- numlen = 0; /* disallow underscores */
- ender = (UV)scan_hex(p, 2, &numlen);
+ I32 flags = PERL_SCAN_DISALLOW_PREFIX;
+ numlen = 2;
+ ender = grok_hex(p, &numlen, &flags, NULL);
p += numlen;
}
break;
case '5': case '6': case '7': case '8':case '9':
if (*p == '0' ||
(isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
- numlen = 0; /* disallow underscores */
- ender = (UV)scan_oct(p, 3, &numlen);
+ I32 flags = 0;
+ numlen = 3;
+ ender = grok_oct(p, &numlen, &flags, NULL);
p += numlen;
}
else {
/* FALL THROUGH */
default:
if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
- vWARN2(p +1, "Unrecognized escape \\%c passed through", *p);
+ vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
goto normal_default;
}
break;
default:
normal_default:
if (UTF8_IS_START(*p) && UTF) {
- ender = utf8n_to_uvuni((U8*)p, RExC_end - p,
+ ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
&numlen, 0);
p += numlen;
}
if (RExC_flags16 & PMf_EXTENDED)
p = regwhite(p, RExC_end);
if (UTF && FOLD) {
- if (LOC)
- ender = toLOWER_LC_uvchr(UNI_TO_NATIVE(ender));
- else
- ender = toLOWER_uni(ender);
+ toLOWER_uni(ender, tmpbuf, &ulen);
+ ender = utf8_to_uvchr(tmpbuf, 0);
}
if (ISMULT2(p)) { /* Back off on ?+*. */
if (len)
p = oldp;
- /* ender is a Unicode value so it can be > 0xff --
- * in other words, do not use UTF8_IS_CONTINUED(). */
- else if (ender >= 0x80 && UTF) {
+ else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
reguni(pRExC_state, ender, s, &numlen);
s += numlen;
len += numlen;
}
break;
}
- /* ender is a Unicode value so it can be > 0xff --
- * in other words, do not use UTF8_IS_CONTINUED(). */
- if (ender >= 0x80 && UTF) {
+ if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
reguni(pRExC_state, ender, s, &numlen);
s += numlen;
len += numlen - 1;
}
loopdone:
RExC_parse = p - 1;
+ Set_Node_Cur_Length(ret); /* MJD */
nextchar(pRExC_state);
{
/* len is STRLEN which is unsigned, need to copy to signed */
S_regclass(pTHX_ RExC_state_t *pRExC_state)
{
register UV value;
- register IV lastvalue = OOB_UNICODE;
+ register IV prevvalue = OOB_UNICODE;
register IV range = 0;
register regnode *ret;
STRLEN numlen;
IV namedclass;
- char *rangebegin;
+ char *rangebegin = 0;
bool need_class = 0;
- SV *listsv;
+ SV *listsv = Nullsv;
register char *e;
UV n;
- bool dont_optimize_invert = FALSE;
+ bool optimize_invert = TRUE;
ret = reganode(pRExC_state, ANYOF, 0);
if (!range)
rangebegin = RExC_parse;
if (UTF) {
- value = utf8n_to_uvuni((U8*)RExC_parse,
- RExC_end - RExC_parse,
- &numlen, 0);
+ value = utf8n_to_uvchr((U8*)RExC_parse,
+ RExC_end - RExC_parse,
+ &numlen, 0);
RExC_parse += numlen;
}
else
namedclass = regpposixcc(pRExC_state, value);
else if (value == '\\') {
if (UTF) {
- value = utf8n_to_uvuni((U8*)RExC_parse,
+ value = utf8n_to_uvchr((U8*)RExC_parse,
RExC_end - RExC_parse,
&numlen, 0);
RExC_parse += numlen;
case 'p':
case 'P':
if (*RExC_parse == '{') {
+ U8 c = (U8)value;
e = strchr(RExC_parse++, '}');
if (!e)
- vFAIL("Missing right brace on \\p{}");
+ vFAIL2("Missing right brace on \\%c{}", c);
+ while (isSPACE(UCHARAT(RExC_parse)))
+ RExC_parse++;
+ if (e == RExC_parse)
+ vFAIL2("Empty \\%c{}", c);
n = e - RExC_parse;
+ while (isSPACE(UCHARAT(RExC_parse + n - 1)))
+ n--;
}
else {
e = RExC_parse;
n = 1;
}
if (!SIZE_ONLY) {
+ if (UCHARAT(RExC_parse) == '^') {
+ RExC_parse++;
+ n--;
+ value = value == 'p' ? 'P' : 'p'; /* toggle */
+ while (isSPACE(UCHARAT(RExC_parse))) {
+ RExC_parse++;
+ n--;
+ }
+ }
if (value == 'p')
- Perl_sv_catpvf(aTHX_ listsv,
- "+utf8::%.*s\n", (int)n, RExC_parse);
+ Perl_sv_catpvf(aTHX_ listsv,
+ "+utf8::%.*s\n", (int)n, RExC_parse);
else
- Perl_sv_catpvf(aTHX_ listsv,
- "!utf8::%.*s\n", (int)n, RExC_parse);
+ Perl_sv_catpvf(aTHX_ listsv,
+ "!utf8::%.*s\n", (int)n, RExC_parse);
}
RExC_parse = e + 1;
ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
case 't': value = '\t'; break;
case 'f': value = '\f'; break;
case 'b': value = '\b'; break;
-#ifdef ASCIIish
- case 'e': value = '\033'; break;
- case 'a': value = '\007'; break;
-#else
- case 'e': value = '\047'; break;
- case 'a': value = '\057'; break;
-#endif
+ case 'e': value = ASCII_TO_NATIVE('\033');break;
+ case 'a': value = ASCII_TO_NATIVE('\007');break;
case 'x':
if (*RExC_parse == '{') {
+ I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
+ | PERL_SCAN_DISALLOW_PREFIX;
e = strchr(RExC_parse++, '}');
if (!e)
vFAIL("Missing right brace on \\x{}");
- numlen = 1; /* allow underscores */
- value = (UV)scan_hex(RExC_parse,
- e - RExC_parse,
- &numlen);
+
+ numlen = e - RExC_parse;
+ value = grok_hex(RExC_parse, &numlen, &flags, NULL);
RExC_parse = e + 1;
}
else {
- numlen = 0; /* disallow underscores */
- value = (UV)scan_hex(RExC_parse, 2, &numlen);
+ I32 flags = PERL_SCAN_DISALLOW_PREFIX;
+ numlen = 2;
+ value = grok_hex(RExC_parse, &numlen, &flags, NULL);
RExC_parse += numlen;
}
break;
break;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
- numlen = 0; /* disallow underscores */
- value = (UV)scan_oct(--RExC_parse, 3, &numlen);
+ {
+ I32 flags = 0;
+ numlen = 3;
+ value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
RExC_parse += numlen;
break;
+ }
default:
if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
vWARN2(RExC_parse,
RExC_parse - rangebegin,
RExC_parse - rangebegin,
rangebegin);
- if (lastvalue < 256) {
- ANYOF_BITMAP_SET(ret, lastvalue);
+ if (prevvalue < 256) {
+ ANYOF_BITMAP_SET(ret, prevvalue);
ANYOF_BITMAP_SET(ret, '-');
}
else {
ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
Perl_sv_catpvf(aTHX_ listsv,
- /* 0x002D is Unicode for '-' */
- "%04"UVxf"\n002D\n", (UV)lastvalue);
+ "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
}
}
}
if (!SIZE_ONLY) {
+ if (namedclass > OOB_NAMEDCLASS)
+ optimize_invert = FALSE;
/* Possible truncation here but in some 64-bit environments
* the compiler gets heartburn about switch on 64-bit values.
* A similar issue a little earlier when switching on value.
if (isALNUM(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
break;
case ANYOF_NALNUM:
if (!isALNUM(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
break;
case ANYOF_ALNUMC:
if (isALNUMC(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
break;
case ANYOF_NALNUMC:
if (!isALNUMC(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
break;
case ANYOF_ALPHA:
if (isALPHA(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
break;
case ANYOF_NALPHA:
if (!isALPHA(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
break;
case ANYOF_ASCII:
if (LOC)
ANYOF_CLASS_SET(ret, ANYOF_ASCII);
else {
-#ifdef ASCIIish
+#ifndef EBCDIC
for (value = 0; value < 128; value++)
ANYOF_BITMAP_SET(ret, value);
#else /* EBCDIC */
- for (value = 0; value < 256; value++)
+ for (value = 0; value < 256; value++) {
if (isASCII(value))
- ANYOF_BITMAP_SET(ret, value);
+ ANYOF_BITMAP_SET(ret, value);
+ }
#endif /* EBCDIC */
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
break;
case ANYOF_NASCII:
if (LOC)
ANYOF_CLASS_SET(ret, ANYOF_NASCII);
else {
-#ifdef ASCIIish
+#ifndef EBCDIC
for (value = 128; value < 256; value++)
ANYOF_BITMAP_SET(ret, value);
#else /* EBCDIC */
- for (value = 0; value < 256; value++)
+ for (value = 0; value < 256; value++) {
if (!isASCII(value))
- ANYOF_BITMAP_SET(ret, value);
+ ANYOF_BITMAP_SET(ret, value);
+ }
#endif /* EBCDIC */
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
break;
case ANYOF_BLANK:
if (isBLANK(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
break;
case ANYOF_NBLANK:
if (!isBLANK(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
break;
case ANYOF_CNTRL:
if (isCNTRL(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
break;
case ANYOF_NCNTRL:
if (!isCNTRL(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
break;
case ANYOF_DIGIT:
for (value = '0'; value <= '9'; value++)
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
break;
case ANYOF_NDIGIT:
for (value = '9' + 1; value < 256; value++)
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
break;
case ANYOF_GRAPH:
if (isGRAPH(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
break;
case ANYOF_NGRAPH:
if (!isGRAPH(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
break;
case ANYOF_LOWER:
if (isLOWER(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
break;
case ANYOF_NLOWER:
if (!isLOWER(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
break;
case ANYOF_PRINT:
if (isPRINT(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
break;
case ANYOF_NPRINT:
if (!isPRINT(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
break;
case ANYOF_PSXSPC:
if (isPSXSPC(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
break;
case ANYOF_NPSXSPC:
if (!isPSXSPC(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
break;
case ANYOF_PUNCT:
if (isPUNCT(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
break;
case ANYOF_NPUNCT:
if (!isPUNCT(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
break;
case ANYOF_SPACE:
if (isSPACE(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
break;
case ANYOF_NSPACE:
if (!isSPACE(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
break;
case ANYOF_UPPER:
if (isUPPER(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
break;
case ANYOF_NUPPER:
if (!isUPPER(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
break;
case ANYOF_XDIGIT:
if (isXDIGIT(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
break;
case ANYOF_NXDIGIT:
if (!isXDIGIT(value))
ANYOF_BITMAP_SET(ret, value);
}
- dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
break;
default:
} /* end of namedclass \blah */
if (range) {
- if (lastvalue > value) /* b-a */ {
+ if (prevvalue > value) /* b-a */ {
Simple_vFAIL4("Invalid [] range \"%*.*s\"",
RExC_parse - rangebegin,
RExC_parse - rangebegin,
rangebegin);
+ range = 0; /* not a valid range */
}
- range = 0; /* not a true range */
}
else {
- lastvalue = value; /* save the beginning of the range */
+ prevvalue = value; /* save the beginning of the range */
if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
RExC_parse[1] != ']') {
RExC_parse++;
/* now is the next time */
if (!SIZE_ONLY) {
- if (lastvalue < 256 && value < 256) {
-#ifndef ASCIIish /* EBCDIC, for example. */
- if ((isLOWER(lastvalue) && isLOWER(value)) ||
- (isUPPER(lastvalue) && isUPPER(value)))
+ IV i;
+
+ if (prevvalue < 256) {
+ IV ceilvalue = value < 256 ? value : 255;
+
+#ifdef EBCDIC
+ if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
+ (isUPPER(prevvalue) && isUPPER(ceilvalue)))
{
- IV i;
- if (isLOWER(lastvalue)) {
- for (i = lastvalue; i <= value; i++)
+ if (isLOWER(prevvalue)) {
+ for (i = prevvalue; i <= ceilvalue; i++)
if (isLOWER(i))
ANYOF_BITMAP_SET(ret, i);
} else {
- for (i = lastvalue; i <= value; i++)
+ for (i = prevvalue; i <= ceilvalue; i++)
if (isUPPER(i))
ANYOF_BITMAP_SET(ret, i);
}
}
else
#endif
- for ( ; lastvalue <= value; lastvalue++)
- ANYOF_BITMAP_SET(ret, lastvalue);
- } else {
+ for (i = prevvalue; i <= ceilvalue; i++)
+ ANYOF_BITMAP_SET(ret, i);
+ }
+ if (value > 255) {
ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
- if (lastvalue < value)
+ if (prevvalue < value)
Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
- (UV)lastvalue, (UV)value);
- else
+ (UV)prevvalue, (UV)value);
+ else if (prevvalue == value)
Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
(UV)value);
}
}
if (need_class) {
+ ANYOF_FLAGS(ret) |= ANYOF_LARGE;
if (SIZE_ONLY)
RExC_size += ANYOF_CLASS_ADD_SKIP;
else
/* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
if (!SIZE_ONLY &&
- (ANYOF_FLAGS(ret) &
/* If the only flag is folding (plus possibly inversion). */
- (ANYOF_FLAGS_ALL ^ ANYOF_INVERT) == ANYOF_FOLD)) {
+ ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
+ ) {
for (value = 0; value < 256; ++value) {
if (ANYOF_BITMAP_TEST(ret, value)) {
IV fold = PL_fold[value];
}
/* optimize inverted simple patterns (e.g. [^a-z]) */
- if (!SIZE_ONLY && !dont_optimize_invert &&
+ if (!SIZE_ONLY && optimize_invert &&
/* If the only flag is inversion. */
(ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
NODE_ALIGN_FILL(ret);
ptr = ret;
FILL_ADVANCE_NODE(ptr, op);
+ if (RExC_offsets) { /* MJD */
+ MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
+ "reg_node", __LINE__,
+ reg_name[op],
+ RExC_emit - RExC_emit_start > RExC_offsets[0]
+ ? "Overwriting end of array!\n" : "OK",
+ RExC_emit - RExC_emit_start,
+ RExC_parse - RExC_start,
+ RExC_offsets[0]));
+ Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
+ }
+
RExC_emit = ptr;
return(ret);
NODE_ALIGN_FILL(ret);
ptr = ret;
FILL_ADVANCE_NODE_ARG(ptr, op, arg);
+ if (RExC_offsets) { /* MJD */
+ MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
+ "reganode",
+ RExC_emit - RExC_emit_start > RExC_offsets[0] ?
+ "Overwriting end of array!\n" : "OK",
+ RExC_emit - RExC_emit_start,
+ RExC_parse - RExC_start,
+ RExC_offsets[0]));
+ Set_Cur_Node_Offset;
+ }
+
RExC_emit = ptr;
return(ret);
STATIC void
S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
{
- *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvuni_to_utf8((U8*)s, uv) - (U8*)s);
+ *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
}
/*
src = RExC_emit;
RExC_emit += NODE_STEP_REGNODE + offset;
dst = RExC_emit;
- while (src > opnd)
+ while (src > opnd) {
StructCopy(--src, --dst, regnode);
+ if (RExC_offsets) { /* MJD 20010112 */
+ MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
+ "reg_insert",
+ dst - RExC_emit_start > RExC_offsets[0]
+ ? "Overwriting end of array!\n" : "OK",
+ src - RExC_emit_start,
+ dst - RExC_emit_start,
+ RExC_offsets[0]));
+ Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
+ Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
+ }
+ }
+
place = opnd; /* Op node, where operand used to be. */
+ if (RExC_offsets) { /* MJD */
+ MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
+ "reginsert",
+ place - RExC_emit_start > RExC_offsets[0]
+ ? "Overwriting end of array!\n" : "OK",
+ place - RExC_emit_start,
+ RExC_parse - RExC_start,
+ RExC_offsets[0]));
+ Set_Node_Offset(place, RExC_parse);
+ }
src = NEXTOPER(place);
FILL_ADVANCE_NODE(place, op);
Zero(src, offset, regnode);
}
+#ifdef DEBUGGING
+
STATIC regnode *
S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
{
-#ifdef DEBUGGING
register U8 op = EXACT; /* Arbitrary non-END op. */
register regnode *next;
node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
}
else if (op == ANYOF) {
+ /* arglen 1 + class block */
+ node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
+ ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
node = NEXTOPER(node);
- node += ANYOF_SKIP;
}
else if (PL_regkind[(U8)op] == EXACT) {
/* Literal string, where present. */
else if (op == WHILEM)
l--;
}
-#endif /* DEBUGGING */
return node;
}
+#endif /* DEBUGGING */
+
/*
- regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
*/
if (r->reganch & ROPT_EVAL_SEEN)
PerlIO_printf(Perl_debug_log, "with eval ");
PerlIO_printf(Perl_debug_log, "\n");
+ if (r->offsets) {
+ U32 i;
+ U32 len = r->offsets[0];
+ PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
+ for (i = 1; i <= len; i++)
+ PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
+ (UV)r->offsets[i*2-1],
+ (UV)r->offsets[i*2]);
+ PerlIO_printf(Perl_debug_log, "\n");
+ }
#endif /* DEBUGGING */
}
+#ifdef DEBUGGING
+
STATIC void
S_put_byte(pTHX_ SV *sv, int c)
{
- if (isCNTRL(c) || c == 127 || c == 255 || !isPRINT(c))
+ if (isCNTRL(c) || c == 255 || !isPRINT(c))
Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
else if (c == '-' || c == ']' || c == '\\' || c == '^')
Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
Perl_sv_catpvf(aTHX_ sv, "%c", c);
}
+#endif /* DEBUGGING */
+
/*
- regprop - printable representation of opcode
*/
U8 s[UTF8_MAXLEN+1];
for (i = 0; i <= 256; i++) { /* just the first 256 */
- U8 *e = uvuni_to_utf8(s, i);
+ U8 *e = uvchr_to_utf8(s, i);
- if (i < 256 && swash_fetch(sw, s)) {
+ if (i < 256 && swash_fetch(sw, s, TRUE)) {
if (rangestart == -1)
rangestart = i;
} else if (rangestart != -1) {
if (i <= rangestart + 3)
for (; rangestart < i; rangestart++) {
- for(e = uvuni_to_utf8(s, rangestart), p = s; p < e; p++)
+ for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
put_byte(sv, *p);
}
else {
- for (e = uvuni_to_utf8(s, rangestart), p = s; p < e; p++)
+ for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
put_byte(sv, *p);
sv_catpv(sv, "-");
- for (e = uvuni_to_utf8(s, i - 1), p = s; p < e; p++)
+ for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
put_byte(sv, *p);
}
rangestart = -1;
if (r->precomp)
Safefree(r->precomp);
+ if (r->offsets) /* 20010421 MJD */
+ Safefree(r->offsets);
if (RX_MATCH_COPIED(r))
Safefree(r->subbeg);
if (r->substrs) {
SV** old_curpad;
while (--n >= 0) {
+ /* If you add a ->what type here, update the comment in regcomp.h */
switch (r->data->what[n]) {
case 's':
SvREFCNT_dec((SV*)r->data->data[n]);
}
else
PL_curpad = NULL;
- op_free((OP_4tree*)r->data->data[n]);
+
+ if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
+ op_free((OP_4tree*)r->data->data[n]);
+ }
+
PL_comppad = old_comppad;
PL_curpad = old_curpad;
SvREFCNT_dec((SV*)new_comppad);
SAVEVPTR(PL_regendp); /* Ditto for endp. */
SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
SAVEPPTR(PL_regtill); /* How far we are required to go. */
- SAVEI8(PL_regprev); /* char before regbol, \n if none */
SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
PL_reg_start_tmp = 0;
SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
SAVEVPTR(PL_reg_re); /* from regexec.c */
SAVEPPTR(PL_reg_ganch); /* from regexec.c */
SAVESPTR(PL_reg_sv); /* from regexec.c */
+ SAVEI8(PL_reg_match_utf8); /* from regexec.c */
SAVEVPTR(PL_reg_magic); /* from regexec.c */
SAVEI32(PL_reg_oldpos); /* from regexec.c */
SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
SAVEVPTR(PL_reg_curpm); /* from regexec.c */
SAVEI32(PL_regnpar); /* () count. */
+ SAVEI32(PL_regsize); /* from regexec.c */
#ifdef DEBUGGING
SAVEPPTR(PL_reg_starttry); /* from regexec.c */
#endif
}
-#ifdef PERL_OBJECT
-#include "XSUB.h"
-#undef this
-#define this pPerl
-#endif
-
static void
-clear_re(pTHXo_ void *r)
+clear_re(pTHX_ void *r)
{
ReREFCNT_dec((regexp *)r);
}
+