char *parse; /* Input-scan pointer. */
I32 whilem_seen; /* number of WHILEM in this expr */
regnode *emit_start; /* Start of emitted-code area */
+ regnode *emit_bound; /* First regnode outside of the allocated space */
regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
I32 naughty; /* How bad is this pattern? */
I32 sawback; /* Did we see \1, ...? */
regnode **open_parens; /* pointers to open parens */
regnode **close_parens; /* pointers to close parens */
regnode *opend; /* END node in program */
- I32 utf8;
+ I32 utf8; /* whether the pattern is utf8 or not */
+ I32 orig_utf8; /* whether the pattern was originally in utf8 */
+ /* XXX use this for future optimisation of case
+ * where pattern must be upgraded to utf8. */
HV *charnames; /* cache of named sequences */
HV *paren_names; /* Paren names */
#endif
#define RExC_emit (pRExC_state->emit)
#define RExC_emit_start (pRExC_state->emit_start)
+#define RExC_emit_bound (pRExC_state->emit_bound)
#define RExC_naughty (pRExC_state->naughty)
#define RExC_sawback (pRExC_state->sawback)
#define RExC_seen (pRExC_state->seen)
#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
#define RExC_seen_evals (pRExC_state->seen_evals)
#define RExC_utf8 (pRExC_state->utf8)
+#define RExC_orig_utf8 (pRExC_state->orig_utf8)
#define RExC_charnames (pRExC_state->charnames)
#define RExC_open_parens (pRExC_state->open_parens)
#define RExC_close_parens (pRExC_state->close_parens)
U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
const U8 *scan = (U8*)NULL;
U32 wordlen = 0; /* required init */
- STRLEN chars=0;
+ STRLEN chars = 0;
+ bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
if (OP(noper) == NOTHING) {
trie->minlen= 0;
continue;
}
- if (trie->bitmap) {
- TRIE_BITMAP_SET(trie,*uc);
- if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
- }
+ if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
+ TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
+ regardless of encoding */
+
for ( ; uc < e ; uc += len ) {
TRIE_CHARCOUNT(trie)++;
TRIE_READ_CHAR;
trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
TRIE_STORE_REVCHAR;
}
+ if ( set_bit ) {
+ /* store the codepoint in the bitmap, and if its ascii
+ also store its folded equivelent. */
+ TRIE_BITMAP_SET(trie,uvc);
+ if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
+ set_bit = 0; /* We've done our bit :-) */
+ }
} else {
SV** svpp;
if ( !widecharmap )
if (exp == NULL)
FAIL("NULL regexp argument");
- RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
+ RExC_utf8 = RExC_orig_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
- RExC_precomp = exp;
DEBUG_COMPILE_r({
SV *dsv= sv_newmortal();
RE_PV_QUOTED_DECL(s, RExC_utf8,
- dsv, RExC_precomp, (xend - exp), 60);
+ dsv, exp, (xend - exp), 60);
PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
PL_colors[4],PL_colors[5],s);
});
+
+redo_first_pass:
+ RExC_precomp = exp;
RExC_flags = pm->op_pmflags;
RExC_sawback = 0;
RExC_precomp = NULL;
return(NULL);
}
+ if (RExC_utf8 && !RExC_orig_utf8) {
+ /* It's possible to write a regexp in ascii that represents unicode
+ codepoints outside of the byte range, such as via \x{100}. If we
+ detect such a sequence we have to convert the entire pattern to utf8
+ and then recompile, as our sizing calculation will have been based
+ on 1 byte == 1 character, but we will need to use utf8 to encode
+ at least some part of the pattern, and therefore must convert the whole
+ thing.
+ XXX: somehow figure out how to make this less expensive...
+ -- dmq */
+ STRLEN len = xend-exp;
+ DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+ "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
+ exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
+ xend = exp + len;
+ RExC_orig_utf8 = RExC_utf8;
+ SAVEFREEPV(exp);
+ goto redo_first_pass;
+ }
DEBUG_PARSE_r({
PerlIO_printf(Perl_debug_log,
"Required size %"IVdf" nodes\n"
if (RExC_whilem_seen > 15)
RExC_whilem_seen = 15;
-#ifdef DEBUGGING
- /* Make room for a sentinel value at the end of the program */
- RExC_size++;
-#endif
-
/* Allocate space and zero-initialize. Note, the two step process
of zeroing when in debug mode, thus anything assigned has to
happen after that */
RExC_npar = 1;
RExC_emit_start = ri->program;
RExC_emit = ri->program;
-#ifdef DEBUGGING
- /* put a sentinal on the end of the program so we can check for
- overwrites */
- ri->program[RExC_size].type = 255;
-#endif
+ RExC_emit_bound = ri->program + RExC_size + 1;
+
/* Store the count of eval-groups for security checks: */
RExC_rx->seen_evals = RExC_seen_evals;
REGC((U8)REG_MAGIC, (char*) RExC_emit++);
SV* sv_dat=HeVAL(he_str);
I32 *nums=(I32*)SvPVX(sv_dat);
for ( i=0; i<SvIVX(sv_dat); i++ ) {
- if ((I32)(rx->nparens) >= nums[i]
- && rx->startp[nums[i]] != -1
- && rx->endp[nums[i]] != -1)
+ if ((I32)(rx->nparens) >= nums[i]
+ && rx->startp[nums[i]] != -1
+ && rx->endp[nums[i]] != -1)
{
ret = CALLREG_NUMBUF(rx,nums[i],NULL);
if (!retarray)
PerlIO_printf(Perl_debug_log,"%16s",""); \
\
if (SIZE_ONLY) \
- num=RExC_size; \
+ num = RExC_size + 1; \
else \
num=REG_NODE_NUM(RExC_emit); \
if (RExC_lastnum!=num) \
GET_RE_DEBUG_FLAGS_DECL;
DEBUG_PARSE("reg ");
-
*flagp = 0; /* Tentatively. */
} /* named and numeric backreferences */
/* NOT REACHED */
- case 'p': /* (?p...) */
- if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
- vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
- /* FALL THROUGH*/
case '?': /* (??...) */
is_logical = 1;
if (*RExC_parse != '{') {
}
else
REGTAIL(pRExC_state, ret, ender);
+ RExC_size++; /* XXX WHY do we need this?!!
+ For large programs it seems to be required
+ but I can't figure out why. -- dmq*/
return ret;
}
else {
I32 flags = 0, c = 0;
GET_RE_DEBUG_FLAGS_DECL;
DEBUG_PARSE("brnc");
+
if (first)
ret = NULL;
else {
return ret;
/****** !SIZE_ONLY AFTER HERE *********/
- if( stored == 1 && value < 256
+ if( stored == 1 && (value < 128 || (value < 256 && !UTF))
&& !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
) {
/* optimize single char class to an EXACT node
RExC_size += 1;
return(ret);
}
-#ifdef DEBUGGING
- if (OP(RExC_emit) == 255)
- Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
- reg_name[op], OP(RExC_emit));
-#endif
+ if (RExC_emit >= RExC_emit_bound)
+ Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
+
NODE_ALIGN_FILL(ret);
ptr = ret;
FILL_ADVANCE_NODE(ptr, op);
if (RExC_offsets) { /* MJD */
MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
"reg_node", __LINE__,
- reg_name[op],
+ PL_reg_name[op],
(UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
? "Overwriting end of array!\n" : "OK",
(UV)(RExC_emit - RExC_emit_start),
*/
return(ret);
}
-#ifdef DEBUGGING
- if (OP(RExC_emit) == 255)
- Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
-#endif
+ if (RExC_emit >= RExC_emit_bound)
+ Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
+
NODE_ALIGN_FILL(ret);
ptr = ret;
FILL_ADVANCE_NODE_ARG(ptr, op, arg);
MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
"reganode",
__LINE__,
- reg_name[op],
+ PL_reg_name[op],
(UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
"Overwriting end of array!\n" : "OK",
(UV)(RExC_emit - RExC_emit_start),
const int offset = regarglen[(U8)op];
const int size = NODE_STEP_REGNODE + offset;
GET_RE_DEBUG_FLAGS_DECL;
+ PERL_UNUSED_ARG(depth);
/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
- DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
+ DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
if (SIZE_ONLY) {
RExC_size += size;
return;
dst = RExC_emit;
if (RExC_open_parens) {
int paren;
- DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
+ /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
for ( paren=0 ; paren < RExC_npar ; paren++ ) {
if ( RExC_open_parens[paren] >= opnd ) {
- DEBUG_PARSE_FMT("open"," - %d",size);
+ /*DEBUG_PARSE_FMT("open"," - %d",size);*/
RExC_open_parens[paren] += size;
} else {
- DEBUG_PARSE_FMT("open"," - %s","ok");
+ /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
}
if ( RExC_close_parens[paren] >= opnd ) {
- DEBUG_PARSE_FMT("close"," - %d",size);
+ /*DEBUG_PARSE_FMT("close"," - %d",size);*/
RExC_close_parens[paren] += size;
} else {
- DEBUG_PARSE_FMT("close"," - %s","ok");
+ /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
}
}
}
MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
"reg_insert",
__LINE__,
- reg_name[op],
+ PL_reg_name[op],
(UV)(dst - RExC_emit_start) > RExC_offsets[0]
? "Overwriting end of array!\n" : "OK",
(UV)(src - RExC_emit_start),
MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
"reginsert",
__LINE__,
- reg_name[op],
+ PL_reg_name[op],
(UV)(place - RExC_emit_start) > RExC_offsets[0]
? "Overwriting end of array!\n" : "OK",
(UV)(place - RExC_emit_start),
PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
(temp == NULL ? "->" : ""),
- (temp == NULL ? reg_name[OP(val)] : "")
+ (temp == NULL ? PL_reg_name[OP(val)] : "")
);
});
if (temp == NULL)
PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
SvPV_nolen_const(mysv),
REG_NODE_NUM(scan),
- reg_name[exact]);
+ PL_reg_name[exact]);
});
if (temp == NULL)
break;
/* It would be nice to FAIL() here, but this may be called from
regexec.c, and it would be hard to supply pRExC_state. */
Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
- sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
+ sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
k = PL_regkind[OP(o)];
const reg_trie_data * const trie
= (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
- Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
+ Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
DEBUG_TRIE_COMPILE_r(
Perl_sv_catpvf(aTHX_ sv,
"<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",