* it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
* This causes the main functions to be compiled under new names and with
* debugging support added, which makes "use re 'debug'" work.
-
*/
/* NOTE: this is derived from Henry Spencer's regexp code, and should not
#define RF_tainted 1 /* tainted information used? */
#define RF_warned 2 /* warned about big count? */
#define RF_evaled 4 /* Did an EVAL with setting? */
-#define RF_utf8 8 /* String contains multibyte chars? */
+#define RF_utf8 8 /* Pattern contains multibyte chars? */
#define UTF ((PL_reg_flags & RF_utf8) != 0)
}
/* These are needed since we do not localize EVAL nodes: */
-# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
- " Setting an EVAL scope, savestack=%"IVdf"\n", \
- (IV)PL_savestack_ix)); cp = PL_savestack_ix
-
-# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
- PerlIO_printf(Perl_debug_log, \
- " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
- (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
+#define REGCP_SET(cp) \
+ DEBUG_STATE_r( \
+ PerlIO_printf(Perl_debug_log, \
+ " Setting an EVAL scope, savestack=%"IVdf"\n", \
+ (IV)PL_savestack_ix)); \
+ cp = PL_savestack_ix
+
+#define REGCP_UNWIND(cp) \
+ DEBUG_STATE_r( \
+ if (cp != PL_savestack_ix) \
+ PerlIO_printf(Perl_debug_log, \
+ " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
+ (IV)(cp), (IV)PL_savestack_ix)); \
+ regcpblow(cp)
STATIC char *
S_regcppop(pTHX_ const regexp *rex)
#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
-#define TRYPAREN(paren, n, input, where) { \
- if (paren) { \
- if (n) { \
- PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
- PL_regendp[paren] = input - PL_bostr; \
- } \
- else \
- PL_regendp[paren] = -1; \
- } \
- REGMATCH(next, where); \
- if (result) \
- sayYES; \
- if (paren && n) \
- PL_regendp[paren] = -1; \
-}
-
-
/*
* pregexec and friends
*/
const I32 multiline = prog->reganch & PMf_MULTILINE;
#ifdef DEBUGGING
const char * const i_strpos = strpos;
- SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
#endif
GET_RE_DEBUG_FLAGS_DECL;
RX_MATCH_UTF8_set(prog,do_utf8);
if (prog->reganch & ROPT_UTF8) {
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- "UTF-8 regex...\n"));
PL_reg_flags |= RF_utf8;
}
-
- DEBUG_EXECUTE_r({
- const char *s = PL_reg_match_utf8 ?
- sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
- strpos;
- const int len = PL_reg_match_utf8 ?
- (int)strlen(s) : strend - strpos;
- if (!PL_colorset)
- reginitcolors();
- if (PL_reg_match_utf8)
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- "UTF-8 target...\n"));
- PerlIO_printf(Perl_debug_log,
- "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
- PL_colors[4], PL_colors[5], PL_colors[0],
- prog->precomp,
- PL_colors[1],
- (strlen(prog->precomp) > 60 ? "..." : ""),
- PL_colors[0],
- (int)(len > 60 ? 60 : len),
- s, PL_colors[1],
- (len > 60 ? "..." : "")
+ DEBUG_EXECUTE_r(
+ debug_start_match(prog, do_utf8, strpos, strend,
+ "Guessing start of match for");
);
- });
/* CHR_DIST() would be more correct here but it makes things slow. */
if (prog->minlen > strend - strpos) {
/* Update the count-of-usability, remove useless subpatterns,
unshift s. */
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
+ DEBUG_EXECUTE_r({
+ RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
+ SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
+ PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
(s ? "Found" : "Did not find"),
- (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
- PL_colors[0],
- (int)(SvCUR(check) - (SvTAIL(check)!=0)),
- SvPVX_const(check),
- PL_colors[1], (SvTAIL(check) ? "$" : ""),
- (s ? " at offset " : "...\n") ) );
+ (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
+ ? "anchored" : "floating"),
+ quoted,
+ RE_SV_TAIL(check),
+ (s ? " at offset " : "...\n") );
+ });
if (!s)
goto fail_finish;
must,
multiline ? FBMrf_MULTILINE : 0
);
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- "%s anchored substr \"%s%.*s%s\"%s",
+ DEBUG_EXECUTE_r({
+ RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
+ SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
+ PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
(s ? "Found" : "Contradicts"),
- PL_colors[0],
- (int)(SvCUR(must)
- - (SvTAIL(must)!=0)),
- SvPVX_const(must),
- PL_colors[1], (SvTAIL(must) ? "$" : "")));
+ quoted, RE_SV_TAIL(must));
+ });
+
+
if (!s) {
if (last1 >= last2) {
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
(unsigned char*)last + SvCUR(must)
- (SvTAIL(must)!=0),
must, multiline ? FBMrf_MULTILINE : 0);
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
+ DEBUG_EXECUTE_r({
+ RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
+ SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
+ PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
(s ? "Found" : "Contradicts"),
- PL_colors[0],
- (int)(SvCUR(must) - (SvTAIL(must)!=0)),
- SvPVX_const(must),
- PL_colors[1], (SvTAIL(must) ? "$" : "")));
+ quoted, RE_SV_TAIL(must));
+ });
if (!s) {
if (last1 == last) {
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
/* Last resort... */
/* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
- if (prog->regstclass && OP(prog->regstclass)!=TRIE) {
+ if (prog->regstclass && PL_regkind[OP(prog->regstclass)]!=TRIE) {
/* minlen == 0 is possible if regstclass is \b or \B,
and the fixed substr is ''$.
Since minlen is already taken into account, s+1 is before strend;
return NULL;
}
+
+
+#define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
+foldlen, foldbuf, uniflags) STMT_START { \
+ switch (trie_type) { \
+ case trie_utf8_fold: \
+ if ( foldlen>0 ) { \
+ uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
+ foldlen -= len; \
+ uscan += len; \
+ len=0; \
+ } else { \
+ uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
+ uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
+ foldlen -= UNISKIP( uvc ); \
+ uscan = foldbuf + UNISKIP( uvc ); \
+ } \
+ break; \
+ case trie_utf8: \
+ uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
+ break; \
+ case trie_plain: \
+ uvc = (UV)*uc; \
+ len = 1; \
+ } \
+ \
+ if (uvc < 256) { \
+ charid = trie->charmap[ uvc ]; \
+ } \
+ else { \
+ charid = 0; \
+ if (trie->widecharmap) { \
+ SV** const svpp = hv_fetch(trie->widecharmap, \
+ (char*)&uvc, sizeof(UV), 0); \
+ if (svpp) \
+ charid = (U16)SvIV(*svpp); \
+ } \
+ } \
+} STMT_END
+
+#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
+ if ( (CoNd) \
+ && (ln == len || \
+ ibcmp_utf8(s, NULL, 0, do_utf8, \
+ m, NULL, ln, (bool)UTF)) \
+ && (!reginfo || regtry(reginfo, s)) ) \
+ goto got_it; \
+ else { \
+ U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
+ uvchr_to_utf8(tmpbuf, c); \
+ f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
+ if ( f != c \
+ && (f == c1 || f == c2) \
+ && (ln == foldlen || \
+ !ibcmp_utf8((char *) foldbuf, \
+ NULL, foldlen, do_utf8, \
+ m, \
+ NULL, ln, (bool)UTF)) \
+ && (!reginfo || regtry(reginfo, s)) ) \
+ goto got_it; \
+ } \
+ s += len
+
+#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
+STMT_START { \
+ while (s <= e) { \
+ if ( (CoNd) \
+ && (ln == 1 || !(OP(c) == EXACTF \
+ ? ibcmp(s, m, ln) \
+ : ibcmp_locale(s, m, ln))) \
+ && (!reginfo || regtry(reginfo, s)) ) \
+ goto got_it; \
+ s++; \
+ } \
+} STMT_END
+
+#define REXEC_FBC_UTF8_SCAN(CoDe) \
+STMT_START { \
+ while (s + (uskip = UTF8SKIP(s)) <= strend) { \
+ CoDe \
+ s += uskip; \
+ } \
+} STMT_END
+
+#define REXEC_FBC_SCAN(CoDe) \
+STMT_START { \
+ while (s < strend) { \
+ CoDe \
+ s++; \
+ } \
+} STMT_END
+
+#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
+REXEC_FBC_UTF8_SCAN( \
+ if (CoNd) { \
+ if (tmp && (!reginfo || regtry(reginfo, s))) \
+ goto got_it; \
+ else \
+ tmp = doevery; \
+ } \
+ else \
+ tmp = 1; \
+)
+
+#define REXEC_FBC_CLASS_SCAN(CoNd) \
+REXEC_FBC_SCAN( \
+ if (CoNd) { \
+ if (tmp && (!reginfo || regtry(reginfo, s))) \
+ goto got_it; \
+ else \
+ tmp = doevery; \
+ } \
+ else \
+ tmp = 1; \
+)
+
+#define REXEC_FBC_TRYIT \
+if ((!reginfo || regtry(reginfo, s))) \
+ goto got_it
+
+#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
+ if (do_utf8) { \
+ UtFpReLoAd; \
+ REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
+ } \
+ else { \
+ REXEC_FBC_CLASS_SCAN(CoNd); \
+ } \
+ break
+
+#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
+ PL_reg_flags |= RF_tainted; \
+ if (do_utf8) { \
+ REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
+ } \
+ else { \
+ REXEC_FBC_CLASS_SCAN(CoNd); \
+ } \
+ break
+
+#define DUMP_EXEC_POS(li,s,doutf8) \
+ dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
+
/* We know what class REx starts with. Try to find this position... */
/* if reginfo is NULL, its a dryrun */
/* annoyingly all the vars in this routine have different names from their counterparts
switch (OP(c)) {
case ANYOF:
if (do_utf8) {
- while (s + (uskip = UTF8SKIP(s)) <= strend) {
- if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
+ REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
!UTF8_IS_INVARIANT((U8)s[0]) ?
reginclass(prog, c, (U8*)s, 0, do_utf8) :
- REGINCLASS(prog, c, (U8*)s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s += uskip;
- }
+ REGINCLASS(prog, c, (U8*)s));
}
else {
while (s < strend) {
}
break;
case CANY:
- while (s < strend) {
+ REXEC_FBC_SCAN(
if (tmp && (!reginfo || regtry(reginfo, s)))
goto got_it;
else
tmp = doevery;
- s++;
- }
+ );
break;
case EXACTF:
m = STRING(c);
while (s <= e) {
c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
uniflags);
- if ( c == c1
- && (ln == len ||
- ibcmp_utf8(s, NULL, 0, do_utf8,
- m, NULL, ln, (bool)UTF))
- && (!reginfo || regtry(reginfo, s)) )
- goto got_it;
- else {
- U8 foldbuf[UTF8_MAXBYTES_CASE+1];
- uvchr_to_utf8(tmpbuf, c);
- f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
- if ( f != c
- && (f == c1 || f == c2)
- && (ln == foldlen ||
- !ibcmp_utf8((char *) foldbuf,
- NULL, foldlen, do_utf8,
- m,
- NULL, ln, (bool)UTF))
- && (!reginfo || regtry(reginfo, s)) )
- goto got_it;
- }
- s += len;
+ REXEC_FBC_EXACTISH_CHECK(c == c1);
}
}
else {
c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
- if ( (c == c1 || c == c2)
- && (ln == len ||
- ibcmp_utf8(s, NULL, 0, do_utf8,
- m, NULL, ln, (bool)UTF))
- && (!reginfo || regtry(reginfo, s)) )
- goto got_it;
- else {
- U8 foldbuf[UTF8_MAXBYTES_CASE+1];
- uvchr_to_utf8(tmpbuf, c);
- f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
- if ( f != c
- && (f == c1 || f == c2)
- && (ln == foldlen ||
- !ibcmp_utf8((char *) foldbuf,
- NULL, foldlen, do_utf8,
- m,
- NULL, ln, (bool)UTF))
- && (!reginfo || regtry(reginfo, s)) )
- goto got_it;
- }
- s += len;
+ REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
}
}
}
else {
if (c1 == c2)
- while (s <= e) {
- if ( *(U8*)s == c1
- && (ln == 1 || !(OP(c) == EXACTF
- ? ibcmp(s, m, ln)
- : ibcmp_locale(s, m, ln)))
- && (!reginfo || regtry(reginfo, s)) )
- goto got_it;
- s++;
- }
+ REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
else
- while (s <= e) {
- if ( (*(U8*)s == c1 || *(U8*)s == c2)
- && (ln == 1 || !(OP(c) == EXACTF
- ? ibcmp(s, m, ln)
- : ibcmp_locale(s, m, ln)))
- && (!reginfo || regtry(reginfo, s)) )
- goto got_it;
- s++;
- }
+ REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
}
break;
case BOUNDL:
tmp = ((OP(c) == BOUND ?
isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
LOAD_UTF8_CHARCLASS_ALNUM();
- while (s + (uskip = UTF8SKIP(s)) <= strend) {
+ REXEC_FBC_UTF8_SCAN(
if (tmp == !(OP(c) == BOUND ?
(bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
isALNUM_LC_utf8((U8*)s)))
{
tmp = !tmp;
- if ((!reginfo || regtry(reginfo, s)))
- goto got_it;
- }
- s += uskip;
+ REXEC_FBC_TRYIT;
}
+ );
}
else {
tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
- while (s < strend) {
+ REXEC_FBC_SCAN(
if (tmp ==
!(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
tmp = !tmp;
- if ((!reginfo || regtry(reginfo, s)))
- goto got_it;
- }
- s++;
+ REXEC_FBC_TRYIT;
}
+ );
}
if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
goto got_it;
tmp = ((OP(c) == NBOUND ?
isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
LOAD_UTF8_CHARCLASS_ALNUM();
- while (s + (uskip = UTF8SKIP(s)) <= strend) {
+ REXEC_FBC_UTF8_SCAN(
if (tmp == !(OP(c) == NBOUND ?
(bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
isALNUM_LC_utf8((U8*)s)))
tmp = !tmp;
- else if ((!reginfo || regtry(reginfo, s)))
- goto got_it;
- s += uskip;
- }
+ else REXEC_FBC_TRYIT;
+ );
}
else {
tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
tmp = ((OP(c) == NBOUND ?
isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
- while (s < strend) {
+ REXEC_FBC_SCAN(
if (tmp ==
!(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
tmp = !tmp;
- else if ((!reginfo || regtry(reginfo, s)))
- goto got_it;
- s++;
- }
+ else REXEC_FBC_TRYIT;
+ );
}
if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
goto got_it;
break;
case ALNUM:
- if (do_utf8) {
- LOAD_UTF8_CHARCLASS_ALNUM();
- while (s + (uskip = UTF8SKIP(s)) <= strend) {
- if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s += uskip;
- }
- }
- else {
- while (s < strend) {
- if (isALNUM(*s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s++;
- }
- }
- break;
+ REXEC_FBC_CSCAN_PRELOAD(
+ LOAD_UTF8_CHARCLASS_ALNUM(),
+ swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
+ isALNUM(*s)
+ );
case ALNUML:
- PL_reg_flags |= RF_tainted;
- if (do_utf8) {
- while (s + (uskip = UTF8SKIP(s)) <= strend) {
- if (isALNUM_LC_utf8((U8*)s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s += uskip;
- }
- }
- else {
- while (s < strend) {
- if (isALNUM_LC(*s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s++;
- }
- }
- break;
+ REXEC_FBC_CSCAN_TAINT(
+ isALNUM_LC_utf8((U8*)s),
+ isALNUM_LC(*s)
+ );
case NALNUM:
- if (do_utf8) {
- LOAD_UTF8_CHARCLASS_ALNUM();
- while (s + (uskip = UTF8SKIP(s)) <= strend) {
- if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s += uskip;
- }
- }
- else {
- while (s < strend) {
- if (!isALNUM(*s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s++;
- }
- }
- break;
+ REXEC_FBC_CSCAN_PRELOAD(
+ LOAD_UTF8_CHARCLASS_ALNUM(),
+ !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
+ !isALNUM(*s)
+ );
case NALNUML:
- PL_reg_flags |= RF_tainted;
- if (do_utf8) {
- while (s + (uskip = UTF8SKIP(s)) <= strend) {
- if (!isALNUM_LC_utf8((U8*)s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s += uskip;
- }
- }
- else {
- while (s < strend) {
- if (!isALNUM_LC(*s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s++;
- }
- }
- break;
+ REXEC_FBC_CSCAN_TAINT(
+ !isALNUM_LC_utf8((U8*)s),
+ !isALNUM_LC(*s)
+ );
case SPACE:
- if (do_utf8) {
- LOAD_UTF8_CHARCLASS_SPACE();
- while (s + (uskip = UTF8SKIP(s)) <= strend) {
- if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s += uskip;
- }
- }
- else {
- while (s < strend) {
- if (isSPACE(*s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s++;
- }
- }
- break;
+ REXEC_FBC_CSCAN_PRELOAD(
+ LOAD_UTF8_CHARCLASS_SPACE(),
+ *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
+ isSPACE(*s)
+ );
case SPACEL:
- PL_reg_flags |= RF_tainted;
- if (do_utf8) {
- while (s + (uskip = UTF8SKIP(s)) <= strend) {
- if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s += uskip;
- }
- }
- else {
- while (s < strend) {
- if (isSPACE_LC(*s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s++;
- }
- }
- break;
- case NSPACE:
- if (do_utf8) {
- LOAD_UTF8_CHARCLASS_SPACE();
- while (s + (uskip = UTF8SKIP(s)) <= strend) {
- if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s += uskip;
- }
- }
- else {
- while (s < strend) {
- if (!isSPACE(*s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s++;
- }
- }
- break;
- case NSPACEL:
- PL_reg_flags |= RF_tainted;
- if (do_utf8) {
- while (s + (uskip = UTF8SKIP(s)) <= strend) {
- if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s += uskip;
- }
- }
- else {
- while (s < strend) {
- if (!isSPACE_LC(*s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s++;
- }
- }
- break;
- case DIGIT:
- if (do_utf8) {
- LOAD_UTF8_CHARCLASS_DIGIT();
- while (s + (uskip = UTF8SKIP(s)) <= strend) {
- if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s += uskip;
- }
- }
- else {
- while (s < strend) {
- if (isDIGIT(*s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s++;
- }
- }
- break;
- case DIGITL:
- PL_reg_flags |= RF_tainted;
- if (do_utf8) {
- while (s + (uskip = UTF8SKIP(s)) <= strend) {
- if (isDIGIT_LC_utf8((U8*)s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s += uskip;
- }
- }
- else {
- while (s < strend) {
- if (isDIGIT_LC(*s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s++;
- }
- }
- break;
- case NDIGIT:
- if (do_utf8) {
- LOAD_UTF8_CHARCLASS_DIGIT();
- while (s + (uskip = UTF8SKIP(s)) <= strend) {
- if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s += uskip;
- }
- }
- else {
- while (s < strend) {
- if (!isDIGIT(*s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s++;
- }
- }
- break;
- case NDIGITL:
- PL_reg_flags |= RF_tainted;
- if (do_utf8) {
- while (s + (uskip = UTF8SKIP(s)) <= strend) {
- if (!isDIGIT_LC_utf8((U8*)s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s += uskip;
- }
- }
- else {
- while (s < strend) {
- if (!isDIGIT_LC(*s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s++;
- }
- }
- break;
+ REXEC_FBC_CSCAN_TAINT(
+ *s == ' ' || isSPACE_LC_utf8((U8*)s),
+ isSPACE_LC(*s)
+ );
+ case NSPACE:
+ REXEC_FBC_CSCAN_PRELOAD(
+ LOAD_UTF8_CHARCLASS_SPACE(),
+ !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
+ !isSPACE(*s)
+ );
+ case NSPACEL:
+ REXEC_FBC_CSCAN_TAINT(
+ !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
+ !isSPACE_LC(*s)
+ );
+ case DIGIT:
+ REXEC_FBC_CSCAN_PRELOAD(
+ LOAD_UTF8_CHARCLASS_DIGIT(),
+ swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
+ isDIGIT(*s)
+ );
+ case DIGITL:
+ REXEC_FBC_CSCAN_TAINT(
+ isDIGIT_LC_utf8((U8*)s),
+ isDIGIT_LC(*s)
+ );
+ case NDIGIT:
+ REXEC_FBC_CSCAN_PRELOAD(
+ LOAD_UTF8_CHARCLASS_DIGIT(),
+ !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
+ !isDIGIT(*s)
+ );
+ case NDIGITL:
+ REXEC_FBC_CSCAN_TAINT(
+ !isDIGIT_LC_utf8((U8*)s),
+ !isDIGIT_LC(*s)
+ );
+ case TRIEC:
case TRIE:
- /*Perl_croak(aTHX_ "panic: unknown regstclass TRIE");*/
{
const enum { trie_plain, trie_utf8, trie_utf8_fold }
trie_type = do_utf8 ?
reg_trie_data *trie=aho->trie;
const char *last_start = strend - trie->minlen;
+#ifdef DEBUGGING
const char *real_start = s;
+#endif
STRLEN maxlen = trie->maxlen;
SV *sv_points;
U8 **points; /* map of where we were in the input string
- when reading a given string. For ASCII this
+ when reading a given char. For ASCII this
is unnecessary overhead as the relationship
is always 1:1, but for unicode, especially
case folded unicode this is not true. */
U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+ U8 *bitmap=NULL;
+
GET_RE_DEBUG_FLAGS_DECL;
SvPOK_on(sv_points);
sv_2mortal(sv_points);
points=(U8**)SvPV_nolen(sv_points );
-
- if (trie->bitmap && trie_type != trie_utf8_fold) {
- while (s <= last_start && !TRIE_BITMAP_TEST(trie,*s) ) {
- s++;
- }
+ if ( trie_type != trie_utf8_fold && (trie->bitmap || OP(c)==TRIEC) ) {
+ if (trie->bitmap)
+ bitmap=(U8*)trie->bitmap;
+ else
+ bitmap=(U8*)ANYOF_BITMAP(c);
}
-
+ /* this is the Aho-Corasick algorithm modified a touch
+ to include special handling for long "unknown char"
+ sequences. The basic idea being that we use AC as long
+ as we are dealing with a possible matching char, when
+ we encounter an unknown char (and we have not encountered
+ an accepting state) we scan forward until we find a legal
+ starting char.
+ AC matching is basically that of trie matching, except
+ that when we encounter a failing transition, we fall back
+ to the current states "fail state", and try the current char
+ again, a process we repeat until we reach the root state,
+ state 1, or a legal transition. If we fail on the root state
+ then we can either terminate if we have reached an accepting
+ state previously, or restart the entire process from the beginning
+ if we have not.
+
+ */
while (s <= last_start) {
const U32 uniflags = UTF8_ALLOW_DEFAULT;
U8 *uc = (U8*)s;
STRLEN foldlen = 0;
U8 *uscan = (U8*)NULL;
U8 *leftmost = NULL;
-
+#ifdef DEBUGGING
+ U32 accepted_word= 0;
+#endif
U32 pointpos = 0;
while ( state && uc <= (U8*)strend ) {
int failed=0;
- if (aho->states[ state ].wordnum) {
- U8 *lpos= points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1] ) % maxlen ];
- if (!leftmost || lpos < leftmost)
- leftmost= lpos;
- if (base==0) break;
- }
- points[pointpos++ % maxlen]= uc;
- switch (trie_type) {
- case trie_utf8_fold:
- if ( foldlen>0 ) {
- uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
- foldlen -= len;
- uscan += len;
- len=0;
- } else {
- uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
- uvc = to_uni_fold( uvc, foldbuf, &foldlen );
- foldlen -= UNISKIP( uvc );
- uscan = foldbuf + UNISKIP( uvc );
- }
- break;
- case trie_utf8:
- uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
- &len, uniflags );
- break;
- case trie_plain:
- uvc = (UV)*uc;
- len = 1;
- }
+ U32 word = aho->states[ state ].wordnum;
- if (uvc < 256) {
- charid = trie->charmap[ uvc ];
+ if( state==1 && bitmap ) {
+ DEBUG_TRIE_EXECUTE_r(
+ if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
+ dump_exec_pos( (char *)uc, c, strend, real_start,
+ (char*)uc, do_utf8 );
+ PerlIO_printf( Perl_debug_log,
+ " Scanning for legal start char...\n");
+ }
+ );
+ while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
+ uc++;
+ }
+ s= (char *)uc;
+ if (uc >(U8*)last_start) break;
}
- else {
- charid = 0;
- if (trie->widecharmap) {
- SV** const svpp = hv_fetch(trie->widecharmap,
- (char*)&uvc, sizeof(UV), 0);
- if (svpp)
- charid = (U16)SvIV(*svpp);
+
+ if ( word ) {
+ U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
+ if (!leftmost || lpos < leftmost) {
+ DEBUG_r(accepted_word=word);
+ leftmost= lpos;
}
+ if (base==0) break;
+
}
- DEBUG_TRIE_EXECUTE_r(
+ points[pointpos++ % maxlen]= uc;
+ REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
+ uvc, charid, foldlen, foldbuf, uniflags);
+ DEBUG_TRIE_EXECUTE_r({
+ dump_exec_pos( (char *)uc, c, strend, real_start,
+ s, do_utf8 );
PerlIO_printf(Perl_debug_log,
- "Pos: %d Charid:%3x CV:%4"UVxf" ",
- (int)((const char*)uc - real_start), charid, uvc)
- );
- uc += len;
+ " Charid:%3u CP:%4"UVxf" ",
+ charid, uvc);
+ });
do {
- U32 word = aho->states[ state ].wordnum;
+#ifdef DEBUGGING
+ word = aho->states[ state ].wordnum;
+#endif
base = aho->states[ state ].trans.base;
- DEBUG_TRIE_EXECUTE_r(
+ DEBUG_TRIE_EXECUTE_r({
+ if (failed)
+ dump_exec_pos( (char *)uc, c, strend, real_start,
+ s, do_utf8 );
PerlIO_printf( Perl_debug_log,
- "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf" word=%"UVxf"\n",
- failed ? "Fail transition to " : "",
- state, base, uvc, word)
- );
+ "%sState: %4"UVxf", word=%"UVxf,
+ failed ? " Fail transition to " : "",
+ (UV)state, (UV)word);
+ });
if ( base ) {
U32 tmp;
if (charid &&
&& (tmp=trie->trans[base + charid - 1 -
trie->uniquecharcount ].next))
{
+ DEBUG_TRIE_EXECUTE_r(
+ PerlIO_printf( Perl_debug_log," - legal\n"));
state = tmp;
break;
}
else {
- failed++;
- if ( state == 1 )
- break;
- else
- state = aho->fail[state];
+ DEBUG_TRIE_EXECUTE_r(
+ PerlIO_printf( Perl_debug_log," - fail\n"));
+ failed = 1;
+ state = aho->fail[state];
}
}
else {
/* we must be accepting here */
- failed++;
+ DEBUG_TRIE_EXECUTE_r(
+ PerlIO_printf( Perl_debug_log," - accepting\n"));
+ failed = 1;
break;
}
} while(state);
+ uc += len;
if (failed) {
if (leftmost)
break;
- else if (!charid && trie->bitmap && trie_type != trie_utf8_fold) {
- while ( uc <= (U8*)last_start && !TRIE_BITMAP_TEST(trie,*uc) ) {
- uc++;
- }
- }
+ if (!state) state = 1;
}
}
if ( aho->states[ state ].wordnum ) {
U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
- if (!leftmost || lpos < leftmost)
+ if (!leftmost || lpos < leftmost) {
+ DEBUG_r(accepted_word=aho->states[ state ].wordnum);
leftmost = lpos;
+ }
}
- DEBUG_TRIE_EXECUTE_r(
- PerlIO_printf( Perl_debug_log,
- "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf"\n",
- "All done: ",
- state, base, uvc)
- );
if (leftmost) {
s = (char*)leftmost;
+ DEBUG_TRIE_EXECUTE_r({
+ PerlIO_printf(
+ Perl_debug_log,"Matches word #%"UVxf" at position %d. Trying full pattern...\n",
+ (UV)accepted_word, s - real_start
+ );
+ });
if (!reginfo || regtry(reginfo, s)) {
FREETMPS;
LEAVE;
goto got_it;
}
s = HOPc(s,1);
+ DEBUG_TRIE_EXECUTE_r({
+ PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
+ });
} else {
+ DEBUG_TRIE_EXECUTE_r(
+ PerlIO_printf( Perl_debug_log,"No match.\n"));
break;
}
}
SV* const oreplsv = GvSV(PL_replgv);
const bool do_utf8 = DO_UTF8(sv);
I32 multiline;
-#ifdef DEBUGGING
- SV* dsv0;
- SV* dsv1;
-#endif
+
regmatch_info reginfo; /* create some info to pass to regtry etc */
GET_RE_DEBUG_FLAGS_DECL;
multiline = prog->reganch & PMf_MULTILINE;
reginfo.prog = prog;
-#ifdef DEBUGGING
- dsv0 = PERL_DEBUG_PAD_ZERO(0);
- dsv1 = PERL_DEBUG_PAD_ZERO(1);
-#endif
-
RX_MATCH_UTF8_set(prog, do_utf8);
minlen = prog->minlen;
}
}
- DEBUG_EXECUTE_r({
- const char * const s0 = UTF
- ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
- UNI_DISPLAY_REGEX)
- : prog->precomp;
- const int len0 = UTF ? (int)SvCUR(dsv0) : prog->prelen;
- const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
- UNI_DISPLAY_REGEX) : startpos;
- const int len1 = do_utf8 ? (int)SvCUR(dsv1) : strend - startpos;
- if (!PL_colorset)
- reginitcolors();
- PerlIO_printf(Perl_debug_log,
- "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
- PL_colors[4], PL_colors[5], PL_colors[0],
- len0, len0, s0,
- PL_colors[1],
- len0 > 60 ? "..." : "",
- PL_colors[0],
- (int)(len1 > 60 ? 60 : len1),
- s1, PL_colors[1],
- (len1 > 60 ? "..." : "")
+ DEBUG_EXECUTE_r(
+ debug_start_match(prog, do_utf8, startpos, strend,
+ "Matching");
);
- });
/* Simplest case: anchored match need be tried only once. */
/* [unless only anchor is BOL and multiline is set] */
ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
if (do_utf8) {
- while (s < strend) {
+ REXEC_FBC_SCAN(
if (*s == ch) {
DEBUG_EXECUTE_r( did_match = 1 );
if (regtry(®info, s)) goto got_it;
while (s < strend && *s == ch)
s += UTF8SKIP(s);
}
- s += UTF8SKIP(s);
- }
+ );
}
else {
- while (s < strend) {
+ REXEC_FBC_SCAN(
if (*s == ch) {
DEBUG_EXECUTE_r( did_match = 1 );
if (regtry(®info, s)) goto got_it;
while (s < strend && *s == ch)
s++;
}
- s++;
- }
+ );
}
DEBUG_EXECUTE_r(if (!did_match)
PerlIO_printf(Perl_debug_log,
}
}
}
- DEBUG_EXECUTE_r(if (!did_match)
- PerlIO_printf(Perl_debug_log,
- "Did not find %s substr \"%s%.*s%s\"%s...\n",
+ DEBUG_EXECUTE_r(if (!did_match) {
+ RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
+ SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
+ PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
((must == prog->anchored_substr || must == prog->anchored_utf8)
? "anchored" : "floating"),
- PL_colors[0],
- (int)(SvCUR(must) - (SvTAIL(must)!=0)),
- SvPVX_const(must),
- PL_colors[1], (SvTAIL(must) ? "$" : ""))
- );
+ quoted, RE_SV_TAIL(must));
+ });
goto phooey;
}
- else if ((c = prog->regstclass)) {
+ else if ( (c = prog->regstclass) ) {
if (minlen) {
const OPCODE op = OP(prog->regstclass);
/* don't bother with what can't match */
- if (PL_regkind[op] != EXACT && op != CANY && op != TRIE)
+ if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
strend = HOPc(strend, -(minlen - 1));
}
DEBUG_EXECUTE_r({
SV * const prop = sv_newmortal();
- const char *s0;
- const char *s1;
- int len0;
- int len1;
-
regprop(prog, prop, c);
- s0 = UTF ?
- pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
- UNI_DISPLAY_REGEX) :
- SvPVX_const(prop);
- len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
- s1 = UTF ?
- sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
- len1 = UTF ? (int)SvCUR(dsv1) : strend - s;
- PerlIO_printf(Perl_debug_log,
- "Matching stclass \"%*.*s\" against \"%*.*s\" (%d chars)\n",
- len0, len0, s0,
- len1, len1, s1, (int)(strend - s));
+ {
+ RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1),
+ s,strend-s,60);
+ PerlIO_printf(Perl_debug_log,
+ "Matching stclass %.*s against %s (%d chars)\n",
+ (int)SvCUR(prop), SvPVX_const(prop),
+ quoted, (int)(strend - s));
+ }
});
if (find_byclass(prog, c, s, strend, ®info))
goto got_it;
#define sayNO_SILENT goto do_no
#define saySAME(x) if (x) goto yes; else goto no
-#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
-#define POSCACHE_SEEN 1 /* we know what we're caching */
-#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
-
-#define CACHEsayYES STMT_START { \
- if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
- if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
- PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) | (1<<POSCACHE_SEEN); \
- PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
- } \
- else if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS)) { \
- PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
- } \
- else { \
- /* cache records failure, but this is success */ \
- DEBUG_r( \
- PerlIO_printf(Perl_debug_log, \
- "%*s (remove success from failure cache)\n", \
- REPORT_CODE_OFF+PL_regindent*2, "") \
- ); \
- PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
- } \
- } \
- sayYES; \
-} STMT_END
-
#define CACHEsayNO STMT_START { \
- if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
- if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
- PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
- PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
- } \
- else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
- PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
- } \
- else { \
- /* cache records success, but this is failure */ \
- DEBUG_r( \
- PerlIO_printf(Perl_debug_log, \
- "%*s (remove failure from success cache)\n", \
- REPORT_CODE_OFF+PL_regindent*2, "") \
- ); \
- PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
- } \
- } \
+ if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) \
+ PL_reg_poscache[st->u.whilem.cache_offset] |= \
+ (1<<st->u.whilem.cache_bit); \
sayNO; \
} STMT_END
+
/* this is used to determine how far from the left messages like
'failed...' are printed. Currently 29 makes these messages line
up with the opcode they refer to. Earlier perls used 25 which
*/
/* *** every FOO_fail should = FOO+1 */
-#define TRIE_next (REGNODE_MAX+1)
-#define TRIE_next_fail (REGNODE_MAX+2)
-#define EVAL_A (REGNODE_MAX+3)
-#define EVAL_A_fail (REGNODE_MAX+4)
-#define resume_CURLYX (REGNODE_MAX+5)
-#define resume_WHILEM1 (REGNODE_MAX+6)
-#define resume_WHILEM2 (REGNODE_MAX+7)
-#define resume_WHILEM3 (REGNODE_MAX+8)
-#define resume_WHILEM4 (REGNODE_MAX+9)
-#define resume_WHILEM5 (REGNODE_MAX+10)
-#define resume_WHILEM6 (REGNODE_MAX+11)
-#define BRANCH_next (REGNODE_MAX+12)
-#define BRANCH_next_fail (REGNODE_MAX+13)
-#define CURLYM_A (REGNODE_MAX+14)
-#define CURLYM_A_fail (REGNODE_MAX+15)
-#define CURLYM_B (REGNODE_MAX+16)
-#define CURLYM_B_fail (REGNODE_MAX+17)
-#define IFMATCH_A (REGNODE_MAX+18)
-#define IFMATCH_A_fail (REGNODE_MAX+19)
-#define resume_PLUS1 (REGNODE_MAX+20)
-#define resume_PLUS2 (REGNODE_MAX+21)
-#define resume_PLUS3 (REGNODE_MAX+22)
-#define resume_PLUS4 (REGNODE_MAX+23)
-
+#define TRIE_next (REGNODE_MAX+1)
+#define TRIE_next_fail (REGNODE_MAX+2)
+#define EVAL_A (REGNODE_MAX+3)
+#define EVAL_A_fail (REGNODE_MAX+4)
+#define resume_CURLYX (REGNODE_MAX+5)
+#define resume_WHILEM1 (REGNODE_MAX+6)
+#define resume_WHILEM2 (REGNODE_MAX+7)
+#define resume_WHILEM3 (REGNODE_MAX+8)
+#define resume_WHILEM4 (REGNODE_MAX+9)
+#define resume_WHILEM5 (REGNODE_MAX+10)
+#define resume_WHILEM6 (REGNODE_MAX+11)
+#define BRANCH_next (REGNODE_MAX+12)
+#define BRANCH_next_fail (REGNODE_MAX+13)
+#define CURLYM_A (REGNODE_MAX+14)
+#define CURLYM_A_fail (REGNODE_MAX+15)
+#define CURLYM_B (REGNODE_MAX+16)
+#define CURLYM_B_fail (REGNODE_MAX+17)
+#define IFMATCH_A (REGNODE_MAX+18)
+#define IFMATCH_A_fail (REGNODE_MAX+19)
+#define CURLY_B_min_known (REGNODE_MAX+20)
+#define CURLY_B_min_known_fail (REGNODE_MAX+21)
+#define CURLY_B_min (REGNODE_MAX+22)
+#define CURLY_B_min_fail (REGNODE_MAX+23)
+#define CURLY_B_max (REGNODE_MAX+24)
+#define CURLY_B_max_fail (REGNODE_MAX+25)
#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
-#ifdef DEBUGGING
-STATIC void
-S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8)
+#ifdef DEBUGGING
+STATIC void
+S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
+ const char *start, const char *end, const char *blurb)
+{
+ const bool utf8_pat= prog->reganch & ROPT_UTF8 ? 1 : 0;
+ if (!PL_colorset)
+ reginitcolors();
+ {
+ RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
+ prog->precomp, prog->prelen, 60);
+
+ RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
+ start, end - start, 60);
+
+ PerlIO_printf(Perl_debug_log,
+ "%s%s REx%s %s against %s\n",
+ PL_colors[4], blurb, PL_colors[5], s0, s1);
+
+ if (do_utf8||utf8_pat)
+ PerlIO_printf(Perl_debug_log, "UTF-8 %s...\n",
+ !do_utf8 ? "pattern" : !utf8_pat ? "string" :
+ "pattern and string"
+ );
+ }
+}
+
+STATIC void
+S_dump_exec_pos(pTHX_ const char *locinput,
+ const regnode *scan,
+ const char *loc_regeol,
+ const char *loc_bostr,
+ const char *loc_reg_starttry,
+ const bool do_utf8)
{
- const int docolor = *PL_colors[0];
+ const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
- int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
+ int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
/* The part of the string before starttry has one color
(pref0_len chars), between starttry and current
position another one (pref_len - pref0_len chars),
after the current position the third one.
We assume that pref0_len <= pref_len, otherwise we
decrease pref0_len. */
- int pref_len = (locinput - PL_bostr) > (5 + taill) - l
- ? (5 + taill) - l : locinput - PL_bostr;
+ int pref_len = (locinput - loc_bostr) > (5 + taill) - l
+ ? (5 + taill) - l : locinput - loc_bostr;
int pref0_len;
while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
pref_len++;
- pref0_len = pref_len - (locinput - PL_reg_starttry);
- if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
- l = ( PL_regeol - locinput > (5 + taill) - pref_len
- ? (5 + taill) - pref_len : PL_regeol - locinput);
+ pref0_len = pref_len - (locinput - loc_reg_starttry);
+ if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
+ l = ( loc_regeol - locinput > (5 + taill) - pref_len
+ ? (5 + taill) - pref_len : loc_regeol - locinput);
while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
l--;
if (pref0_len < 0)
if (pref0_len > pref_len)
pref0_len = pref_len;
{
- const char * const s0 =
- do_utf8 && OP(scan) != CANY ?
- pv_uni_display(PERL_DEBUG_PAD(0), (U8*)(locinput - pref_len),
- pref0_len, 60, UNI_DISPLAY_REGEX) :
- locinput - pref_len;
- const int len0 = do_utf8 ? (int)strlen(s0) : pref0_len;
- const char * const s1 = do_utf8 && OP(scan) != CANY ?
- pv_uni_display(PERL_DEBUG_PAD(1),
- (U8*)(locinput - pref_len + pref0_len),
- pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
- locinput - pref_len + pref0_len;
- const int len1 = do_utf8 ? (int)strlen(s1) : pref_len - pref0_len;
- const char * const s2 = do_utf8 && OP(scan) != CANY ?
- pv_uni_display(PERL_DEBUG_PAD(2), (U8*)locinput,
- PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
- locinput;
- const int len2 = do_utf8 ? (int)strlen(s2) : l;
- PerlIO_printf(Perl_debug_log,
- "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|",
- (IV)(locinput - PL_bostr),
- PL_colors[4],
+ const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
+
+ RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
+ (locinput - pref_len),pref0_len, pref0_len, 4, 5);
+
+ RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
+ (locinput - pref_len + pref0_len),
+ pref_len - pref0_len, pref_len - pref0_len, 2, 3);
+
+ RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
+ locinput, loc_regeol - locinput, l, 0, 1);
+
+ PerlIO_printf(Perl_debug_log,
+ "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
+ (IV)(locinput - loc_bostr),
len0, s0,
- PL_colors[5],
- PL_colors[2],
len1, s1,
- PL_colors[3],
(docolor ? "" : "> <"),
- PL_colors[0],
len2, s2,
- PL_colors[1],
15 - l - pref_len + 1,
"");
}
}
+
#endif
STATIC I32 /* 0 failure, 1 success */
/* these variables are NOT saved during a recusive RFEGMATCH: */
register I32 nextchr; /* is always set to UCHARAT(locinput) */
- bool result; /* return value of S_regmatch */
+ bool result = 0; /* return value of S_regmatch */
int depth = 0; /* depth of recursion */
regmatch_state *yes_state = NULL; /* state to pop to on success of
subpattern */
U32 state_num;
+ I32 parenfloor = 0;
+
#ifdef DEBUGGING
GET_RE_DEBUG_FLAGS_DECL;
PL_regindent++;
st->sw = 0;
st->logical = 0;
st->cc = NULL;
+
/* Note that nextchr is a byte even in UTF */
nextchr = UCHARAT(locinput);
scan = prog;
DEBUG_EXECUTE_r( {
SV * const prop = sv_newmortal();
- dump_exec_pos( locinput, scan, do_utf8 );
+ DUMP_EXEC_POS( locinput, scan, do_utf8 );
regprop(rex, prop, scan);
PerlIO_printf(Perl_debug_log,
#undef ST
#define ST st->u.trie
-
+ case TRIEC:
+ /* In this case the charclass data is available inline so
+ we can fail fast without a lot of extra overhead.
+ */
+ if (scan->flags == EXACT || !do_utf8) {
+ if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
+ DEBUG_EXECUTE_r(
+ PerlIO_printf(Perl_debug_log,
+ "%*s %sfailed to match trie start class...%s\n",
+ REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
+ );
+ sayNO_SILENT;
+ /* NOTREACHED */
+ }
+ }
+ /* FALL THROUGH */
case TRIE:
{
/* what type of TRIE am I? (utf8 makes this contextual) */
= (reg_trie_data*)rex->data->data[ ARG( scan ) ];
U32 state = trie->startstate;
- U8 *uc = ( U8* )locinput;
- U16 charid = 0;
- U32 base = 0;
- UV uvc = 0;
- STRLEN len = 0;
- STRLEN foldlen = 0;
- U8 *uscan = (U8*)NULL;
- STRLEN bufflen=0;
- SV *sv_accept_buff = NULL;
- U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
-
- ST.accepted = 0; /* how many accepting states we have seen */
- ST.B = next;
-#ifdef DEBUGGING
- ST.me = scan;
-#endif
-
if (trie->bitmap && trie_type != trie_utf8_fold &&
!TRIE_BITMAP_TEST(trie,*locinput)
) {
} else {
DEBUG_EXECUTE_r(
PerlIO_printf(Perl_debug_log,
- "%*s %sfailed to match start class...%s\n",
+ "%*s %sfailed to match trie start class...%s\n",
REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
);
sayNO_SILENT;
}
}
+ {
+ U8 *uc = ( U8* )locinput;
+
+ STRLEN len = 0;
+ STRLEN foldlen = 0;
+ U8 *uscan = (U8*)NULL;
+ STRLEN bufflen=0;
+ SV *sv_accept_buff = NULL;
+ U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+
+ ST.accepted = 0; /* how many accepting states we have seen */
+ ST.B = next;
+ ST.jump = trie->jump;
+
+#ifdef DEBUGGING
+ ST.me = scan;
+#endif
+
+
+
/*
traverse the TRIE keeping track of all accepting states
we transition through until we get to a failing node.
*/
while ( state && uc <= (U8*)PL_regeol ) {
-
- if (trie->states[ state ].wordnum) {
- if (!ST.accepted ) {
+ U32 base = trie->states[ state ].trans.base;
+ UV uvc;
+ U16 charid;
+ /* We use charid to hold the wordnum as we don't use it
+ for charid until after we have done the wordnum logic.
+ We define an alias just so that the wordnum logic reads
+ more naturally. */
+
+#define got_wordnum charid
+ got_wordnum = trie->states[ state ].wordnum;
+
+ if ( got_wordnum ) {
+ if ( ! ST.accepted ) {
ENTER;
SAVETMPS;
bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
sv_accept_buff=newSV(bufflen *
sizeof(reg_trie_accepted) - 1);
- SvCUR_set(sv_accept_buff,
- sizeof(reg_trie_accepted));
+ SvCUR_set(sv_accept_buff, 0);
SvPOK_on(sv_accept_buff);
sv_2mortal(sv_accept_buff);
SAVETMPS;
ST.accept_buff =
(reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
}
- else {
+ do {
if (ST.accepted >= bufflen) {
bufflen *= 2;
ST.accept_buff =(reg_trie_accepted*)
}
SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
+ sizeof(reg_trie_accepted));
- }
- ST.accept_buff[ST.accepted].wordnum = trie->states[state].wordnum;
- ST.accept_buff[ST.accepted].endpos = uc;
- ++ST.accepted;
- }
- base = trie->states[ state ].trans.base;
+
+ ST.accept_buff[ST.accepted].wordnum = got_wordnum;
+ ST.accept_buff[ST.accepted].endpos = uc;
+ ++ST.accepted;
+ } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
+ }
+#undef got_wordnum
DEBUG_TRIE_EXECUTE_r({
- dump_exec_pos( (char *)uc, scan, do_utf8 );
+ DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
PerlIO_printf( Perl_debug_log,
- "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
+ "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
2+PL_regindent * 2, "", PL_colors[4],
- (UV)state, (UV)base, (UV)ST.accepted );
+ (UV)state, (UV)ST.accepted );
});
if ( base ) {
- switch (trie_type) {
- case trie_utf8_fold:
- if ( foldlen>0 ) {
- uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
- foldlen -= len;
- uscan += len;
- len=0;
- } else {
- uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
- uvc = to_uni_fold( uvc, foldbuf, &foldlen );
- foldlen -= UNISKIP( uvc );
- uscan = foldbuf + UNISKIP( uvc );
- }
- break;
- case trie_utf8:
- uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
- &len, uniflags );
- break;
- case trie_plain:
- uvc = (UV)*uc;
- len = 1;
- }
-
- if (uvc < 256) {
- charid = trie->charmap[ uvc ];
- }
- else {
- charid = 0;
- if (trie->widecharmap) {
- SV** const svpp = hv_fetch(trie->widecharmap,
- (char*)&uvc, sizeof(UV), 0);
- if (svpp)
- charid = (U16)SvIV(*svpp);
- }
- }
+ REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
+ uvc, charid, foldlen, foldbuf, uniflags);
if (charid &&
(base + charid > trie->uniquecharcount )
}
DEBUG_TRIE_EXECUTE_r(
PerlIO_printf( Perl_debug_log,
- "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
+ "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
charid, uvc, (UV)state, PL_colors[5] );
);
}
REPORT_CODE_OFF + PL_regindent * 2, "",
PL_colors[4], (IV)ST.accepted, PL_colors[5] );
);
- }
+ }}
/* FALL THROUGH */
LEAVE;
locinput = PL_reginput;
nextchr = UCHARAT(locinput);
- scan = ST.B;
+
+ if ( !ST.jump )
+ scan = ST.B;
+ else
+ scan = ST.B - ST.jump[ST.accept_buff[0].wordnum];
+
continue; /* execute rest of RE */
}
best = ST.accepted;
}
PL_reginput = (char *)ST.accept_buff[ best ].endpos;
+ if ( !ST.jump ) {
+ PUSH_STATE_GOTO(TRIE_next, ST.B);
+ /* NOTREACHED */
+ } else {
+ PUSH_STATE_GOTO(TRIE_next, ST.B - ST.jump[ST.accept_buff[best].wordnum]);
+ /* NOTREACHED */
+ }
+ /* NOTREACHED */
}
- PUSH_STATE_GOTO(TRIE_next, ST.B);
/* NOTREACHED */
#undef ST
}
/* run the pattern returned from (??{...}) */
-
DEBUG_EXECUTE_r(
- PerlIO_printf(Perl_debug_log,
- "Entering embedded \"%s%.60s%s%s\"\n",
- PL_colors[0],
- re->precomp,
- PL_colors[1],
- (strlen(re->precomp) > 60 ? "..." : ""))
+ debug_start_match(re, do_utf8, locinput, PL_regeol,
+ "Matching embedded");
);
ST.cp = regcppush(0); /* Save *all* the positions. */
case CURLYX: {
/* No need to save/restore up to this paren */
- I32 parenfloor = scan->flags;
-
+ parenfloor = scan->flags;
+
/* Dave says:
CURLYX and WHILEM are always paired: they're the moral
PL_reg_leftiter = PL_reg_maxiter;
}
if (PL_reg_leftiter-- == 0) {
- const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
+ const I32 size = (PL_reg_maxiter + 7)/8;
if (PL_reg_poscache) {
if ((I32)PL_reg_poscache_size < size) {
Renew(PL_reg_poscache, size, char);
if (PL_reg_leftiter < 0) {
st->u.whilem.cache_offset = locinput - PL_bostr;
- st->u.whilem.cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
+ st->u.whilem.cache_offset = (scan->flags & 0xf) - 1
+ st->u.whilem.cache_offset * (scan->flags>>4);
st->u.whilem.cache_bit = st->u.whilem.cache_offset % 8;
st->u.whilem.cache_offset /= 8;
"%*s already tried at this position...\n",
REPORT_CODE_OFF+PL_regindent*2, "")
);
- if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
- /* cache records success */
- sayYES;
- else
- /* cache records failure */
- sayNO_SILENT;
+ sayNO; /* cache records failure */
}
}
}
st->cc = st->u.whilem.savecc;
if (result) {
regcpblow(st->u.whilem.cp);
- CACHEsayYES; /* All done. */
+ sayYES; /* All done. */
}
REGCP_UNWIND(st->u.whilem.lastcp);
regcppop(rex);
/*** all unsaved local vars undefined at this point */
if (result) {
regcpblow(st->u.whilem.cp);
- CACHEsayYES;
+ sayYES;
}
REGCP_UNWIND(st->u.whilem.lastcp);
regcppop(rex);
/*** all unsaved local vars undefined at this point */
if (result) {
regcpblow(st->u.whilem.cp);
- CACHEsayYES;
+ sayYES;
}
REGCP_UNWIND(st->u.whilem.lastcp);
regcppop(rex); /* Restore some previous $<digit>s? */
/*** all unsaved local vars undefined at this point */
st->cc = st->u.whilem.savecc;
if (result)
- CACHEsayYES;
+ sayYES;
if (st->cc->u.curlyx.outercc)
st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
st->cc->u.curlyx.cur = n - 1;
DEBUG_EXECUTE_r(
PerlIO_printf(Perl_debug_log,
"%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
- (int)(REPORT_CODE_OFF+PL_regindent*2), "",
+ (int)(REPORT_CODE_OFF+(PL_regindent*2)), "",
(IV) ST.count, (IV)ST.alen)
);
DEBUG_EXECUTE_r(
PerlIO_printf(Perl_debug_log,
"%*s CURLYM trying tail with matches=%"IVdf"...\n",
- (int)(REPORT_CODE_OFF+PL_regindent*2),
+ (int)(REPORT_CODE_OFF+(PL_regindent*2)),
"", (IV)ST.count)
);
if (ST.c1 != CHRTEST_VOID
locinput = HOPc(locinput, -ST.alen);
goto curlym_do_B; /* try to match B */
+#undef ST
+#define ST st->u.curly
- case CURLYN:
- st->u.plus.paren = scan->flags; /* Which paren to set */
- if (st->u.plus.paren > PL_regsize)
- PL_regsize = st->u.plus.paren;
- if (st->u.plus.paren > (I32)*PL_reglastparen)
- *PL_reglastparen = st->u.plus.paren;
- st->ln = ARG1(scan); /* min to match */
- n = ARG2(scan); /* max to match */
- scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
- goto repeat;
- case CURLY:
- st->u.plus.paren = 0;
- st->ln = ARG1(scan); /* min to match */
- n = ARG2(scan); /* max to match */
- scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
- goto repeat;
- case STAR:
- st->ln = 0;
- n = REG_INFTY;
+#define CURLY_SETPAREN(paren, success) \
+ if (paren) { \
+ if (success) { \
+ PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
+ PL_regendp[paren] = locinput - PL_bostr; \
+ } \
+ else \
+ PL_regendp[paren] = -1; \
+ }
+
+ case STAR: /* /A*B/ where A is width 1 */
+ ST.paren = 0;
+ ST.min = 0;
+ ST.max = REG_INFTY;
scan = NEXTOPER(scan);
- st->u.plus.paren = 0;
goto repeat;
- case PLUS:
- st->ln = 1;
- n = REG_INFTY;
+ case PLUS: /* /A+B/ where A is width 1 */
+ ST.paren = 0;
+ ST.min = 1;
+ ST.max = REG_INFTY;
scan = NEXTOPER(scan);
- st->u.plus.paren = 0;
+ goto repeat;
+ case CURLYN: /* /(A){m,n}B/ where A is width 1 */
+ ST.paren = scan->flags; /* Which paren to set */
+ if (ST.paren > PL_regsize)
+ PL_regsize = ST.paren;
+ if (ST.paren > (I32)*PL_reglastparen)
+ *PL_reglastparen = ST.paren;
+ ST.min = ARG1(scan); /* min to match */
+ ST.max = ARG2(scan); /* max to match */
+ scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
+ goto repeat;
+ case CURLY: /* /A{m,n}B/ where A is width 1 */
+ ST.paren = 0;
+ ST.min = ARG1(scan); /* min to match */
+ ST.max = ARG2(scan); /* max to match */
+ scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
repeat:
/*
* Lookahead to avoid useless match attempts
* when we know what character comes next.
- */
-
- /*
+ *
* Used to only do .*x and .*?x, but now it allows
* for )'s, ('s and (?{ ... })'s to be in the way
* of the quantifier and the EXACT-like node. -- japhy
*/
+ if (ST.min > ST.max) /* XXX make this a compile-time check? */
+ sayNO;
if (HAS_TEXT(next) || JUMPABLE(next)) {
U8 *s;
regnode *text_node = next;
FIND_NEXT_IMPT(text_node);
if (! HAS_TEXT(text_node))
- st->u.plus.c1 = st->u.plus.c2 = CHRTEST_VOID;
+ ST.c1 = ST.c2 = CHRTEST_VOID;
else {
if (PL_regkind[OP(text_node)] == REF) {
- st->u.plus.c1 = st->u.plus.c2 = CHRTEST_VOID;
+ ST.c1 = ST.c2 = CHRTEST_VOID;
goto assume_ok_easy;
}
else
s = (U8*)STRING(text_node);
if (!UTF) {
- st->u.plus.c2 = st->u.plus.c1 = *s;
+ ST.c2 = ST.c1 = *s;
if (OP(text_node) == EXACTF || OP(text_node) == REFF)
- st->u.plus.c2 = PL_fold[st->u.plus.c1];
+ ST.c2 = PL_fold[ST.c1];
else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
- st->u.plus.c2 = PL_fold_locale[st->u.plus.c1];
+ ST.c2 = PL_fold_locale[ST.c1];
}
else { /* UTF */
if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
-
- st->u.plus.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
- uniflags);
- st->u.plus.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
- uniflags);
+#ifdef EBCDIC
+ ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY);
+ ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY);
+#else
+ ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
+ uniflags);
+ ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
+ uniflags);
+#endif
}
else {
- st->u.plus.c2 = st->u.plus.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
+ ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
uniflags);
}
}
}
}
else
- st->u.plus.c1 = st->u.plus.c2 = CHRTEST_VOID;
+ ST.c1 = ST.c2 = CHRTEST_VOID;
assume_ok_easy:
+
+ ST.A = scan;
+ ST.B = next;
PL_reginput = locinput;
if (st->minmod) {
st->minmod = 0;
- if (st->ln && regrepeat(rex, scan, st->ln) < st->ln)
+ if (ST.min && regrepeat(rex, ST.A, ST.min) < ST.min)
sayNO;
+ ST.count = ST.min;
locinput = PL_reginput;
- REGCP_SET(st->u.plus.lastcp);
- if (st->u.plus.c1 != CHRTEST_VOID) {
- st->u.plus.old = locinput;
- st->u.plus.count = 0;
-
- if (n == REG_INFTY) {
- st->u.plus.e = PL_regeol - 1;
- if (do_utf8)
- while (UTF8_IS_CONTINUATION(*(U8*)st->u.plus.e))
- st->u.plus.e--;
- }
- else if (do_utf8) {
- int m = n - st->ln;
- for (st->u.plus.e = locinput;
- m >0 && st->u.plus.e + UTF8SKIP(st->u.plus.e) <= PL_regeol; m--)
- st->u.plus.e += UTF8SKIP(st->u.plus.e);
+ REGCP_SET(ST.cp);
+ if (ST.c1 == CHRTEST_VOID)
+ goto curly_try_B_min;
+
+ ST.oldloc = locinput;
+
+ /* set ST.maxpos to the furthest point along the
+ * string that could possibly match */
+ if (ST.max == REG_INFTY) {
+ ST.maxpos = PL_regeol - 1;
+ if (do_utf8)
+ while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
+ ST.maxpos--;
+ }
+ else if (do_utf8) {
+ int m = ST.max - ST.min;
+ for (ST.maxpos = locinput;
+ m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
+ ST.maxpos += UTF8SKIP(ST.maxpos);
+ }
+ else {
+ ST.maxpos = locinput + ST.max - ST.min;
+ if (ST.maxpos >= PL_regeol)
+ ST.maxpos = PL_regeol - 1;
+ }
+ goto curly_try_B_min_known;
+
+ }
+ else {
+ ST.count = regrepeat(rex, ST.A, ST.max);
+ locinput = PL_reginput;
+ if (ST.count < ST.min)
+ sayNO;
+ if ((ST.count > ST.min)
+ && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
+ {
+ /* A{m,n} must come at the end of the string, there's
+ * no point in backing off ... */
+ ST.min = ST.count;
+ /* ...except that $ and \Z can match before *and* after
+ newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
+ We may back off by one in this case. */
+ if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
+ ST.min--;
+ }
+ REGCP_SET(ST.cp);
+ goto curly_try_B_max;
+ }
+ /* NOTREACHED */
+
+
+ case CURLY_B_min_known_fail:
+ /* failed to find B in a non-greedy match where c1,c2 valid */
+ if (ST.paren && ST.count)
+ PL_regendp[ST.paren] = -1;
+
+ PL_reginput = locinput; /* Could be reset... */
+ REGCP_UNWIND(ST.cp);
+ /* Couldn't or didn't -- move forward. */
+ ST.oldloc = locinput;
+ if (do_utf8)
+ locinput += UTF8SKIP(locinput);
+ else
+ locinput++;
+ ST.count++;
+ curly_try_B_min_known:
+ /* find the next place where 'B' could work, then call B */
+ {
+ int n;
+ if (do_utf8) {
+ n = (ST.oldloc == locinput) ? 0 : 1;
+ if (ST.c1 == ST.c2) {
+ STRLEN len;
+ /* set n to utf8_distance(oldloc, locinput) */
+ while (locinput <= ST.maxpos &&
+ utf8n_to_uvchr((U8*)locinput,
+ UTF8_MAXBYTES, &len,
+ uniflags) != (UV)ST.c1) {
+ locinput += len;
+ n++;
+ }
}
else {
- st->u.plus.e = locinput + n - st->ln;
- if (st->u.plus.e >= PL_regeol)
- st->u.plus.e = PL_regeol - 1;
- }
- while (1) {
- /* Find place 'next' could work */
- if (!do_utf8) {
- if (st->u.plus.c1 == st->u.plus.c2) {
- while (locinput <= st->u.plus.e &&
- UCHARAT(locinput) != st->u.plus.c1)
- locinput++;
- } else {
- while (locinput <= st->u.plus.e
- && UCHARAT(locinput) != st->u.plus.c1
- && UCHARAT(locinput) != st->u.plus.c2)
- locinput++;
- }
- st->u.plus.count = locinput - st->u.plus.old;
- }
- else {
- if (st->u.plus.c1 == st->u.plus.c2) {
- STRLEN len;
- /* count initialised to
- * utf8_distance(old, locinput) */
- while (locinput <= st->u.plus.e &&
- utf8n_to_uvchr((U8*)locinput,
- UTF8_MAXBYTES, &len,
- uniflags) != (UV)st->u.plus.c1) {
- locinput += len;
- st->u.plus.count++;
- }
- } else {
- /* count initialised to
- * utf8_distance(old, locinput) */
- while (locinput <= st->u.plus.e) {
- STRLEN len;
- const UV c = utf8n_to_uvchr((U8*)locinput,
- UTF8_MAXBYTES, &len,
- uniflags);
- if (c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2)
- break;
- locinput += len;
- st->u.plus.count++;
- }
- }
- }
- if (locinput > st->u.plus.e)
- sayNO;
- /* PL_reginput == old now */
- if (locinput != st->u.plus.old) {
- st->ln = 1; /* Did some */
- if (regrepeat(rex, scan, st->u.plus.count) < st->u.plus.count)
- sayNO;
+ /* set n to utf8_distance(oldloc, locinput) */
+ while (locinput <= ST.maxpos) {
+ STRLEN len;
+ const UV c = utf8n_to_uvchr((U8*)locinput,
+ UTF8_MAXBYTES, &len,
+ uniflags);
+ if (c == (UV)ST.c1 || c == (UV)ST.c2)
+ break;
+ locinput += len;
+ n++;
}
- /* PL_reginput == locinput now */
- PL_reginput = locinput; /* Could be reset... */
- TRYPAREN(st->u.plus.paren, st->ln, locinput, PLUS1);
- /*** all unsaved local vars undefined at this point */
-
- REGCP_UNWIND(st->u.plus.lastcp);
- /* Couldn't or didn't -- move forward. */
- st->u.plus.old = locinput;
- if (do_utf8)
- locinput += UTF8SKIP(locinput);
- else
- locinput++;
- st->u.plus.count = 1;
}
}
- else
- while (n >= st->ln || (n == REG_INFTY && st->ln > 0)) { /* ln overflow ? */
- UV c;
- if (st->u.plus.c1 != CHRTEST_VOID) {
- if (do_utf8)
- c = utf8n_to_uvchr((U8*)PL_reginput,
- UTF8_MAXBYTES, 0,
- uniflags);
- else
- c = UCHARAT(PL_reginput);
- /* If it could work, try it. */
- if (c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2) {
- TRYPAREN(st->u.plus.paren, st->ln, PL_reginput, PLUS2);
- /*** all unsaved local vars undefined at this point */
- REGCP_UNWIND(st->u.plus.lastcp);
- }
- }
- /* If it could work, try it. */
- else if (st->u.plus.c1 == CHRTEST_VOID) {
- TRYPAREN(st->u.plus.paren, st->ln, PL_reginput, PLUS3);
- /*** all unsaved local vars undefined at this point */
- REGCP_UNWIND(st->u.plus.lastcp);
+ else {
+ if (ST.c1 == ST.c2) {
+ while (locinput <= ST.maxpos &&
+ UCHARAT(locinput) != ST.c1)
+ locinput++;
}
- /* Couldn't or didn't -- move forward. */
- PL_reginput = locinput;
- if (regrepeat(rex, scan, 1)) {
- st->ln++;
- locinput = PL_reginput;
+ else {
+ while (locinput <= ST.maxpos
+ && UCHARAT(locinput) != ST.c1
+ && UCHARAT(locinput) != ST.c2)
+ locinput++;
}
- else
+ n = locinput - ST.oldloc;
+ }
+ if (locinput > ST.maxpos)
+ sayNO;
+ /* PL_reginput == oldloc now */
+ if (n) {
+ ST.count += n;
+ if (regrepeat(rex, ST.A, n) < n)
sayNO;
}
+ PL_reginput = locinput;
+ CURLY_SETPAREN(ST.paren, ST.count);
+ PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
}
- else {
- n = regrepeat(rex, scan, n);
+ /* NOTREACHED */
+
+
+ case CURLY_B_min_fail:
+ /* failed to find B in a non-greedy match where c1,c2 invalid */
+ if (ST.paren && ST.count)
+ PL_regendp[ST.paren] = -1;
+
+ REGCP_UNWIND(ST.cp);
+ /* failed -- move forward one */
+ PL_reginput = locinput;
+ if (regrepeat(rex, ST.A, 1)) {
+ ST.count++;
locinput = PL_reginput;
- if ((st->ln < n) && (PL_regkind[OP(next)] == EOL) &&
- (OP(next) != MEOL || OP(next) == SEOL || OP(next) == EOS))
- {
- st->ln = n; /* why back off? */
- /* ...because $ and \Z can match before *and* after
- newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
- We should back off by one in this case. */
- if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
- st->ln--;
- }
- REGCP_SET(st->u.plus.lastcp);
+ if (ST.count <= ST.max || (ST.max == REG_INFTY &&
+ ST.count > 0)) /* count overflow ? */
{
- while (n >= st->ln) {
- UV c = 0;
- if (st->u.plus.c1 != CHRTEST_VOID) {
- if (do_utf8)
- c = utf8n_to_uvchr((U8*)PL_reginput,
- UTF8_MAXBYTES, 0,
- uniflags);
- else
- c = UCHARAT(PL_reginput);
- }
- /* If it could work, try it. */
- if (st->u.plus.c1 == CHRTEST_VOID || c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2) {
- TRYPAREN(st->u.plus.paren, n, PL_reginput, PLUS4);
- /*** all unsaved local vars undefined at this point */
- REGCP_UNWIND(st->u.plus.lastcp);
- }
- /* Couldn't or didn't -- back up. */
- n--;
- PL_reginput = locinput = HOPc(locinput, -1);
- }
+ curly_try_B_min:
+ CURLY_SETPAREN(ST.paren, ST.count);
+ PUSH_STATE_GOTO(CURLY_B_min, ST.B);
}
}
sayNO;
- break;
+ /* NOTREACHED */
+
+
+ curly_try_B_max:
+ /* a successful greedy match: now try to match B */
+ {
+ UV c = 0;
+ if (ST.c1 != CHRTEST_VOID)
+ c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
+ UTF8_MAXBYTES, 0, uniflags)
+ : (UV) UCHARAT(PL_reginput);
+ /* If it could work, try it. */
+ if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
+ CURLY_SETPAREN(ST.paren, ST.count);
+ PUSH_STATE_GOTO(CURLY_B_max, ST.B);
+ /* NOTREACHED */
+ }
+ }
+ /* FALL THROUGH */
+ case CURLY_B_max_fail:
+ /* failed to find B in a greedy match */
+ if (ST.paren && ST.count)
+ PL_regendp[ST.paren] = -1;
+
+ REGCP_UNWIND(ST.cp);
+ /* back up. */
+ if (--ST.count < ST.min)
+ sayNO;
+ PL_reginput = locinput = HOPc(locinput, -1);
+ goto curly_try_B_max;
+
+#undef ST
+
+
case END:
if (locinput < reginfo->till) {
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
regmatch_state *newst;
depth++;
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_STATE_r(PerlIO_printf(Perl_debug_log,
"PUSH STATE(%d)\n", depth));
st->locinput = locinput;
newst = st+1;
newst->sw = 0;
newst->logical = 0;
+
locinput = PL_reginput;
nextchr = UCHARAT(locinput);
st = newst;
st->minmod = 0;
st->sw = 0;
st->logical = 0;
+
#ifdef DEBUGGING
PL_regindent++;
#endif
st = SLAB_LAST(PL_regmatch_slab);
}
depth -= (st - yes_state);
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATES (%d..%d)\n",
- depth+1, depth+(st - yes_state)));
+ DEBUG_STATE_r(PerlIO_printf(Perl_debug_log, "POP STATES (%"UVuf"..%"UVuf")\n",
+ (UV)(depth+1), (UV)(depth+(st - yes_state))));
st = yes_state;
yes_state = st->u.yes.prev_yes_state;
PL_regmatch_state = st;
case CURLYM_B:
case BRANCH_next:
case TRIE_next:
+ case CURLY_B_max:
default:
Perl_croak(aTHX_ "unexpected yes resume state");
}
* will disappear when REGFMATCH goes */
if (depth) {
/* restore previous state and re-enter */
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
+ DEBUG_STATE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
depth--;
st--;
if (st < SLAB_FIRST(PL_regmatch_slab)) {
goto resume_point_WHILEM5;
case resume_WHILEM6:
goto resume_point_WHILEM6;
- case resume_PLUS1:
- goto resume_point_PLUS1;
- case resume_PLUS2:
- goto resume_point_PLUS2;
- case resume_PLUS3:
- goto resume_point_PLUS3;
- case resume_PLUS4:
- goto resume_point_PLUS4;
case TRIE_next:
case CURLYM_A:
case EVAL_A:
case IFMATCH_A:
case BRANCH_next:
+ case CURLY_B_max:
+ case CURLY_B_min:
+ case CURLY_B_min_known:
break;
default:
no:
DEBUG_EXECUTE_r(
PerlIO_printf(Perl_debug_log,
- "%*s %sfailed...%s\n",
- REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
+ "%*s %sfailed...%s\n",
+ REPORT_CODE_OFF+PL_regindent*2, "",
+ PL_colors[4], PL_colors[5])
);
no_final:
do_no:
if (depth) {
/* there's a previous state to backtrack to */
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
+ DEBUG_STATE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
depth--;
st--;
if (st < SLAB_FIRST(PL_regmatch_slab)) {
case CURLYM_A:
case CURLYM_B:
case IFMATCH_A:
+ case CURLY_B_max:
+ case CURLY_B_min:
+ case CURLY_B_min_known:
if (yes_state == st)
yes_state = st->u.yes.prev_yes_state;
state_num = st->resume_state + 1; /* failure = success + 1 */
goto reenter_switch;
- case resume_PLUS1:
- goto resume_point_PLUS1;
- case resume_PLUS2:
- goto resume_point_PLUS2;
- case resume_PLUS3:
- goto resume_point_PLUS3;
- case resume_PLUS4:
- goto resume_point_PLUS4;
default:
Perl_croak(aTHX_ "regexp resume memory corruption");
}
c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
(UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
/* see [perl #37836] for UTF8_ALLOW_ANYUV */
- if (len == (STRLEN)-1)
+ if (len == (STRLEN)-1)
Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
}