* it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
* This causes the main functions to be compiled under new names and with
* debugging support added, which makes "use re 'debug'" work.
-
*/
/* NOTE: this is derived from Henry Spencer's regexp code, and should not
#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
-#define TRYPAREN(paren, n, input, where) { \
- if (paren) { \
- if (n) { \
- PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
- PL_regendp[paren] = input - PL_bostr; \
- } \
- else \
- PL_regendp[paren] = -1; \
- } \
- REGMATCH(next, where); \
- if (result) \
- sayYES; \
- if (paren && n) \
- PL_regendp[paren] = -1; \
-}
-
-
/*
* pregexec and friends
*/
const I32 multiline = prog->reganch & PMf_MULTILINE;
#ifdef DEBUGGING
const char * const i_strpos = strpos;
- SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
#endif
GET_RE_DEBUG_FLAGS_DECL;
}
DEBUG_EXECUTE_r({
- const char *s = PL_reg_match_utf8 ?
- sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
- strpos;
- const int len = PL_reg_match_utf8 ?
- (int)strlen(s) : strend - strpos;
+ RE_PV_DISPLAY_DECL(s, len, PL_reg_match_utf8,
+ PERL_DEBUG_PAD_ZERO(0), strpos, strend - strpos, 60);
+
if (!PL_colorset)
reginitcolors();
if (PL_reg_match_utf8)
/* annoyingly all the vars in this routine have different names from their counterparts
in regmatch. /grrr */
+#define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
+foldlen, foldbuf, uniflags) STMT_START { \
+ switch (trie_type) { \
+ case trie_utf8_fold: \
+ if ( foldlen>0 ) { \
+ uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
+ foldlen -= len; \
+ uscan += len; \
+ len=0; \
+ } else { \
+ uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
+ uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
+ foldlen -= UNISKIP( uvc ); \
+ uscan = foldbuf + UNISKIP( uvc ); \
+ } \
+ break; \
+ case trie_utf8: \
+ uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
+ break; \
+ case trie_plain: \
+ uvc = (UV)*uc; \
+ len = 1; \
+ } \
+ \
+ if (uvc < 256) { \
+ charid = trie->charmap[ uvc ]; \
+ } \
+ else { \
+ charid = 0; \
+ if (trie->widecharmap) { \
+ SV** const svpp = hv_fetch(trie->widecharmap, \
+ (char*)&uvc, sizeof(UV), 0); \
+ if (svpp) \
+ charid = (U16)SvIV(*svpp); \
+ } \
+ } \
+} STMT_END
+
+#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
+ if ( (CoNd) \
+ && (ln == len || \
+ ibcmp_utf8(s, NULL, 0, do_utf8, \
+ m, NULL, ln, (bool)UTF)) \
+ && (!reginfo || regtry(reginfo, s)) ) \
+ goto got_it; \
+ else { \
+ U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
+ uvchr_to_utf8(tmpbuf, c); \
+ f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
+ if ( f != c \
+ && (f == c1 || f == c2) \
+ && (ln == foldlen || \
+ !ibcmp_utf8((char *) foldbuf, \
+ NULL, foldlen, do_utf8, \
+ m, \
+ NULL, ln, (bool)UTF)) \
+ && (!reginfo || regtry(reginfo, s)) ) \
+ goto got_it; \
+ } \
+ s += len
+
+#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
+STMT_START { \
+ while (s <= e) { \
+ if ( (CoNd) \
+ && (ln == 1 || !(OP(c) == EXACTF \
+ ? ibcmp(s, m, ln) \
+ : ibcmp_locale(s, m, ln))) \
+ && (!reginfo || regtry(reginfo, s)) ) \
+ goto got_it; \
+ s++; \
+ } \
+} STMT_END
+
+#define REXEC_FBC_UTF8_SCAN(CoDe) \
+STMT_START { \
+ while (s + (uskip = UTF8SKIP(s)) <= strend) { \
+ CoDe \
+ s += uskip; \
+ } \
+} STMT_END
+
+#define REXEC_FBC_SCAN(CoDe) \
+STMT_START { \
+ while (s < strend) { \
+ CoDe \
+ s++; \
+ } \
+} STMT_END
+
+#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
+REXEC_FBC_UTF8_SCAN( \
+ if (CoNd) { \
+ if (tmp && (!reginfo || regtry(reginfo, s))) \
+ goto got_it; \
+ else \
+ tmp = doevery; \
+ } \
+ else \
+ tmp = 1; \
+)
+
+#define REXEC_FBC_CLASS_SCAN(CoNd) \
+REXEC_FBC_SCAN( \
+ if (CoNd) { \
+ if (tmp && (!reginfo || regtry(reginfo, s))) \
+ goto got_it; \
+ else \
+ tmp = doevery; \
+ } \
+ else \
+ tmp = 1; \
+)
+
+#define REXEC_FBC_TRYIT \
+if ((!reginfo || regtry(reginfo, s))) \
+ goto got_it
+
+#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
+ if (do_utf8) { \
+ UtFpReLoAd; \
+ REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
+ } \
+ else { \
+ REXEC_FBC_CLASS_SCAN(CoNd); \
+ } \
+ break
+
+#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
+ PL_reg_flags |= RF_tainted; \
+ if (do_utf8) { \
+ REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
+ } \
+ else { \
+ REXEC_FBC_CLASS_SCAN(CoNd); \
+ } \
+ break
+
STATIC char *
S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
const char *strend, const regmatch_info *reginfo)
switch (OP(c)) {
case ANYOF:
if (do_utf8) {
- while (s + (uskip = UTF8SKIP(s)) <= strend) {
- if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
+ REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
!UTF8_IS_INVARIANT((U8)s[0]) ?
reginclass(prog, c, (U8*)s, 0, do_utf8) :
- REGINCLASS(prog, c, (U8*)s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s += uskip;
- }
+ REGINCLASS(prog, c, (U8*)s));
}
else {
while (s < strend) {
}
break;
case CANY:
- while (s < strend) {
+ REXEC_FBC_SCAN(
if (tmp && (!reginfo || regtry(reginfo, s)))
goto got_it;
else
tmp = doevery;
- s++;
- }
+ );
break;
case EXACTF:
m = STRING(c);
while (s <= e) {
c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
uniflags);
- if ( c == c1
- && (ln == len ||
- ibcmp_utf8(s, NULL, 0, do_utf8,
- m, NULL, ln, (bool)UTF))
- && (!reginfo || regtry(reginfo, s)) )
- goto got_it;
- else {
- U8 foldbuf[UTF8_MAXBYTES_CASE+1];
- uvchr_to_utf8(tmpbuf, c);
- f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
- if ( f != c
- && (f == c1 || f == c2)
- && (ln == foldlen ||
- !ibcmp_utf8((char *) foldbuf,
- NULL, foldlen, do_utf8,
- m,
- NULL, ln, (bool)UTF))
- && (!reginfo || regtry(reginfo, s)) )
- goto got_it;
- }
- s += len;
+ REXEC_FBC_EXACTISH_CHECK(c == c1);
}
}
else {
c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
- if ( (c == c1 || c == c2)
- && (ln == len ||
- ibcmp_utf8(s, NULL, 0, do_utf8,
- m, NULL, ln, (bool)UTF))
- && (!reginfo || regtry(reginfo, s)) )
- goto got_it;
- else {
- U8 foldbuf[UTF8_MAXBYTES_CASE+1];
- uvchr_to_utf8(tmpbuf, c);
- f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
- if ( f != c
- && (f == c1 || f == c2)
- && (ln == foldlen ||
- !ibcmp_utf8((char *) foldbuf,
- NULL, foldlen, do_utf8,
- m,
- NULL, ln, (bool)UTF))
- && (!reginfo || regtry(reginfo, s)) )
- goto got_it;
- }
- s += len;
+ REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
}
}
}
else {
if (c1 == c2)
- while (s <= e) {
- if ( *(U8*)s == c1
- && (ln == 1 || !(OP(c) == EXACTF
- ? ibcmp(s, m, ln)
- : ibcmp_locale(s, m, ln)))
- && (!reginfo || regtry(reginfo, s)) )
- goto got_it;
- s++;
- }
+ REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
else
- while (s <= e) {
- if ( (*(U8*)s == c1 || *(U8*)s == c2)
- && (ln == 1 || !(OP(c) == EXACTF
- ? ibcmp(s, m, ln)
- : ibcmp_locale(s, m, ln)))
- && (!reginfo || regtry(reginfo, s)) )
- goto got_it;
- s++;
- }
+ REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
}
break;
case BOUNDL:
tmp = ((OP(c) == BOUND ?
isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
LOAD_UTF8_CHARCLASS_ALNUM();
- while (s + (uskip = UTF8SKIP(s)) <= strend) {
+ REXEC_FBC_UTF8_SCAN(
if (tmp == !(OP(c) == BOUND ?
(bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
isALNUM_LC_utf8((U8*)s)))
{
tmp = !tmp;
- if ((!reginfo || regtry(reginfo, s)))
- goto got_it;
- }
- s += uskip;
+ REXEC_FBC_TRYIT;
}
+ );
}
else {
tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
- while (s < strend) {
+ REXEC_FBC_SCAN(
if (tmp ==
!(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
tmp = !tmp;
- if ((!reginfo || regtry(reginfo, s)))
- goto got_it;
- }
- s++;
+ REXEC_FBC_TRYIT;
}
+ );
}
if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
goto got_it;
tmp = ((OP(c) == NBOUND ?
isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
LOAD_UTF8_CHARCLASS_ALNUM();
- while (s + (uskip = UTF8SKIP(s)) <= strend) {
+ REXEC_FBC_UTF8_SCAN(
if (tmp == !(OP(c) == NBOUND ?
(bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
isALNUM_LC_utf8((U8*)s)))
tmp = !tmp;
- else if ((!reginfo || regtry(reginfo, s)))
- goto got_it;
- s += uskip;
- }
+ else REXEC_FBC_TRYIT;
+ );
}
else {
tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
tmp = ((OP(c) == NBOUND ?
isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
- while (s < strend) {
+ REXEC_FBC_SCAN(
if (tmp ==
!(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
tmp = !tmp;
- else if ((!reginfo || regtry(reginfo, s)))
- goto got_it;
- s++;
- }
+ else REXEC_FBC_TRYIT;
+ );
}
if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
goto got_it;
break;
case ALNUM:
- if (do_utf8) {
- LOAD_UTF8_CHARCLASS_ALNUM();
- while (s + (uskip = UTF8SKIP(s)) <= strend) {
- if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s += uskip;
- }
- }
- else {
- while (s < strend) {
- if (isALNUM(*s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s++;
- }
- }
- break;
+ REXEC_FBC_CSCAN_PRELOAD(
+ LOAD_UTF8_CHARCLASS_ALNUM(),
+ swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
+ isALNUM(*s)
+ );
case ALNUML:
- PL_reg_flags |= RF_tainted;
- if (do_utf8) {
- while (s + (uskip = UTF8SKIP(s)) <= strend) {
- if (isALNUM_LC_utf8((U8*)s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s += uskip;
- }
- }
- else {
- while (s < strend) {
- if (isALNUM_LC(*s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s++;
- }
- }
- break;
+ REXEC_FBC_CSCAN_TAINT(
+ isALNUM_LC_utf8((U8*)s),
+ isALNUM_LC(*s)
+ );
case NALNUM:
- if (do_utf8) {
- LOAD_UTF8_CHARCLASS_ALNUM();
- while (s + (uskip = UTF8SKIP(s)) <= strend) {
- if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s += uskip;
- }
- }
- else {
- while (s < strend) {
- if (!isALNUM(*s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s++;
- }
- }
- break;
+ REXEC_FBC_CSCAN_PRELOAD(
+ LOAD_UTF8_CHARCLASS_ALNUM(),
+ !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
+ !isALNUM(*s)
+ );
case NALNUML:
- PL_reg_flags |= RF_tainted;
- if (do_utf8) {
- while (s + (uskip = UTF8SKIP(s)) <= strend) {
- if (!isALNUM_LC_utf8((U8*)s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s += uskip;
- }
- }
- else {
- while (s < strend) {
- if (!isALNUM_LC(*s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s++;
- }
- }
- break;
+ REXEC_FBC_CSCAN_TAINT(
+ !isALNUM_LC_utf8((U8*)s),
+ !isALNUM_LC(*s)
+ );
case SPACE:
- if (do_utf8) {
- LOAD_UTF8_CHARCLASS_SPACE();
- while (s + (uskip = UTF8SKIP(s)) <= strend) {
- if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s += uskip;
- }
- }
- else {
- while (s < strend) {
- if (isSPACE(*s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s++;
- }
- }
- break;
+ REXEC_FBC_CSCAN_PRELOAD(
+ LOAD_UTF8_CHARCLASS_SPACE(),
+ *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
+ isSPACE(*s)
+ );
case SPACEL:
- PL_reg_flags |= RF_tainted;
- if (do_utf8) {
- while (s + (uskip = UTF8SKIP(s)) <= strend) {
- if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s += uskip;
- }
- }
- else {
- while (s < strend) {
- if (isSPACE_LC(*s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s++;
- }
- }
- break;
+ REXEC_FBC_CSCAN_TAINT(
+ *s == ' ' || isSPACE_LC_utf8((U8*)s),
+ isSPACE_LC(*s)
+ );
case NSPACE:
- if (do_utf8) {
- LOAD_UTF8_CHARCLASS_SPACE();
- while (s + (uskip = UTF8SKIP(s)) <= strend) {
- if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s += uskip;
- }
- }
- else {
- while (s < strend) {
- if (!isSPACE(*s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s++;
- }
- }
- break;
+ REXEC_FBC_CSCAN_PRELOAD(
+ LOAD_UTF8_CHARCLASS_SPACE(),
+ !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
+ !isSPACE(*s)
+ );
case NSPACEL:
- PL_reg_flags |= RF_tainted;
- if (do_utf8) {
- while (s + (uskip = UTF8SKIP(s)) <= strend) {
- if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s += uskip;
- }
- }
- else {
- while (s < strend) {
- if (!isSPACE_LC(*s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s++;
- }
- }
- break;
+ REXEC_FBC_CSCAN_TAINT(
+ !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
+ !isSPACE_LC(*s)
+ );
case DIGIT:
- if (do_utf8) {
- LOAD_UTF8_CHARCLASS_DIGIT();
- while (s + (uskip = UTF8SKIP(s)) <= strend) {
- if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s += uskip;
- }
- }
- else {
- while (s < strend) {
- if (isDIGIT(*s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s++;
- }
- }
- break;
+ REXEC_FBC_CSCAN_PRELOAD(
+ LOAD_UTF8_CHARCLASS_DIGIT(),
+ swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
+ isDIGIT(*s)
+ );
case DIGITL:
- PL_reg_flags |= RF_tainted;
- if (do_utf8) {
- while (s + (uskip = UTF8SKIP(s)) <= strend) {
- if (isDIGIT_LC_utf8((U8*)s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s += uskip;
- }
- }
- else {
- while (s < strend) {
- if (isDIGIT_LC(*s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s++;
- }
- }
- break;
+ REXEC_FBC_CSCAN_TAINT(
+ isDIGIT_LC_utf8((U8*)s),
+ isDIGIT_LC(*s)
+ );
case NDIGIT:
- if (do_utf8) {
- LOAD_UTF8_CHARCLASS_DIGIT();
- while (s + (uskip = UTF8SKIP(s)) <= strend) {
- if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s += uskip;
- }
- }
- else {
- while (s < strend) {
- if (!isDIGIT(*s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s++;
- }
- }
- break;
+ REXEC_FBC_CSCAN_PRELOAD(
+ LOAD_UTF8_CHARCLASS_DIGIT(),
+ !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
+ !isDIGIT(*s)
+ );
case NDIGITL:
- PL_reg_flags |= RF_tainted;
- if (do_utf8) {
- while (s + (uskip = UTF8SKIP(s)) <= strend) {
- if (!isDIGIT_LC_utf8((U8*)s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s += uskip;
- }
- }
- else {
- while (s < strend) {
- if (!isDIGIT_LC(*s)) {
- if (tmp && (!reginfo || regtry(reginfo, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s++;
- }
- }
- break;
+ REXEC_FBC_CSCAN_TAINT(
+ !isDIGIT_LC_utf8((U8*)s),
+ !isDIGIT_LC(*s)
+ );
case TRIE:
/*Perl_croak(aTHX_ "panic: unknown regstclass TRIE");*/
{
if (base==0) break;
}
points[pointpos++ % maxlen]= uc;
- switch (trie_type) {
- case trie_utf8_fold:
- if ( foldlen>0 ) {
- uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
- foldlen -= len;
- uscan += len;
- len=0;
- } else {
- uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
- uvc = to_uni_fold( uvc, foldbuf, &foldlen );
- foldlen -= UNISKIP( uvc );
- uscan = foldbuf + UNISKIP( uvc );
- }
- break;
- case trie_utf8:
- uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
- &len, uniflags );
- break;
- case trie_plain:
- uvc = (UV)*uc;
- len = 1;
- }
-
- if (uvc < 256) {
- charid = trie->charmap[ uvc ];
- }
- else {
- charid = 0;
- if (trie->widecharmap) {
- SV** const svpp = hv_fetch(trie->widecharmap,
- (char*)&uvc, sizeof(UV), 0);
- if (svpp)
- charid = (U16)SvIV(*svpp);
- }
- }
+ REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
+ uvc, charid, foldlen, foldbuf, uniflags);
DEBUG_TRIE_EXECUTE_r(
PerlIO_printf(Perl_debug_log,
"Pos: %d Charid:%3x CV:%4"UVxf" ",
SV* const oreplsv = GvSV(PL_replgv);
const bool do_utf8 = DO_UTF8(sv);
I32 multiline;
-#ifdef DEBUGGING
- SV* dsv0;
- SV* dsv1;
-#endif
+
regmatch_info reginfo; /* create some info to pass to regtry etc */
GET_RE_DEBUG_FLAGS_DECL;
multiline = prog->reganch & PMf_MULTILINE;
reginfo.prog = prog;
-#ifdef DEBUGGING
- dsv0 = PERL_DEBUG_PAD_ZERO(0);
- dsv1 = PERL_DEBUG_PAD_ZERO(1);
-#endif
-
RX_MATCH_UTF8_set(prog, do_utf8);
minlen = prog->minlen;
}
DEBUG_EXECUTE_r({
- const char * const s0 = UTF
- ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
- UNI_DISPLAY_REGEX)
- : prog->precomp;
- const int len0 = UTF ? (int)SvCUR(dsv0) : prog->prelen;
- const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
- UNI_DISPLAY_REGEX) : startpos;
- const int len1 = do_utf8 ? (int)SvCUR(dsv1) : strend - startpos;
+ RE_PV_DISPLAY_DECL(s0, len0, UTF,
+ PERL_DEBUG_PAD_ZERO(0), prog->precomp, prog->prelen, 60);
+ RE_PV_DISPLAY_DECL(s1, len1, do_utf8,
+ PERL_DEBUG_PAD_ZERO(1), startpos, strend - startpos, 60);
+
if (!PL_colorset)
reginitcolors();
PerlIO_printf(Perl_debug_log,
ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
if (do_utf8) {
- while (s < strend) {
+ REXEC_FBC_SCAN(
if (*s == ch) {
DEBUG_EXECUTE_r( did_match = 1 );
if (regtry(®info, s)) goto got_it;
while (s < strend && *s == ch)
s += UTF8SKIP(s);
}
- s += UTF8SKIP(s);
- }
+ );
}
else {
- while (s < strend) {
+ REXEC_FBC_SCAN(
if (*s == ch) {
DEBUG_EXECUTE_r( did_match = 1 );
if (regtry(®info, s)) goto got_it;
while (s < strend && *s == ch)
s++;
}
- s++;
- }
+ );
}
DEBUG_EXECUTE_r(if (!did_match)
PerlIO_printf(Perl_debug_log,
}
DEBUG_EXECUTE_r({
SV * const prop = sv_newmortal();
- const char *s0;
- const char *s1;
- int len0;
- int len1;
-
regprop(prog, prop, c);
- s0 = UTF ?
- pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
- UNI_DISPLAY_REGEX) :
- SvPVX_const(prop);
- len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
- s1 = UTF ?
- sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
- len1 = UTF ? (int)SvCUR(dsv1) : strend - s;
- PerlIO_printf(Perl_debug_log,
- "Matching stclass \"%*.*s\" against \"%*.*s\" (%d chars)\n",
- len0, len0, s0,
- len1, len1, s1, (int)(strend - s));
+ {
+ RE_PV_DISPLAY_DECL(s0,len0,UTF,
+ PERL_DEBUG_PAD_ZERO(0),SvPVX_const(prop),SvCUR(prop),60);
+ RE_PV_DISPLAY_DECL(s1,len1,UTF,
+ PERL_DEBUG_PAD_ZERO(1),s,strend-s,60);
+ PerlIO_printf(Perl_debug_log,
+ "Matching stclass \"%*.*s\" against \"%*.*s\" (%d chars)\n",
+ len0, len0, s0,
+ len1, len1, s1, (int)(strend - s));
+ }
});
if (find_byclass(prog, c, s, strend, ®info))
goto got_it;
*/
/* *** every FOO_fail should = FOO+1 */
-#define resume_TRIE1 (REGNODE_MAX+1)
-#define resume_TRIE2 (REGNODE_MAX+2)
-#define EVAL_A (REGNODE_MAX+3)
-#define EVAL_A_fail (REGNODE_MAX+4)
-#define resume_CURLYX (REGNODE_MAX+5)
-#define resume_WHILEM1 (REGNODE_MAX+6)
-#define resume_WHILEM2 (REGNODE_MAX+7)
-#define resume_WHILEM3 (REGNODE_MAX+8)
-#define resume_WHILEM4 (REGNODE_MAX+9)
-#define resume_WHILEM5 (REGNODE_MAX+10)
-#define resume_WHILEM6 (REGNODE_MAX+11)
-#define BRANCH_next (REGNODE_MAX+12)
-#define BRANCH_next_fail (REGNODE_MAX+13)
-#define CURLYM_A (REGNODE_MAX+14)
-#define CURLYM_A_fail (REGNODE_MAX+15)
-#define CURLYM_B (REGNODE_MAX+16)
-#define CURLYM_B_fail (REGNODE_MAX+17)
-#define IFMATCH_A (REGNODE_MAX+18)
-#define IFMATCH_A_fail (REGNODE_MAX+19)
-#define resume_PLUS1 (REGNODE_MAX+20)
-#define resume_PLUS2 (REGNODE_MAX+21)
-#define resume_PLUS3 (REGNODE_MAX+22)
-#define resume_PLUS4 (REGNODE_MAX+23)
-
+#define TRIE_next (REGNODE_MAX+1)
+#define TRIE_next_fail (REGNODE_MAX+2)
+#define EVAL_A (REGNODE_MAX+3)
+#define EVAL_A_fail (REGNODE_MAX+4)
+#define resume_CURLYX (REGNODE_MAX+5)
+#define resume_WHILEM1 (REGNODE_MAX+6)
+#define resume_WHILEM2 (REGNODE_MAX+7)
+#define resume_WHILEM3 (REGNODE_MAX+8)
+#define resume_WHILEM4 (REGNODE_MAX+9)
+#define resume_WHILEM5 (REGNODE_MAX+10)
+#define resume_WHILEM6 (REGNODE_MAX+11)
+#define BRANCH_next (REGNODE_MAX+12)
+#define BRANCH_next_fail (REGNODE_MAX+13)
+#define CURLYM_A (REGNODE_MAX+14)
+#define CURLYM_A_fail (REGNODE_MAX+15)
+#define CURLYM_B (REGNODE_MAX+16)
+#define CURLYM_B_fail (REGNODE_MAX+17)
+#define IFMATCH_A (REGNODE_MAX+18)
+#define IFMATCH_A_fail (REGNODE_MAX+19)
+#define CURLY_B_min_known (REGNODE_MAX+20)
+#define CURLY_B_min_known_fail (REGNODE_MAX+21)
+#define CURLY_B_min (REGNODE_MAX+22)
+#define CURLY_B_min_fail (REGNODE_MAX+23)
+#define CURLY_B_max (REGNODE_MAX+24)
+#define CURLY_B_max_fail (REGNODE_MAX+25)
#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
-#ifdef DEBUGGING
-STATIC void
+#ifdef DEBUGGING
+
+STATIC void
S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8)
{
const int docolor = *PL_colors[0];
if (pref0_len > pref_len)
pref0_len = pref_len;
{
- const char * const s0 =
- do_utf8 && OP(scan) != CANY ?
- pv_uni_display(PERL_DEBUG_PAD(0), (U8*)(locinput - pref_len),
- pref0_len, 60, UNI_DISPLAY_REGEX) :
- locinput - pref_len;
- const int len0 = do_utf8 ? (int)strlen(s0) : pref0_len;
- const char * const s1 = do_utf8 && OP(scan) != CANY ?
- pv_uni_display(PERL_DEBUG_PAD(1),
- (U8*)(locinput - pref_len + pref0_len),
- pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
- locinput - pref_len + pref0_len;
- const int len1 = do_utf8 ? (int)strlen(s1) : pref_len - pref0_len;
- const char * const s2 = do_utf8 && OP(scan) != CANY ?
- pv_uni_display(PERL_DEBUG_PAD(2), (U8*)locinput,
- PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
- locinput;
- const int len2 = do_utf8 ? (int)strlen(s2) : l;
- PerlIO_printf(Perl_debug_log,
+ const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
+
+ RE_PV_DISPLAY_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
+ (locinput - pref_len),pref0_len, 60);
+
+ RE_PV_DISPLAY_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
+ (locinput - pref_len + pref0_len),
+ pref_len - pref0_len, 60);
+
+ RE_PV_DISPLAY_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
+ locinput, PL_regeol - locinput, 60);
+
+ PerlIO_printf(Perl_debug_log,
"%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|",
(IV)(locinput - PL_bostr),
PL_colors[4],
"");
}
}
+
#endif
STATIC I32 /* 0 failure, 1 success */
else
nextchr = UCHARAT(++locinput);
break;
+
+#undef ST
+#define ST st->u.trie
+
case TRIE:
{
/* what type of TRIE am I? (utf8 makes this contextual) */
reg_trie_data * const trie
= (reg_trie_data*)rex->data->data[ ARG( scan ) ];
U32 state = trie->startstate;
+
+ U8 *uc = ( U8* )locinput;
+ U16 charid = 0;
+ U32 base = 0;
+ UV uvc = 0;
+ STRLEN len = 0;
+ STRLEN foldlen = 0;
+ U8 *uscan = (U8*)NULL;
+ STRLEN bufflen=0;
+ SV *sv_accept_buff = NULL;
+ U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+
+ ST.accepted = 0; /* how many accepting states we have seen */
+ ST.B = next;
+#ifdef DEBUGGING
+ ST.me = scan;
+#endif
if (trie->bitmap && trie_type != trie_utf8_fold &&
!TRIE_BITMAP_TEST(trie,*locinput)
sayNO_SILENT;
}
}
- {
+
/*
traverse the TRIE keeping track of all accepting states
we transition through until we get to a failing node.
*/
- U8 *uc = ( U8* )locinput;
- U16 charid = 0;
- U32 base = 0;
- UV uvc = 0;
- STRLEN len = 0;
- STRLEN foldlen = 0;
- U8 *uscan = (U8*)NULL;
- STRLEN bufflen=0;
- SV *sv_accept_buff = NULL;
- U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
-
- st->u.trie.accepted = 0; /* how many accepting states we have seen */
- result = 0;
-
while ( state && uc <= (U8*)PL_regeol ) {
if (trie->states[ state ].wordnum) {
- if (!st->u.trie.accepted ) {
+ if (!ST.accepted ) {
ENTER;
SAVETMPS;
bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
sizeof(reg_trie_accepted));
SvPOK_on(sv_accept_buff);
sv_2mortal(sv_accept_buff);
- st->u.trie.accept_buff =
+ SAVETMPS;
+ ST.accept_buff =
(reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
}
else {
- if (st->u.trie.accepted >= bufflen) {
+ if (ST.accepted >= bufflen) {
bufflen *= 2;
- st->u.trie.accept_buff =(reg_trie_accepted*)
+ ST.accept_buff =(reg_trie_accepted*)
SvGROW(sv_accept_buff,
bufflen * sizeof(reg_trie_accepted));
}
SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
+ sizeof(reg_trie_accepted));
}
- st->u.trie.accept_buff[st->u.trie.accepted].wordnum = trie->states[state].wordnum;
- st->u.trie.accept_buff[st->u.trie.accepted].endpos = uc;
- ++st->u.trie.accepted;
+ ST.accept_buff[ST.accepted].wordnum = trie->states[state].wordnum;
+ ST.accept_buff[ST.accepted].endpos = uc;
+ ++ST.accepted;
}
base = trie->states[ state ].trans.base;
PerlIO_printf( Perl_debug_log,
"%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
2+PL_regindent * 2, "", PL_colors[4],
- (UV)state, (UV)base, (UV)st->u.trie.accepted );
+ (UV)state, (UV)base, (UV)ST.accepted );
});
if ( base ) {
- switch (trie_type) {
- case trie_utf8_fold:
- if ( foldlen>0 ) {
- uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
- foldlen -= len;
- uscan += len;
- len=0;
- } else {
- uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
- uvc = to_uni_fold( uvc, foldbuf, &foldlen );
- foldlen -= UNISKIP( uvc );
- uscan = foldbuf + UNISKIP( uvc );
- }
- break;
- case trie_utf8:
- uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
- &len, uniflags );
- break;
- case trie_plain:
- uvc = (UV)*uc;
- len = 1;
- }
-
- if (uvc < 256) {
- charid = trie->charmap[ uvc ];
- }
- else {
- charid = 0;
- if (trie->widecharmap) {
- SV** const svpp = hv_fetch(trie->widecharmap,
- (char*)&uvc, sizeof(UV), 0);
- if (svpp)
- charid = (U16)SvIV(*svpp);
- }
- }
+ REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
+ uvc, charid, foldlen, foldbuf, uniflags);
if (charid &&
(base + charid > trie->uniquecharcount )
charid, uvc, (UV)state, PL_colors[5] );
);
}
- if (!st->u.trie.accepted )
+ if (!ST.accepted )
sayNO;
+ DEBUG_EXECUTE_r(
+ PerlIO_printf( Perl_debug_log,
+ "%*s %sgot %"IVdf" possible matches%s\n",
+ REPORT_CODE_OFF + PL_regindent * 2, "",
+ PL_colors[4], (IV)ST.accepted, PL_colors[5] );
+ );
+ }
+
+ /* FALL THROUGH */
+
+ case TRIE_next_fail: /* we failed - try next alterative */
+
+ if ( ST.accepted == 1 ) {
+ /* only one choice left - just continue */
+ DEBUG_EXECUTE_r({
+ reg_trie_data * const trie
+ = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
+ SV ** const tmp = RX_DEBUG(reginfo->prog)
+ ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
+ : NULL;
+ PerlIO_printf( Perl_debug_log,
+ "%*s %sonly one match left: #%d <%s>%s\n",
+ REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
+ ST.accept_buff[ 0 ].wordnum,
+ tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
+ PL_colors[5] );
+ });
+ PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
+ /* in this case we free tmps/leave before we call regmatch
+ as we wont be using accept_buff again. */
+ FREETMPS;
+ LEAVE;
+ locinput = PL_reginput;
+ nextchr = UCHARAT(locinput);
+ scan = ST.B;
+ continue; /* execute rest of RE */
+ }
+
+ if (!ST.accepted-- ) {
+ FREETMPS;
+ LEAVE;
+ sayNO;
+ }
+
/*
- There was at least one accepting state that we
- transitioned through. Presumably the number of accepting
- states is going to be low, typically one or two. So we
- simply scan through to find the one with lowest wordnum.
- Once we find it, we swap the last state into its place
- and decrement the size. We then try to match the rest of
- the pattern at the point where the word ends, if we
- succeed then we end the loop, otherwise the loop
- eventually terminates once all of the accepting states
- have been tried.
- */
+ There are at least two accepting states left. Presumably
+ the number of accepting states is going to be low,
+ typically two. So we simply scan through to find the one
+ with lowest wordnum. Once we find it, we swap the last
+ state into its place and decrement the size. We then try to
+ match the rest of the pattern at the point where the word
+ ends. If we succeed, control just continues along the
+ regex; if we fail we return here to try the next accepting
+ state
+ */
- if ( st->u.trie.accepted == 1 ) {
- DEBUG_EXECUTE_r({
- SV ** const tmp = RX_DEBUG(reginfo->prog)
- ? av_fetch( trie->words, st->u.trie.accept_buff[ 0 ].wordnum-1, 0 )
- : NULL;
+ {
+ U32 best = 0;
+ U32 cur;
+ for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
+ DEBUG_TRIE_EXECUTE_r(
PerlIO_printf( Perl_debug_log,
- "%*s %sonly one match : #%d <%s>%s\n",
- REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
- st->u.trie.accept_buff[ 0 ].wordnum,
- tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
- PL_colors[5] );
- });
- PL_reginput = (char *)st->u.trie.accept_buff[ 0 ].endpos;
- /* in this case we free tmps/leave before we call regmatch
- as we wont be using accept_buff again. */
- FREETMPS;
- LEAVE;
- /* do we need this? why dont we just do a break? */
- REGMATCH(scan + NEXT_OFF(scan), TRIE1);
- /*** all unsaved local vars undefined at this point */
- } else {
- DEBUG_EXECUTE_r(
- PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
- REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)st->u.trie.accepted,
- PL_colors[5] );
- );
- while ( !result && st->u.trie.accepted-- ) {
- U32 best = 0;
- U32 cur;
- for( cur = 1 ; cur <= st->u.trie.accepted ; cur++ ) {
- DEBUG_TRIE_EXECUTE_r(
- PerlIO_printf( Perl_debug_log,
- "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
- REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
- (IV)best, st->u.trie.accept_buff[ best ].wordnum, (IV)cur,
- st->u.trie.accept_buff[ cur ].wordnum, PL_colors[5] );
- );
-
- if (st->u.trie.accept_buff[cur].wordnum <
- st->u.trie.accept_buff[best].wordnum)
- best = cur;
- }
- DEBUG_EXECUTE_r({
- reg_trie_data * const trie = (reg_trie_data*)
- rex->data->data[ARG(scan)];
- SV ** const tmp = RX_DEBUG(reginfo->prog)
- ? av_fetch( trie->words, st->u.trie.accept_buff[ best ].wordnum - 1, 0 )
- : NULL;
- PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
- REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
- st->u.trie.accept_buff[best].wordnum,
- tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
- PL_colors[5] );
- });
- if ( best<st->u.trie.accepted ) {
- reg_trie_accepted tmp = st->u.trie.accept_buff[ best ];
- st->u.trie.accept_buff[ best ] = st->u.trie.accept_buff[ st->u.trie.accepted ];
- st->u.trie.accept_buff[ st->u.trie.accepted ] = tmp;
- best = st->u.trie.accepted;
- }
- PL_reginput = (char *)st->u.trie.accept_buff[ best ].endpos;
-
- /*
- as far as I can tell we only need the SAVETMPS/FREETMPS
- for re's with EVAL in them but I'm leaving them in for
- all until I can be sure.
- */
- SAVETMPS;
- REGMATCH(scan + NEXT_OFF(scan), TRIE2);
- /*** all unsaved local vars undefined at this point */
- FREETMPS;
- }
- FREETMPS;
- LEAVE;
+ "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
+ REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
+ (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
+ ST.accept_buff[ cur ].wordnum, PL_colors[5] );
+ );
+
+ if (ST.accept_buff[cur].wordnum <
+ ST.accept_buff[best].wordnum)
+ best = cur;
}
-
- if (result) {
- sayYES;
- } else {
- sayNO;
+
+ DEBUG_EXECUTE_r({
+ reg_trie_data * const trie
+ = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
+ SV ** const tmp = RX_DEBUG(reginfo->prog)
+ ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
+ : NULL;
+ PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
+ REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
+ ST.accept_buff[best].wordnum,
+ tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
+ PL_colors[5] );
+ });
+
+ if ( best<ST.accepted ) {
+ reg_trie_accepted tmp = ST.accept_buff[ best ];
+ ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
+ ST.accept_buff[ ST.accepted ] = tmp;
+ best = ST.accepted;
}
- }}
- /* unreached codepoint */
+ PL_reginput = (char *)ST.accept_buff[ best ].endpos;
+ }
+ PUSH_STATE_GOTO(TRIE_next, ST.B);
+ /* NOTREACHED */
+
+#undef ST
+
case EXACT: {
char *s = STRING(scan);
st->ln = STR_LEN(scan);
locinput = HOPc(locinput, -ST.alen);
goto curlym_do_B; /* try to match B */
+#undef ST
+#define ST st->u.curly
- case CURLYN:
- st->u.plus.paren = scan->flags; /* Which paren to set */
- if (st->u.plus.paren > PL_regsize)
- PL_regsize = st->u.plus.paren;
- if (st->u.plus.paren > (I32)*PL_reglastparen)
- *PL_reglastparen = st->u.plus.paren;
- st->ln = ARG1(scan); /* min to match */
- n = ARG2(scan); /* max to match */
- scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
- goto repeat;
- case CURLY:
- st->u.plus.paren = 0;
- st->ln = ARG1(scan); /* min to match */
- n = ARG2(scan); /* max to match */
- scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
- goto repeat;
- case STAR:
- st->ln = 0;
- n = REG_INFTY;
+#define CURLY_SETPAREN(paren, success) \
+ if (paren) { \
+ if (success) { \
+ PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
+ PL_regendp[paren] = locinput - PL_bostr; \
+ } \
+ else \
+ PL_regendp[paren] = -1; \
+ }
+
+ case STAR: /* /A*B/ where A is width 1 */
+ ST.paren = 0;
+ ST.min = 0;
+ ST.max = REG_INFTY;
scan = NEXTOPER(scan);
- st->u.plus.paren = 0;
goto repeat;
- case PLUS:
- st->ln = 1;
- n = REG_INFTY;
+ case PLUS: /* /A+B/ where A is width 1 */
+ ST.paren = 0;
+ ST.min = 1;
+ ST.max = REG_INFTY;
scan = NEXTOPER(scan);
- st->u.plus.paren = 0;
+ goto repeat;
+ case CURLYN: /* /(A){m,n}B/ where A is width 1 */
+ ST.paren = scan->flags; /* Which paren to set */
+ if (ST.paren > PL_regsize)
+ PL_regsize = ST.paren;
+ if (ST.paren > (I32)*PL_reglastparen)
+ *PL_reglastparen = ST.paren;
+ ST.min = ARG1(scan); /* min to match */
+ ST.max = ARG2(scan); /* max to match */
+ scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
+ goto repeat;
+ case CURLY: /* /A{m,n}B/ where A is width 1 */
+ ST.paren = 0;
+ ST.min = ARG1(scan); /* min to match */
+ ST.max = ARG2(scan); /* max to match */
+ scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
repeat:
/*
* Lookahead to avoid useless match attempts
* when we know what character comes next.
- */
-
- /*
+ *
* Used to only do .*x and .*?x, but now it allows
* for )'s, ('s and (?{ ... })'s to be in the way
* of the quantifier and the EXACT-like node. -- japhy
*/
+ if (ST.min > ST.max) /* XXX make this a compile-time check? */
+ sayNO;
if (HAS_TEXT(next) || JUMPABLE(next)) {
U8 *s;
regnode *text_node = next;
FIND_NEXT_IMPT(text_node);
if (! HAS_TEXT(text_node))
- st->u.plus.c1 = st->u.plus.c2 = CHRTEST_VOID;
+ ST.c1 = ST.c2 = CHRTEST_VOID;
else {
if (PL_regkind[OP(text_node)] == REF) {
- st->u.plus.c1 = st->u.plus.c2 = CHRTEST_VOID;
+ ST.c1 = ST.c2 = CHRTEST_VOID;
goto assume_ok_easy;
}
else
s = (U8*)STRING(text_node);
if (!UTF) {
- st->u.plus.c2 = st->u.plus.c1 = *s;
+ ST.c2 = ST.c1 = *s;
if (OP(text_node) == EXACTF || OP(text_node) == REFF)
- st->u.plus.c2 = PL_fold[st->u.plus.c1];
+ ST.c2 = PL_fold[ST.c1];
else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
- st->u.plus.c2 = PL_fold_locale[st->u.plus.c1];
+ ST.c2 = PL_fold_locale[ST.c1];
}
else { /* UTF */
if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
- st->u.plus.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
+ ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
uniflags);
- st->u.plus.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
+ ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
uniflags);
}
else {
- st->u.plus.c2 = st->u.plus.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
+ ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
uniflags);
}
}
}
}
else
- st->u.plus.c1 = st->u.plus.c2 = CHRTEST_VOID;
+ ST.c1 = ST.c2 = CHRTEST_VOID;
assume_ok_easy:
+
+ ST.A = scan;
+ ST.B = next;
PL_reginput = locinput;
if (st->minmod) {
st->minmod = 0;
- if (st->ln && regrepeat(rex, scan, st->ln) < st->ln)
+ if (ST.min && regrepeat(rex, ST.A, ST.min) < ST.min)
sayNO;
+ ST.count = ST.min;
locinput = PL_reginput;
- REGCP_SET(st->u.plus.lastcp);
- if (st->u.plus.c1 != CHRTEST_VOID) {
- st->u.plus.old = locinput;
- st->u.plus.count = 0;
-
- if (n == REG_INFTY) {
- st->u.plus.e = PL_regeol - 1;
- if (do_utf8)
- while (UTF8_IS_CONTINUATION(*(U8*)st->u.plus.e))
- st->u.plus.e--;
- }
- else if (do_utf8) {
- int m = n - st->ln;
- for (st->u.plus.e = locinput;
- m >0 && st->u.plus.e + UTF8SKIP(st->u.plus.e) <= PL_regeol; m--)
- st->u.plus.e += UTF8SKIP(st->u.plus.e);
+ REGCP_SET(ST.cp);
+ if (ST.c1 == CHRTEST_VOID)
+ goto curly_try_B_min;
+
+ ST.oldloc = locinput;
+
+ /* set ST.maxpos to the furthest point along the
+ * string that could possibly match */
+ if (ST.max == REG_INFTY) {
+ ST.maxpos = PL_regeol - 1;
+ if (do_utf8)
+ while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
+ ST.maxpos--;
+ }
+ else if (do_utf8) {
+ int m = ST.max - ST.min;
+ for (ST.maxpos = locinput;
+ m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
+ ST.maxpos += UTF8SKIP(ST.maxpos);
+ }
+ else {
+ ST.maxpos = locinput + ST.max - ST.min;
+ if (ST.maxpos >= PL_regeol)
+ ST.maxpos = PL_regeol - 1;
+ }
+ goto curly_try_B_min_known;
+
+ }
+ else {
+ ST.count = regrepeat(rex, ST.A, ST.max);
+ locinput = PL_reginput;
+ if (ST.count < ST.min)
+ sayNO;
+ if ((ST.count > ST.min)
+ && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
+ {
+ /* A{m,n} must come at the end of the string, there's
+ * no point in backing off ... */
+ ST.min = ST.count;
+ /* ...except that $ and \Z can match before *and* after
+ newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
+ We may back off by one in this case. */
+ if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
+ ST.min--;
+ }
+ REGCP_SET(ST.cp);
+ goto curly_try_B_max;
+ }
+ /* NOTREACHED */
+
+
+ case CURLY_B_min_known_fail:
+ /* failed to find B in a non-greedy match where c1,c2 valid */
+ if (ST.paren && ST.count)
+ PL_regendp[ST.paren] = -1;
+
+ PL_reginput = locinput; /* Could be reset... */
+ REGCP_UNWIND(ST.cp);
+ /* Couldn't or didn't -- move forward. */
+ ST.oldloc = locinput;
+ if (do_utf8)
+ locinput += UTF8SKIP(locinput);
+ else
+ locinput++;
+ ST.count++;
+ curly_try_B_min_known:
+ /* find the next place where 'B' could work, then call B */
+ {
+ int n;
+ if (do_utf8) {
+ n = (ST.oldloc == locinput) ? 0 : 1;
+ if (ST.c1 == ST.c2) {
+ STRLEN len;
+ /* set n to utf8_distance(oldloc, locinput) */
+ while (locinput <= ST.maxpos &&
+ utf8n_to_uvchr((U8*)locinput,
+ UTF8_MAXBYTES, &len,
+ uniflags) != (UV)ST.c1) {
+ locinput += len;
+ n++;
+ }
}
else {
- st->u.plus.e = locinput + n - st->ln;
- if (st->u.plus.e >= PL_regeol)
- st->u.plus.e = PL_regeol - 1;
- }
- while (1) {
- /* Find place 'next' could work */
- if (!do_utf8) {
- if (st->u.plus.c1 == st->u.plus.c2) {
- while (locinput <= st->u.plus.e &&
- UCHARAT(locinput) != st->u.plus.c1)
- locinput++;
- } else {
- while (locinput <= st->u.plus.e
- && UCHARAT(locinput) != st->u.plus.c1
- && UCHARAT(locinput) != st->u.plus.c2)
- locinput++;
- }
- st->u.plus.count = locinput - st->u.plus.old;
- }
- else {
- if (st->u.plus.c1 == st->u.plus.c2) {
- STRLEN len;
- /* count initialised to
- * utf8_distance(old, locinput) */
- while (locinput <= st->u.plus.e &&
- utf8n_to_uvchr((U8*)locinput,
- UTF8_MAXBYTES, &len,
- uniflags) != (UV)st->u.plus.c1) {
- locinput += len;
- st->u.plus.count++;
- }
- } else {
- /* count initialised to
- * utf8_distance(old, locinput) */
- while (locinput <= st->u.plus.e) {
- STRLEN len;
- const UV c = utf8n_to_uvchr((U8*)locinput,
- UTF8_MAXBYTES, &len,
- uniflags);
- if (c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2)
- break;
- locinput += len;
- st->u.plus.count++;
- }
- }
- }
- if (locinput > st->u.plus.e)
- sayNO;
- /* PL_reginput == old now */
- if (locinput != st->u.plus.old) {
- st->ln = 1; /* Did some */
- if (regrepeat(rex, scan, st->u.plus.count) < st->u.plus.count)
- sayNO;
+ /* set n to utf8_distance(oldloc, locinput) */
+ while (locinput <= ST.maxpos) {
+ STRLEN len;
+ const UV c = utf8n_to_uvchr((U8*)locinput,
+ UTF8_MAXBYTES, &len,
+ uniflags);
+ if (c == (UV)ST.c1 || c == (UV)ST.c2)
+ break;
+ locinput += len;
+ n++;
}
- /* PL_reginput == locinput now */
- PL_reginput = locinput; /* Could be reset... */
- TRYPAREN(st->u.plus.paren, st->ln, locinput, PLUS1);
- /*** all unsaved local vars undefined at this point */
-
- REGCP_UNWIND(st->u.plus.lastcp);
- /* Couldn't or didn't -- move forward. */
- st->u.plus.old = locinput;
- if (do_utf8)
- locinput += UTF8SKIP(locinput);
- else
- locinput++;
- st->u.plus.count = 1;
}
}
- else
- while (n >= st->ln || (n == REG_INFTY && st->ln > 0)) { /* ln overflow ? */
- UV c;
- if (st->u.plus.c1 != CHRTEST_VOID) {
- if (do_utf8)
- c = utf8n_to_uvchr((U8*)PL_reginput,
- UTF8_MAXBYTES, 0,
- uniflags);
- else
- c = UCHARAT(PL_reginput);
- /* If it could work, try it. */
- if (c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2) {
- TRYPAREN(st->u.plus.paren, st->ln, PL_reginput, PLUS2);
- /*** all unsaved local vars undefined at this point */
- REGCP_UNWIND(st->u.plus.lastcp);
- }
- }
- /* If it could work, try it. */
- else if (st->u.plus.c1 == CHRTEST_VOID) {
- TRYPAREN(st->u.plus.paren, st->ln, PL_reginput, PLUS3);
- /*** all unsaved local vars undefined at this point */
- REGCP_UNWIND(st->u.plus.lastcp);
+ else {
+ if (ST.c1 == ST.c2) {
+ while (locinput <= ST.maxpos &&
+ UCHARAT(locinput) != ST.c1)
+ locinput++;
}
- /* Couldn't or didn't -- move forward. */
- PL_reginput = locinput;
- if (regrepeat(rex, scan, 1)) {
- st->ln++;
- locinput = PL_reginput;
+ else {
+ while (locinput <= ST.maxpos
+ && UCHARAT(locinput) != ST.c1
+ && UCHARAT(locinput) != ST.c2)
+ locinput++;
}
- else
+ n = locinput - ST.oldloc;
+ }
+ if (locinput > ST.maxpos)
+ sayNO;
+ /* PL_reginput == oldloc now */
+ if (n) {
+ ST.count += n;
+ if (regrepeat(rex, ST.A, n) < n)
sayNO;
}
+ PL_reginput = locinput;
+ CURLY_SETPAREN(ST.paren, ST.count);
+ PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
}
- else {
- n = regrepeat(rex, scan, n);
+ /* NOTREACHED */
+
+
+ case CURLY_B_min_fail:
+ /* failed to find B in a non-greedy match where c1,c2 invalid */
+ if (ST.paren && ST.count)
+ PL_regendp[ST.paren] = -1;
+
+ REGCP_UNWIND(ST.cp);
+ /* failed -- move forward one */
+ PL_reginput = locinput;
+ if (regrepeat(rex, ST.A, 1)) {
+ ST.count++;
locinput = PL_reginput;
- if ((st->ln < n) && (PL_regkind[OP(next)] == EOL) &&
- (OP(next) != MEOL || OP(next) == SEOL || OP(next) == EOS))
+ if (ST.count <= ST.max || (ST.max == REG_INFTY &&
+ ST.count > 0)) /* count overflow ? */
{
- st->ln = n; /* why back off? */
- /* ...because $ and \Z can match before *and* after
- newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
- We should back off by one in this case. */
- if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
- st->ln--;
- }
- REGCP_SET(st->u.plus.lastcp);
- {
- while (n >= st->ln) {
- UV c = 0;
- if (st->u.plus.c1 != CHRTEST_VOID) {
- if (do_utf8)
- c = utf8n_to_uvchr((U8*)PL_reginput,
- UTF8_MAXBYTES, 0,
- uniflags);
- else
- c = UCHARAT(PL_reginput);
- }
- /* If it could work, try it. */
- if (st->u.plus.c1 == CHRTEST_VOID || c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2) {
- TRYPAREN(st->u.plus.paren, n, PL_reginput, PLUS4);
- /*** all unsaved local vars undefined at this point */
- REGCP_UNWIND(st->u.plus.lastcp);
- }
- /* Couldn't or didn't -- back up. */
- n--;
- PL_reginput = locinput = HOPc(locinput, -1);
- }
+ curly_try_B_min:
+ CURLY_SETPAREN(ST.paren, ST.count);
+ PUSH_STATE_GOTO(CURLY_B_min, ST.B);
}
}
sayNO;
- break;
+ /* NOTREACHED */
+
+
+ curly_try_B_max:
+ /* a successful greedy match: now try to match B */
+ {
+ UV c = 0;
+ if (ST.c1 != CHRTEST_VOID)
+ c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
+ UTF8_MAXBYTES, 0, uniflags)
+ : (UV) UCHARAT(PL_reginput);
+ /* If it could work, try it. */
+ if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
+ CURLY_SETPAREN(ST.paren, ST.count);
+ PUSH_STATE_GOTO(CURLY_B_max, ST.B);
+ /* NOTREACHED */
+ }
+ }
+ /* FALL THROUGH */
+ case CURLY_B_max_fail:
+ /* failed to find B in a greedy match */
+ if (ST.paren && ST.count)
+ PL_regendp[ST.paren] = -1;
+
+ REGCP_UNWIND(ST.cp);
+ /* back up. */
+ if (--ST.count < ST.min)
+ sayNO;
+ PL_reginput = locinput = HOPc(locinput, -1);
+ goto curly_try_B_max;
+
+#undef ST
+
+
case END:
if (locinput < reginfo->till) {
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
switch (st->resume_state) {
case IFMATCH_A:
case CURLYM_A:
- case BRANCH_next:
case EVAL_A:
state_num = st->resume_state;
goto reenter_switch;
case CURLYM_B:
+ case BRANCH_next:
+ case TRIE_next:
+ case CURLY_B_max:
default:
Perl_croak(aTHX_ "unexpected yes resume state");
}
nextchr = UCHARAT(locinput);
switch (st->resume_state) {
- case resume_TRIE1:
- goto resume_point_TRIE1;
- case resume_TRIE2:
- goto resume_point_TRIE2;
case resume_CURLYX:
goto resume_point_CURLYX;
case resume_WHILEM1:
goto resume_point_WHILEM5;
case resume_WHILEM6:
goto resume_point_WHILEM6;
- case resume_PLUS1:
- goto resume_point_PLUS1;
- case resume_PLUS2:
- goto resume_point_PLUS2;
- case resume_PLUS3:
- goto resume_point_PLUS3;
- case resume_PLUS4:
- goto resume_point_PLUS4;
+ case TRIE_next:
case CURLYM_A:
case CURLYM_B:
case EVAL_A:
case IFMATCH_A:
case BRANCH_next:
+ case CURLY_B_max:
+ case CURLY_B_min:
+ case CURLY_B_min_known:
break;
default:
nextchr = UCHARAT(locinput);
switch (st->resume_state) {
- case resume_TRIE1:
- goto resume_point_TRIE1;
- case resume_TRIE2:
- goto resume_point_TRIE2;
-
case resume_CURLYX:
goto resume_point_CURLYX;
case resume_WHILEM1:
case resume_WHILEM6:
goto resume_point_WHILEM6;
+ case TRIE_next:
case EVAL_A:
case BRANCH_next:
case CURLYM_A:
case CURLYM_B:
case IFMATCH_A:
+ case CURLY_B_max:
+ case CURLY_B_min:
+ case CURLY_B_min_known:
if (yes_state == st)
yes_state = st->u.yes.prev_yes_state;
state_num = st->resume_state + 1; /* failure = success + 1 */
goto reenter_switch;
- case resume_PLUS1:
- goto resume_point_PLUS1;
- case resume_PLUS2:
- goto resume_point_PLUS2;
- case resume_PLUS3:
- goto resume_point_PLUS3;
- case resume_PLUS4:
- goto resume_point_PLUS4;
default:
Perl_croak(aTHX_ "regexp resume memory corruption");
}