#define RF_tainted 1 /* tainted information used? */
#define RF_warned 2 /* warned about big count? */
#define RF_evaled 4 /* Did an EVAL with setting? */
-#define RF_utf8 8 /* String contains multibyte chars? */
+#define RF_utf8 8 /* Pattern contains multibyte chars? */
#define UTF ((PL_reg_flags & RF_utf8) != 0)
}
/* These are needed since we do not localize EVAL nodes: */
-# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
- " Setting an EVAL scope, savestack=%"IVdf"\n", \
- (IV)PL_savestack_ix)); cp = PL_savestack_ix
-
-# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
- PerlIO_printf(Perl_debug_log, \
- " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
- (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
+#define REGCP_SET(cp) \
+ DEBUG_STATE_r( \
+ PerlIO_printf(Perl_debug_log, \
+ " Setting an EVAL scope, savestack=%"IVdf"\n", \
+ (IV)PL_savestack_ix)); \
+ cp = PL_savestack_ix
+
+#define REGCP_UNWIND(cp) \
+ DEBUG_STATE_r( \
+ if (cp != PL_savestack_ix) \
+ PerlIO_printf(Perl_debug_log, \
+ " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
+ (IV)(cp), (IV)PL_savestack_ix)); \
+ regcpblow(cp)
STATIC char *
S_regcppop(pTHX_ const regexp *rex)
RX_MATCH_UTF8_set(prog,do_utf8);
if (prog->reganch & ROPT_UTF8) {
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- "UTF-8 regex...\n"));
PL_reg_flags |= RF_utf8;
}
-
- DEBUG_EXECUTE_r({
- RE_PV_DISPLAY_DECL(s, len, PL_reg_match_utf8,
- PERL_DEBUG_PAD_ZERO(0), strpos, strend - strpos, 60);
-
- if (!PL_colorset)
- reginitcolors();
- if (PL_reg_match_utf8)
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- "UTF-8 target...\n"));
- PerlIO_printf(Perl_debug_log,
- "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
- PL_colors[4], PL_colors[5], PL_colors[0],
- prog->precomp,
- PL_colors[1],
- (strlen(prog->precomp) > 60 ? "..." : ""),
- PL_colors[0],
- (int)(len > 60 ? 60 : len),
- s, PL_colors[1],
- (len > 60 ? "..." : "")
+ DEBUG_EXECUTE_r(
+ debug_start_match(prog, do_utf8, strpos, strend,
+ "Guessing start of match for");
);
- });
/* CHR_DIST() would be more correct here but it makes things slow. */
if (prog->minlen > strend - strpos) {
/* Update the count-of-usability, remove useless subpatterns,
unshift s. */
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
+ DEBUG_EXECUTE_r({
+ RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
+ SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
+ PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
(s ? "Found" : "Did not find"),
- (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
- PL_colors[0],
- (int)(SvCUR(check) - (SvTAIL(check)!=0)),
- SvPVX_const(check),
- PL_colors[1], (SvTAIL(check) ? "$" : ""),
- (s ? " at offset " : "...\n") ) );
+ (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
+ ? "anchored" : "floating"),
+ quoted,
+ RE_SV_TAIL(check),
+ (s ? " at offset " : "...\n") );
+ });
if (!s)
goto fail_finish;
must,
multiline ? FBMrf_MULTILINE : 0
);
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- "%s anchored substr \"%s%.*s%s\"%s",
+ DEBUG_EXECUTE_r({
+ RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
+ SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
+ PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
(s ? "Found" : "Contradicts"),
- PL_colors[0],
- (int)(SvCUR(must)
- - (SvTAIL(must)!=0)),
- SvPVX_const(must),
- PL_colors[1], (SvTAIL(must) ? "$" : "")));
+ quoted, RE_SV_TAIL(must));
+ });
+
+
if (!s) {
if (last1 >= last2) {
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
(unsigned char*)last + SvCUR(must)
- (SvTAIL(must)!=0),
must, multiline ? FBMrf_MULTILINE : 0);
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
+ DEBUG_EXECUTE_r({
+ RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
+ SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
+ PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
(s ? "Found" : "Contradicts"),
- PL_colors[0],
- (int)(SvCUR(must) - (SvTAIL(must)!=0)),
- SvPVX_const(must),
- PL_colors[1], (SvTAIL(must) ? "$" : "")));
+ quoted, RE_SV_TAIL(must));
+ });
if (!s) {
if (last1 == last) {
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
/* Last resort... */
/* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
- if (prog->regstclass && OP(prog->regstclass)!=TRIE) {
+ if (prog->regstclass && PL_regkind[OP(prog->regstclass)]!=TRIE) {
/* minlen == 0 is possible if regstclass is \b or \B,
and the fixed substr is ''$.
Since minlen is already taken into account, s+1 is before strend;
return NULL;
}
-/* We know what class REx starts with. Try to find this position... */
-/* if reginfo is NULL, its a dryrun */
-/* annoyingly all the vars in this routine have different names from their counterparts
- in regmatch. /grrr */
+
#define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
foldlen, foldbuf, uniflags) STMT_START { \
} \
break
+#define DUMP_EXEC_POS(li,s,doutf8) \
+ dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
+
+/* We know what class REx starts with. Try to find this position... */
+/* if reginfo is NULL, its a dryrun */
+/* annoyingly all the vars in this routine have different names from their counterparts
+ in regmatch. /grrr */
+
STATIC char *
S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
const char *strend, const regmatch_info *reginfo)
!isDIGIT_LC_utf8((U8*)s),
!isDIGIT_LC(*s)
);
+ case TRIEC:
case TRIE:
- /*Perl_croak(aTHX_ "panic: unknown regstclass TRIE");*/
{
const enum { trie_plain, trie_utf8, trie_utf8_fold }
trie_type = do_utf8 ?
reg_trie_data *trie=aho->trie;
const char *last_start = strend - trie->minlen;
+#ifdef DEBUGGING
const char *real_start = s;
+#endif
STRLEN maxlen = trie->maxlen;
SV *sv_points;
U8 **points; /* map of where we were in the input string
- when reading a given string. For ASCII this
+ when reading a given char. For ASCII this
is unnecessary overhead as the relationship
is always 1:1, but for unicode, especially
case folded unicode this is not true. */
U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+ U8 *bitmap=NULL;
+
GET_RE_DEBUG_FLAGS_DECL;
SvPOK_on(sv_points);
sv_2mortal(sv_points);
points=(U8**)SvPV_nolen(sv_points );
-
- if (trie->bitmap && trie_type != trie_utf8_fold) {
- while (s <= last_start && !TRIE_BITMAP_TEST(trie,*s) ) {
- s++;
- }
+ if ( trie_type != trie_utf8_fold && (trie->bitmap || OP(c)==TRIEC) ) {
+ if (trie->bitmap)
+ bitmap=(U8*)trie->bitmap;
+ else
+ bitmap=(U8*)ANYOF_BITMAP(c);
}
-
+ /* this is the Aho-Corasick algorithm modified a touch
+ to include special handling for long "unknown char"
+ sequences. The basic idea being that we use AC as long
+ as we are dealing with a possible matching char, when
+ we encounter an unknown char (and we have not encountered
+ an accepting state) we scan forward until we find a legal
+ starting char.
+ AC matching is basically that of trie matching, except
+ that when we encounter a failing transition, we fall back
+ to the current states "fail state", and try the current char
+ again, a process we repeat until we reach the root state,
+ state 1, or a legal transition. If we fail on the root state
+ then we can either terminate if we have reached an accepting
+ state previously, or restart the entire process from the beginning
+ if we have not.
+
+ */
while (s <= last_start) {
const U32 uniflags = UTF8_ALLOW_DEFAULT;
U8 *uc = (U8*)s;
STRLEN foldlen = 0;
U8 *uscan = (U8*)NULL;
U8 *leftmost = NULL;
-
+#ifdef DEBUGGING
+ U32 accepted_word= 0;
+#endif
U32 pointpos = 0;
while ( state && uc <= (U8*)strend ) {
int failed=0;
- if (aho->states[ state ].wordnum) {
- U8 *lpos= points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1] ) % maxlen ];
- if (!leftmost || lpos < leftmost)
+ U32 word = aho->states[ state ].wordnum;
+
+ if( state==1 && bitmap ) {
+ DEBUG_TRIE_EXECUTE_r(
+ if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
+ dump_exec_pos( (char *)uc, c, strend, real_start,
+ (char*)uc, do_utf8 );
+ PerlIO_printf( Perl_debug_log,
+ " Scanning for legal start char...\n");
+ }
+ );
+ while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
+ uc++;
+ }
+ s= (char *)uc;
+ if (uc >(U8*)last_start) break;
+ }
+
+ if ( word ) {
+ U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
+ if (!leftmost || lpos < leftmost) {
+ DEBUG_r(accepted_word=word);
leftmost= lpos;
+ }
if (base==0) break;
+
}
points[pointpos++ % maxlen]= uc;
REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
uvc, charid, foldlen, foldbuf, uniflags);
- DEBUG_TRIE_EXECUTE_r(
+ DEBUG_TRIE_EXECUTE_r({
+ dump_exec_pos( (char *)uc, c, strend, real_start,
+ s, do_utf8 );
PerlIO_printf(Perl_debug_log,
- "Pos: %d Charid:%3x CV:%4"UVxf" ",
- (int)((const char*)uc - real_start), charid, uvc)
- );
- uc += len;
+ " Charid:%3u CP:%4"UVxf" ",
+ charid, uvc);
+ });
do {
- U32 word = aho->states[ state ].wordnum;
+#ifdef DEBUGGING
+ word = aho->states[ state ].wordnum;
+#endif
base = aho->states[ state ].trans.base;
- DEBUG_TRIE_EXECUTE_r(
+ DEBUG_TRIE_EXECUTE_r({
+ if (failed)
+ dump_exec_pos( (char *)uc, c, strend, real_start,
+ s, do_utf8 );
PerlIO_printf( Perl_debug_log,
- "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf" word=%"UVxf"\n",
- failed ? "Fail transition to " : "",
- state, base, uvc, word)
- );
+ "%sState: %4"UVxf", word=%"UVxf,
+ failed ? " Fail transition to " : "",
+ (UV)state, (UV)word);
+ });
if ( base ) {
U32 tmp;
if (charid &&
&& (tmp=trie->trans[base + charid - 1 -
trie->uniquecharcount ].next))
{
+ DEBUG_TRIE_EXECUTE_r(
+ PerlIO_printf( Perl_debug_log," - legal\n"));
state = tmp;
break;
}
else {
- failed++;
- if ( state == 1 )
- break;
- else
- state = aho->fail[state];
+ DEBUG_TRIE_EXECUTE_r(
+ PerlIO_printf( Perl_debug_log," - fail\n"));
+ failed = 1;
+ state = aho->fail[state];
}
}
else {
/* we must be accepting here */
- failed++;
+ DEBUG_TRIE_EXECUTE_r(
+ PerlIO_printf( Perl_debug_log," - accepting\n"));
+ failed = 1;
break;
}
} while(state);
+ uc += len;
if (failed) {
if (leftmost)
break;
- else if (!charid && trie->bitmap && trie_type != trie_utf8_fold) {
- while ( uc <= (U8*)last_start && !TRIE_BITMAP_TEST(trie,*uc) ) {
- uc++;
- }
- }
+ if (!state) state = 1;
}
}
if ( aho->states[ state ].wordnum ) {
U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
- if (!leftmost || lpos < leftmost)
+ if (!leftmost || lpos < leftmost) {
+ DEBUG_r(accepted_word=aho->states[ state ].wordnum);
leftmost = lpos;
+ }
}
- DEBUG_TRIE_EXECUTE_r(
- PerlIO_printf( Perl_debug_log,
- "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf"\n",
- "All done: ",
- state, base, uvc)
- );
if (leftmost) {
s = (char*)leftmost;
+ DEBUG_TRIE_EXECUTE_r({
+ PerlIO_printf(
+ Perl_debug_log,"Matches word #%"UVxf" at position %d. Trying full pattern...\n",
+ (UV)accepted_word, s - real_start
+ );
+ });
if (!reginfo || regtry(reginfo, s)) {
FREETMPS;
LEAVE;
goto got_it;
}
s = HOPc(s,1);
+ DEBUG_TRIE_EXECUTE_r({
+ PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
+ });
} else {
+ DEBUG_TRIE_EXECUTE_r(
+ PerlIO_printf( Perl_debug_log,"No match.\n"));
break;
}
}
}
}
- DEBUG_EXECUTE_r({
- RE_PV_DISPLAY_DECL(s0, len0, UTF,
- PERL_DEBUG_PAD_ZERO(0), prog->precomp, prog->prelen, 60);
- RE_PV_DISPLAY_DECL(s1, len1, do_utf8,
- PERL_DEBUG_PAD_ZERO(1), startpos, strend - startpos, 60);
-
- if (!PL_colorset)
- reginitcolors();
- PerlIO_printf(Perl_debug_log,
- "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
- PL_colors[4], PL_colors[5], PL_colors[0],
- len0, len0, s0,
- PL_colors[1],
- len0 > 60 ? "..." : "",
- PL_colors[0],
- (int)(len1 > 60 ? 60 : len1),
- s1, PL_colors[1],
- (len1 > 60 ? "..." : "")
+ DEBUG_EXECUTE_r(
+ debug_start_match(prog, do_utf8, startpos, strend,
+ "Matching");
);
- });
/* Simplest case: anchored match need be tried only once. */
/* [unless only anchor is BOL and multiline is set] */
}
}
}
- DEBUG_EXECUTE_r(if (!did_match)
- PerlIO_printf(Perl_debug_log,
- "Did not find %s substr \"%s%.*s%s\"%s...\n",
+ DEBUG_EXECUTE_r(if (!did_match) {
+ RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
+ SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
+ PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
((must == prog->anchored_substr || must == prog->anchored_utf8)
? "anchored" : "floating"),
- PL_colors[0],
- (int)(SvCUR(must) - (SvTAIL(must)!=0)),
- SvPVX_const(must),
- PL_colors[1], (SvTAIL(must) ? "$" : ""))
- );
+ quoted, RE_SV_TAIL(must));
+ });
goto phooey;
}
- else if ((c = prog->regstclass)) {
+ else if ( (c = prog->regstclass) ) {
if (minlen) {
const OPCODE op = OP(prog->regstclass);
/* don't bother with what can't match */
- if (PL_regkind[op] != EXACT && op != CANY && op != TRIE)
+ if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
strend = HOPc(strend, -(minlen - 1));
}
DEBUG_EXECUTE_r({
SV * const prop = sv_newmortal();
regprop(prog, prop, c);
{
- RE_PV_DISPLAY_DECL(s0,len0,UTF,
- PERL_DEBUG_PAD_ZERO(0),SvPVX_const(prop),SvCUR(prop),60);
- RE_PV_DISPLAY_DECL(s1,len1,UTF,
- PERL_DEBUG_PAD_ZERO(1),s,strend-s,60);
+ RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1),
+ s,strend-s,60);
PerlIO_printf(Perl_debug_log,
- "Matching stclass \"%*.*s\" against \"%*.*s\" (%d chars)\n",
- len0, len0, s0,
- len1, len1, s1, (int)(strend - s));
+ "Matching stclass %.*s against %s (%d chars)\n",
+ (int)SvCUR(prop), SvPVX_const(prop),
+ quoted, (int)(strend - s));
}
});
if (find_byclass(prog, c, s, strend, ®info))
#define sayNO_SILENT goto do_no
#define saySAME(x) if (x) goto yes; else goto no
-#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
-#define POSCACHE_SEEN 1 /* we know what we're caching */
-#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
-
-#define CACHEsayYES STMT_START { \
- if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
- if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
- PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) | (1<<POSCACHE_SEEN); \
- PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
- } \
- else if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS)) { \
- PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
- } \
- else { \
- /* cache records failure, but this is success */ \
- DEBUG_r( \
- PerlIO_printf(Perl_debug_log, \
- "%*s (remove success from failure cache)\n", \
- REPORT_CODE_OFF+PL_regindent*2, "") \
- ); \
- PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
- } \
- } \
- sayYES; \
-} STMT_END
-
#define CACHEsayNO STMT_START { \
- if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
- if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
- PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
- PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
- } \
- else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
- PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
- } \
- else { \
- /* cache records success, but this is failure */ \
- DEBUG_r( \
- PerlIO_printf(Perl_debug_log, \
- "%*s (remove failure from success cache)\n", \
- REPORT_CODE_OFF+PL_regindent*2, "") \
- ); \
- PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
- } \
- } \
+ if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) \
+ PL_reg_poscache[st->u.whilem.cache_offset] |= \
+ (1<<st->u.whilem.cache_bit); \
sayNO; \
} STMT_END
+
/* this is used to determine how far from the left messages like
'failed...' are printed. Currently 29 makes these messages line
up with the opcode they refer to. Earlier perls used 25 which
#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
#ifdef DEBUGGING
+STATIC void
+S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
+ const char *start, const char *end, const char *blurb)
+{
+ const bool utf8_pat= prog->reganch & ROPT_UTF8 ? 1 : 0;
+ if (!PL_colorset)
+ reginitcolors();
+ {
+ RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
+ prog->precomp, prog->prelen, 60);
+
+ RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
+ start, end - start, 60);
+
+ PerlIO_printf(Perl_debug_log,
+ "%s%s REx%s %s against %s\n",
+ PL_colors[4], blurb, PL_colors[5], s0, s1);
+
+ if (do_utf8||utf8_pat)
+ PerlIO_printf(Perl_debug_log, "UTF-8 %s...\n",
+ !do_utf8 ? "pattern" : !utf8_pat ? "string" :
+ "pattern and string"
+ );
+ }
+}
STATIC void
-S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8)
+S_dump_exec_pos(pTHX_ const char *locinput,
+ const regnode *scan,
+ const char *loc_regeol,
+ const char *loc_bostr,
+ const char *loc_reg_starttry,
+ const bool do_utf8)
{
- const int docolor = *PL_colors[0];
+ const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
- int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
+ int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
/* The part of the string before starttry has one color
(pref0_len chars), between starttry and current
position another one (pref_len - pref0_len chars),
after the current position the third one.
We assume that pref0_len <= pref_len, otherwise we
decrease pref0_len. */
- int pref_len = (locinput - PL_bostr) > (5 + taill) - l
- ? (5 + taill) - l : locinput - PL_bostr;
+ int pref_len = (locinput - loc_bostr) > (5 + taill) - l
+ ? (5 + taill) - l : locinput - loc_bostr;
int pref0_len;
while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
pref_len++;
- pref0_len = pref_len - (locinput - PL_reg_starttry);
- if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
- l = ( PL_regeol - locinput > (5 + taill) - pref_len
- ? (5 + taill) - pref_len : PL_regeol - locinput);
+ pref0_len = pref_len - (locinput - loc_reg_starttry);
+ if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
+ l = ( loc_regeol - locinput > (5 + taill) - pref_len
+ ? (5 + taill) - pref_len : loc_regeol - locinput);
while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
l--;
if (pref0_len < 0)
{
const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
- RE_PV_DISPLAY_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
- (locinput - pref_len),pref0_len, 60);
+ RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
+ (locinput - pref_len),pref0_len, pref0_len, 4, 5);
- RE_PV_DISPLAY_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
+ RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
(locinput - pref_len + pref0_len),
- pref_len - pref0_len, 60);
+ pref_len - pref0_len, pref_len - pref0_len, 2, 3);
- RE_PV_DISPLAY_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
- locinput, PL_regeol - locinput, 60);
+ RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
+ locinput, loc_regeol - locinput, l, 0, 1);
PerlIO_printf(Perl_debug_log,
- "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|",
- (IV)(locinput - PL_bostr),
- PL_colors[4],
+ "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
+ (IV)(locinput - loc_bostr),
len0, s0,
- PL_colors[5],
- PL_colors[2],
len1, s1,
- PL_colors[3],
(docolor ? "" : "> <"),
- PL_colors[0],
len2, s2,
- PL_colors[1],
15 - l - pref_len + 1,
"");
}
/* these variables are NOT saved during a recusive RFEGMATCH: */
register I32 nextchr; /* is always set to UCHARAT(locinput) */
- bool result; /* return value of S_regmatch */
+ bool result = 0; /* return value of S_regmatch */
int depth = 0; /* depth of recursion */
regmatch_state *yes_state = NULL; /* state to pop to on success of
subpattern */
U32 state_num;
+ I32 parenfloor = 0;
+
#ifdef DEBUGGING
GET_RE_DEBUG_FLAGS_DECL;
PL_regindent++;
st->sw = 0;
st->logical = 0;
st->cc = NULL;
+
/* Note that nextchr is a byte even in UTF */
nextchr = UCHARAT(locinput);
scan = prog;
DEBUG_EXECUTE_r( {
SV * const prop = sv_newmortal();
- dump_exec_pos( locinput, scan, do_utf8 );
+ DUMP_EXEC_POS( locinput, scan, do_utf8 );
regprop(rex, prop, scan);
PerlIO_printf(Perl_debug_log,
#undef ST
#define ST st->u.trie
-
+ case TRIEC:
+ /* In this case the charclass data is available inline so
+ we can fail fast without a lot of extra overhead.
+ */
+ if (scan->flags == EXACT || !do_utf8) {
+ if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
+ DEBUG_EXECUTE_r(
+ PerlIO_printf(Perl_debug_log,
+ "%*s %sfailed to match trie start class...%s\n",
+ REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
+ );
+ sayNO_SILENT;
+ /* NOTREACHED */
+ }
+ }
+ /* FALL THROUGH */
case TRIE:
{
/* what type of TRIE am I? (utf8 makes this contextual) */
= (reg_trie_data*)rex->data->data[ ARG( scan ) ];
U32 state = trie->startstate;
- U8 *uc = ( U8* )locinput;
- U16 charid = 0;
- U32 base = 0;
- UV uvc = 0;
- STRLEN len = 0;
- STRLEN foldlen = 0;
- U8 *uscan = (U8*)NULL;
- STRLEN bufflen=0;
- SV *sv_accept_buff = NULL;
- U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
-
- ST.accepted = 0; /* how many accepting states we have seen */
- ST.B = next;
-#ifdef DEBUGGING
- ST.me = scan;
-#endif
-
if (trie->bitmap && trie_type != trie_utf8_fold &&
!TRIE_BITMAP_TEST(trie,*locinput)
) {
} else {
DEBUG_EXECUTE_r(
PerlIO_printf(Perl_debug_log,
- "%*s %sfailed to match start class...%s\n",
+ "%*s %sfailed to match trie start class...%s\n",
REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
);
sayNO_SILENT;
}
}
+ {
+ U8 *uc = ( U8* )locinput;
+
+ STRLEN len = 0;
+ STRLEN foldlen = 0;
+ U8 *uscan = (U8*)NULL;
+ STRLEN bufflen=0;
+ SV *sv_accept_buff = NULL;
+ U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+
+ ST.accepted = 0; /* how many accepting states we have seen */
+ ST.B = next;
+ ST.jump = trie->jump;
+
+#ifdef DEBUGGING
+ ST.me = scan;
+#endif
+
+
+
/*
traverse the TRIE keeping track of all accepting states
we transition through until we get to a failing node.
*/
while ( state && uc <= (U8*)PL_regeol ) {
-
- if (trie->states[ state ].wordnum) {
- if (!ST.accepted ) {
+ U32 base = trie->states[ state ].trans.base;
+ UV uvc;
+ U16 charid;
+ /* We use charid to hold the wordnum as we don't use it
+ for charid until after we have done the wordnum logic.
+ We define an alias just so that the wordnum logic reads
+ more naturally. */
+
+#define got_wordnum charid
+ got_wordnum = trie->states[ state ].wordnum;
+
+ if ( got_wordnum ) {
+ if ( ! ST.accepted ) {
ENTER;
SAVETMPS;
bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
sv_accept_buff=newSV(bufflen *
sizeof(reg_trie_accepted) - 1);
- SvCUR_set(sv_accept_buff,
- sizeof(reg_trie_accepted));
+ SvCUR_set(sv_accept_buff, 0);
SvPOK_on(sv_accept_buff);
sv_2mortal(sv_accept_buff);
SAVETMPS;
ST.accept_buff =
(reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
}
- else {
+ do {
if (ST.accepted >= bufflen) {
bufflen *= 2;
ST.accept_buff =(reg_trie_accepted*)
}
SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
+ sizeof(reg_trie_accepted));
- }
- ST.accept_buff[ST.accepted].wordnum = trie->states[state].wordnum;
- ST.accept_buff[ST.accepted].endpos = uc;
- ++ST.accepted;
- }
- base = trie->states[ state ].trans.base;
+
+ ST.accept_buff[ST.accepted].wordnum = got_wordnum;
+ ST.accept_buff[ST.accepted].endpos = uc;
+ ++ST.accepted;
+ } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
+ }
+#undef got_wordnum
DEBUG_TRIE_EXECUTE_r({
- dump_exec_pos( (char *)uc, scan, do_utf8 );
+ DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
PerlIO_printf( Perl_debug_log,
- "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
+ "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
2+PL_regindent * 2, "", PL_colors[4],
- (UV)state, (UV)base, (UV)ST.accepted );
+ (UV)state, (UV)ST.accepted );
});
if ( base ) {
}
DEBUG_TRIE_EXECUTE_r(
PerlIO_printf( Perl_debug_log,
- "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
+ "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
charid, uvc, (UV)state, PL_colors[5] );
);
}
REPORT_CODE_OFF + PL_regindent * 2, "",
PL_colors[4], (IV)ST.accepted, PL_colors[5] );
);
- }
+ }}
/* FALL THROUGH */
LEAVE;
locinput = PL_reginput;
nextchr = UCHARAT(locinput);
- scan = ST.B;
+
+ if ( !ST.jump )
+ scan = ST.B;
+ else
+ scan = ST.B - ST.jump[ST.accept_buff[0].wordnum];
+
continue; /* execute rest of RE */
}
best = ST.accepted;
}
PL_reginput = (char *)ST.accept_buff[ best ].endpos;
+ if ( !ST.jump ) {
+ PUSH_STATE_GOTO(TRIE_next, ST.B);
+ /* NOTREACHED */
+ } else {
+ PUSH_STATE_GOTO(TRIE_next, ST.B - ST.jump[ST.accept_buff[best].wordnum]);
+ /* NOTREACHED */
+ }
+ /* NOTREACHED */
}
- PUSH_STATE_GOTO(TRIE_next, ST.B);
/* NOTREACHED */
#undef ST
}
/* run the pattern returned from (??{...}) */
-
DEBUG_EXECUTE_r(
- PerlIO_printf(Perl_debug_log,
- "Entering embedded \"%s%.60s%s%s\"\n",
- PL_colors[0],
- re->precomp,
- PL_colors[1],
- (strlen(re->precomp) > 60 ? "..." : ""))
+ debug_start_match(re, do_utf8, locinput, PL_regeol,
+ "Matching embedded");
);
ST.cp = regcppush(0); /* Save *all* the positions. */
case CURLYX: {
/* No need to save/restore up to this paren */
- I32 parenfloor = scan->flags;
-
+ parenfloor = scan->flags;
+
/* Dave says:
CURLYX and WHILEM are always paired: they're the moral
PL_reg_leftiter = PL_reg_maxiter;
}
if (PL_reg_leftiter-- == 0) {
- const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
+ const I32 size = (PL_reg_maxiter + 7)/8;
if (PL_reg_poscache) {
if ((I32)PL_reg_poscache_size < size) {
Renew(PL_reg_poscache, size, char);
if (PL_reg_leftiter < 0) {
st->u.whilem.cache_offset = locinput - PL_bostr;
- st->u.whilem.cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
+ st->u.whilem.cache_offset = (scan->flags & 0xf) - 1
+ st->u.whilem.cache_offset * (scan->flags>>4);
st->u.whilem.cache_bit = st->u.whilem.cache_offset % 8;
st->u.whilem.cache_offset /= 8;
"%*s already tried at this position...\n",
REPORT_CODE_OFF+PL_regindent*2, "")
);
- if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
- /* cache records success */
- sayYES;
- else
- /* cache records failure */
- sayNO_SILENT;
+ sayNO; /* cache records failure */
}
}
}
st->cc = st->u.whilem.savecc;
if (result) {
regcpblow(st->u.whilem.cp);
- CACHEsayYES; /* All done. */
+ sayYES; /* All done. */
}
REGCP_UNWIND(st->u.whilem.lastcp);
regcppop(rex);
/*** all unsaved local vars undefined at this point */
if (result) {
regcpblow(st->u.whilem.cp);
- CACHEsayYES;
+ sayYES;
}
REGCP_UNWIND(st->u.whilem.lastcp);
regcppop(rex);
/*** all unsaved local vars undefined at this point */
if (result) {
regcpblow(st->u.whilem.cp);
- CACHEsayYES;
+ sayYES;
}
REGCP_UNWIND(st->u.whilem.lastcp);
regcppop(rex); /* Restore some previous $<digit>s? */
/*** all unsaved local vars undefined at this point */
st->cc = st->u.whilem.savecc;
if (result)
- CACHEsayYES;
+ sayYES;
if (st->cc->u.curlyx.outercc)
st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
st->cc->u.curlyx.cur = n - 1;
DEBUG_EXECUTE_r(
PerlIO_printf(Perl_debug_log,
"%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
- (int)(REPORT_CODE_OFF+PL_regindent*2), "",
+ (int)(REPORT_CODE_OFF+(PL_regindent*2)), "",
(IV) ST.count, (IV)ST.alen)
);
DEBUG_EXECUTE_r(
PerlIO_printf(Perl_debug_log,
"%*s CURLYM trying tail with matches=%"IVdf"...\n",
- (int)(REPORT_CODE_OFF+PL_regindent*2),
+ (int)(REPORT_CODE_OFF+(PL_regindent*2)),
"", (IV)ST.count)
);
if (ST.c1 != CHRTEST_VOID
to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
-
+#ifdef EBCDIC
+ ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY);
+ ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY);
+#else
ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
- uniflags);
+ uniflags);
ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
- uniflags);
+ uniflags);
+#endif
}
else {
ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
regmatch_state *newst;
depth++;
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ DEBUG_STATE_r(PerlIO_printf(Perl_debug_log,
"PUSH STATE(%d)\n", depth));
st->locinput = locinput;
newst = st+1;
newst->sw = 0;
newst->logical = 0;
+
locinput = PL_reginput;
nextchr = UCHARAT(locinput);
st = newst;
st->minmod = 0;
st->sw = 0;
st->logical = 0;
+
#ifdef DEBUGGING
PL_regindent++;
#endif
st = SLAB_LAST(PL_regmatch_slab);
}
depth -= (st - yes_state);
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATES (%d..%d)\n",
- depth+1, depth+(st - yes_state)));
+ DEBUG_STATE_r(PerlIO_printf(Perl_debug_log, "POP STATES (%"UVuf"..%"UVuf")\n",
+ (UV)(depth+1), (UV)(depth+(st - yes_state))));
st = yes_state;
yes_state = st->u.yes.prev_yes_state;
PL_regmatch_state = st;
* will disappear when REGFMATCH goes */
if (depth) {
/* restore previous state and re-enter */
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
+ DEBUG_STATE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
depth--;
st--;
if (st < SLAB_FIRST(PL_regmatch_slab)) {
no:
DEBUG_EXECUTE_r(
PerlIO_printf(Perl_debug_log,
- "%*s %sfailed...%s\n",
- REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
+ "%*s %sfailed...%s\n",
+ REPORT_CODE_OFF+PL_regindent*2, "",
+ PL_colors[4], PL_colors[5])
);
no_final:
do_no:
if (depth) {
/* there's a previous state to backtrack to */
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
+ DEBUG_STATE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
depth--;
st--;
if (st < SLAB_FIRST(PL_regmatch_slab)) {
c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
(UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
/* see [perl #37836] for UTF8_ALLOW_ANYUV */
- if (len == (STRLEN)-1)
+ if (len == (STRLEN)-1)
Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
}