X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=regexec.c;h=65a3b90e74db07078119d6f4b168be9cf5b53bf3;hb=54e82ce5cfd72fcdc60806373e0c4d6890b68a3c;hp=f6ed3287ef63c57ba4623677b4f8b7d6487ebaff;hpb=d6a287148f674729887096e6af50b42b464c3451;p=p5sagit%2Fp5-mst-13.2.git diff --git a/regexec.c b/regexec.c index f6ed328..65a3b90 100644 --- a/regexec.c +++ b/regexec.c @@ -270,25 +270,45 @@ S_cache_re(pTHX_ regexp *prog) /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */ -/* If SCREAM, then sv should be compatible with strpos and strend. +/* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend. Otherwise, only SvCUR(sv) is used to get strbeg. */ /* XXXX We assume that strpos is strbeg unless sv. */ +/* A failure to find a constant substring means that there is no need to make + an expensive call to REx engine, thus we celebrate a failure. Similarly, + finding a substring too deep into the string means that less calls to + regtry() should be needed. + + REx compiler's optimizer found 4 possible hints: + a) Anchored substring; + b) Fixed substring; + c) Whether we are anchored (beginning-of-line or \G); + d) First node (of those at offset 0) which may distingush positions; + We use 'a', 'b', multiline-part of 'c', and try to find a position in the + string which does not contradict any of them. + */ + char * Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, char *strend, U32 flags, re_scream_pos_data *data) { - I32 start_shift; + register I32 start_shift; /* Should be nonnegative! */ - I32 end_shift; - char *s; + register I32 end_shift; + register char *s; + register SV *check; char *t; I32 ml_anch; + char *tmp; + register char *other_last = Nullch; +#ifdef DEBUGGING + char *i_strpos = strpos; +#endif DEBUG_r( if (!PL_colorset) reginitcolors() ); DEBUG_r(PerlIO_printf(Perl_debug_log, - "%sGuessing start of match:%s `%s%.60s%s%s' against `%s%.*s%s%s'\n", + "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n", PL_colors[4],PL_colors[5],PL_colors[0], prog->precomp, PL_colors[1], @@ -299,129 +319,322 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, (strend - strpos > 60 ? "..." : "")) ); - if (prog->minlen > strend - strpos) + if (prog->minlen > strend - strpos) { + DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n")); goto fail; - - /* XXXX Move further down? */ - start_shift = prog->check_offset_min; /* okay to underestimate on CC */ - /* Should be nonnegative! */ - end_shift = prog->minlen - start_shift - - CHR_SVLEN(prog->check_substr) + (SvTAIL(prog->check_substr) != 0); - - if (prog->reganch & ROPT_ANCH) { + } + if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */ ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE) || ( (prog->reganch & ROPT_ANCH_BOL) - && !PL_multiline ) ); + && !PL_multiline ) ); /* Check after \n? */ if ((prog->check_offset_min == prog->check_offset_max) && !ml_anch) { - /* Anchored... */ + /* Substring at constant offset from beg-of-str... */ I32 slen; if ( !(prog->reganch & ROPT_ANCH_GPOS) /* Checked by the caller */ - && (sv && (strpos + SvCUR(sv) != strend)) ) + && (sv && (strpos + SvCUR(sv) != strend)) ) { + DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n")); goto fail; - + } PL_regeol = strend; /* Used in HOP() */ - s = (char*)HOP((U8*)strpos, prog->check_offset_min); + s = HOPc(strpos, prog->check_offset_min); if (SvTAIL(prog->check_substr)) { slen = SvCUR(prog->check_substr); /* >= 1 */ - if ( strend - s > slen || strend - s < slen - 1 ) { - s = Nullch; - goto finish; - } - if ( strend - s == slen && strend[-1] != '\n') { - s = Nullch; - goto finish; + if ( strend - s > slen || strend - s < slen - 1 + || (strend - s == slen && strend[-1] != '\n')) { + DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n")); + goto fail_finish; } /* Now should match s[0..slen-2] */ slen--; if (slen && (*SvPVX(prog->check_substr) != *s || (slen > 1 - && memNE(SvPVX(prog->check_substr), s, slen)))) - s = Nullch; + && memNE(SvPVX(prog->check_substr), s, slen)))) { + report_neq: + DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n")); + goto fail_finish; + } } else if (*SvPVX(prog->check_substr) != *s || ((slen = SvCUR(prog->check_substr)) > 1 && memNE(SvPVX(prog->check_substr), s, slen))) - s = Nullch; - else - s = strpos; - goto finish; + goto report_neq; + goto success_at_start; } + /* Match is anchored, but substr is not anchored wrt beg-of-str. */ s = strpos; - if (!ml_anch && (s + prog->check_offset_max < strend - prog->minlen)) - end_shift += strend - s - prog->minlen - prog->check_offset_max; + start_shift = prog->check_offset_min; /* okay to underestimate on CC */ + /* Should be nonnegative! */ + end_shift = prog->minlen - start_shift - + CHR_SVLEN(prog->check_substr) + (SvTAIL(prog->check_substr) != 0); + if (!ml_anch) { + I32 end = prog->check_offset_max + CHR_SVLEN(prog->check_substr) + - (SvTAIL(prog->check_substr) != 0); + I32 eshift = strend - s - end; + + if (end_shift < eshift) + end_shift = eshift; + } } - else { + else { /* Can match at random position */ ml_anch = 0; s = strpos; + start_shift = prog->check_offset_min; /* okay to underestimate on CC */ + /* Should be nonnegative! */ + end_shift = prog->minlen - start_shift - + CHR_SVLEN(prog->check_substr) + (SvTAIL(prog->check_substr) != 0); } - restart: +#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */ if (end_shift < 0) - end_shift = 0; /* can happen when strend == strpos */ + Perl_croak(aTHX_ "panic: end_shift"); +#endif + + check = prog->check_substr; + restart: + /* Find a possible match in the region s..strend by looking for + the "check" substring in the region corrected by start/end_shift. */ if (flags & REXEC_SCREAM) { - SV *c = prog->check_substr; char *strbeg = SvPVX(sv); /* XXXX Assume PV_force() on SCREAM! */ I32 p = -1; /* Internal iterator of scream. */ I32 *pp = data ? data->scream_pos : &p; - if (PL_screamfirst[BmRARE(c)] >= 0 - || ( BmRARE(c) == '\n' - && (BmPREVIOUS(c) == SvCUR(c) - 1) - && SvTAIL(c) )) - s = screaminstr(sv, prog->check_substr, - start_shift + (strpos - strbeg), end_shift, pp, 0); + if (PL_screamfirst[BmRARE(check)] >= 0 + || ( BmRARE(check) == '\n' + && (BmPREVIOUS(check) == SvCUR(check) - 1) + && SvTAIL(check) )) + s = screaminstr(sv, check, + start_shift + (s - strbeg), end_shift, pp, 0); else - s = Nullch; + goto fail_finish; if (data) *data->scream_olds = s; } else s = fbm_instr((unsigned char*)s + start_shift, (unsigned char*)strend - end_shift, - prog->check_substr, PL_multiline ? FBMrf_MULTILINE : 0); + check, PL_multiline ? FBMrf_MULTILINE : 0); /* Update the count-of-usability, remove useless subpatterns, unshift s. */ - finish: - if (!s) { - ++BmUSEFUL(prog->check_substr); /* hooray */ - goto fail; /* not present */ + + DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s", + (s ? "Found" : "Did not find"), + ((check == prog->anchored_substr) ? "anchored" : "floating"), + PL_colors[0], + SvCUR(check) - (SvTAIL(check)!=0), SvPVX(check), + PL_colors[1], (SvTAIL(check) ? "$" : ""), + (s ? " at offset " : "...\n") ) ); + + if (!s) + goto fail_finish; + + /* Finish the diagnostic message */ + DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) ); + + /* Got a candidate. Check MBOL anchoring, and the *other* substr. + Start with the other substr. + XXXX no SCREAM optimization yet - and a very coarse implementation + XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will + *always* match. Probably should be marked during compile... + Probably it is right to do no SCREAM here... + */ + + if (prog->float_substr && prog->anchored_substr) { + /* Take into account the "other" substring. */ + /* XXXX May be hopelessly wrong for UTF... */ + if (!other_last) + other_last = strpos - 1; + if (check == prog->float_substr) { + do_other_anchored: + { + char *last = s - start_shift, *last1, *last2; + char *s1 = s; + + tmp = PL_bostr; + t = s - prog->check_offset_max; + if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ + && (!(prog->reganch & ROPT_UTF8) + || (PL_bostr = strpos, /* Used in regcopmaybe() */ + (t = reghopmaybe_c(s, -(prog->check_offset_max))) + && t > strpos))) + /* EMPTY */; + else + t = strpos; + t += prog->anchored_offset; + if (t <= other_last) + t = other_last + 1; + PL_bostr = tmp; + last2 = last1 = strend - prog->minlen; + if (last < last1) + last1 = last; + /* XXXX It is not documented what units *_offsets are in. Assume bytes. */ + /* On end-of-str: see comment below. */ + s = fbm_instr((unsigned char*)t, + (unsigned char*)last1 + prog->anchored_offset + + SvCUR(prog->anchored_substr) + - (SvTAIL(prog->anchored_substr)!=0), + prog->anchored_substr, PL_multiline ? FBMrf_MULTILINE : 0); + DEBUG_r(PerlIO_printf(Perl_debug_log, "%s anchored substr `%s%.*s%s'%s", + (s ? "Found" : "Contradicts"), + PL_colors[0], + SvCUR(prog->anchored_substr) + - (SvTAIL(prog->anchored_substr)!=0), + SvPVX(prog->anchored_substr), + PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : ""))); + if (!s) { + if (last1 >= last2) { + DEBUG_r(PerlIO_printf(Perl_debug_log, + ", giving up...\n")); + goto fail_finish; + } + DEBUG_r(PerlIO_printf(Perl_debug_log, + ", trying floating at offset %ld...\n", + (long)(s1 + 1 - i_strpos))); + PL_regeol = strend; /* Used in HOP() */ + other_last = last1 + prog->anchored_offset; + s = HOPc(last, 1); + goto restart; + } + else { + DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", + (long)(s - i_strpos))); + t = s - prog->anchored_offset; + other_last = s - 1; + s = s1; + if (t == strpos) + goto try_at_start; + goto try_at_offset; + } + } + } + else { /* Take into account the floating substring. */ + char *last, *last1; + char *s1 = s; + + t = s - start_shift; + last1 = last = strend - prog->minlen + prog->float_min_offset; + if (last - t > prog->float_max_offset) + last = t + prog->float_max_offset; + s = t + prog->float_min_offset; + if (s <= other_last) + s = other_last + 1; + /* XXXX It is not documented what units *_offsets are in. Assume bytes. */ + /* fbm_instr() takes into account exact value of end-of-str + if the check is SvTAIL(ed). Since false positives are OK, + and end-of-str is not later than strend we are OK. */ + s = fbm_instr((unsigned char*)s, + (unsigned char*)last + SvCUR(prog->float_substr) + - (SvTAIL(prog->float_substr)!=0), + prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0); + DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s", + (s ? "Found" : "Contradicts"), + PL_colors[0], + SvCUR(prog->float_substr) + - (SvTAIL(prog->float_substr)!=0), + SvPVX(prog->float_substr), + PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : ""))); + if (!s) { + if (last1 == last) { + DEBUG_r(PerlIO_printf(Perl_debug_log, + ", giving up...\n")); + goto fail_finish; + } + DEBUG_r(PerlIO_printf(Perl_debug_log, + ", trying anchored starting at offset %ld...\n", + (long)(s1 + 1 - i_strpos))); + other_last = last; + PL_regeol = strend; /* Used in HOP() */ + s = HOPc(t, 1); + goto restart; + } + else { + DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", + (long)(s - i_strpos))); + other_last = s - 1; + s = s1; + if (t == strpos) + goto try_at_start; + goto try_at_offset; + } + } } - else if (s - strpos > prog->check_offset_max && - ((prog->reganch & ROPT_UTF8) - ? ((t = reghopmaybe_c(s, -(prog->check_offset_max))) - && t >= strpos) - : (t = s - prog->check_offset_max) != 0) ) { + + t = s - prog->check_offset_max; + tmp = PL_bostr; + if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ + && (!(prog->reganch & ROPT_UTF8) + || (PL_bostr = strpos, /* Used in regcopmaybe() */ + ((t = reghopmaybe_c(s, -(prog->check_offset_max))) + && t > strpos)))) { + PL_bostr = tmp; + /* Fixed substring is found far enough so that the match + cannot start at strpos. */ + try_at_offset: if (ml_anch && t[-1] != '\n') { + /* Eventually fbm_*() should handle this, but often + anchored_offset is not 0, so this check will not be wasted. */ + /* XXXX In the code below we prefer to look for "^" even in + presence of anchored substrings. And we search even + beyond the found float position. These pessimizations + are historical artefacts only. */ find_anchor: - while (t < strend - end_shift - prog->minlen) { + while (t < strend - prog->minlen) { if (*t == '\n') { if (t < s - prog->check_offset_min) { + if (prog->anchored_substr) { + /* We definitely contradict the found anchored + substr. Due to the above check we do not + contradict "check" substr. + Thus we can arrive here only if check substr + is float. Redo checking for "other"=="fixed". + */ + strpos = t + 1; + DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n", + PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset))); + goto do_other_anchored; + } s = t + 1; + DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n", + PL_colors[0],PL_colors[1], (long)(s - i_strpos))); goto set_useful; } - s = t + 1; + DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting at offset %ld...\n", + PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos))); + strpos = s = t + 1; goto restart; } t++; } - s = Nullch; - goto finish; + DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n", + PL_colors[0],PL_colors[1])); + goto fail_finish; } s = t; set_useful: - ++BmUSEFUL(prog->check_substr); /* hooray/2 */ + ++BmUSEFUL(prog->check_substr); /* hooray/5 */ } else { - if (ml_anch && sv + PL_bostr = tmp; + /* The found string does not prohibit matching at beg-of-str + - no optimization of calling REx engine can be performed, + unless it was an MBOL and we are not after MBOL. */ + try_at_start: + /* Even in this situation we may use MBOL flag if strpos is offset + wrt the start of the string. */ + if (ml_anch && sv && (strpos + SvCUR(sv) != strend) && strpos[-1] != '\n') { t = strpos; goto find_anchor; } - if (!(prog->reganch & ROPT_NAUGHTY) + DEBUG_r( if (ml_anch) + PerlIO_printf(Perl_debug_log, "Does not contradict /%s^%s/m...\n", + PL_colors[0],PL_colors[1]); + ); + success_at_start: + if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */ && --BmUSEFUL(prog->check_substr) < 0 && prog->check_substr == prog->float_substr) { /* boo */ /* If flags & SOMETHING - do not do it many times on the same match */ @@ -435,11 +648,14 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, s = strpos; } - DEBUG_r(PerlIO_printf(Perl_debug_log, "%sFound%s at offset %ld\n", - PL_colors[4],PL_colors[5], (long)(s - strpos)) ); + DEBUG_r(PerlIO_printf(Perl_debug_log, "%sGuessed:%s match at offset %ld\n", + PL_colors[4], PL_colors[5], (long)(s - i_strpos)) ); return s; + + fail_finish: /* Substring not found */ + BmUSEFUL(prog->check_substr) += 5; /* hooray */ fail: - DEBUG_r(PerlIO_printf(Perl_debug_log, "%sNot found...%s\n", + DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n", PL_colors[4],PL_colors[5])); return Nullch; } @@ -463,7 +679,6 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * register I32 tmp; I32 minlen; /* must match at least this many chars */ I32 dontbother = 0; /* how many characters not to try at end */ - CURCUR cc; I32 start_shift = 0; /* Offset of the start to find constant substr. */ /* CC */ I32 end_shift = 0; /* Same for the end. */ /* CC */ @@ -471,9 +686,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * char *scream_olds; SV* oreplsv = GvSV(PL_replgv); - cc.cur = 0; - cc.oldcc = 0; - PL_regcc = &cc; + PL_regcc = 0; cache_re(prog); #ifdef DEBUGGING @@ -504,6 +717,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * PL_reg_flags = 0; PL_reg_eval_set = 0; + PL_reg_maxiter = 0; if (prog->reganch & ROPT_UTF8) PL_reg_flags |= RF_utf8; @@ -525,19 +739,23 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * /* If there is a "must appear" string, look for it. */ s = startpos; - if (prog->reganch & ROPT_GPOS_SEEN) { + if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */ MAGIC *mg; - if (!(flags & REXEC_IGNOREPOS) && sv && SvTYPE(sv) >= SVt_PVMG - && SvMAGIC(sv) && (mg = mg_find(sv, 'g')) && mg->mg_len >= 0) - PL_reg_ganch = strbeg + mg->mg_len; - else + if (flags & REXEC_IGNOREPOS) /* Means: check only at start */ PL_reg_ganch = startpos; - if (prog->reganch & ROPT_ANCH_GPOS) { - if (s > PL_reg_ganch) - goto phooey; - s = PL_reg_ganch; + else if (sv && SvTYPE(sv) >= SVt_PVMG + && SvMAGIC(sv) + && (mg = mg_find(sv, 'g')) && mg->mg_len >= 0) { + PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */ + if (prog->reganch & ROPT_ANCH_GPOS) { + if (s > PL_reg_ganch) + goto phooey; + s = PL_reg_ganch; + } } + else /* pos() not defined */ + PL_reg_ganch = strbeg; } if (!(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) { @@ -552,7 +770,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * DEBUG_r( if (!PL_colorset) reginitcolors() ); DEBUG_r(PerlIO_printf(Perl_debug_log, - "%sMatching%s `%s%.60s%s%s' against `%s%.*s%s%s'\n", + "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n", PL_colors[4],PL_colors[5],PL_colors[0], prog->precomp, PL_colors[1], @@ -578,9 +796,12 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * end = HOPc(strend, -dontbother) - 1; /* for multiline we only have to try after newlines */ if (prog->check_substr) { + if (s == startpos) + goto after_try; while (1) { if (regtry(prog, s)) goto got_it; + after_try: if (s >= end) goto phooey; s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL); @@ -704,7 +925,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * /* We know what class it must start with. */ switch (OP(c)) { case ANYOFUTF8: - cc = (char *) OPERAND(c); + cc = MASK(c); while (s < strend) { if (REGINCLASSUTF8(c, (U8*)s)) { if (tmp && regtry(prog, s)) @@ -718,7 +939,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * } break; case ANYOF: - cc = (char *) OPERAND(c); + cc = MASK(c); while (s < strend) { if (REGINCLASS(cc, *s)) { if (tmp && regtry(prog, s)) @@ -1288,7 +1509,7 @@ S_regtry(pTHX_ regexp *prog, char *startpos) } PL_reg_magic = mg; PL_reg_oldpos = mg->mg_len; - SAVEDESTRUCTOR(restore_pos, 0); + SAVEDESTRUCTOR_X(restore_pos, 0); } if (!PL_reg_curpm) New(22,PL_reg_curpm, 1, PMOP); @@ -1385,11 +1606,19 @@ S_regmatch(pTHX_ regnode *prog) #ifdef DEBUGGING # define sayYES goto yes # define sayNO goto no +# define sayYES_FINAL goto yes_final +# define sayYES_LOUD goto yes_loud +# define sayNO_FINAL goto no_final +# define sayNO_SILENT goto do_no # define saySAME(x) if (x) goto yes; else goto no # define REPORT_CODE_OFF 24 #else # define sayYES return 1 # define sayNO return 0 +# define sayYES_FINAL return 1 +# define sayYES_LOUD return 1 +# define sayNO_FINAL return 0 +# define sayNO_SILENT return 0 # define saySAME(x) return x #endif DEBUG_r( { @@ -1517,8 +1746,8 @@ S_regmatch(pTHX_ regnode *prog) nextchr = UCHARAT(++locinput); break; case EXACT: - s = (char *) OPERAND(scan); - ln = UCHARAT(s++); + s = STRING(scan); + ln = STR_LEN(scan); /* Inline the first character, for speed. */ if (UCHARAT(s) != nextchr) sayNO; @@ -1533,8 +1762,8 @@ S_regmatch(pTHX_ regnode *prog) PL_reg_flags |= RF_tainted; /* FALL THROUGH */ case EXACTF: - s = (char *) OPERAND(scan); - ln = UCHARAT(s++); + s = STRING(scan); + ln = STR_LEN(scan); if (UTF) { char *l = locinput; @@ -1572,7 +1801,7 @@ S_regmatch(pTHX_ regnode *prog) nextchr = UCHARAT(locinput); break; case ANYOFUTF8: - s = (char *) OPERAND(scan); + s = MASK(scan); if (!REGINCLASSUTF8(scan, (U8*)locinput)) sayNO; if (locinput >= PL_regeol) @@ -1581,7 +1810,7 @@ S_regmatch(pTHX_ regnode *prog) nextchr = UCHARAT(locinput); break; case ANYOF: - s = (char *) OPERAND(scan); + s = MASK(scan); if (nextchr < 0) nextchr = UCHARAT(locinput); if (!REGINCLASS(s, nextchr)) @@ -1838,6 +2067,7 @@ S_regmatch(pTHX_ regnode *prog) case REFF: n = ARG(scan); /* which paren pair */ ln = PL_regstartp[n]; + PL_reg_leftiter = PL_reg_maxiter; /* Void cache */ if (*PL_reglastparen < n || ln == -1) sayNO; /* Do not match unless seen CLOSEn. */ if (ln == PL_regendp[n]) @@ -1928,7 +2158,6 @@ S_regmatch(pTHX_ regnode *prog) regexp *re; MAGIC *mg = Null(MAGIC*); re_cc_state state; - CURCUR cctmp; CHECKPOINT cp, lastcp; if(SvROK(ret) || SvRMAGICAL(ret)) { @@ -1971,9 +2200,7 @@ S_regmatch(pTHX_ regnode *prog) state.cc = PL_regcc; state.re = PL_reg_re; - cctmp.cur = 0; - cctmp.oldcc = 0; - PL_regcc = &cctmp; + PL_regcc = 0; cp = regcppush(0); /* Save *all* the positions. */ REGCP_SET; @@ -1982,16 +2209,29 @@ S_regmatch(pTHX_ regnode *prog) *PL_reglastparen = 0; PL_reg_call_cc = &state; PL_reginput = locinput; + + /* XXXX This is too dramatic a measure... */ + PL_reg_maxiter = 0; + if (regmatch(re->program + 1)) { + /* Even though we succeeded, we need to restore + global variables, since we may be wrapped inside + SUSPEND, thus the match may be not finished yet. */ + + /* XXXX Do this only if SUSPENDed? */ + PL_reg_call_cc = state.prev; + PL_regcc = state.cc; + PL_reg_re = state.re; + cache_re(PL_reg_re); + + /* XXXX This is too dramatic a measure... */ + PL_reg_maxiter = 0; + + /* These are needed even if not SUSPEND. */ ReREFCNT_dec(re); regcpblow(cp); sayYES; } - DEBUG_r( - PerlIO_printf(Perl_debug_log, - "%*s failed...\n", - REPORT_CODE_OFF+PL_regindent*2, "") - ); ReREFCNT_dec(re); REGCP_UNWIND; regcppop(); @@ -1999,6 +2239,10 @@ S_regmatch(pTHX_ regnode *prog) PL_regcc = state.cc; PL_reg_re = state.re; cache_re(PL_reg_re); + + /* XXXX This is too dramatic a measure... */ + PL_reg_maxiter = 0; + sayNO; } sw = SvTRUE(ret); @@ -2026,6 +2270,7 @@ S_regmatch(pTHX_ regnode *prog) sw = (*PL_reglastparen >= n && PL_regendp[n] != -1); break; case IFTHEN: + PL_reg_leftiter = PL_reg_maxiter; /* Void cache */ if (sw) next = NEXTOPER(NEXTOPER(scan)); else { @@ -2037,6 +2282,81 @@ S_regmatch(pTHX_ regnode *prog) case LOGICAL: logical = scan->flags; break; +/******************************************************************* + PL_regcc contains infoblock about the innermost (...)* loop, and + a pointer to the next outer infoblock. + + Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM): + + 1) After matching X, regnode for CURLYX is processed; + + 2) This regnode creates infoblock on the stack, and calls + regmatch() recursively with the starting point at WHILEM node; + + 3) Each hit of WHILEM node tries to match A and Z (in the order + depending on the current iteration, min/max of {min,max} and + greediness). The information about where are nodes for "A" + and "Z" is read from the infoblock, as is info on how many times "A" + was already matched, and greediness. + + 4) After A matches, the same WHILEM node is hit again. + + 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX + of the same pair. Thus when WHILEM tries to match Z, it temporarily + resets PL_regcc, since this Y(A)*Z can be a part of some other loop: + as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node + of the external loop. + + Currently present infoblocks form a tree with a stem formed by PL_curcc + and whatever it mentions via ->next, and additional attached trees + corresponding to temporarily unset infoblocks as in "5" above. + + In the following picture infoblocks for outer loop of + (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block + is denoted by x. The matched string is YAAZYAZT. Temporarily postponed + infoblocks are drawn below the "reset" infoblock. + + In fact in the picture below we do not show failed matches for Z and T + by WHILEM blocks. [We illustrate minimal matches, since for them it is + more obvious *why* one needs to *temporary* unset infoblocks.] + + Matched REx position InfoBlocks Comment + (Y(A)*?Z)*?T x + Y(A)*?Z)*?T x <- O + Y (A)*?Z)*?T x <- O + Y A)*?Z)*?T x <- O <- I + YA )*?Z)*?T x <- O <- I + YA A)*?Z)*?T x <- O <- I + YAA )*?Z)*?T x <- O <- I + YAA Z)*?T x <- O # Temporary unset I + I + + YAAZ Y(A)*?Z)*?T x <- O + I + + YAAZY (A)*?Z)*?T x <- O + I + + YAAZY A)*?Z)*?T x <- O <- I + I + + YAAZYA )*?Z)*?T x <- O <- I + I + + YAAZYA Z)*?T x <- O # Temporary unset I + I,I + + YAAZYAZ )*?T x <- O + I,I + + YAAZYAZ T x # Temporary unset O + O + I,I + + YAAZYAZT x + O + I,I + *******************************************************************/ case CURLYX: { CURCUR cc; CHECKPOINT cp = PL_savestack_ix; @@ -2064,7 +2384,7 @@ S_regmatch(pTHX_ regnode *prog) /* * This is really hard to understand, because after we match * what we're trying to match, we must make sure the rest of - * the RE is going to match for sure, and to do that we have + * the REx is going to match for sure, and to do that we have * to go back UP the parse tree by recursing ever deeper. And * if it fails, we have to reset our parent's current state * that we can try again after backing off. @@ -2089,7 +2409,8 @@ S_regmatch(pTHX_ regnode *prog) if (locinput == cc->lastloc && n >= cc->min) { PL_regcc = cc->oldcc; - ln = PL_regcc->cur; + if (PL_regcc) + ln = PL_regcc->cur; DEBUG_r( PerlIO_printf(Perl_debug_log, "%*s empty match detected, try continuation...\n", @@ -2097,12 +2418,8 @@ S_regmatch(pTHX_ regnode *prog) ); if (regmatch(cc->next)) sayYES; - DEBUG_r( - PerlIO_printf(Perl_debug_log, - "%*s failed...\n", - REPORT_CODE_OFF+PL_regindent*2, "") - ); - PL_regcc->cur = ln; + if (PL_regcc) + PL_regcc->cur = ln; PL_regcc = cc; sayNO; } @@ -2116,19 +2433,60 @@ S_regmatch(pTHX_ regnode *prog) sayYES; cc->cur = n - 1; cc->lastloc = lastloc; + sayNO; + } + + if (scan->flags) { + /* Check whether we already were at this position. + Postpone detection until we know the match is not + *that* much linear. */ + if (!PL_reg_maxiter) { + PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4); + PL_reg_leftiter = PL_reg_maxiter; + } + if (PL_reg_leftiter-- == 0) { + I32 size = (PL_reg_maxiter + 7)/8; + if (PL_reg_poscache) { + if (PL_reg_poscache_size < size) { + Renew(PL_reg_poscache, size, char); + PL_reg_poscache_size = size; + } + Zero(PL_reg_poscache, size, char); + } + else { + PL_reg_poscache_size = size; + Newz(29, PL_reg_poscache, size, char); + } DEBUG_r( PerlIO_printf(Perl_debug_log, - "%*s failed...\n", + "%sDetected a super-linear match, switching on caching%s...\n", + PL_colors[4], PL_colors[5]) + ); + } + if (PL_reg_leftiter < 0) { + I32 o = locinput - PL_bostr, b; + + o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4); + b = o % 8; + o /= 8; + if (PL_reg_poscache[o] & (1<minmod) { PL_regcc = cc->oldcc; - ln = PL_regcc->cur; + if (PL_regcc) + ln = PL_regcc->cur; cp = regcppush(cc->parenfloor); REGCP_SET; if (regmatch(cc->next)) { @@ -2137,7 +2495,8 @@ S_regmatch(pTHX_ regnode *prog) } REGCP_UNWIND; regcppop(); - PL_regcc->cur = ln; + if (PL_regcc) + PL_regcc->cur = ln; PL_regcc = cc; if (n >= cc->max) { /* Maximum greed exceeded? */ @@ -2166,11 +2525,6 @@ S_regmatch(pTHX_ regnode *prog) regcpblow(cp); sayYES; } - DEBUG_r( - PerlIO_printf(Perl_debug_log, - "%*s failed...\n", - REPORT_CODE_OFF+PL_regindent*2, "") - ); REGCP_UNWIND; regcppop(); cc->cur = n - 1; @@ -2208,14 +2562,12 @@ S_regmatch(pTHX_ regnode *prog) /* Failed deeper matches of scan, so see if this one works. */ PL_regcc = cc->oldcc; - ln = PL_regcc->cur; + if (PL_regcc) + ln = PL_regcc->cur; if (regmatch(cc->next)) sayYES; - DEBUG_r( - PerlIO_printf(Perl_debug_log, "%*s failed...\n", - REPORT_CODE_OFF+PL_regindent*2, "") - ); - PL_regcc->cur = ln; + if (PL_regcc) + PL_regcc->cur = ln; PL_regcc = cc; cc->cur = n - 1; cc->lastloc = lastloc; @@ -2300,7 +2652,7 @@ S_regmatch(pTHX_ regnode *prog) ln = n; locinput = PL_reginput; if (PL_regkind[(U8)OP(next)] == EXACT) { - c1 = UCHARAT(OPERAND(next) + 1); + c1 = (U8)*STRING(next); if (OP(next) == EXACTF) c2 = PL_fold[c1]; else if (OP(next) == EXACTFL) @@ -2357,7 +2709,7 @@ S_regmatch(pTHX_ regnode *prog) ); if (n >= ln) { if (PL_regkind[(U8)OP(next)] == EXACT) { - c1 = UCHARAT(OPERAND(next) + 1); + c1 = (U8)*STRING(next); if (OP(next) == EXACTF) c2 = PL_fold[c1]; else if (OP(next) == EXACTFL) @@ -2434,7 +2786,7 @@ S_regmatch(pTHX_ regnode *prog) * when we know what character comes next. */ if (PL_regkind[(U8)OP(next)] == EXACT) { - c1 = UCHARAT(OPERAND(next) + 1); + c1 = (U8)*STRING(next); if (OP(next) == EXACTF) c2 = PL_fold[c1]; else if (OP(next) == EXACTFL) @@ -2608,14 +2960,22 @@ S_regmatch(pTHX_ regnode *prog) "%*s continuation failed...\n", REPORT_CODE_OFF+PL_regindent*2, "") ); - sayNO; + sayNO_SILENT; + } + if (locinput < PL_regtill) { + DEBUG_r(PerlIO_printf(Perl_debug_log, + "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n", + PL_colors[4], + (long)(locinput - PL_reg_starttry), + (long)(PL_regtill - PL_reg_starttry), + PL_colors[5])); + sayNO_FINAL; /* Cannot match: too short. */ } - if (locinput < PL_regtill) - sayNO; /* Cannot match: too short. */ - /* Fall through */ + PL_reginput = locinput; /* put where regtry can find it */ + sayYES_FINAL; /* Success! */ case SUCCEED: PL_reginput = locinput; /* put where regtry can find it */ - sayYES; /* Success! */ + sayYES_LOUD; /* Success! */ case SUSPEND: n = 1; PL_reginput = locinput; @@ -2691,7 +3051,7 @@ S_regmatch(pTHX_ regnode *prog) next = NULL; break; default: - PerlIO_printf(PerlIO_stderr(), "%lx %d\n", + PerlIO_printf(Perl_error_log, "%lx %d\n", (unsigned long)scan, OP(scan)); Perl_croak(aTHX_ "regexp memory corruption"); } @@ -2706,6 +3066,16 @@ S_regmatch(pTHX_ regnode *prog) /*NOTREACHED*/ sayNO; +yes_loud: + DEBUG_r( + PerlIO_printf(Perl_debug_log, + "%*s %scould match...%s\n", + REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5]) + ); + goto yes; +yes_final: + DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n", + PL_colors[4],PL_colors[5])); yes: #ifdef DEBUGGING PL_regindent--; @@ -2713,6 +3083,14 @@ yes: return 1; no: + DEBUG_r( + PerlIO_printf(Perl_debug_log, + "%*s %sfailed...%s\n", + REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5]) + ); + goto do_no; +no_final: +do_no: #ifdef DEBUGGING PL_regindent--; #endif @@ -2740,7 +3118,6 @@ S_regrepeat(pTHX_ regnode *p, I32 max) scan = PL_reginput; if (max != REG_INFTY && max < loceol - scan) loceol = scan + max; - opnd = (char *) OPERAND(p); switch (OP(p)) { case REG_ANY: while (scan < loceol && *scan != '\n') @@ -2764,19 +3141,19 @@ S_regrepeat(pTHX_ regnode *p, I32 max) } break; case EXACT: /* length of string is 1 */ - c = UCHARAT(++opnd); + c = (U8)*STRING(p); while (scan < loceol && UCHARAT(scan) == c) scan++; break; case EXACTF: /* length of string is 1 */ - c = UCHARAT(++opnd); + c = (U8)*STRING(p); while (scan < loceol && (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c])) scan++; break; case EXACTFL: /* length of string is 1 */ PL_reg_flags |= RF_tainted; - c = UCHARAT(++opnd); + c = (U8)*STRING(p); while (scan < loceol && (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c])) scan++; @@ -2789,6 +3166,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max) } break; case ANYOF: + opnd = MASK(p); while (scan < loceol && REGINCLASS(opnd, *scan)) scan++; break;