X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=regexec.c;h=9a7e91b424ed7f7cfb9e2a75d187033891e70356;hb=c15e4c287f38b5a2d53095f441d6ef22247e840b;hp=75f3873ce7da88e4e639bed3f1ce121173f65b94;hpb=b8c5462f6edbb2dd616e1733df011beee816eee1;p=p5sagit%2Fp5-mst-13.2.git diff --git a/regexec.c b/regexec.c index 75f3873..9a7e91b 100644 --- a/regexec.c +++ b/regexec.c @@ -39,6 +39,8 @@ /* *These* symbols are masked to allow static link. */ # define Perl_pregexec my_pregexec # define Perl_reginitcolors my_reginitcolors + +# define PERL_NO_GET_CONTEXT #endif /*SUPPRESS 112*/ @@ -77,6 +79,12 @@ #define PERL_IN_REGEXEC_C #include "perl.h" +#ifdef PERL_IN_XSUB_RE +# if defined(PERL_CAPI) || defined(PERL_OBJECT) +# include "XSUB.h" +# endif +#endif + #include "regcomp.h" #define RF_tainted 1 /* tainted information used? */ @@ -110,6 +118,9 @@ #define HOPc(pos,off) ((char*)HOP(pos,off)) #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off)) +static void restore_pos(pTHXo_ void *arg); + + STATIC CHECKPOINT S_regcppush(pTHX_ I32 parenfloor) { @@ -243,22 +254,6 @@ S_cache_re(pTHX_ regexp *prog) PL_reg_re = prog; } -STATIC void -S_restore_pos(pTHX_ void *arg) -{ - dTHR; - if (PL_reg_eval_set) { - if (PL_reg_oldsaved) { - PL_reg_re->subbeg = PL_reg_oldsaved; - PL_reg_re->sublen = PL_reg_oldsavedlen; - RX_MATCH_COPIED_on(PL_reg_re); - } - PL_reg_magic->mg_len = PL_reg_oldpos; - PL_reg_eval_set = 0; - PL_curpm = PL_reg_oldcurpm; - } -} - /* * Need to implement the following flags for reg_anch: * @@ -275,25 +270,33 @@ S_restore_pos(pTHX_ void *arg) /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */ -/* If SCREAM, then sv should be compatible with strpos and strend. +/* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend. Otherwise, only SvCUR(sv) is used to get strbeg. */ /* XXXX We assume that strpos is strbeg unless sv. */ +/* A failure to find a constant substring means that there is no need to make + an expensive call to REx engine, thus we celebrate a failure. Similarly, + finding a substring too deep into the string means that less calls to + regtry() should be needed. */ + char * Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, char *strend, U32 flags, re_scream_pos_data *data) { - I32 start_shift; + register I32 start_shift; /* Should be nonnegative! */ - I32 end_shift; - char *s; + register I32 end_shift; + register char *s; + register SV *check; char *t; I32 ml_anch; + char *tmp; + register char *other_last = Nullch; DEBUG_r( if (!PL_colorset) reginitcolors() ); DEBUG_r(PerlIO_printf(Perl_debug_log, - "%sGuessing start of match:%s `%s%.60s%s%s' against `%s%.*s%s%s'\n", + "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n", PL_colors[4],PL_colors[5],PL_colors[0], prog->precomp, PL_colors[1], @@ -304,125 +307,296 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, (strend - strpos > 60 ? "..." : "")) ); - if (prog->minlen > strend - strpos) + if (prog->minlen > strend - strpos) { + DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n")); goto fail; - - /* XXXX Move further down? */ - start_shift = prog->check_offset_min; /* okay to underestimate on CC */ - /* Should be nonnegative! */ - end_shift = prog->minlen - start_shift - - CHR_SVLEN(prog->check_substr) + (SvTAIL(prog->check_substr) != 0); - - if (prog->reganch & ROPT_ANCH) { + } + if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */ ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE) || ( (prog->reganch & ROPT_ANCH_BOL) - && !PL_multiline ) ); + && !PL_multiline ) ); /* Check after \n? */ if ((prog->check_offset_min == prog->check_offset_max) && !ml_anch) { - /* Anchored... */ + /* Substring at constant offset from beg-of-str... */ I32 slen; if ( !(prog->reganch & ROPT_ANCH_GPOS) /* Checked by the caller */ - && (sv && (strpos + SvCUR(sv) != strend)) ) + && (sv && (strpos + SvCUR(sv) != strend)) ) { + DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n")); goto fail; - - s = (char*)HOP((U8*)strpos, prog->check_offset_min); + } + PL_regeol = strend; /* Used in HOP() */ + s = HOPc(strpos, prog->check_offset_min); if (SvTAIL(prog->check_substr)) { slen = SvCUR(prog->check_substr); /* >= 1 */ - if ( strend - s > slen || strend - s < slen - 1 ) { - s = Nullch; - goto finish; - } - if ( strend - s == slen && strend[-1] != '\n') { - s = Nullch; - goto finish; + if ( strend - s > slen || strend - s < slen - 1 + || (strend - s == slen && strend[-1] != '\n')) { + DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n")); + goto fail_finish; } /* Now should match s[0..slen-2] */ slen--; if (slen && (*SvPVX(prog->check_substr) != *s || (slen > 1 - && memNE(SvPVX(prog->check_substr), s, slen)))) - s = Nullch; + && memNE(SvPVX(prog->check_substr), s, slen)))) { + report_neq: + DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n")); + goto fail_finish; + } } else if (*SvPVX(prog->check_substr) != *s || ((slen = SvCUR(prog->check_substr)) > 1 && memNE(SvPVX(prog->check_substr), s, slen))) - s = Nullch; - else - s = strpos; - goto finish; + goto report_neq; + goto success_at_start; } + /* Match is anchored, but substr is not anchored wrt beg-of-str. */ s = strpos; - if (!ml_anch && (s + prog->check_offset_max < strend - prog->minlen)) - end_shift += strend - s - prog->minlen - prog->check_offset_max; + start_shift = prog->check_offset_min; /* okay to underestimate on CC */ + /* Should be nonnegative! */ + end_shift = prog->minlen - start_shift - + CHR_SVLEN(prog->check_substr) + (SvTAIL(prog->check_substr) != 0); + if (!ml_anch) { + I32 end = prog->check_offset_max + CHR_SVLEN(prog->check_substr) + - (SvTAIL(prog->check_substr) != 0); + I32 eshift = strend - s - end; + + if (end_shift < eshift) + end_shift = eshift; + } } - else { + else { /* Can match at random position */ ml_anch = 0; s = strpos; + start_shift = prog->check_offset_min; /* okay to underestimate on CC */ + /* Should be nonnegative! */ + end_shift = prog->minlen - start_shift - + CHR_SVLEN(prog->check_substr) + (SvTAIL(prog->check_substr) != 0); } +#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */ + if (end_shift < 0) + croak("panic: end_shift"); +#endif + + check = prog->check_substr; restart: + /* Find a possible match in the region s..strend by looking for + the "check" substring in the region corrected by start/end_shift. */ if (flags & REXEC_SCREAM) { - SV *c = prog->check_substr; char *strbeg = SvPVX(sv); /* XXXX Assume PV_force() on SCREAM! */ I32 p = -1; /* Internal iterator of scream. */ I32 *pp = data ? data->scream_pos : &p; - if (PL_screamfirst[BmRARE(c)] >= 0 - || ( BmRARE(c) == '\n' - && (BmPREVIOUS(c) == SvCUR(c) - 1) - && SvTAIL(c) )) - s = screaminstr(sv, prog->check_substr, - start_shift + (strpos - strbeg), end_shift, pp, 0); + if (PL_screamfirst[BmRARE(check)] >= 0 + || ( BmRARE(check) == '\n' + && (BmPREVIOUS(check) == SvCUR(check) - 1) + && SvTAIL(check) )) + s = screaminstr(sv, check, + start_shift + (s - strbeg), end_shift, pp, 0); else - s = Nullch; + goto fail_finish; if (data) *data->scream_olds = s; } else s = fbm_instr((unsigned char*)s + start_shift, (unsigned char*)strend - end_shift, - prog->check_substr, PL_multiline ? FBMrf_MULTILINE : 0); + check, PL_multiline ? FBMrf_MULTILINE : 0); /* Update the count-of-usability, remove useless subpatterns, unshift s. */ - finish: - if (!s) { - ++BmUSEFUL(prog->check_substr); /* hooray */ - goto fail; /* not present */ + + DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s", + (s ? "Found" : "Did not find"), + ((check == prog->anchored_substr) ? "anchored" : "floating"), + PL_colors[0], + SvCUR(check) - (SvTAIL(check)!=0), SvPVX(check), + PL_colors[1], (SvTAIL(check) ? "$" : ""), + (s ? " at offset " : "...\n") ) ); + + if (!s) + goto fail_finish; + + /* Finish the diagnostic message */ + DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - strpos)) ); + + /* Got a candidate. Check MBOL anchoring, and the *other* substr. + Start with the other substr. + XXXX no SCREAM optimization yet - and a very coarse implementation + XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will + *always* match. Probably should be marked during compile... + Probably it is right to do no SCREAM here... + */ + + if (prog->float_substr && prog->anchored_substr) { + /* Take into account the anchored substring. */ + /* XXXX May be hopelessly wrong for UTF... */ + if (!other_last) + other_last = strpos - 1; + if (check == prog->float_substr) { + char *last = s - start_shift, *last1, *last2; + char *s1 = s; + + tmp = PL_bostr; + t = s - prog->check_offset_max; + if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ + && (!(prog->reganch & ROPT_UTF8) + || (PL_bostr = strpos, /* Used in regcopmaybe() */ + (t = reghopmaybe_c(s, -(prog->check_offset_max))) + && t > strpos))) + ; + else + t = strpos; + t += prog->anchored_offset; + if (t <= other_last) + t = other_last + 1; + PL_bostr = tmp; + last2 = last1 = strend - prog->minlen; + if (last < last1) + last1 = last; + /* XXXX It is not documented what units *_offsets are in. Assume bytes. */ + /* On end-of-str: see comment below. */ + s = fbm_instr((unsigned char*)t, + (unsigned char*)last1 + prog->anchored_offset + + SvCUR(prog->anchored_substr) + - (SvTAIL(prog->anchored_substr)!=0), + prog->anchored_substr, PL_multiline ? FBMrf_MULTILINE : 0); + DEBUG_r(PerlIO_printf(Perl_debug_log, "%s anchored substr `%s%.*s%s'%s", + (s ? "Found" : "Contradicts"), + PL_colors[0], + SvCUR(prog->anchored_substr) + - (SvTAIL(prog->anchored_substr)!=0), + SvPVX(prog->anchored_substr), + PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : ""))); + if (!s) { + if (last1 >= last2) { + DEBUG_r(PerlIO_printf(Perl_debug_log, + ", giving up...\n")); + goto fail_finish; + } + DEBUG_r(PerlIO_printf(Perl_debug_log, + ", trying floating at offset %ld...\n", + (long)(s1 + 1 - strpos))); + PL_regeol = strend; /* Used in HOP() */ + other_last = last1 + prog->anchored_offset; + s = HOPc(last, 1); + goto restart; + } + else { + DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", + (long)(s - strpos))); + t = s - prog->anchored_offset; + other_last = s - 1; + if (t == strpos) + goto try_at_start; + s = s1; + goto try_at_offset; + } + } + else { /* Take into account the floating substring. */ + char *last, *last1; + char *s1 = s; + + t = s - start_shift; + last1 = last = strend - prog->minlen + prog->float_min_offset; + if (last - t > prog->float_max_offset) + last = t + prog->float_max_offset; + s = t + prog->float_min_offset; + if (s <= other_last) + s = other_last + 1; + /* XXXX It is not documented what units *_offsets are in. Assume bytes. */ + /* fbm_instr() takes into account exact value of end-of-str + if the check is SvTAIL(ed). Since false positives are OK, + and end-of-str is not later than strend we are OK. */ + s = fbm_instr((unsigned char*)s, + (unsigned char*)last + SvCUR(prog->float_substr) + - (SvTAIL(prog->float_substr)!=0), + prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0); + DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s", + (s ? "Found" : "Contradicts"), + PL_colors[0], + SvCUR(prog->float_substr) + - (SvTAIL(prog->float_substr)!=0), + SvPVX(prog->float_substr), + PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : ""))); + if (!s) { + if (last1 == last) { + DEBUG_r(PerlIO_printf(Perl_debug_log, + ", giving up...\n")); + goto fail_finish; + } + DEBUG_r(PerlIO_printf(Perl_debug_log, + ", trying anchored starting at offset %ld...\n", + (long)(s1 + 1 - strpos))); + other_last = last; + PL_regeol = strend; /* Used in HOP() */ + s = HOPc(t, 1); + goto restart; + } + else { + DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", + (long)(s - strpos))); + other_last = s - 1; + if (t == strpos) + goto try_at_start; + s = s1; + goto try_at_offset; + } + } } - else if (s - strpos > prog->check_offset_max && - ((prog->reganch & ROPT_UTF8) - ? ((t = reghopmaybe_c(s, -(prog->check_offset_max))) - && t >= strpos) - : (t = s - prog->check_offset_max) != 0) ) { + + t = s - prog->check_offset_max; + tmp = PL_bostr; + if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ + && (!(prog->reganch & ROPT_UTF8) + || (PL_bostr = strpos, /* Used in regcopmaybe() */ + ((t = reghopmaybe_c(s, -(prog->check_offset_max))) + && t > strpos)))) { + PL_bostr = tmp; + /* Fixed substring is found far enough so that the match + cannot start at strpos. */ + try_at_offset: if (ml_anch && t[-1] != '\n') { - find_anchor: - while (t < strend - end_shift - prog->minlen) { + find_anchor: /* Eventually fbm_*() should handle this */ + while (t < strend - prog->minlen) { if (*t == '\n') { if (t < s - prog->check_offset_min) { s = t + 1; + DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n", + PL_colors[0],PL_colors[1], (long)(s - strpos))); goto set_useful; } + DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting at offset %ld...\n", + PL_colors[0],PL_colors[1], (long)(t + 1 - strpos))); s = t + 1; goto restart; } t++; } - s = Nullch; - goto finish; + DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n", + PL_colors[0],PL_colors[1])); + goto fail_finish; } s = t; set_useful: - ++BmUSEFUL(prog->check_substr); /* hooray/2 */ + ++BmUSEFUL(prog->check_substr); /* hooray/5 */ } else { - if (ml_anch && sv + PL_bostr = tmp; + /* The found string does not prohibit matching at beg-of-str + - no optimization of calling REx engine can be performed, + unless it was an MBOL and we are not after MBOL. */ + try_at_start: + /* Even in this situation we may use MBOL flag if strpos is offset + wrt the start of the string. */ + if (ml_anch && sv && (strpos + SvCUR(sv) != strend) && strpos[-1] != '\n') { t = strpos; goto find_anchor; } + success_at_start: if (!(prog->reganch & ROPT_NAUGHTY) && --BmUSEFUL(prog->check_substr) < 0 && prog->check_substr == prog->float_substr) { /* boo */ @@ -437,11 +611,14 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, s = strpos; } - DEBUG_r(PerlIO_printf(Perl_debug_log, "%sFound%s at offset %ld\n", - PL_colors[4],PL_colors[5], (long)(s - strpos)) ); + DEBUG_r(PerlIO_printf(Perl_debug_log, "%sGuessed:%s match at offset %ld\n", + PL_colors[4], PL_colors[5], (long)(s - strpos)) ); return s; + + fail_finish: /* Substring not found */ + BmUSEFUL(prog->check_substr) += 5; /* hooray */ fail: - DEBUG_r(PerlIO_printf(Perl_debug_log, "%sNot found...%s\n", + DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n", PL_colors[4],PL_colors[5])); return Nullch; } @@ -506,6 +683,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * PL_reg_flags = 0; PL_reg_eval_set = 0; + PL_reg_maxiter = 0; if (prog->reganch & ROPT_UTF8) PL_reg_flags |= RF_utf8; @@ -554,7 +732,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * DEBUG_r( if (!PL_colorset) reginitcolors() ); DEBUG_r(PerlIO_printf(Perl_debug_log, - "%sMatching%s `%s%.60s%s%s' against `%s%.*s%s%s'\n", + "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n", PL_colors[4],PL_colors[5],PL_colors[0], prog->precomp, PL_colors[1], @@ -1144,961 +1322,153 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * s += UTF8SKIP(s); } break; - case ALNUMC: - while (s < strend) { - if (isALNUMC(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; + } + } + else { + dontbother = 0; + if (prog->float_substr != Nullsv) { /* Trim the end. */ + char *last; + I32 oldpos = scream_pos; + + if (flags & REXEC_SCREAM) { + last = screaminstr(sv, prog->float_substr, s - strbeg, + end_shift, &scream_pos, 1); /* last one */ + if (!last) + last = scream_olds; /* Only one occurence. */ } - break; - case ALNUMCUTF8: - while (s < strend) { - if (swash_fetch(PL_utf8_alnumc, (U8*)s)) { - if (tmp && regtry(prog, s)) - goto got_it; + else { + STRLEN len; + char *little = SvPV(prog->float_substr, len); + + if (SvTAIL(prog->float_substr)) { + if (memEQ(strend - len + 1, little, len - 1)) + last = strend - len + 1; + else if (!PL_multiline) + last = memEQ(strend - len, little, len) + ? strend - len : Nullch; else - tmp = doevery; - } - else - tmp = 1; - s += UTF8SKIP(s); - } - break; - case ALNUMCL: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (isALNUMC_LC(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; + goto find_last; + } else { + find_last: + if (len) + last = rninstr(s, strend, little, little + len); else - tmp = doevery; + last = strend; /* matching `$' */ } - else - tmp = 1; - s++; } - break; - case ALNUMCLUTF8: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (isALNUMC_LC_utf8((U8*)s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; + if (last == NULL) goto phooey; /* Should not happen! */ + dontbother = strend - last + prog->float_min_offset; + } + if (minlen && (dontbother < minlen)) + dontbother = minlen - 1; + strend -= dontbother; /* this one's always in bytes! */ + /* We don't know much -- general case. */ + if (UTF) { + for (;;) { + if (regtry(prog, s)) + goto got_it; + if (s >= strend) + break; s += UTF8SKIP(s); + }; + } + else { + do { + if (regtry(prog, s)) + goto got_it; + } while (s++ < strend); + } + } + + /* Failure. */ + goto phooey; + +got_it: + RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted); + + if (PL_reg_eval_set) { + /* Preserve the current value of $^R */ + if (oreplsv != GvSV(PL_replgv)) + sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is + restored, the value remains + the same. */ + restore_pos(aTHXo_ 0); + } + + /* make sure $`, $&, $', and $digit will work later */ + if ( !(flags & REXEC_NOT_FIRST) ) { + if (RX_MATCH_COPIED(prog)) { + Safefree(prog->subbeg); + RX_MATCH_COPIED_off(prog); + } + if (flags & REXEC_COPY_STR) { + I32 i = PL_regeol - startpos + (stringarg - strbeg); + + s = savepvn(strbeg, i); + prog->subbeg = s; + prog->sublen = i; + RX_MATCH_COPIED_on(prog); + } + else { + prog->subbeg = strbeg; + prog->sublen = PL_regeol - strbeg; /* strend may have been modified */ + } + } + + return 1; + +phooey: + if (PL_reg_eval_set) + restore_pos(aTHXo_ 0); + return 0; +} + +/* + - regtry - try match at specific point + */ +STATIC I32 /* 0 failure, 1 success */ +S_regtry(pTHX_ regexp *prog, char *startpos) +{ + dTHR; + register I32 i; + register I32 *sp; + register I32 *ep; + CHECKPOINT lastcp; + + if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) { + MAGIC *mg; + + PL_reg_eval_set = RS_init; + DEBUG_r(DEBUG_s( + PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %i\n", + PL_stack_sp - PL_stack_base); + )); + SAVEINT(cxstack[cxstack_ix].blk_oldsp); + cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base; + /* Otherwise OP_NEXTSTATE will free whatever on stack now. */ + SAVETMPS; + /* Apparently this is not needed, judging by wantarray. */ + /* SAVEINT(cxstack[cxstack_ix].blk_gimme); + cxstack[cxstack_ix].blk_gimme = G_SCALAR; */ + + if (PL_reg_sv) { + /* Make $_ available to executed code. */ + if (PL_reg_sv != DEFSV) { + /* SAVE_DEFSV does *not* suffice here for USE_THREADS */ + SAVESPTR(DEFSV); + DEFSV = PL_reg_sv; } - break; - case NALNUMC: - while (s < strend) { - if (!isALNUMC(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - case NALNUMCUTF8: - while (s < strend) { - if (!swash_fetch(PL_utf8_alnumc, (U8*)s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s += UTF8SKIP(s); - } - break; - case NALNUMCL: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (!isALNUMC_LC(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - case NALNUMCLUTF8: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (!isALNUMC_LC_utf8((U8*)s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s += UTF8SKIP(s); - } - break; - case ASCII: - while (s < strend) { - if (isASCII(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - case NASCII: - while (s < strend) { - if (!isASCII(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - case CNTRL: - while (s < strend) { - if (isCNTRL(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - case CNTRLUTF8: - while (s < strend) { - if (swash_fetch(PL_utf8_cntrl,(U8*)s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s += UTF8SKIP(s); - } - break; - case CNTRLL: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (isCNTRL_LC(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - case CNTRLLUTF8: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (*s == ' ' || isCNTRL_LC_utf8((U8*)s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s += UTF8SKIP(s); - } - break; - case NCNTRL: - while (s < strend) { - if (!isCNTRL(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - case NCNTRLUTF8: - while (s < strend) { - if (!swash_fetch(PL_utf8_cntrl,(U8*)s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s += UTF8SKIP(s); - } - break; - case NCNTRLL: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (!isCNTRL_LC(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - case NCNTRLLUTF8: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (!isCNTRL_LC_utf8((U8*)s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s += UTF8SKIP(s); - } - break; - case GRAPH: - while (s < strend) { - if (isGRAPH(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - case GRAPHUTF8: - while (s < strend) { - if (swash_fetch(PL_utf8_graph,(U8*)s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s += UTF8SKIP(s); - } - break; - case GRAPHL: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (isGRAPH_LC(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - case GRAPHLUTF8: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (*s == ' ' || isGRAPH_LC_utf8((U8*)s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s += UTF8SKIP(s); - } - break; - case NGRAPH: - while (s < strend) { - if (!isGRAPH(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - case NGRAPHUTF8: - while (s < strend) { - if (!swash_fetch(PL_utf8_graph,(U8*)s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s += UTF8SKIP(s); - } - break; - case NGRAPHL: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (!isGRAPH_LC(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - case NGRAPHLUTF8: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (!isGRAPH_LC_utf8((U8*)s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s += UTF8SKIP(s); - } - break; - case LOWER: - while (s < strend) { - if (isLOWER(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - case LOWERUTF8: - while (s < strend) { - if (swash_fetch(PL_utf8_lower,(U8*)s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s += UTF8SKIP(s); - } - break; - case LOWERL: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (isLOWER_LC(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - case LOWERLUTF8: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (*s == ' ' || isLOWER_LC_utf8((U8*)s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s += UTF8SKIP(s); - } - break; - case NLOWER: - while (s < strend) { - if (!isLOWER(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - case NLOWERUTF8: - while (s < strend) { - if (!swash_fetch(PL_utf8_lower,(U8*)s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s += UTF8SKIP(s); - } - break; - case NLOWERL: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (!isLOWER_LC(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - case NLOWERLUTF8: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (!isLOWER_LC_utf8((U8*)s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s += UTF8SKIP(s); - } - break; - case PRINT: - while (s < strend) { - if (isPRINT(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - case PRINTUTF8: - while (s < strend) { - if (swash_fetch(PL_utf8_print,(U8*)s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s += UTF8SKIP(s); - } - break; - case PRINTL: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (isPRINT_LC(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - case PRINTLUTF8: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (*s == ' ' || isPRINT_LC_utf8((U8*)s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s += UTF8SKIP(s); - } - break; - case NPRINT: - while (s < strend) { - if (!isPRINT(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - case NPRINTUTF8: - while (s < strend) { - if (!swash_fetch(PL_utf8_print,(U8*)s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s += UTF8SKIP(s); - } - break; - case NPRINTL: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (!isPRINT_LC(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - case NPRINTLUTF8: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (!isPRINT_LC_utf8((U8*)s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s += UTF8SKIP(s); - } - break; - case PUNCT: - while (s < strend) { - if (isPUNCT(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - case PUNCTUTF8: - while (s < strend) { - if (swash_fetch(PL_utf8_punct,(U8*)s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s += UTF8SKIP(s); - } - break; - case PUNCTL: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (isPUNCT_LC(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - case PUNCTLUTF8: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (*s == ' ' || isPUNCT_LC_utf8((U8*)s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s += UTF8SKIP(s); - } - break; - case NPUNCT: - while (s < strend) { - if (!isPUNCT(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - case NPUNCTUTF8: - while (s < strend) { - if (!swash_fetch(PL_utf8_punct,(U8*)s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s += UTF8SKIP(s); - } - break; - case NPUNCTL: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (!isPUNCT_LC(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - case NPUNCTLUTF8: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (!isPUNCT_LC_utf8((U8*)s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s += UTF8SKIP(s); - } - break; - case UPPER: - while (s < strend) { - if (isUPPER(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - case UPPERUTF8: - while (s < strend) { - if (swash_fetch(PL_utf8_upper,(U8*)s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s += UTF8SKIP(s); - } - break; - case UPPERL: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (isUPPER_LC(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - case UPPERLUTF8: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (*s == ' ' || isUPPER_LC_utf8((U8*)s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s += UTF8SKIP(s); - } - break; - case NUPPER: - while (s < strend) { - if (!isUPPER(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - case NUPPERUTF8: - while (s < strend) { - if (!swash_fetch(PL_utf8_upper,(U8*)s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s += UTF8SKIP(s); - } - break; - case NUPPERL: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (!isUPPER_LC(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - case NUPPERLUTF8: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (!isUPPER_LC_utf8((U8*)s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s += UTF8SKIP(s); - } - break; - case XDIGIT: - while (s < strend) { - if (isXDIGIT(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - case NXDIGIT: - while (s < strend) { - if (!isXDIGIT(*s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - } - } - else { - dontbother = 0; - if (prog->float_substr != Nullsv) { /* Trim the end. */ - char *last; - I32 oldpos = scream_pos; - - if (flags & REXEC_SCREAM) { - last = screaminstr(sv, prog->float_substr, s - strbeg, - end_shift, &scream_pos, 1); /* last one */ - if (!last) - last = scream_olds; /* Only one occurence. */ - } - else { - STRLEN len; - char *little = SvPV(prog->float_substr, len); - - if (SvTAIL(prog->float_substr)) { - if (memEQ(strend - len + 1, little, len - 1)) - last = strend - len + 1; - else if (!PL_multiline) - last = memEQ(strend - len, little, len) - ? strend - len : Nullch; - else - goto find_last; - } else { - find_last: - if (len) - last = rninstr(s, strend, little, little + len); - else - last = strend; /* matching `$' */ - } - } - if (last == NULL) goto phooey; /* Should not happen! */ - dontbother = strend - last + prog->float_min_offset; - } - if (minlen && (dontbother < minlen)) - dontbother = minlen - 1; - strend -= dontbother; /* this one's always in bytes! */ - /* We don't know much -- general case. */ - if (UTF) { - for (;;) { - if (regtry(prog, s)) - goto got_it; - if (s >= strend) - break; - s += UTF8SKIP(s); - }; - } - else { - do { - if (regtry(prog, s)) - goto got_it; - } while (s++ < strend); - } - } - - /* Failure. */ - goto phooey; - -got_it: - RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted); - - if (PL_reg_eval_set) { - /* Preserve the current value of $^R */ - if (oreplsv != GvSV(PL_replgv)) - sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is - restored, the value remains - the same. */ - restore_pos(0); - } - - /* make sure $`, $&, $', and $digit will work later */ - if ( !(flags & REXEC_NOT_FIRST) ) { - if (RX_MATCH_COPIED(prog)) { - Safefree(prog->subbeg); - RX_MATCH_COPIED_off(prog); - } - if (flags & REXEC_COPY_STR) { - I32 i = PL_regeol - startpos + (stringarg - strbeg); - - s = savepvn(strbeg, i); - prog->subbeg = s; - prog->sublen = i; - RX_MATCH_COPIED_on(prog); - } - else { - prog->subbeg = strbeg; - prog->sublen = PL_regeol - strbeg; /* strend may have been modified */ - } - } - - return 1; - -phooey: - if (PL_reg_eval_set) - restore_pos(0); - return 0; -} - -/* - - regtry - try match at specific point - */ -STATIC I32 /* 0 failure, 1 success */ -S_regtry(pTHX_ regexp *prog, char *startpos) -{ - dTHR; - register I32 i; - register I32 *sp; - register I32 *ep; - CHECKPOINT lastcp; - - if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) { - MAGIC *mg; - - PL_reg_eval_set = RS_init; - DEBUG_r(DEBUG_s( - PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %i\n", - PL_stack_sp - PL_stack_base); - )); - SAVEINT(cxstack[cxstack_ix].blk_oldsp); - cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base; - /* Otherwise OP_NEXTSTATE will free whatever on stack now. */ - SAVETMPS; - /* Apparently this is not needed, judging by wantarray. */ - /* SAVEINT(cxstack[cxstack_ix].blk_gimme); - cxstack[cxstack_ix].blk_gimme = G_SCALAR; */ - - if (PL_reg_sv) { - /* Make $_ available to executed code. */ - if (PL_reg_sv != DEFSV) { - /* SAVE_DEFSV does *not* suffice here for USE_THREADS */ - SAVESPTR(DEFSV); - DEFSV = PL_reg_sv; - } - - if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv) - && (mg = mg_find(PL_reg_sv, 'g')))) { - /* prepare for quick setting of pos */ - sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0); - mg = mg_find(PL_reg_sv, 'g'); - mg->mg_len = -1; + + if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv) + && (mg = mg_find(PL_reg_sv, 'g')))) { + /* prepare for quick setting of pos */ + sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0); + mg = mg_find(PL_reg_sv, 'g'); + mg->mg_len = -1; } PL_reg_magic = mg; PL_reg_oldpos = mg->mg_len; - SAVEDESTRUCTOR(S_restore_pos, 0); + SAVEDESTRUCTOR(restore_pos, 0); } if (!PL_reg_curpm) New(22,PL_reg_curpm, 1, PMOP); @@ -2171,806 +1541,289 @@ S_regtry(pTHX_ regexp *prog, char *startpos) STATIC I32 /* 0 failure, 1 success */ S_regmatch(pTHX_ regnode *prog) { - dTHR; - register regnode *scan; /* Current node. */ - regnode *next; /* Next node. */ - regnode *inner; /* Next node in internal branch. */ - register I32 nextchr; /* renamed nextchr - nextchar colides with - function of same name */ - register I32 n; /* no or next */ - register I32 ln; /* len or last */ - register char *s; /* operand or save */ - register char *locinput = PL_reginput; - register I32 c1, c2, paren; /* case fold search, parenth */ - int minmod = 0, sw = 0, logical = 0; -#ifdef DEBUGGING - PL_regindent++; -#endif - - /* Note that nextchr is a byte even in UTF */ - nextchr = UCHARAT(locinput); - scan = prog; - while (scan != NULL) { -#define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO) -#ifdef DEBUGGING -# define sayYES goto yes -# define sayNO goto no -# define saySAME(x) if (x) goto yes; else goto no -# define REPORT_CODE_OFF 24 -#else -# define sayYES return 1 -# define sayNO return 0 -# define saySAME(x) return x -#endif - DEBUG_r( { - SV *prop = sv_newmortal(); - int docolor = *PL_colors[0]; - int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */ - int l = (PL_regeol - locinput > taill ? taill : PL_regeol - locinput); - /* The part of the string before starttry has one color - (pref0_len chars), between starttry and current - position another one (pref_len - pref0_len chars), - after the current position the third one. - We assume that pref0_len <= pref_len, otherwise we - decrease pref0_len. */ - int pref_len = (locinput - PL_bostr > (5 + taill) - l - ? (5 + taill) - l : locinput - PL_bostr); - int pref0_len = pref_len - (locinput - PL_reg_starttry); - - if (l + pref_len < (5 + taill) && l < PL_regeol - locinput) - l = ( PL_regeol - locinput > (5 + taill) - pref_len - ? (5 + taill) - pref_len : PL_regeol - locinput); - if (pref0_len < 0) - pref0_len = 0; - if (pref0_len > pref_len) - pref0_len = pref_len; - regprop(prop, scan); - PerlIO_printf(Perl_debug_log, - "%4i <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3d:%*s%s\n", - locinput - PL_bostr, - PL_colors[4], pref0_len, - locinput - pref_len, PL_colors[5], - PL_colors[2], pref_len - pref0_len, - locinput - pref_len + pref0_len, PL_colors[3], - (docolor ? "" : "> <"), - PL_colors[0], l, locinput, PL_colors[1], - 15 - l - pref_len + 1, - "", - scan - PL_regprogram, PL_regindent*2, "", - SvPVX(prop)); - } ); - - next = scan + NEXT_OFF(scan); - if (next == scan) - next = NULL; - - switch (OP(scan)) { - case BOL: - if (locinput == PL_bostr - ? PL_regprev == '\n' - : (PL_multiline && - (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') ) - { - /* regtill = regbol; */ - break; - } - sayNO; - case MBOL: - if (locinput == PL_bostr - ? PL_regprev == '\n' - : ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n') ) - { - break; - } - sayNO; - case SBOL: - if (locinput == PL_regbol && PL_regprev == '\n') - break; - sayNO; - case GPOS: - if (locinput == PL_reg_ganch) - break; - sayNO; - case EOL: - if (PL_multiline) - goto meol; - else - goto seol; - case MEOL: - meol: - if ((nextchr || locinput < PL_regeol) && nextchr != '\n') - sayNO; - break; - case SEOL: - seol: - if ((nextchr || locinput < PL_regeol) && nextchr != '\n') - sayNO; - if (PL_regeol - locinput > 1) - sayNO; - break; - case EOS: - if (PL_regeol != locinput) - sayNO; - break; - case SANYUTF8: - if (nextchr & 0x80) { - locinput += PL_utf8skip[nextchr]; - if (locinput > PL_regeol) - sayNO; - nextchr = UCHARAT(locinput); - break; - } - if (!nextchr && locinput >= PL_regeol) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case SANY: - if (!nextchr && locinput >= PL_regeol) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case ANYUTF8: - if (nextchr & 0x80) { - locinput += PL_utf8skip[nextchr]; - if (locinput > PL_regeol) - sayNO; - nextchr = UCHARAT(locinput); - break; - } - if (!nextchr && locinput >= PL_regeol || nextchr == '\n') - sayNO; - nextchr = UCHARAT(++locinput); - break; - case REG_ANY: - if (!nextchr && locinput >= PL_regeol || nextchr == '\n') - sayNO; - nextchr = UCHARAT(++locinput); - break; - case EXACT: - s = (char *) OPERAND(scan); - ln = UCHARAT(s++); - /* Inline the first character, for speed. */ - if (UCHARAT(s) != nextchr) - sayNO; - if (PL_regeol - locinput < ln) - sayNO; - if (ln > 1 && memNE(s, locinput, ln)) - sayNO; - locinput += ln; - nextchr = UCHARAT(locinput); - break; - case EXACTFL: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case EXACTF: - s = (char *) OPERAND(scan); - ln = UCHARAT(s++); - - if (UTF) { - char *l = locinput; - char *e = s + ln; - c1 = OP(scan) == EXACTF; - while (s < e) { - if (l >= PL_regeol) - sayNO; - if (utf8_to_uv((U8*)s, 0) != (c1 ? - toLOWER_utf8((U8*)l) : - toLOWER_LC_utf8((U8*)l))) - { - sayNO; - } - s += UTF8SKIP(s); - l += UTF8SKIP(l); - } - locinput = l; - nextchr = UCHARAT(locinput); - break; - } + dTHR; + register regnode *scan; /* Current node. */ + regnode *next; /* Next node. */ + regnode *inner; /* Next node in internal branch. */ + register I32 nextchr; /* renamed nextchr - nextchar colides with + function of same name */ + register I32 n; /* no or next */ + register I32 ln; /* len or last */ + register char *s; /* operand or save */ + register char *locinput = PL_reginput; + register I32 c1, c2, paren; /* case fold search, parenth */ + int minmod = 0, sw = 0, logical = 0; +#ifdef DEBUGGING + PL_regindent++; +#endif - /* Inline the first character, for speed. */ - if (UCHARAT(s) != nextchr && - UCHARAT(s) != ((OP(scan) == EXACTF) - ? PL_fold : PL_fold_locale)[nextchr]) - sayNO; - if (PL_regeol - locinput < ln) - sayNO; - if (ln > 1 && (OP(scan) == EXACTF - ? ibcmp(s, locinput, ln) - : ibcmp_locale(s, locinput, ln))) - sayNO; - locinput += ln; - nextchr = UCHARAT(locinput); - break; - case ANYOFUTF8: - s = (char *) OPERAND(scan); - if (!REGINCLASSUTF8(scan, (U8*)locinput)) - sayNO; - if (locinput >= PL_regeol) - sayNO; - locinput += PL_utf8skip[nextchr]; - nextchr = UCHARAT(locinput); - break; - case ANYOF: - s = (char *) OPERAND(scan); - if (nextchr < 0) - nextchr = UCHARAT(locinput); - if (!REGINCLASS(s, nextchr)) - sayNO; - if (!nextchr && locinput >= PL_regeol) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case ALNUML: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case ALNUM: - if (!nextchr) - sayNO; - if (!(OP(scan) == ALNUM - ? isALNUM(nextchr) : isALNUM_LC(nextchr))) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case ALNUMLUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case ALNUMUTF8: - if (!nextchr) - sayNO; - if (nextchr & 0x80) { - if (!(OP(scan) == ALNUMUTF8 - ? swash_fetch(PL_utf8_alnum, (U8*)locinput) - : isALNUM_LC_utf8((U8*)locinput))) - { - sayNO; - } - locinput += PL_utf8skip[nextchr]; - nextchr = UCHARAT(locinput); - break; - } - if (!(OP(scan) == ALNUMUTF8 - ? isALNUM(nextchr) : isALNUM_LC(nextchr))) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case NALNUML: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case NALNUM: - if (!nextchr && locinput >= PL_regeol) - sayNO; - if (OP(scan) == NALNUM - ? isALNUM(nextchr) : isALNUM_LC(nextchr)) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case NALNUMLUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case NALNUMUTF8: - if (!nextchr && locinput >= PL_regeol) - sayNO; - if (nextchr & 0x80) { - if (OP(scan) == NALNUMUTF8 - ? swash_fetch(PL_utf8_alnum, (U8*)locinput) - : isALNUM_LC_utf8((U8*)locinput)) - { - sayNO; - } - locinput += PL_utf8skip[nextchr]; - nextchr = UCHARAT(locinput); - break; - } - if (OP(scan) == NALNUMUTF8 - ? isALNUM(nextchr) : isALNUM_LC(nextchr)) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case BOUNDL: - case NBOUNDL: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case BOUND: - case NBOUND: - /* was last char in word? */ - ln = (locinput != PL_regbol) ? UCHARAT(locinput - 1) : PL_regprev; - if (OP(scan) == BOUND || OP(scan) == NBOUND) { - ln = isALNUM(ln); - n = isALNUM(nextchr); - } - else { - ln = isALNUM_LC(ln); - n = isALNUM_LC(nextchr); - } - if (((!ln) == (!n)) == (OP(scan) == BOUND || OP(scan) == BOUNDL)) - sayNO; - break; - case BOUNDLUTF8: - case NBOUNDLUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case BOUNDUTF8: - case NBOUNDUTF8: - /* was last char in word? */ - ln = (locinput != PL_regbol) - ? utf8_to_uv(reghop((U8*)locinput, -1), 0) : PL_regprev; - if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) { - ln = isALNUM_uni(ln); - n = swash_fetch(PL_utf8_alnum, (U8*)locinput); - } - else { - ln = isALNUM_LC_uni(ln); - n = isALNUM_LC_utf8((U8*)locinput); - } - if (((!ln) == (!n)) == (OP(scan) == BOUNDUTF8 || OP(scan) == BOUNDLUTF8)) - sayNO; - break; - case SPACEL: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case SPACE: - if (!nextchr && locinput >= PL_regeol) - sayNO; - if (!(OP(scan) == SPACE - ? isSPACE(nextchr) : isSPACE_LC(nextchr))) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case SPACELUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case SPACEUTF8: - if (!nextchr && locinput >= PL_regeol) - sayNO; - if (nextchr & 0x80) { - if (!(OP(scan) == SPACEUTF8 - ? swash_fetch(PL_utf8_space,(U8*)locinput) - : isSPACE_LC_utf8((U8*)locinput))) - { - sayNO; - } - locinput += PL_utf8skip[nextchr]; - nextchr = UCHARAT(locinput); - break; - } - if (!(OP(scan) == SPACEUTF8 - ? isSPACE(nextchr) : isSPACE_LC(nextchr))) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case NSPACEL: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case NSPACE: - if (!nextchr) - sayNO; - if (OP(scan) == SPACE - ? isSPACE(nextchr) : isSPACE_LC(nextchr)) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case NSPACELUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case NSPACEUTF8: - if (!nextchr) - sayNO; - if (nextchr & 0x80) { - if (OP(scan) == NSPACEUTF8 - ? swash_fetch(PL_utf8_space,(U8*)locinput) - : isSPACE_LC_utf8((U8*)locinput)) - { - sayNO; - } - locinput += PL_utf8skip[nextchr]; - nextchr = UCHARAT(locinput); + /* Note that nextchr is a byte even in UTF */ + nextchr = UCHARAT(locinput); + scan = prog; + while (scan != NULL) { +#define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO) +#ifdef DEBUGGING +# define sayYES goto yes +# define sayNO goto no +# define saySAME(x) if (x) goto yes; else goto no +# define REPORT_CODE_OFF 24 +#else +# define sayYES return 1 +# define sayNO return 0 +# define saySAME(x) return x +#endif + DEBUG_r( { + SV *prop = sv_newmortal(); + int docolor = *PL_colors[0]; + int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */ + int l = (PL_regeol - locinput > taill ? taill : PL_regeol - locinput); + /* The part of the string before starttry has one color + (pref0_len chars), between starttry and current + position another one (pref_len - pref0_len chars), + after the current position the third one. + We assume that pref0_len <= pref_len, otherwise we + decrease pref0_len. */ + int pref_len = (locinput - PL_bostr > (5 + taill) - l + ? (5 + taill) - l : locinput - PL_bostr); + int pref0_len = pref_len - (locinput - PL_reg_starttry); + + if (l + pref_len < (5 + taill) && l < PL_regeol - locinput) + l = ( PL_regeol - locinput > (5 + taill) - pref_len + ? (5 + taill) - pref_len : PL_regeol - locinput); + if (pref0_len < 0) + pref0_len = 0; + if (pref0_len > pref_len) + pref0_len = pref_len; + regprop(prop, scan); + PerlIO_printf(Perl_debug_log, + "%4i <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3d:%*s%s\n", + locinput - PL_bostr, + PL_colors[4], pref0_len, + locinput - pref_len, PL_colors[5], + PL_colors[2], pref_len - pref0_len, + locinput - pref_len + pref0_len, PL_colors[3], + (docolor ? "" : "> <"), + PL_colors[0], l, locinput, PL_colors[1], + 15 - l - pref_len + 1, + "", + scan - PL_regprogram, PL_regindent*2, "", + SvPVX(prop)); + } ); + + next = scan + NEXT_OFF(scan); + if (next == scan) + next = NULL; + + switch (OP(scan)) { + case BOL: + if (locinput == PL_bostr + ? PL_regprev == '\n' + : (PL_multiline && + (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') ) + { + /* regtill = regbol; */ break; } - if (OP(scan) == NSPACEUTF8 - ? isSPACE(nextchr) : isSPACE_LC(nextchr)) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case DIGITL: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case DIGIT: - if (!nextchr && locinput >= PL_regeol) - sayNO; - if (!(OP(scan) == DIGIT - ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case DIGITLUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case DIGITUTF8: - if (!nextchr) - sayNO; - if (nextchr & 0x80) { - if (OP(scan) == NDIGITUTF8 - ? swash_fetch(PL_utf8_digit,(U8*)locinput) - : isDIGIT_LC_utf8((U8*)locinput)) - { - sayNO; - } - locinput += PL_utf8skip[nextchr]; - nextchr = UCHARAT(locinput); + sayNO; + case MBOL: + if (locinput == PL_bostr + ? PL_regprev == '\n' + : ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n') ) + { break; } - if (!isDIGIT(nextchr)) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case NDIGITL: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case NDIGIT: - if (!nextchr) - sayNO; - if (OP(scan) == DIGIT - ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case NDIGITLUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case NDIGITUTF8: - if (!nextchr && locinput >= PL_regeol) - sayNO; - if (nextchr & 0x80) { - if (swash_fetch(PL_utf8_digit,(U8*)locinput)) - sayNO; - locinput += PL_utf8skip[nextchr]; - nextchr = UCHARAT(locinput); + sayNO; + case SBOL: + if (locinput == PL_regbol && PL_regprev == '\n') break; - } - if (isDIGIT(nextchr)) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case ALNUMCL: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case ALNUMC: - if (!nextchr) - sayNO; - if (!(OP(scan) == ALNUMC - ? isALNUMC(nextchr) : isALNUMC_LC(nextchr))) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case ALNUMCLUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case ALNUMCUTF8: - if (!nextchr) - sayNO; - if (nextchr & 0x80) { - if (!(OP(scan) == ALNUMCUTF8 - ? swash_fetch(PL_utf8_alnumc, (U8*)locinput) - : isALNUMC_LC_utf8((U8*)locinput))) - { - sayNO; - } - locinput += PL_utf8skip[nextchr]; - nextchr = UCHARAT(locinput); + sayNO; + case GPOS: + if (locinput == PL_reg_ganch) break; - } - if (!(OP(scan) == ALNUMCUTF8 - ? isALNUMC(nextchr) : isALNUMC_LC(nextchr))) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case NALNUMCL: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case NALNUMC: - if (!nextchr) - sayNO; - if (OP(scan) == ALNUMC - ? isALNUMC(nextchr) : isALNUMC_LC(nextchr)) + sayNO; + case EOL: + if (PL_multiline) + goto meol; + else + goto seol; + case MEOL: + meol: + if ((nextchr || locinput < PL_regeol) && nextchr != '\n') sayNO; - nextchr = UCHARAT(++locinput); break; - case NALNUMCLUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case NALNUMCUTF8: - if (!nextchr && locinput >= PL_regeol) + case SEOL: + seol: + if ((nextchr || locinput < PL_regeol) && nextchr != '\n') sayNO; - if (nextchr & 0x80) { - if (swash_fetch(PL_utf8_alnumc,(U8*)locinput)) - sayNO; - locinput += PL_utf8skip[nextchr]; - nextchr = UCHARAT(locinput); - break; - } - if (isALNUMC(nextchr)) + if (PL_regeol - locinput > 1) sayNO; - nextchr = UCHARAT(++locinput); break; - case ALPHAL: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case ALPHA: - if (!nextchr) - sayNO; - if (!(OP(scan) == ALPHA - ? isALPHA(nextchr) : isALPHA_LC(nextchr))) + case EOS: + if (PL_regeol != locinput) sayNO; - nextchr = UCHARAT(++locinput); break; - case ALPHALUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case ALPHAUTF8: - if (!nextchr) - sayNO; + case SANYUTF8: if (nextchr & 0x80) { - if (!(OP(scan) == ALPHAUTF8 - ? swash_fetch(PL_utf8_alpha, (U8*)locinput) - : isALPHA_LC_utf8((U8*)locinput))) - { - sayNO; - } locinput += PL_utf8skip[nextchr]; - nextchr = UCHARAT(locinput); - break; - } - if (!(OP(scan) == ALPHAUTF8 - ? isALPHA(nextchr) : isALPHA_LC(nextchr))) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case NALPHAL: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case NALPHA: - if (!nextchr) - sayNO; - if (OP(scan) == ALPHA - ? isALPHA(nextchr) : isALPHA_LC(nextchr)) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case NALPHALUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case NALPHAUTF8: - if (!nextchr && locinput >= PL_regeol) - sayNO; - if (nextchr & 0x80) { - if (swash_fetch(PL_utf8_alpha,(U8*)locinput)) + if (locinput > PL_regeol) sayNO; - locinput += PL_utf8skip[nextchr]; nextchr = UCHARAT(locinput); break; } - if (isALPHA(nextchr)) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case ASCII: - if (!nextchr && locinput >= PL_regeol) - sayNO; - if (!isASCII(nextchr)) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case NASCII: if (!nextchr && locinput >= PL_regeol) sayNO; - if (isASCII(nextchr)) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case CNTRLL: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case CNTRL: - if (!nextchr) - sayNO; - if (!(OP(scan) == CNTRL - ? isCNTRL(nextchr) : isCNTRL_LC(nextchr))) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case CNTRLLUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case CNTRLUTF8: - if (!nextchr) - sayNO; - if (nextchr & 0x80) { - if (!(OP(scan) == CNTRLUTF8 - ? swash_fetch(PL_utf8_cntrl, (U8*)locinput) - : isCNTRL_LC_utf8((U8*)locinput))) - { - sayNO; - } - locinput += PL_utf8skip[nextchr]; - nextchr = UCHARAT(locinput); - break; - } - if (!(OP(scan) == CNTRLUTF8 - ? isCNTRL(nextchr) : isCNTRL_LC(nextchr))) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case NCNTRLL: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case NCNTRL: - if (!nextchr) - sayNO; - if (OP(scan) == CNTRL - ? isCNTRL(nextchr) : isCNTRL_LC(nextchr)) - sayNO; nextchr = UCHARAT(++locinput); break; - case NCNTRLLUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case NCNTRLUTF8: + case SANY: if (!nextchr && locinput >= PL_regeol) sayNO; - if (nextchr & 0x80) { - if (swash_fetch(PL_utf8_cntrl,(U8*)locinput)) - sayNO; - locinput += PL_utf8skip[nextchr]; - nextchr = UCHARAT(locinput); - break; - } - if (isCNTRL(nextchr)) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case GRAPHL: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case GRAPH: - if (!nextchr) - sayNO; - if (!(OP(scan) == GRAPH - ? isGRAPH(nextchr) : isGRAPH_LC(nextchr))) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case GRAPHLUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case GRAPHUTF8: - if (!nextchr) - sayNO; - if (nextchr & 0x80) { - if (!(OP(scan) == GRAPHUTF8 - ? swash_fetch(PL_utf8_graph, (U8*)locinput) - : isGRAPH_LC_utf8((U8*)locinput))) - { - sayNO; - } - locinput += PL_utf8skip[nextchr]; - nextchr = UCHARAT(locinput); - break; - } - if (!(OP(scan) == GRAPHUTF8 - ? isGRAPH(nextchr) : isGRAPH_LC(nextchr))) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case NGRAPHL: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case NGRAPH: - if (!nextchr) - sayNO; - if (OP(scan) == GRAPH - ? isGRAPH(nextchr) : isGRAPH_LC(nextchr)) - sayNO; nextchr = UCHARAT(++locinput); break; - case NGRAPHLUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case NGRAPHUTF8: - if (!nextchr && locinput >= PL_regeol) - sayNO; + case ANYUTF8: if (nextchr & 0x80) { - if (swash_fetch(PL_utf8_graph,(U8*)locinput)) - sayNO; locinput += PL_utf8skip[nextchr]; + if (locinput > PL_regeol) + sayNO; nextchr = UCHARAT(locinput); break; } - if (isGRAPH(nextchr)) + if (!nextchr && locinput >= PL_regeol || nextchr == '\n') sayNO; nextchr = UCHARAT(++locinput); break; - case LOWERL: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case LOWER: - if (!nextchr) - sayNO; - if (!(OP(scan) == LOWER - ? isLOWER(nextchr) : isLOWER_LC(nextchr))) + case REG_ANY: + if (!nextchr && locinput >= PL_regeol || nextchr == '\n') sayNO; nextchr = UCHARAT(++locinput); break; - case LOWERLUTF8: + case EXACT: + s = (char *) OPERAND(scan); + ln = UCHARAT(s++); + /* Inline the first character, for speed. */ + if (UCHARAT(s) != nextchr) + sayNO; + if (PL_regeol - locinput < ln) + sayNO; + if (ln > 1 && memNE(s, locinput, ln)) + sayNO; + locinput += ln; + nextchr = UCHARAT(locinput); + break; + case EXACTFL: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ - case LOWERUTF8: - if (!nextchr) - sayNO; - if (nextchr & 0x80) { - if (!(OP(scan) == LOWERUTF8 - ? swash_fetch(PL_utf8_lower, (U8*)locinput) - : isLOWER_LC_utf8((U8*)locinput))) - { - sayNO; + case EXACTF: + s = (char *) OPERAND(scan); + ln = UCHARAT(s++); + + if (UTF) { + char *l = locinput; + char *e = s + ln; + c1 = OP(scan) == EXACTF; + while (s < e) { + if (l >= PL_regeol) + sayNO; + if (utf8_to_uv((U8*)s, 0) != (c1 ? + toLOWER_utf8((U8*)l) : + toLOWER_LC_utf8((U8*)l))) + { + sayNO; + } + s += UTF8SKIP(s); + l += UTF8SKIP(l); } - locinput += PL_utf8skip[nextchr]; + locinput = l; nextchr = UCHARAT(locinput); break; } - if (!(OP(scan) == LOWERUTF8 - ? isLOWER(nextchr) : isLOWER_LC(nextchr))) + + /* Inline the first character, for speed. */ + if (UCHARAT(s) != nextchr && + UCHARAT(s) != ((OP(scan) == EXACTF) + ? PL_fold : PL_fold_locale)[nextchr]) + sayNO; + if (PL_regeol - locinput < ln) + sayNO; + if (ln > 1 && (OP(scan) == EXACTF + ? ibcmp(s, locinput, ln) + : ibcmp_locale(s, locinput, ln))) + sayNO; + locinput += ln; + nextchr = UCHARAT(locinput); + break; + case ANYOFUTF8: + s = (char *) OPERAND(scan); + if (!REGINCLASSUTF8(scan, (U8*)locinput)) + sayNO; + if (locinput >= PL_regeol) + sayNO; + locinput += PL_utf8skip[nextchr]; + nextchr = UCHARAT(locinput); + break; + case ANYOF: + s = (char *) OPERAND(scan); + if (nextchr < 0) + nextchr = UCHARAT(locinput); + if (!REGINCLASS(s, nextchr)) + sayNO; + if (!nextchr && locinput >= PL_regeol) sayNO; nextchr = UCHARAT(++locinput); break; - case NLOWERL: + case ALNUML: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ - case NLOWER: + case ALNUM: if (!nextchr) sayNO; - if (OP(scan) == LOWER - ? isLOWER(nextchr) : isLOWER_LC(nextchr)) + if (!(OP(scan) == ALNUM + ? isALNUM(nextchr) : isALNUM_LC(nextchr))) sayNO; nextchr = UCHARAT(++locinput); break; - case NLOWERLUTF8: + case ALNUMLUTF8: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ - case NLOWERUTF8: - if (!nextchr && locinput >= PL_regeol) + case ALNUMUTF8: + if (!nextchr) sayNO; if (nextchr & 0x80) { - if (swash_fetch(PL_utf8_lower,(U8*)locinput)) + if (!(OP(scan) == ALNUMUTF8 + ? swash_fetch(PL_utf8_alnum, (U8*)locinput) + : isALNUM_LC_utf8((U8*)locinput))) + { sayNO; + } locinput += PL_utf8skip[nextchr]; nextchr = UCHARAT(locinput); break; } - if (isLOWER(nextchr)) + if (!(OP(scan) == ALNUMUTF8 + ? isALNUM(nextchr) : isALNUM_LC(nextchr))) sayNO; nextchr = UCHARAT(++locinput); break; - case PRINTL: + case NALNUML: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ - case PRINT: - if (!nextchr) + case NALNUM: + if (!nextchr && locinput >= PL_regeol) sayNO; - if (!(OP(scan) == PRINT - ? isPRINT(nextchr) : isPRINT_LC(nextchr))) + if (OP(scan) == NALNUM + ? isALNUM(nextchr) : isALNUM_LC(nextchr)) sayNO; nextchr = UCHARAT(++locinput); break; - case PRINTLUTF8: + case NALNUMLUTF8: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ - case PRINTUTF8: - if (!nextchr) + case NALNUMUTF8: + if (!nextchr && locinput >= PL_regeol) sayNO; if (nextchr & 0x80) { - if (!(OP(scan) == PRINTUTF8 - ? swash_fetch(PL_utf8_print, (U8*)locinput) - : isPRINT_LC_utf8((U8*)locinput))) + if (OP(scan) == NALNUMUTF8 + ? swash_fetch(PL_utf8_alnum, (U8*)locinput) + : isALNUM_LC_utf8((U8*)locinput)) { sayNO; } @@ -2978,60 +1831,71 @@ S_regmatch(pTHX_ regnode *prog) nextchr = UCHARAT(locinput); break; } - if (!(OP(scan) == PRINTUTF8 - ? isPRINT(nextchr) : isPRINT_LC(nextchr))) + if (OP(scan) == NALNUMUTF8 + ? isALNUM(nextchr) : isALNUM_LC(nextchr)) sayNO; nextchr = UCHARAT(++locinput); break; - case NPRINTL: + case BOUNDL: + case NBOUNDL: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ - case NPRINT: - if (!nextchr) - sayNO; - if (OP(scan) == PRINT - ? isPRINT(nextchr) : isPRINT_LC(nextchr)) + case BOUND: + case NBOUND: + /* was last char in word? */ + ln = (locinput != PL_regbol) ? UCHARAT(locinput - 1) : PL_regprev; + if (OP(scan) == BOUND || OP(scan) == NBOUND) { + ln = isALNUM(ln); + n = isALNUM(nextchr); + } + else { + ln = isALNUM_LC(ln); + n = isALNUM_LC(nextchr); + } + if (((!ln) == (!n)) == (OP(scan) == BOUND || OP(scan) == BOUNDL)) sayNO; - nextchr = UCHARAT(++locinput); break; - case NPRINTLUTF8: + case BOUNDLUTF8: + case NBOUNDLUTF8: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ - case NPRINTUTF8: - if (!nextchr && locinput >= PL_regeol) - sayNO; - if (nextchr & 0x80) { - if (swash_fetch(PL_utf8_print,(U8*)locinput)) - sayNO; - locinput += PL_utf8skip[nextchr]; - nextchr = UCHARAT(locinput); - break; + case BOUNDUTF8: + case NBOUNDUTF8: + /* was last char in word? */ + ln = (locinput != PL_regbol) + ? utf8_to_uv(reghop((U8*)locinput, -1), 0) : PL_regprev; + if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) { + ln = isALNUM_uni(ln); + n = swash_fetch(PL_utf8_alnum, (U8*)locinput); + } + else { + ln = isALNUM_LC_uni(ln); + n = isALNUM_LC_utf8((U8*)locinput); } - if (isPRINT(nextchr)) + if (((!ln) == (!n)) == (OP(scan) == BOUNDUTF8 || OP(scan) == BOUNDLUTF8)) sayNO; - nextchr = UCHARAT(++locinput); break; - case PUNCTL: + case SPACEL: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ - case PUNCT: - if (!nextchr) + case SPACE: + if (!nextchr && locinput >= PL_regeol) sayNO; - if (!(OP(scan) == PUNCT - ? isPUNCT(nextchr) : isPUNCT_LC(nextchr))) + if (!(OP(scan) == SPACE + ? isSPACE(nextchr) : isSPACE_LC(nextchr))) sayNO; nextchr = UCHARAT(++locinput); break; - case PUNCTLUTF8: + case SPACELUTF8: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ - case PUNCTUTF8: - if (!nextchr) + case SPACEUTF8: + if (!nextchr && locinput >= PL_regeol) sayNO; if (nextchr & 0x80) { - if (!(OP(scan) == PUNCTUTF8 - ? swash_fetch(PL_utf8_punct, (U8*)locinput) - : isPUNCT_LC_utf8((U8*)locinput))) + if (!(OP(scan) == SPACEUTF8 + ? swash_fetch(PL_utf8_space,(U8*)locinput) + : isSPACE_LC_utf8((U8*)locinput))) { sayNO; } @@ -3039,60 +1903,65 @@ S_regmatch(pTHX_ regnode *prog) nextchr = UCHARAT(locinput); break; } - if (!(OP(scan) == PUNCTUTF8 - ? isPUNCT(nextchr) : isPUNCT_LC(nextchr))) + if (!(OP(scan) == SPACEUTF8 + ? isSPACE(nextchr) : isSPACE_LC(nextchr))) sayNO; nextchr = UCHARAT(++locinput); break; - case NPUNCTL: + case NSPACEL: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ - case NPUNCT: + case NSPACE: if (!nextchr) sayNO; - if (OP(scan) == PUNCT - ? isPUNCT(nextchr) : isPUNCT_LC(nextchr)) + if (OP(scan) == SPACE + ? isSPACE(nextchr) : isSPACE_LC(nextchr)) sayNO; nextchr = UCHARAT(++locinput); break; - case NPUNCTLUTF8: + case NSPACELUTF8: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ - case NPUNCTUTF8: - if (!nextchr && locinput >= PL_regeol) + case NSPACEUTF8: + if (!nextchr) sayNO; if (nextchr & 0x80) { - if (swash_fetch(PL_utf8_punct,(U8*)locinput)) + if (OP(scan) == NSPACEUTF8 + ? swash_fetch(PL_utf8_space,(U8*)locinput) + : isSPACE_LC_utf8((U8*)locinput)) + { sayNO; + } locinput += PL_utf8skip[nextchr]; nextchr = UCHARAT(locinput); break; } - if (isPUNCT(nextchr)) + if (OP(scan) == NSPACEUTF8 + ? isSPACE(nextchr) : isSPACE_LC(nextchr)) sayNO; nextchr = UCHARAT(++locinput); break; - case UPPERL: + case DIGITL: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ - case UPPER: - if (!nextchr) + case DIGIT: + if (!nextchr && locinput >= PL_regeol) sayNO; - if (!(OP(scan) == UPPER - ? isUPPER(nextchr) : isUPPER_LC(nextchr))) + if (!(OP(scan) == DIGIT + ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))) sayNO; nextchr = UCHARAT(++locinput); break; - case UPPERLUTF8: + case DIGITLUTF8: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ - case UPPERUTF8: + case DIGITUTF8: if (!nextchr) sayNO; if (nextchr & 0x80) { - if (!(OP(scan) == UPPERUTF8 - ? swash_fetch(PL_utf8_upper, (U8*)locinput) - : isUPPER_LC_utf8((U8*)locinput))) + if (OP(scan) == NDIGITUTF8 + ? swash_fetch(PL_utf8_digit,(U8*)locinput) + : isDIGIT_LC_utf8((U8*)locinput)) { sayNO; } @@ -3100,50 +1969,35 @@ S_regmatch(pTHX_ regnode *prog) nextchr = UCHARAT(locinput); break; } - if (!(OP(scan) == UPPERUTF8 - ? isUPPER(nextchr) : isUPPER_LC(nextchr))) + if (!isDIGIT(nextchr)) sayNO; nextchr = UCHARAT(++locinput); break; - case NUPPERL: + case NDIGITL: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ - case NUPPER: + case NDIGIT: if (!nextchr) sayNO; - if (OP(scan) == UPPER - ? isUPPER(nextchr) : isUPPER_LC(nextchr)) + if (OP(scan) == DIGIT + ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)) sayNO; nextchr = UCHARAT(++locinput); break; - case NUPPERLUTF8: + case NDIGITLUTF8: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ - case NUPPERUTF8: + case NDIGITUTF8: if (!nextchr && locinput >= PL_regeol) sayNO; if (nextchr & 0x80) { - if (swash_fetch(PL_utf8_upper,(U8*)locinput)) + if (swash_fetch(PL_utf8_digit,(U8*)locinput)) sayNO; locinput += PL_utf8skip[nextchr]; nextchr = UCHARAT(locinput); break; } - if (isUPPER(nextchr)) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case XDIGIT: - if (!nextchr && locinput >= PL_regeol) - sayNO; - if (!isXDIGIT(nextchr)) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case NXDIGIT: - if (!nextchr && locinput >= PL_regeol) - sayNO; - if (isXDIGIT(nextchr)) + if (isDIGIT(nextchr)) sayNO; nextchr = UCHARAT(++locinput); break; @@ -3164,6 +2018,7 @@ S_regmatch(pTHX_ regnode *prog) case REFF: n = ARG(scan); /* which paren pair */ ln = PL_regstartp[n]; + PL_reg_leftiter = PL_reg_maxiter; /* Void cache */ if (*PL_reglastparen < n || ln == -1) sayNO; /* Do not match unless seen CLOSEn. */ if (ln == PL_regendp[n]) @@ -3308,6 +2163,10 @@ S_regmatch(pTHX_ regnode *prog) *PL_reglastparen = 0; PL_reg_call_cc = &state; PL_reginput = locinput; + + /* XXXX This is too dramatic a measure... */ + PL_reg_maxiter = 0; + if (regmatch(re->program + 1)) { ReREFCNT_dec(re); regcpblow(cp); @@ -3325,6 +2184,10 @@ S_regmatch(pTHX_ regnode *prog) PL_regcc = state.cc; PL_reg_re = state.re; cache_re(PL_reg_re); + + /* XXXX This is too dramatic a measure... */ + PL_reg_maxiter = 0; + sayNO; } sw = SvTRUE(ret); @@ -3352,6 +2215,7 @@ S_regmatch(pTHX_ regnode *prog) sw = (*PL_reglastparen >= n && PL_regendp[n] != -1); break; case IFTHEN: + PL_reg_leftiter = PL_reg_maxiter; /* Void cache */ if (sw) next = NEXTOPER(NEXTOPER(scan)); else { @@ -3390,7 +2254,7 @@ S_regmatch(pTHX_ regnode *prog) /* * This is really hard to understand, because after we match * what we're trying to match, we must make sure the rest of - * the RE is going to match for sure, and to do that we have + * the REx is going to match for sure, and to do that we have * to go back UP the parse tree by recursing ever deeper. And * if it fails, we have to reset our parent's current state * that we can try again after backing off. @@ -3450,6 +2314,51 @@ S_regmatch(pTHX_ regnode *prog) sayNO; } + if (scan->flags) { + /* Check whether we already were at this position. + Postpone detection until we know the match is not + *that* much linear. */ + if (!PL_reg_maxiter) { + PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4); + PL_reg_leftiter = PL_reg_maxiter; + } + if (PL_reg_leftiter-- == 0) { + I32 size = (PL_reg_maxiter + 7)/8; + if (PL_reg_poscache) { + if (PL_reg_poscache_size < size) { + Renew(PL_reg_poscache, size, char); + PL_reg_poscache_size = size; + } + Zero(PL_reg_poscache, size, char); + } + else { + PL_reg_poscache_size = size; + Newz(29, PL_reg_poscache, size, char); + } + DEBUG_r( + PerlIO_printf(Perl_debug_log, + "%sDetected a super-linear match, switching on caching%s...\n", + PL_colors[4], PL_colors[5]) + ); + } + if (PL_reg_leftiter < 0) { + I32 o = locinput - PL_bostr, b; + + o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4); + b = o % 8; + o /= 8; + if (PL_reg_poscache[o] & (1<minmod) { @@ -4455,3 +3364,25 @@ S_reghopmaybe(pTHX_ U8* s, I32 off) } return s; } + +#ifdef PERL_OBJECT +#define NO_XSLOCKS +#include "XSUB.h" +#endif + +static void +restore_pos(pTHXo_ void *arg) +{ + dTHR; + if (PL_reg_eval_set) { + if (PL_reg_oldsaved) { + PL_reg_re->subbeg = PL_reg_oldsaved; + PL_reg_re->sublen = PL_reg_oldsavedlen; + RX_MATCH_COPIED_on(PL_reg_re); + } + PL_reg_magic->mg_len = PL_reg_oldpos; + PL_reg_eval_set = 0; + PL_curpm = PL_reg_oldcurpm; + } +} +