From: Jarkko Hietaniemi Date: Sun, 14 Nov 1999 21:17:26 +0000 (+0000) Subject: Ilya's "hopscotch" patch, reworked by Ilya to fit. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=6eb5f6b9f48454b7ad64225a5bab0de7fdff695c;p=p5sagit%2Fp5-mst-13.2.git Ilya's "hopscotch" patch, reworked by Ilya to fit. p4raw-id: //depot/cfgperl@4586 --- diff --git a/embed.h b/embed.h index 3307585..5242cec 100644 --- a/embed.h +++ b/embed.h @@ -940,6 +940,7 @@ #define cache_re S_cache_re #define reghop S_reghop #define reghopmaybe S_reghopmaybe +#define find_byclass S_find_byclass #endif #if defined(PERL_IN_RUN_C) || defined(PERL_DECL_PROT) #define debprof S_debprof @@ -2307,6 +2308,7 @@ #define cache_re(a) S_cache_re(aTHX_ a) #define reghop(a,b) S_reghop(aTHX_ a,b) #define reghopmaybe(a,b) S_reghopmaybe(aTHX_ a,b) +#define find_byclass(a,b,c,d,e,f) S_find_byclass(aTHX_ a,b,c,d,e,f) #endif #if defined(PERL_IN_RUN_C) || defined(PERL_DECL_PROT) #define debprof(a) S_debprof(aTHX_ a) @@ -4516,6 +4518,8 @@ #define reghop S_reghop #define S_reghopmaybe CPerlObj::S_reghopmaybe #define reghopmaybe S_reghopmaybe +#define S_find_byclass CPerlObj::S_find_byclass +#define find_byclass S_find_byclass #endif #if defined(PERL_IN_RUN_C) || defined(PERL_DECL_PROT) #define S_debprof CPerlObj::S_debprof diff --git a/embed.pl b/embed.pl index 07bed66..78a071a 100755 --- a/embed.pl +++ b/embed.pl @@ -1969,6 +1969,7 @@ s |char*|regcp_set_to |I32 ss s |void |cache_re |regexp *prog s |U8* |reghop |U8 *pos|I32 off s |U8* |reghopmaybe |U8 *pos|I32 off +s |char* |find_byclass |regexp * prog|regnode *c|char *s|char *strend|char *startpos|I32 norun #endif #if defined(PERL_IN_RUN_C) || defined(PERL_DECL_PROT) diff --git a/proto.h b/proto.h index 5daeb90..c5bfb92 100644 --- a/proto.h +++ b/proto.h @@ -916,6 +916,7 @@ STATIC char* S_regcp_set_to(pTHX_ I32 ss); STATIC void S_cache_re(pTHX_ regexp *prog); STATIC U8* S_reghop(pTHX_ U8 *pos, I32 off); STATIC U8* S_reghopmaybe(pTHX_ U8 *pos, I32 off); +STATIC char* S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun); #endif #if defined(PERL_IN_RUN_C) || defined(PERL_DECL_PROT) STATIC void S_debprof(pTHX_ OP *o); diff --git a/regexec.c b/regexec.c index fa891c8..37ef5b1 100644 --- a/regexec.c +++ b/regexec.c @@ -254,6 +254,9 @@ S_cache_re(pTHX_ regexp *prog) PL_reg_re = prog; } +static char *find_byclass(regexp * prog, regnode *c, char *s, char *strend, + char *startpos, I32 norun); + /* * Need to implement the following flags for reg_anch: * @@ -275,6 +278,13 @@ S_cache_re(pTHX_ regexp *prog) /* XXXX We assume that strpos is strbeg unless sv. */ +/* XXXX Some places assume that there is a fixed substring. + An update may be needed if optimizer marks as "INTUITable" + RExen without fixed substrings. Similarly, it is assumed that + lengths of all the strings are no more than minlen, thus they + cannot come from lookahead. + (Or minlen should take into account lookahead.) */ + /* A failure to find a constant substring means that there is no need to make an expensive call to REx engine, thus we celebrate a failure. Similarly, finding a substring too deep into the string means that less calls to @@ -285,10 +295,14 @@ S_cache_re(pTHX_ regexp *prog) b) Fixed substring; c) Whether we are anchored (beginning-of-line or \G); d) First node (of those at offset 0) which may distingush positions; - We use 'a', 'b', multiline-part of 'c', and try to find a position in the + We use a)b)d) and multiline-part of c), and try to find a position in the string which does not contradict any of them. */ +/* Most of decisions we do here should have been done at compile time. + The nodes of the REx which we used for the search should have been + deleted from the finite automaton. */ + char * Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, char *strend, U32 flags, re_scream_pos_data *data) @@ -301,7 +315,8 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, char *t; I32 ml_anch; char *tmp; - register char *other_last = Nullch; + register char *other_last = Nullch; /* other substr checked before this */ + char *check_at; /* check substr found at this pos */ #ifdef DEBUGGING char *i_strpos = strpos; #endif @@ -432,6 +447,8 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, if (!s) goto fail_finish; + check_at = s; + /* Finish the diagnostic message */ DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) ); @@ -447,7 +464,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, /* Take into account the "other" substring. */ /* XXXX May be hopelessly wrong for UTF... */ if (!other_last) - other_last = strpos - 1; + other_last = strpos; if (check == prog->float_substr) { do_other_anchored: { @@ -465,8 +482,8 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, else t = strpos; t += prog->anchored_offset; - if (t <= other_last) - t = other_last + 1; + if (t < other_last) /* These positions already checked */ + t = other_last; PL_bostr = tmp; last2 = last1 = strend - prog->minlen; if (last < last1) @@ -495,7 +512,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, ", trying floating at offset %ld...\n", (long)(s1 + 1 - i_strpos))); PL_regeol = strend; /* Used in HOP() */ - other_last = last1 + prog->anchored_offset; + other_last = last1 + prog->anchored_offset + 1; s = HOPc(last, 1); goto restart; } @@ -503,7 +520,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", (long)(s - i_strpos))); t = s - prog->anchored_offset; - other_last = s - 1; + other_last = s + 1; s = s1; if (t == strpos) goto try_at_start; @@ -520,8 +537,8 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, if (last - t > prog->float_max_offset) last = t + prog->float_max_offset; s = t + prog->float_min_offset; - if (s <= other_last) - s = other_last + 1; + if (s < other_last) + s = other_last; /* XXXX It is not documented what units *_offsets are in. Assume bytes. */ /* fbm_instr() takes into account exact value of end-of-str if the check is SvTAIL(ed). Since false positives are OK, @@ -546,7 +563,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, DEBUG_r(PerlIO_printf(Perl_debug_log, ", trying anchored starting at offset %ld...\n", (long)(s1 + 1 - i_strpos))); - other_last = last; + other_last = last + 1; PL_regeol = strend; /* Used in HOP() */ s = HOPc(t, 1); goto restart; @@ -554,7 +571,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, else { DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", (long)(s - i_strpos))); - other_last = s - 1; + other_last = s + 1; s = s1; if (t == strpos) goto try_at_start; @@ -645,13 +662,79 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, s = strpos; /* XXXX This is a remnant of the old implementation. It looks wasteful, since now INTUIT can use many - other heuristics too. */ + other heuristics. */ prog->reganch &= ~RE_USE_INTUIT; } else s = strpos; } + /* Last resort... */ + /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */ + if (prog->regstclass) { + /* minlen == 0 is possible if regstclass is \b or \B, + and the fixed substr is ''$. + Since minlen is already taken into account, s+1 is before strend; + accidentally, minlen >= 1 guaranties no false positives at s + 1 + even for \b or \B. But (minlen? 1 : 0) below assumes that + regstclass does not come from lookahead... */ + /* If regstclass takes bytelength more than 1: If charlength==1, OK. + This leaves EXACTF only, which is dealt with in find_byclass(). */ + char *endpos = (prog->anchored_substr || ml_anch) + ? s + (prog->minlen? 1 : 0) + : (prog->float_substr ? check_at - start_shift + 1 + : strend) ; + char *startpos = sv ? strend - SvCUR(sv) : s; + + t = s; + s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1); + if (!s) { +#ifdef DEBUGGING + char *what; +#endif + if (endpos == strend) { + DEBUG_r( PerlIO_printf(Perl_debug_log, + "Could not match STCLASS...\n") ); + goto fail; + } + /* Contradict one of substrings */ + if (prog->anchored_substr) { + DEBUG_r( PerlIO_printf(Perl_debug_log, + "This position contradicts STCLASS...\n") ); + if (prog->anchored_substr == check) { + DEBUG_r( what = "anchored" ); + hop_and_restart: + PL_regeol = strend; /* Used in HOP() */ + s = HOPc(t, 1); + DEBUG_r( PerlIO_printf(Perl_debug_log, + "trying %s substr starting at offset %ld...\n", + what, (long)(s + start_shift - i_strpos)) ); + goto restart; + } + /* Have both, check is floating */ + if (t + start_shift >= check_at) /* Contradicts floating=check */ + goto retry_floating_check; + /* Recheck anchored substring, but not floating... */ + s = check_at; + DEBUG_r( PerlIO_printf(Perl_debug_log, + "trying anchored substr starting at offset %ld...\n", + (long)(other_last - i_strpos)) ); + goto do_other_anchored; + } + /* Check is floating subtring. */ + retry_floating_check: + t = check_at - start_shift; + DEBUG_r( what = "floating" ); + goto hop_and_restart; + } + DEBUG_r( if (t != s) + PerlIO_printf(Perl_debug_log, + "By STCLASS: moving %ld --> %ld\n", + (long)(t - i_strpos), (long)(s - i_strpos)); + else + PerlIO_printf(Perl_debug_log, + "Does not contradict STCLASS...\n") ); + } DEBUG_r(PerlIO_printf(Perl_debug_log, "%sGuessed:%s match at offset %ld\n", PL_colors[4], PL_colors[5], (long)(s - i_strpos)) ); return s; @@ -664,318 +747,62 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, return Nullch; } -/* - - regexec_flags - match a regexp against a string - */ -I32 -Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend, - char *strbeg, I32 minend, SV *sv, void *data, U32 flags) -/* strend: pointer to null at end of string */ -/* strbeg: real beginning of string */ -/* minend: end of match must be >=minend after stringarg. */ -/* data: May be used for some additional optimizations. */ -/* nosave: For optimizations. */ +/* We know what class REx starts with. Try to find this position... */ +static char * +find_byclass(regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun) { - dTHR; - register char *s; - register regnode *c; - register char *startpos = stringarg; - register I32 tmp; - I32 minlen; /* must match at least this many chars */ - I32 dontbother = 0; /* how many characters not to try at end */ - I32 start_shift = 0; /* Offset of the start to find - constant substr. */ /* CC */ - I32 end_shift = 0; /* Same for the end. */ /* CC */ - I32 scream_pos = -1; /* Internal iterator of scream. */ - char *scream_olds; - SV* oreplsv = GvSV(PL_replgv); - - PL_regcc = 0; - - cache_re(prog); -#ifdef DEBUGGING - PL_regnarrate = PL_debug & 512; -#endif - - /* Be paranoid... */ - if (prog == NULL || startpos == NULL) { - Perl_croak(aTHX_ "NULL regexp parameter"); - return 0; - } - - minlen = prog->minlen; - if (strend - startpos < minlen) goto phooey; - - if (startpos == strbeg) /* is ^ valid at stringarg? */ - PL_regprev = '\n'; - else { - PL_regprev = (U32)stringarg[-1]; - if (!PL_multiline && PL_regprev == '\n') - PL_regprev = '\0'; /* force ^ to NOT match */ - } - - /* Check validity of program. */ - if (UCHARAT(prog->program) != REG_MAGIC) { - Perl_croak(aTHX_ "corrupted regexp program"); - } - - PL_reg_flags = 0; - PL_reg_eval_set = 0; - PL_reg_maxiter = 0; - - if (prog->reganch & ROPT_UTF8) - PL_reg_flags |= RF_utf8; - - /* Mark beginning of line for ^ and lookbehind. */ - PL_regbol = startpos; - PL_bostr = strbeg; - PL_reg_sv = sv; - - /* Mark end of line for $ (and such) */ - PL_regeol = strend; - - /* see how far we have to get to not match where we matched before */ - PL_regtill = startpos+minend; - - /* We start without call_cc context. */ - PL_reg_call_cc = 0; - - /* If there is a "must appear" string, look for it. */ - s = startpos; - - if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */ - MAGIC *mg; - - if (flags & REXEC_IGNOREPOS) /* Means: check only at start */ - PL_reg_ganch = startpos; - else if (sv && SvTYPE(sv) >= SVt_PVMG - && SvMAGIC(sv) - && (mg = mg_find(sv, 'g')) && mg->mg_len >= 0) { - PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */ - if (prog->reganch & ROPT_ANCH_GPOS) { - if (s > PL_reg_ganch) - goto phooey; - s = PL_reg_ganch; - } - } - else /* pos() not defined */ - PL_reg_ganch = strbeg; - } - - if (!(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) { - re_scream_pos_data d; - - d.scream_olds = &scream_olds; - d.scream_pos = &scream_pos; - s = re_intuit_start(prog, sv, s, strend, flags, &d); - if (!s) - goto phooey; /* not present */ - } - - DEBUG_r( if (!PL_colorset) reginitcolors() ); - DEBUG_r(PerlIO_printf(Perl_debug_log, - "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n", - PL_colors[4],PL_colors[5],PL_colors[0], - prog->precomp, - PL_colors[1], - (strlen(prog->precomp) > 60 ? "..." : ""), - PL_colors[0], - (int)(strend - startpos > 60 ? 60 : strend - startpos), - startpos, PL_colors[1], - (strend - startpos > 60 ? "..." : "")) - ); - - /* Simplest case: anchored match need be tried only once. */ - /* [unless only anchor is BOL and multiline is set] */ - if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) { - if (s == startpos && regtry(prog, startpos)) - goto got_it; - else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT) - || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */ - { - char *end; + I32 doevery = (prog->reganch & ROPT_SKIP) == 0; + char *m; + int ln; + int c1; + int c2; + char *e; + register I32 tmp = 1; /* Scratch variable? */ - if (minlen) - dontbother = minlen - 1; - end = HOPc(strend, -dontbother) - 1; - /* for multiline we only have to try after newlines */ - if (prog->check_substr) { - if (s == startpos) - goto after_try; - while (1) { - if (regtry(prog, s)) + /* We know what class it must start with. */ + switch (OP(c)) { + case ANYOFUTF8: + while (s < strend) { + if (REGINCLASSUTF8(c, (U8*)s)) { + if (tmp && (norun || regtry(prog, s))) goto got_it; - after_try: - if (s >= end) - goto phooey; - if (prog->reganch & RE_USE_INTUIT) { - s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL); - if (!s) - goto phooey; - } else - s++; - } - } else { - if (s > startpos) - s--; - while (s < end) { - if (*s++ == '\n') { /* don't need PL_utf8skip here */ - if (regtry(prog, s)) - goto got_it; - } - } - } - } - goto phooey; - } else if (prog->reganch & ROPT_ANCH_GPOS) { - if (regtry(prog, PL_reg_ganch)) - goto got_it; - goto phooey; - } - - /* Messy cases: unanchored match. */ - if (prog->anchored_substr && prog->reganch & ROPT_SKIP) { - /* we have /x+whatever/ */ - /* it must be a one character string (XXXX Except UTF?) */ - char ch = SvPVX(prog->anchored_substr)[0]; - if (UTF) { - while (s < strend) { - if (*s == ch) { - if (regtry(prog, s)) goto got_it; - s += UTF8SKIP(s); - while (s < strend && *s == ch) - s += UTF8SKIP(s); + tmp = doevery; } + else + tmp = 1; s += UTF8SKIP(s); } - } - else { + break; + case ANYOF: while (s < strend) { - if (*s == ch) { - if (regtry(prog, s)) goto got_it; - s++; - while (s < strend && *s == ch) - s++; + if (REGINCLASS(c, *s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; } + else + tmp = 1; s++; } - } - } - /*SUPPRESS 560*/ - else if (prog->anchored_substr != Nullsv - || (prog->float_substr != Nullsv - && prog->float_max_offset < strend - s)) { - SV *must = prog->anchored_substr - ? prog->anchored_substr : prog->float_substr; - I32 back_max = - prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset; - I32 back_min = - prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset; - I32 delta = back_max - back_min; - char *last = HOPc(strend, /* Cannot start after this */ - -(I32)(CHR_SVLEN(must) - - (SvTAIL(must) != 0) + back_min)); - char *last1; /* Last position checked before */ - - if (s > PL_bostr) - last1 = HOPc(s, -1); - else - last1 = s - 1; /* bogus */ - - /* XXXX check_substr already used to find `s', can optimize if - check_substr==must. */ - scream_pos = -1; - dontbother = end_shift; - strend = HOPc(strend, -dontbother); - while ( (s <= last) && - ((flags & REXEC_SCREAM) - ? (s = screaminstr(sv, must, HOPc(s, back_min) - strbeg, - end_shift, &scream_pos, 0)) - : (s = fbm_instr((unsigned char*)HOP(s, back_min), - (unsigned char*)strend, must, - PL_multiline ? FBMrf_MULTILINE : 0))) ) { - if (HOPc(s, -back_max) > last1) { - last1 = HOPc(s, -back_min); - s = HOPc(s, -back_max); - } - else { - char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1; - - last1 = HOPc(s, -back_min); - s = t; - } - if (UTF) { - while (s <= last1) { - if (regtry(prog, s)) - goto got_it; - s += UTF8SKIP(s); - } - } - else { - while (s <= last1) { - if (regtry(prog, s)) - goto got_it; - s++; - } - } - } - goto phooey; - } - else if (c = prog->regstclass) { - I32 doevery = (prog->reganch & ROPT_SKIP) == 0; - char *m; - int ln; - int c1; - int c2; - char *e; - - if (minlen) - dontbother = minlen - 1; - strend = HOPc(strend, -dontbother); /* don't bother with what can't match */ - tmp = 1; - /* We know what class it must start with. */ - switch (OP(c)) { - case ANYOFUTF8: - while (s < strend) { - if (REGINCLASSUTF8(c, (U8*)s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s += UTF8SKIP(s); - } - break; - case ANYOF: - while (s < strend) { - if (REGINCLASS(c, *s)) { - if (tmp && regtry(prog, s)) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s++; - } - break; - case EXACTF: - m = STRING(c); - ln = STR_LEN(c); - c1 = *m; - c2 = PL_fold[c1]; - goto do_exactf; - case EXACTFL: - m = STRING(c); - ln = STR_LEN(c); - c1 = *m; - c2 = PL_fold_locale[c1]; - do_exactf: - e = strend - ln; + break; + case EXACTF: + m = STRING(c); + ln = STR_LEN(c); + c1 = *m; + c2 = PL_fold[c1]; + goto do_exactf; + case EXACTFL: + m = STRING(c); + ln = STR_LEN(c); + c1 = *m; + c2 = PL_fold_locale[c1]; + do_exactf: + e = strend - ln; + if (norun && e < s) + e = s; /* Due to minlen logic of intuit() */ /* Here it is NOT UTF! */ if (c1 == c2) { while (s <= e) { @@ -983,7 +810,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * && (ln == 1 || (OP(c) == EXACTF ? ibcmp(s, m, ln) : ibcmp_locale(s, m, ln))) - && regtry(prog, s) ) + && (norun || regtry(prog, s)) ) goto got_it; s++; } @@ -993,7 +820,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * && (ln == 1 || (OP(c) == EXACTF ? ibcmp(s, m, ln) : ibcmp_locale(s, m, ln))) - && regtry(prog, s) ) + && (norun || regtry(prog, s)) ) goto got_it; s++; } @@ -1003,32 +830,24 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * PL_reg_flags |= RF_tainted; /* FALL THROUGH */ case BOUND: - if (minlen) { - dontbother++; - strend -= 1; - } - tmp = (s != startpos) ? UCHARAT(s - 1) : PL_regprev; + tmp = (s != startpos) ? UCHARAT(s - 1) : '\n'; tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0); while (s < strend) { if (tmp == !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) { tmp = !tmp; - if (regtry(prog, s)) + if ((norun || regtry(prog, s))) goto got_it; } s++; } - if ((minlen || tmp) && regtry(prog,s)) + if ((!prog->minlen && tmp) && (norun || regtry(prog, s))) goto got_it; break; case BOUNDLUTF8: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ case BOUNDUTF8: - if (minlen) { - dontbother++; - strend = reghop_c(strend, -1); - } - tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : PL_regprev; + tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n'; tmp = ((OP(c) == BOUND ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0); while (s < strend) { if (tmp == !(OP(c) == BOUND ? @@ -1036,60 +855,54 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * isALNUM_LC_utf8((U8*)s))) { tmp = !tmp; - if (regtry(prog, s)) + if ((norun || regtry(prog, s))) goto got_it; } s += UTF8SKIP(s); } - if ((minlen || tmp) && regtry(prog,s)) + if ((!prog->minlen && tmp) && (norun || regtry(prog, s))) goto got_it; break; case NBOUNDL: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ case NBOUND: - if (minlen) { - dontbother++; - strend -= 1; - } - tmp = (s != startpos) ? UCHARAT(s - 1) : PL_regprev; + tmp = (s != startpos) ? UCHARAT(s - 1) : '\n'; tmp = ((OP(c) == NBOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0); while (s < strend) { if (tmp == !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s))) tmp = !tmp; - else if (regtry(prog, s)) + else if ((norun || regtry(prog, s))) goto got_it; s++; } - if ((minlen || !tmp) && regtry(prog,s)) + if ((!prog->minlen && !tmp) && (norun || regtry(prog, s))) goto got_it; break; case NBOUNDLUTF8: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ case NBOUNDUTF8: - if (minlen) { - dontbother++; + if (prog->minlen) strend = reghop_c(strend, -1); - } - tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : PL_regprev; + tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n'; tmp = ((OP(c) == NBOUND ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0); while (s < strend) { if (tmp == !(OP(c) == NBOUND ? swash_fetch(PL_utf8_alnum, (U8*)s) : isALNUM_LC_utf8((U8*)s))) tmp = !tmp; - else if (regtry(prog, s)) + else if ((norun || regtry(prog, s))) goto got_it; s += UTF8SKIP(s); } - if ((minlen || !tmp) && regtry(prog,s)) + if ((!prog->minlen && !tmp) && (norun || regtry(prog, s))) goto got_it; break; case ALNUM: while (s < strend) { if (isALNUM(*s)) { - if (tmp && regtry(prog, s)) + if (tmp && (norun || regtry(prog, s))) goto got_it; else tmp = doevery; @@ -1102,7 +915,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * case ALNUMUTF8: while (s < strend) { if (swash_fetch(PL_utf8_alnum, (U8*)s)) { - if (tmp && regtry(prog, s)) + if (tmp && (norun || regtry(prog, s))) goto got_it; else tmp = doevery; @@ -1116,7 +929,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * PL_reg_flags |= RF_tainted; while (s < strend) { if (isALNUM_LC(*s)) { - if (tmp && regtry(prog, s)) + if (tmp && (norun || regtry(prog, s))) goto got_it; else tmp = doevery; @@ -1130,7 +943,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * PL_reg_flags |= RF_tainted; while (s < strend) { if (isALNUM_LC_utf8((U8*)s)) { - if (tmp && regtry(prog, s)) + if (tmp && (norun || regtry(prog, s))) goto got_it; else tmp = doevery; @@ -1143,7 +956,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * case NALNUM: while (s < strend) { if (!isALNUM(*s)) { - if (tmp && regtry(prog, s)) + if (tmp && (norun || regtry(prog, s))) goto got_it; else tmp = doevery; @@ -1156,7 +969,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * case NALNUMUTF8: while (s < strend) { if (!swash_fetch(PL_utf8_alnum, (U8*)s)) { - if (tmp && regtry(prog, s)) + if (tmp && (norun || regtry(prog, s))) goto got_it; else tmp = doevery; @@ -1170,7 +983,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * PL_reg_flags |= RF_tainted; while (s < strend) { if (!isALNUM_LC(*s)) { - if (tmp && regtry(prog, s)) + if (tmp && (norun || regtry(prog, s))) goto got_it; else tmp = doevery; @@ -1184,7 +997,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * PL_reg_flags |= RF_tainted; while (s < strend) { if (!isALNUM_LC_utf8((U8*)s)) { - if (tmp && regtry(prog, s)) + if (tmp && (norun || regtry(prog, s))) goto got_it; else tmp = doevery; @@ -1197,7 +1010,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * case SPACE: while (s < strend) { if (isSPACE(*s)) { - if (tmp && regtry(prog, s)) + if (tmp && (norun || regtry(prog, s))) goto got_it; else tmp = doevery; @@ -1210,7 +1023,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * case SPACEUTF8: while (s < strend) { if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) { - if (tmp && regtry(prog, s)) + if (tmp && (norun || regtry(prog, s))) goto got_it; else tmp = doevery; @@ -1224,7 +1037,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * PL_reg_flags |= RF_tainted; while (s < strend) { if (isSPACE_LC(*s)) { - if (tmp && regtry(prog, s)) + if (tmp && (norun || regtry(prog, s))) goto got_it; else tmp = doevery; @@ -1238,7 +1051,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * PL_reg_flags |= RF_tainted; while (s < strend) { if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) { - if (tmp && regtry(prog, s)) + if (tmp && (norun || regtry(prog, s))) goto got_it; else tmp = doevery; @@ -1251,7 +1064,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * case NSPACE: while (s < strend) { if (!isSPACE(*s)) { - if (tmp && regtry(prog, s)) + if (tmp && (norun || regtry(prog, s))) goto got_it; else tmp = doevery; @@ -1264,7 +1077,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * case NSPACEUTF8: while (s < strend) { if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) { - if (tmp && regtry(prog, s)) + if (tmp && (norun || regtry(prog, s))) goto got_it; else tmp = doevery; @@ -1278,7 +1091,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * PL_reg_flags |= RF_tainted; while (s < strend) { if (!isSPACE_LC(*s)) { - if (tmp && regtry(prog, s)) + if (tmp && (norun || regtry(prog, s))) goto got_it; else tmp = doevery; @@ -1292,7 +1105,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * PL_reg_flags |= RF_tainted; while (s < strend) { if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) { - if (tmp && regtry(prog, s)) + if (tmp && (norun || regtry(prog, s))) goto got_it; else tmp = doevery; @@ -1305,7 +1118,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * case DIGIT: while (s < strend) { if (isDIGIT(*s)) { - if (tmp && regtry(prog, s)) + if (tmp && (norun || regtry(prog, s))) goto got_it; else tmp = doevery; @@ -1318,7 +1131,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * case DIGITUTF8: while (s < strend) { if (swash_fetch(PL_utf8_digit,(U8*)s)) { - if (tmp && regtry(prog, s)) + if (tmp && (norun || regtry(prog, s))) goto got_it; else tmp = doevery; @@ -1332,7 +1145,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * PL_reg_flags |= RF_tainted; while (s < strend) { if (isDIGIT_LC(*s)) { - if (tmp && regtry(prog, s)) + if (tmp && (norun || regtry(prog, s))) goto got_it; else tmp = doevery; @@ -1346,7 +1159,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * PL_reg_flags |= RF_tainted; while (s < strend) { if (isDIGIT_LC_utf8((U8*)s)) { - if (tmp && regtry(prog, s)) + if (tmp && (norun || regtry(prog, s))) goto got_it; else tmp = doevery; @@ -1359,7 +1172,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * case NDIGIT: while (s < strend) { if (!isDIGIT(*s)) { - if (tmp && regtry(prog, s)) + if (tmp && (norun || regtry(prog, s))) goto got_it; else tmp = doevery; @@ -1372,7 +1185,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * case NDIGITUTF8: while (s < strend) { if (!swash_fetch(PL_utf8_digit,(U8*)s)) { - if (tmp && regtry(prog, s)) + if (tmp && (norun || regtry(prog, s))) goto got_it; else tmp = doevery; @@ -1386,7 +1199,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * PL_reg_flags |= RF_tainted; while (s < strend) { if (!isDIGIT_LC(*s)) { - if (tmp && regtry(prog, s)) + if (tmp && (norun || regtry(prog, s))) goto got_it; else tmp = doevery; @@ -1400,7 +1213,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * PL_reg_flags |= RF_tainted; while (s < strend) { if (!isDIGIT_LC_utf8((U8*)s)) { - if (tmp && regtry(prog, s)) + if (tmp && (norun || regtry(prog, s))) goto got_it; else tmp = doevery; @@ -1411,9 +1224,280 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * } break; default: - Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c)); - break; + { + dTHX; + Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c)); + break; + } + } + return 0; + got_it: + return s; +} + +/* + - regexec_flags - match a regexp against a string + */ +I32 +Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend, + char *strbeg, I32 minend, SV *sv, void *data, U32 flags) +/* strend: pointer to null at end of string */ +/* strbeg: real beginning of string */ +/* minend: end of match must be >=minend after stringarg. */ +/* data: May be used for some additional optimizations. */ +/* nosave: For optimizations. */ +{ + dTHR; + register char *s; + register regnode *c; + register char *startpos = stringarg; + register I32 tmp; + I32 minlen; /* must match at least this many chars */ + I32 dontbother = 0; /* how many characters not to try at end */ + I32 start_shift = 0; /* Offset of the start to find + constant substr. */ /* CC */ + I32 end_shift = 0; /* Same for the end. */ /* CC */ + I32 scream_pos = -1; /* Internal iterator of scream. */ + char *scream_olds; + SV* oreplsv = GvSV(PL_replgv); + + PL_regcc = 0; + + cache_re(prog); +#ifdef DEBUGGING + PL_regnarrate = PL_debug & 512; +#endif + + /* Be paranoid... */ + if (prog == NULL || startpos == NULL) { + Perl_croak(aTHX_ "NULL regexp parameter"); + return 0; + } + + minlen = prog->minlen; + if (strend - startpos < minlen) goto phooey; + + if (startpos == strbeg) /* is ^ valid at stringarg? */ + PL_regprev = '\n'; + else { + PL_regprev = (U32)stringarg[-1]; + if (!PL_multiline && PL_regprev == '\n') + PL_regprev = '\0'; /* force ^ to NOT match */ + } + + /* Check validity of program. */ + if (UCHARAT(prog->program) != REG_MAGIC) { + Perl_croak(aTHX_ "corrupted regexp program"); + } + + PL_reg_flags = 0; + PL_reg_eval_set = 0; + PL_reg_maxiter = 0; + + if (prog->reganch & ROPT_UTF8) + PL_reg_flags |= RF_utf8; + + /* Mark beginning of line for ^ and lookbehind. */ + PL_regbol = startpos; + PL_bostr = strbeg; + PL_reg_sv = sv; + + /* Mark end of line for $ (and such) */ + PL_regeol = strend; + + /* see how far we have to get to not match where we matched before */ + PL_regtill = startpos+minend; + + /* We start without call_cc context. */ + PL_reg_call_cc = 0; + + /* If there is a "must appear" string, look for it. */ + s = startpos; + + if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */ + MAGIC *mg; + + if (flags & REXEC_IGNOREPOS) /* Means: check only at start */ + PL_reg_ganch = startpos; + else if (sv && SvTYPE(sv) >= SVt_PVMG + && SvMAGIC(sv) + && (mg = mg_find(sv, 'g')) && mg->mg_len >= 0) { + PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */ + if (prog->reganch & ROPT_ANCH_GPOS) { + if (s > PL_reg_ganch) + goto phooey; + s = PL_reg_ganch; + } } + else /* pos() not defined */ + PL_reg_ganch = strbeg; + } + + if (!(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) { + re_scream_pos_data d; + + d.scream_olds = &scream_olds; + d.scream_pos = &scream_pos; + s = re_intuit_start(prog, sv, s, strend, flags, &d); + if (!s) + goto phooey; /* not present */ + } + + DEBUG_r( if (!PL_colorset) reginitcolors() ); + DEBUG_r(PerlIO_printf(Perl_debug_log, + "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n", + PL_colors[4],PL_colors[5],PL_colors[0], + prog->precomp, + PL_colors[1], + (strlen(prog->precomp) > 60 ? "..." : ""), + PL_colors[0], + (int)(strend - startpos > 60 ? 60 : strend - startpos), + startpos, PL_colors[1], + (strend - startpos > 60 ? "..." : "")) + ); + + /* Simplest case: anchored match need be tried only once. */ + /* [unless only anchor is BOL and multiline is set] */ + if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) { + if (s == startpos && regtry(prog, startpos)) + goto got_it; + else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT) + || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */ + { + char *end; + + if (minlen) + dontbother = minlen - 1; + end = HOPc(strend, -dontbother) - 1; + /* for multiline we only have to try after newlines */ + if (prog->check_substr) { + if (s == startpos) + goto after_try; + while (1) { + if (regtry(prog, s)) + goto got_it; + after_try: + if (s >= end) + goto phooey; + if (prog->reganch & RE_USE_INTUIT) { + s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL); + if (!s) + goto phooey; + } + else + s++; + } + } else { + if (s > startpos) + s--; + while (s < end) { + if (*s++ == '\n') { /* don't need PL_utf8skip here */ + if (regtry(prog, s)) + goto got_it; + } + } + } + } + goto phooey; + } else if (prog->reganch & ROPT_ANCH_GPOS) { + if (regtry(prog, PL_reg_ganch)) + goto got_it; + goto phooey; + } + + /* Messy cases: unanchored match. */ + if (prog->anchored_substr && prog->reganch & ROPT_SKIP) { + /* we have /x+whatever/ */ + /* it must be a one character string (XXXX Except UTF?) */ + char ch = SvPVX(prog->anchored_substr)[0]; + if (UTF) { + while (s < strend) { + if (*s == ch) { + if (regtry(prog, s)) goto got_it; + s += UTF8SKIP(s); + while (s < strend && *s == ch) + s += UTF8SKIP(s); + } + s += UTF8SKIP(s); + } + } + else { + while (s < strend) { + if (*s == ch) { + if (regtry(prog, s)) goto got_it; + s++; + while (s < strend && *s == ch) + s++; + } + s++; + } + } + } + /*SUPPRESS 560*/ + else if (prog->anchored_substr != Nullsv + || (prog->float_substr != Nullsv + && prog->float_max_offset < strend - s)) { + SV *must = prog->anchored_substr + ? prog->anchored_substr : prog->float_substr; + I32 back_max = + prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset; + I32 back_min = + prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset; + I32 delta = back_max - back_min; + char *last = HOPc(strend, /* Cannot start after this */ + -(I32)(CHR_SVLEN(must) + - (SvTAIL(must) != 0) + back_min)); + char *last1; /* Last position checked before */ + + if (s > PL_bostr) + last1 = HOPc(s, -1); + else + last1 = s - 1; /* bogus */ + + /* XXXX check_substr already used to find `s', can optimize if + check_substr==must. */ + scream_pos = -1; + dontbother = end_shift; + strend = HOPc(strend, -dontbother); + while ( (s <= last) && + ((flags & REXEC_SCREAM) + ? (s = screaminstr(sv, must, HOPc(s, back_min) - strbeg, + end_shift, &scream_pos, 0)) + : (s = fbm_instr((unsigned char*)HOP(s, back_min), + (unsigned char*)strend, must, + PL_multiline ? FBMrf_MULTILINE : 0))) ) { + if (HOPc(s, -back_max) > last1) { + last1 = HOPc(s, -back_min); + s = HOPc(s, -back_max); + } + else { + char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1; + + last1 = HOPc(s, -back_min); + s = t; + } + if (UTF) { + while (s <= last1) { + if (regtry(prog, s)) + goto got_it; + s += UTF8SKIP(s); + } + } + else { + while (s <= last1) { + if (regtry(prog, s)) + goto got_it; + s++; + } + } + } + goto phooey; + } + else if (c = prog->regstclass) { + if (minlen) /* don't bother with what can't match */ + strend = HOPc(strend, -(minlen - 1)); + if (find_byclass(prog, c, s, strend, startpos, 0)) + goto got_it; } else { dontbother = 0; diff --git a/t/op/re_tests b/t/op/re_tests index d72a0f7..f866385 100644 --- a/t/op/re_tests +++ b/t/op/re_tests @@ -742,3 +742,5 @@ tt+$ xxxtt y - - ([[:digit:]-z]+) =0-z= y $1 0-z ([[:digit:]-[:alpha:]]+) =0-z= y $1 0-z \GX.*X aaaXbX n - - +(\d+\.\d+) 3.1415926 y $1 3.1415926 +(\ba.{0,10}br) have a web browser y $1 a web br