From: Ilya Zakharevich Date: Sat, 31 Jul 1999 05:13:38 +0000 (-0400) Subject: More optimizations to REx engine X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=82ba1be6639bfd31cc63b76f90d26dc1dafd9221;p=p5sagit%2Fp5-mst-13.2.git More optimizations to REx engine Message-Id: <199907311407.IAA25038@localhost.frii.com> p4raw-id: //depot/perl@3857 --- diff --git a/embed.pl b/embed.pl index 6260550..661a1ac 100755 --- a/embed.pl +++ b/embed.pl @@ -1932,9 +1932,10 @@ s |char*|regwhite |char *|char * s |char*|nextchar s |regnode*|dumpuntil |regnode *start|regnode *node \ |regnode *last|SV* sv|I32 l -s |void |scan_commit |scan_data_t *data +s |void |scan_commit |struct scan_data_t *data s |I32 |study_chunk |regnode **scanp|I32 *deltap \ - |regnode *last|scan_data_t *data|U32 flags + |regnode *last|struct scan_data_t *data \ + |U32 flags s |I32 |add_data |I32 n|char *s rs |void|re_croak2 |const char* pat1|const char* pat2|... s |I32 |regpposixcc |I32 value diff --git a/embedvar.h b/embedvar.h index 42d96de..39bf22b 100644 --- a/embedvar.h +++ b/embedvar.h @@ -63,16 +63,21 @@ #define PL_reg_eval_set (my_perl->Treg_eval_set) #define PL_reg_flags (my_perl->Treg_flags) #define PL_reg_ganch (my_perl->Treg_ganch) +#define PL_reg_leftiter (my_perl->Treg_leftiter) #define PL_reg_magic (my_perl->Treg_magic) +#define PL_reg_maxiter (my_perl->Treg_maxiter) #define PL_reg_oldcurpm (my_perl->Treg_oldcurpm) #define PL_reg_oldpos (my_perl->Treg_oldpos) #define PL_reg_oldsaved (my_perl->Treg_oldsaved) #define PL_reg_oldsavedlen (my_perl->Treg_oldsavedlen) +#define PL_reg_poscache (my_perl->Treg_poscache) +#define PL_reg_poscache_size (my_perl->Treg_poscache_size) #define PL_reg_re (my_perl->Treg_re) #define PL_reg_start_tmp (my_perl->Treg_start_tmp) #define PL_reg_start_tmpl (my_perl->Treg_start_tmpl) #define PL_reg_starttry (my_perl->Treg_starttry) #define PL_reg_sv (my_perl->Treg_sv) +#define PL_reg_whilem_seen (my_perl->Treg_whilem_seen) #define PL_regbol (my_perl->Tregbol) #define PL_regcc (my_perl->Tregcc) #define PL_regcode (my_perl->Tregcode) @@ -193,16 +198,21 @@ #define PL_reg_eval_set (PERL_GET_INTERP->Treg_eval_set) #define PL_reg_flags (PERL_GET_INTERP->Treg_flags) #define PL_reg_ganch (PERL_GET_INTERP->Treg_ganch) +#define PL_reg_leftiter (PERL_GET_INTERP->Treg_leftiter) #define PL_reg_magic (PERL_GET_INTERP->Treg_magic) +#define PL_reg_maxiter (PERL_GET_INTERP->Treg_maxiter) #define PL_reg_oldcurpm (PERL_GET_INTERP->Treg_oldcurpm) #define PL_reg_oldpos (PERL_GET_INTERP->Treg_oldpos) #define PL_reg_oldsaved (PERL_GET_INTERP->Treg_oldsaved) #define PL_reg_oldsavedlen (PERL_GET_INTERP->Treg_oldsavedlen) +#define PL_reg_poscache (PERL_GET_INTERP->Treg_poscache) +#define PL_reg_poscache_size (PERL_GET_INTERP->Treg_poscache_size) #define PL_reg_re (PERL_GET_INTERP->Treg_re) #define PL_reg_start_tmp (PERL_GET_INTERP->Treg_start_tmp) #define PL_reg_start_tmpl (PERL_GET_INTERP->Treg_start_tmpl) #define PL_reg_starttry (PERL_GET_INTERP->Treg_starttry) #define PL_reg_sv (PERL_GET_INTERP->Treg_sv) +#define PL_reg_whilem_seen (PERL_GET_INTERP->Treg_whilem_seen) #define PL_regbol (PERL_GET_INTERP->Tregbol) #define PL_regcc (PERL_GET_INTERP->Tregcc) #define PL_regcode (PERL_GET_INTERP->Tregcode) @@ -864,16 +874,21 @@ #define PL_Treg_eval_set PL_reg_eval_set #define PL_Treg_flags PL_reg_flags #define PL_Treg_ganch PL_reg_ganch +#define PL_Treg_leftiter PL_reg_leftiter #define PL_Treg_magic PL_reg_magic +#define PL_Treg_maxiter PL_reg_maxiter #define PL_Treg_oldcurpm PL_reg_oldcurpm #define PL_Treg_oldpos PL_reg_oldpos #define PL_Treg_oldsaved PL_reg_oldsaved #define PL_Treg_oldsavedlen PL_reg_oldsavedlen +#define PL_Treg_poscache PL_reg_poscache +#define PL_Treg_poscache_size PL_reg_poscache_size #define PL_Treg_re PL_reg_re #define PL_Treg_start_tmp PL_reg_start_tmp #define PL_Treg_start_tmpl PL_reg_start_tmpl #define PL_Treg_starttry PL_reg_starttry #define PL_Treg_sv PL_reg_sv +#define PL_Treg_whilem_seen PL_reg_whilem_seen #define PL_Tregbol PL_regbol #define PL_Tregcc PL_regcc #define PL_Tregcode PL_regcode @@ -1005,16 +1020,21 @@ #define PL_reg_eval_set (thr->Treg_eval_set) #define PL_reg_flags (thr->Treg_flags) #define PL_reg_ganch (thr->Treg_ganch) +#define PL_reg_leftiter (thr->Treg_leftiter) #define PL_reg_magic (thr->Treg_magic) +#define PL_reg_maxiter (thr->Treg_maxiter) #define PL_reg_oldcurpm (thr->Treg_oldcurpm) #define PL_reg_oldpos (thr->Treg_oldpos) #define PL_reg_oldsaved (thr->Treg_oldsaved) #define PL_reg_oldsavedlen (thr->Treg_oldsavedlen) +#define PL_reg_poscache (thr->Treg_poscache) +#define PL_reg_poscache_size (thr->Treg_poscache_size) #define PL_reg_re (thr->Treg_re) #define PL_reg_start_tmp (thr->Treg_start_tmp) #define PL_reg_start_tmpl (thr->Treg_start_tmpl) #define PL_reg_starttry (thr->Treg_starttry) #define PL_reg_sv (thr->Treg_sv) +#define PL_reg_whilem_seen (thr->Treg_whilem_seen) #define PL_regbol (thr->Tregbol) #define PL_regcc (thr->Tregcc) #define PL_regcode (thr->Tregcode) diff --git a/ext/Thread/Thread.xs b/ext/Thread/Thread.xs index 4043a02..ad99e2c 100644 --- a/ext/Thread/Thread.xs +++ b/ext/Thread/Thread.xs @@ -180,6 +180,7 @@ threadstart(void *arg) Safefree(PL_reg_start_tmp); SvREFCNT_dec(PL_lastscream); SvREFCNT_dec(PL_defoutgv); + Safefree(PL_reg_poscache); MUTEX_LOCK(&thr->mutex); DEBUG_S(PerlIO_printf(PerlIO_stderr(), diff --git a/objXSUB.h b/objXSUB.h index 7ae62f3..c3faf68 100644 --- a/objXSUB.h +++ b/objXSUB.h @@ -636,8 +636,12 @@ #define PL_reg_flags (*Perl_Treg_flags_ptr(aTHXo)) #undef PL_reg_ganch #define PL_reg_ganch (*Perl_Treg_ganch_ptr(aTHXo)) +#undef PL_reg_leftiter +#define PL_reg_leftiter (*Perl_Treg_leftiter_ptr(aTHXo)) #undef PL_reg_magic #define PL_reg_magic (*Perl_Treg_magic_ptr(aTHXo)) +#undef PL_reg_maxiter +#define PL_reg_maxiter (*Perl_Treg_maxiter_ptr(aTHXo)) #undef PL_reg_oldcurpm #define PL_reg_oldcurpm (*Perl_Treg_oldcurpm_ptr(aTHXo)) #undef PL_reg_oldpos @@ -646,6 +650,10 @@ #define PL_reg_oldsaved (*Perl_Treg_oldsaved_ptr(aTHXo)) #undef PL_reg_oldsavedlen #define PL_reg_oldsavedlen (*Perl_Treg_oldsavedlen_ptr(aTHXo)) +#undef PL_reg_poscache +#define PL_reg_poscache (*Perl_Treg_poscache_ptr(aTHXo)) +#undef PL_reg_poscache_size +#define PL_reg_poscache_size (*Perl_Treg_poscache_size_ptr(aTHXo)) #undef PL_reg_re #define PL_reg_re (*Perl_Treg_re_ptr(aTHXo)) #undef PL_reg_start_tmp @@ -656,6 +664,8 @@ #define PL_reg_starttry (*Perl_Treg_starttry_ptr(aTHXo)) #undef PL_reg_sv #define PL_reg_sv (*Perl_Treg_sv_ptr(aTHXo)) +#undef PL_reg_whilem_seen +#define PL_reg_whilem_seen (*Perl_Treg_whilem_seen_ptr(aTHXo)) #undef PL_regbol #define PL_regbol (*Perl_Tregbol_ptr(aTHXo)) #undef PL_regcc diff --git a/perl.c b/perl.c index 3a3505d..d811879 100644 --- a/perl.c +++ b/perl.c @@ -507,6 +507,7 @@ perl_destruct(pTHXx) Safefree(PL_reg_start_tmp); if (PL_reg_curpm) Safefree(PL_reg_curpm); + Safefree(PL_reg_poscache); Safefree(HeKEY_hek(&PL_hv_fetch_ent_mh)); Safefree(PL_op_mask); nuke_stacks(); diff --git a/perl.h b/perl.h index 0e43ee4..6891b37 100644 --- a/perl.h +++ b/perl.h @@ -1727,25 +1727,7 @@ struct _sublex_info { typedef struct magic_state MGS; /* struct magic_state defined in mg.c */ -/* Length of a variant. */ - -typedef struct { - I32 len_min; - I32 len_delta; - I32 pos_min; - I32 pos_delta; - SV *last_found; - I32 last_end; /* min value, <0 unless valid. */ - I32 last_start_min; - I32 last_start_max; - SV **longest; /* Either &l_fixed, or &l_float. */ - SV *longest_fixed; - I32 offset_fixed; - SV *longest_float; - I32 offset_float_min; - I32 offset_float_max; - I32 flags; -} scan_data_t; +struct scan_data_t; /* Used in S_* functions in regcomp.c */ typedef I32 CHECKPOINT; diff --git a/proto.h b/proto.h index 7bed4c7..90b2500 100644 --- a/proto.h +++ b/proto.h @@ -868,8 +868,8 @@ STATIC void S_regtail(pTHX_ regnode *, regnode *); STATIC char* S_regwhite(pTHX_ char *, char *); STATIC char* S_nextchar(pTHX); STATIC regnode* S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l); -STATIC void S_scan_commit(pTHX_ scan_data_t *data); -STATIC I32 S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags); +STATIC void S_scan_commit(pTHX_ struct scan_data_t *data); +STATIC I32 S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, struct scan_data_t *data, U32 flags); STATIC I32 S_add_data(pTHX_ I32 n, char *s); STATIC void S_re_croak2(pTHX_ const char* pat1, const char* pat2, ...) __attribute__((noreturn)); STATIC I32 S_regpposixcc(pTHX_ I32 value); diff --git a/regcomp.c b/regcomp.c index 2d81da1..fac31e6 100644 --- a/regcomp.c +++ b/regcomp.c @@ -132,12 +132,33 @@ #define SPSTART 0x4 /* Starts with * or +. */ #define TRYAGAIN 0x8 /* Weeded out a declaration. */ +/* Length of a variant. */ + +typedef struct scan_data_t { + I32 len_min; + I32 len_delta; + I32 pos_min; + I32 pos_delta; + SV *last_found; + I32 last_end; /* min value, <0 unless valid. */ + I32 last_start_min; + I32 last_start_max; + SV **longest; /* Either &l_fixed, or &l_float. */ + SV *longest_fixed; + I32 offset_fixed; + SV *longest_float; + I32 offset_float_min; + I32 offset_float_max; + I32 flags; + I32 whilem_c; +} scan_data_t; + /* * Forward declarations for pregcomp()'s friends. */ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0 }; + 0, 0, 0, 0 }; #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL) #define SF_BEFORE_SEOL 0x1 @@ -328,6 +349,8 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da num++; data_fake.flags = 0; + if (data) + data_fake.whilem_c = data->whilem_c; next = regnext(scan); scan = NEXTOPER(scan); if (code != BRANCH) @@ -346,6 +369,8 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da pars++; if (data && (data_fake.flags & SF_HAS_EVAL)) data->flags |= SF_HAS_EVAL; + if (data) + data->whilem_c = data_fake.whilem_c; if (code == SUSPEND) break; } @@ -562,6 +587,16 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da else oscan->flags = 0; } + else if (OP(oscan) == CURLYX && data && ++data->whilem_c < 16) { + /* This stays as CURLYX, and can put the count/of pair. */ + /* Find WHILEM (as in regexec.c) */ + regnode *nxt = oscan + NEXT_OFF(oscan); + + if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */ + nxt += ARG(nxt); + PREVOPER(nxt)->flags = data->whilem_c + | (PL_reg_whilem_seen << 4); /* On WHILEM */ + } if (data && fl & (SF_HAS_PAR|SF_IN_PAR)) pars++; if (flags & SCF_DO_SUBSTR) { @@ -653,6 +688,8 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da regnode *nscan; data_fake.flags = 0; + if (data) + data_fake.whilem_c = data->whilem_c; next = regnext(scan); nscan = NEXTOPER(NEXTOPER(scan)); minnext = study_chunk(&nscan, &deltanext, last, &data_fake, 0); @@ -669,6 +706,8 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da pars++; if (data && (data_fake.flags & SF_HAS_EVAL)) data->flags |= SF_HAS_EVAL; + if (data) + data->whilem_c = data_fake.whilem_c; } else if (OP(scan) == OPEN) { pars++; @@ -787,6 +826,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) I32 minlen = 0; I32 sawplus = 0; I32 sawopen = 0; + scan_data_t data; if (exp == NULL) FAIL("NULL regexp argument"); @@ -798,7 +838,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) PL_regprecomp = savepvn(exp, xend - exp); DEBUG_r(if (!PL_colorset) reginitcolors()); - DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling%s RE `%s%*s%s'\n", + DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n", PL_colors[4],PL_colors[5],PL_colors[0], xend - exp, PL_regprecomp, PL_colors[1])); PL_regflags = pm->op_pmflags; @@ -816,6 +856,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) PL_regnpar = 1; PL_regsize = 0L; PL_regcode = &PL_regdummy; + PL_reg_whilem_seen = 0; regc((U8)REG_MAGIC, (char*)PL_regcode); if (reg(0, &flags) == NULL) { Safefree(PL_regprecomp); @@ -830,6 +871,8 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) PL_regsize += PL_extralen; else PL_extralen = 0; + if (PL_reg_whilem_seen > 15) + PL_reg_whilem_seen = 15; /* Allocate space and initialize. */ Newc(1001, r, sizeof(regexp) + (unsigned)PL_regsize * sizeof(regnode), @@ -876,12 +919,11 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) 3-units-long substrs field. */ Newz(1004, r->substrs, 1, struct reg_substr_data); + StructCopy(&zero_scan_data, &data, scan_data_t); if (OP(scan) != BRANCH) { /* Only one top-level choice. */ - scan_data_t data; I32 fake; STRLEN longest_float_length, longest_fixed_length; - StructCopy(&zero_scan_data, &data, scan_data_t); first = scan; /* Skip introductions and multiplicators >= 1. */ while ((OP(first) == OPEN && (sawopen = 1)) || @@ -1042,7 +1084,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) DEBUG_r(PerlIO_printf(Perl_debug_log, "\n")); scan = r->program + 1; - minlen = study_chunk(&scan, &fake, scan + PL_regsize, NULL, 0); + minlen = study_chunk(&scan, &fake, scan + PL_regsize, &data, 0); r->check_substr = r->anchored_substr = r->float_substr = Nullsv; } @@ -1520,8 +1562,10 @@ S_regpiece(pTHX_ I32 *flagp) reginsert(CURLY, ret); } else { - PL_regnaughty += 4 + PL_regnaughty; /* compound interest */ - regtail(ret, reg_node(WHILEM)); + regnode *w = reg_node(WHILEM); + + w->flags = 0; + regtail(ret, w); if (!SIZE_ONLY && PL_extralen) { reginsert(LONGJMP,ret); reginsert(NOTHING,ret); @@ -1532,7 +1576,8 @@ S_regpiece(pTHX_ I32 *flagp) NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */ regtail(ret, reg_node(NOTHING)); if (SIZE_ONLY) - PL_extralen += 3; + PL_reg_whilem_seen++, PL_extralen += 3; + PL_regnaughty += 4 + PL_regnaughty; /* compound interest */ } ret->flags = 0; @@ -3115,16 +3160,18 @@ Perl_regdump(pTHX_ regexp *r) /* Header fields of interest. */ if (r->anchored_substr) - PerlIO_printf(Perl_debug_log, "anchored `%s%s%s'%s at %d ", + PerlIO_printf(Perl_debug_log, "anchored `%s%.*s%s'%s at %d ", PL_colors[0], + SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0), SvPVX(r->anchored_substr), PL_colors[1], SvTAIL(r->anchored_substr) ? "$" : "", r->anchored_offset); if (r->float_substr) - PerlIO_printf(Perl_debug_log, "floating `%s%s%s'%s at %d..%u ", + PerlIO_printf(Perl_debug_log, "floating `%s%.*s%s'%s at %d..%u ", PL_colors[0], - SvPVX(r->float_substr), + SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0), + SvPVX(r->float_substr), PL_colors[1], SvTAIL(r->float_substr) ? "$" : "", r->float_min_offset, r->float_max_offset); @@ -3192,6 +3239,8 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */ Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o)); } + else if (k == WHILEM && o->flags) /* Ordinal/of */ + Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP ) Perl_sv_catpvf(aTHX_ sv, "%d", ARG(o)); /* Parenth number */ else if (k == LOGICAL) diff --git a/regexec.c b/regexec.c index e69c4ff..b464a40 100644 --- a/regexec.c +++ b/regexec.c @@ -270,25 +270,33 @@ S_cache_re(pTHX_ regexp *prog) /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */ -/* If SCREAM, then sv should be compatible with strpos and strend. +/* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend. Otherwise, only SvCUR(sv) is used to get strbeg. */ /* XXXX We assume that strpos is strbeg unless sv. */ +/* A failure to find a constant substring means that there is no need to make + an expensive call to REx engine, thus we celebrate a failure. Similarly, + finding a substring too deep into the string means that less calls to + regtry() should be needed. */ + char * Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, char *strend, U32 flags, re_scream_pos_data *data) { - I32 start_shift; + register I32 start_shift; /* Should be nonnegative! */ - I32 end_shift; - char *s; + register I32 end_shift; + register char *s; + register SV *check; char *t; I32 ml_anch; + char *tmp; + register char *other_last = Nullch; DEBUG_r( if (!PL_colorset) reginitcolors() ); DEBUG_r(PerlIO_printf(Perl_debug_log, - "%sGuessing start of match:%s `%s%.60s%s%s' against `%s%.*s%s%s'\n", + "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n", PL_colors[4],PL_colors[5],PL_colors[0], prog->precomp, PL_colors[1], @@ -299,128 +307,296 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, (strend - strpos > 60 ? "..." : "")) ); - if (prog->minlen > strend - strpos) + if (prog->minlen > strend - strpos) { + DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n")); goto fail; - - /* XXXX Move further down? */ - start_shift = prog->check_offset_min; /* okay to underestimate on CC */ - /* Should be nonnegative! */ - end_shift = prog->minlen - start_shift - - CHR_SVLEN(prog->check_substr) + (SvTAIL(prog->check_substr) != 0); - - if (prog->reganch & ROPT_ANCH) { + } + if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */ ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE) || ( (prog->reganch & ROPT_ANCH_BOL) - && !PL_multiline ) ); + && !PL_multiline ) ); /* Check after \n? */ if ((prog->check_offset_min == prog->check_offset_max) && !ml_anch) { - /* Anchored... */ + /* Substring at constant offset from beg-of-str... */ I32 slen; if ( !(prog->reganch & ROPT_ANCH_GPOS) /* Checked by the caller */ - && (sv && (strpos + SvCUR(sv) != strend)) ) + && (sv && (strpos + SvCUR(sv) != strend)) ) { + DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n")); goto fail; - + } PL_regeol = strend; /* Used in HOP() */ - s = (char*)HOP((U8*)strpos, prog->check_offset_min); + s = HOPc(strpos, prog->check_offset_min); if (SvTAIL(prog->check_substr)) { slen = SvCUR(prog->check_substr); /* >= 1 */ - if ( strend - s > slen || strend - s < slen - 1 ) { - s = Nullch; - goto finish; - } - if ( strend - s == slen && strend[-1] != '\n') { - s = Nullch; - goto finish; + if ( strend - s > slen || strend - s < slen - 1 + || (strend - s == slen && strend[-1] != '\n')) { + DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n")); + goto fail_finish; } /* Now should match s[0..slen-2] */ slen--; if (slen && (*SvPVX(prog->check_substr) != *s || (slen > 1 - && memNE(SvPVX(prog->check_substr), s, slen)))) - s = Nullch; + && memNE(SvPVX(prog->check_substr), s, slen)))) { + report_neq: + DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n")); + goto fail_finish; + } } else if (*SvPVX(prog->check_substr) != *s || ((slen = SvCUR(prog->check_substr)) > 1 && memNE(SvPVX(prog->check_substr), s, slen))) - s = Nullch; - else - s = strpos; - goto finish; + goto report_neq; + goto success_at_start; } + /* Match is anchored, but substr is not anchored wrt beg-of-str. */ s = strpos; - if (!ml_anch && (s + prog->check_offset_max < strend - prog->minlen)) - end_shift += strend - s - prog->minlen - prog->check_offset_max; + start_shift = prog->check_offset_min; /* okay to underestimate on CC */ + /* Should be nonnegative! */ + end_shift = prog->minlen - start_shift - + CHR_SVLEN(prog->check_substr) + (SvTAIL(prog->check_substr) != 0); + if (!ml_anch) { + I32 end = prog->check_offset_max + CHR_SVLEN(prog->check_substr) + - (SvTAIL(prog->check_substr) != 0); + I32 eshift = strend - s - end; + + if (end_shift < eshift) + end_shift = eshift; + } } - else { + else { /* Can match at random position */ ml_anch = 0; s = strpos; + start_shift = prog->check_offset_min; /* okay to underestimate on CC */ + /* Should be nonnegative! */ + end_shift = prog->minlen - start_shift - + CHR_SVLEN(prog->check_substr) + (SvTAIL(prog->check_substr) != 0); } - restart: +#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */ if (end_shift < 0) - end_shift = 0; /* can happen when strend == strpos */ + croak("panic: end_shift"); +#endif + + check = prog->check_substr; + restart: + /* Find a possible match in the region s..strend by looking for + the "check" substring in the region corrected by start/end_shift. */ if (flags & REXEC_SCREAM) { - SV *c = prog->check_substr; char *strbeg = SvPVX(sv); /* XXXX Assume PV_force() on SCREAM! */ I32 p = -1; /* Internal iterator of scream. */ I32 *pp = data ? data->scream_pos : &p; - if (PL_screamfirst[BmRARE(c)] >= 0 - || ( BmRARE(c) == '\n' - && (BmPREVIOUS(c) == SvCUR(c) - 1) - && SvTAIL(c) )) - s = screaminstr(sv, prog->check_substr, - start_shift + (strpos - strbeg), end_shift, pp, 0); + if (PL_screamfirst[BmRARE(check)] >= 0 + || ( BmRARE(check) == '\n' + && (BmPREVIOUS(check) == SvCUR(check) - 1) + && SvTAIL(check) )) + s = screaminstr(sv, check, + start_shift + (s - strbeg), end_shift, pp, 0); else - s = Nullch; + goto fail_finish; if (data) *data->scream_olds = s; } else s = fbm_instr((unsigned char*)s + start_shift, (unsigned char*)strend - end_shift, - prog->check_substr, PL_multiline ? FBMrf_MULTILINE : 0); + check, PL_multiline ? FBMrf_MULTILINE : 0); /* Update the count-of-usability, remove useless subpatterns, unshift s. */ - finish: - if (!s) { - ++BmUSEFUL(prog->check_substr); /* hooray */ - goto fail; /* not present */ + + DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s", + (s ? "Found" : "Did not find"), + ((check == prog->anchored_substr) ? "anchored" : "floating"), + PL_colors[0], + SvCUR(check) - (SvTAIL(check)!=0), SvPVX(check), + PL_colors[1], (SvTAIL(check) ? "$" : ""), + (s ? " at offset " : "...\n") ) ); + + if (!s) + goto fail_finish; + + /* Finish the diagnostic message */ + DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - strpos)) ); + + /* Got a candidate. Check MBOL anchoring, and the *other* substr. + Start with the other substr. + XXXX no SCREAM optimization yet - and a very coarse implementation + XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will + *always* match. Probably should be marked during compile... + Probably it is right to do no SCREAM here... + */ + + if (prog->float_substr && prog->anchored_substr) { + /* Take into account the anchored substring. */ + /* XXXX May be hopelessly wrong for UTF... */ + if (!other_last) + other_last = strpos - 1; + if (check == prog->float_substr) { + char *last = s - start_shift, *last1, *last2; + char *s1 = s; + + tmp = PL_bostr; + t = s - prog->check_offset_max; + if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ + && (!(prog->reganch & ROPT_UTF8) + || (PL_bostr = strpos, /* Used in regcopmaybe() */ + (t = reghopmaybe_c(s, -(prog->check_offset_max))) + && t > strpos))) + ; + else + t = strpos; + t += prog->anchored_offset; + if (t <= other_last) + t = other_last + 1; + PL_bostr = tmp; + last2 = last1 = strend - prog->minlen; + if (last < last1) + last1 = last; + /* XXXX It is not documented what units *_offsets are in. Assume bytes. */ + /* On end-of-str: see comment below. */ + s = fbm_instr((unsigned char*)t, + (unsigned char*)last1 + prog->anchored_offset + + SvCUR(prog->anchored_substr) + - (SvTAIL(prog->anchored_substr)!=0), + prog->anchored_substr, PL_multiline ? FBMrf_MULTILINE : 0); + DEBUG_r(PerlIO_printf(Perl_debug_log, "%s anchored substr `%s%.*s%s'%s", + (s ? "Found" : "Contradicts"), + PL_colors[0], + SvCUR(prog->anchored_substr) + - (SvTAIL(prog->anchored_substr)!=0), + SvPVX(prog->anchored_substr), + PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : ""))); + if (!s) { + if (last1 >= last2) { + DEBUG_r(PerlIO_printf(Perl_debug_log, + ", giving up...\n")); + goto fail_finish; + } + DEBUG_r(PerlIO_printf(Perl_debug_log, + ", trying floating at offset %ld...\n", + (long)(s1 + 1 - strpos))); + PL_regeol = strend; /* Used in HOP() */ + other_last = last1 + prog->anchored_offset; + s = HOPc(last, 1); + goto restart; + } + else { + DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", + (long)(s - strpos))); + t = s - prog->anchored_offset; + other_last = s - 1; + if (t == strpos) + goto try_at_start; + s = s1; + goto try_at_offset; + } + } + else { /* Take into account the floating substring. */ + char *last, *last1; + char *s1 = s; + + t = s - start_shift; + last1 = last = strend - prog->minlen + prog->float_min_offset; + if (last - t > prog->float_max_offset) + last = t + prog->float_max_offset; + s = t + prog->float_min_offset; + if (s <= other_last) + s = other_last + 1; + /* XXXX It is not documented what units *_offsets are in. Assume bytes. */ + /* fbm_instr() takes into account exact value of end-of-str + if the check is SvTAIL(ed). Since false positives are OK, + and end-of-str is not later than strend we are OK. */ + s = fbm_instr((unsigned char*)s, + (unsigned char*)last + SvCUR(prog->float_substr) + - (SvTAIL(prog->float_substr)!=0), + prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0); + DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s", + (s ? "Found" : "Contradicts"), + PL_colors[0], + SvCUR(prog->float_substr) + - (SvTAIL(prog->float_substr)!=0), + SvPVX(prog->float_substr), + PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : ""))); + if (!s) { + if (last1 == last) { + DEBUG_r(PerlIO_printf(Perl_debug_log, + ", giving up...\n")); + goto fail_finish; + } + DEBUG_r(PerlIO_printf(Perl_debug_log, + ", trying anchored starting at offset %ld...\n", + (long)(s1 + 1 - strpos))); + other_last = last; + PL_regeol = strend; /* Used in HOP() */ + s = HOPc(t, 1); + goto restart; + } + else { + DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", + (long)(s - strpos))); + other_last = s - 1; + if (t == strpos) + goto try_at_start; + s = s1; + goto try_at_offset; + } + } } - else if (s - strpos > prog->check_offset_max && - ((prog->reganch & ROPT_UTF8) - ? ((t = reghopmaybe_c(s, -(prog->check_offset_max))) - && t >= strpos) - : (t = s - prog->check_offset_max) != 0) ) { + + t = s - prog->check_offset_max; + tmp = PL_bostr; + if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ + && (!(prog->reganch & ROPT_UTF8) + || (PL_bostr = strpos, /* Used in regcopmaybe() */ + ((t = reghopmaybe_c(s, -(prog->check_offset_max))) + && t > strpos)))) { + PL_bostr = tmp; + /* Fixed substring is found far enough so that the match + cannot start at strpos. */ + try_at_offset: if (ml_anch && t[-1] != '\n') { - find_anchor: - while (t < strend - end_shift - prog->minlen) { + find_anchor: /* Eventually fbm_*() should handle this */ + while (t < strend - prog->minlen) { if (*t == '\n') { if (t < s - prog->check_offset_min) { s = t + 1; + DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n", + PL_colors[0],PL_colors[1], (long)(s - strpos))); goto set_useful; } + DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting at offset %ld...\n", + PL_colors[0],PL_colors[1], (long)(t + 1 - strpos))); s = t + 1; goto restart; } t++; } - s = Nullch; - goto finish; + DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n", + PL_colors[0],PL_colors[1])); + goto fail_finish; } s = t; set_useful: - ++BmUSEFUL(prog->check_substr); /* hooray/2 */ + ++BmUSEFUL(prog->check_substr); /* hooray/5 */ } else { - if (ml_anch && sv + PL_bostr = tmp; + /* The found string does not prohibit matching at beg-of-str + - no optimization of calling REx engine can be performed, + unless it was an MBOL and we are not after MBOL. */ + try_at_start: + /* Even in this situation we may use MBOL flag if strpos is offset + wrt the start of the string. */ + if (ml_anch && sv && (strpos + SvCUR(sv) != strend) && strpos[-1] != '\n') { t = strpos; goto find_anchor; } + success_at_start: if (!(prog->reganch & ROPT_NAUGHTY) && --BmUSEFUL(prog->check_substr) < 0 && prog->check_substr == prog->float_substr) { /* boo */ @@ -435,11 +611,14 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, s = strpos; } - DEBUG_r(PerlIO_printf(Perl_debug_log, "%sFound%s at offset %ld\n", - PL_colors[4],PL_colors[5], (long)(s - strpos)) ); + DEBUG_r(PerlIO_printf(Perl_debug_log, "%sGuessed:%s match at offset %ld\n", + PL_colors[4], PL_colors[5], (long)(s - strpos)) ); return s; + + fail_finish: /* Substring not found */ + BmUSEFUL(prog->check_substr) += 5; /* hooray */ fail: - DEBUG_r(PerlIO_printf(Perl_debug_log, "%sNot found...%s\n", + DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n", PL_colors[4],PL_colors[5])); return Nullch; } @@ -504,6 +683,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * PL_reg_flags = 0; PL_reg_eval_set = 0; + PL_reg_maxiter = 0; if (prog->reganch & ROPT_UTF8) PL_reg_flags |= RF_utf8; @@ -552,7 +732,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * DEBUG_r( if (!PL_colorset) reginitcolors() ); DEBUG_r(PerlIO_printf(Perl_debug_log, - "%sMatching%s `%s%.60s%s%s' against `%s%.*s%s%s'\n", + "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n", PL_colors[4],PL_colors[5],PL_colors[0], prog->precomp, PL_colors[1], @@ -3162,6 +3342,7 @@ S_regmatch(pTHX_ regnode *prog) case REFF: n = ARG(scan); /* which paren pair */ ln = PL_regstartp[n]; + PL_reg_leftiter = PL_reg_maxiter; /* Void cache */ if (*PL_reglastparen < n || ln == -1) sayNO; /* Do not match unless seen CLOSEn. */ if (ln == PL_regendp[n]) @@ -3306,6 +3487,10 @@ S_regmatch(pTHX_ regnode *prog) *PL_reglastparen = 0; PL_reg_call_cc = &state; PL_reginput = locinput; + + /* XXXX This is too dramatic a measure... */ + PL_reg_maxiter = 0; + if (regmatch(re->program + 1)) { ReREFCNT_dec(re); regcpblow(cp); @@ -3323,6 +3508,10 @@ S_regmatch(pTHX_ regnode *prog) PL_regcc = state.cc; PL_reg_re = state.re; cache_re(PL_reg_re); + + /* XXXX This is too dramatic a measure... */ + PL_reg_maxiter = 0; + sayNO; } sw = SvTRUE(ret); @@ -3350,6 +3539,7 @@ S_regmatch(pTHX_ regnode *prog) sw = (*PL_reglastparen >= n && PL_regendp[n] != -1); break; case IFTHEN: + PL_reg_leftiter = PL_reg_maxiter; /* Void cache */ if (sw) next = NEXTOPER(NEXTOPER(scan)); else { @@ -3388,7 +3578,7 @@ S_regmatch(pTHX_ regnode *prog) /* * This is really hard to understand, because after we match * what we're trying to match, we must make sure the rest of - * the RE is going to match for sure, and to do that we have + * the REx is going to match for sure, and to do that we have * to go back UP the parse tree by recursing ever deeper. And * if it fails, we have to reset our parent's current state * that we can try again after backing off. @@ -3448,6 +3638,51 @@ S_regmatch(pTHX_ regnode *prog) sayNO; } + if (scan->flags) { + /* Check whether we already were at this position. + Postpone detection until we know the match is not + *that* much linear. */ + if (!PL_reg_maxiter) { + PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4); + PL_reg_leftiter = PL_reg_maxiter; + } + if (PL_reg_leftiter-- == 0) { + I32 size = (PL_reg_maxiter + 7)/8; + if (PL_reg_poscache) { + if (PL_reg_poscache_size < size) { + Renew(PL_reg_poscache, size, char); + PL_reg_poscache_size = size; + } + Zero(PL_reg_poscache, size, char); + } + else { + PL_reg_poscache_size = size; + Newz(29, PL_reg_poscache, size, char); + } + DEBUG_r( + PerlIO_printf(Perl_debug_log, + "%sDetected a super-linear match, switching on caching%s...\n", + PL_colors[4], PL_colors[5]) + ); + } + if (PL_reg_leftiter < 0) { + I32 o = locinput - PL_bostr, b; + + o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4); + b = o % 8; + o /= 8; + if (PL_reg_poscache[o] & (1<minmod) { diff --git a/t/op/re_tests b/t/op/re_tests index 34b6e29..899b35e 100644 --- a/t/op/re_tests +++ b/t/op/re_tests @@ -715,3 +715,23 @@ round\(((?>[^()]+))\) _I(round(xs * sz),1) y $1 xs * sz '((?x:.) )' x y $1- x - '((?-x:.) )'x x y $1- x- foo.bart foo.bart y - - +'^d[x][x][x]'m abcd\ndxxx y - - +.X(.+)+X bbbbXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - - +.X(.+)+XX bbbbXcXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - - +.XX(.+)+X bbbbXXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - - +.X(.+)+X bbbbXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - - +.X(.+)+XX bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - - +.XX(.+)+X bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - - +.X(.+)+[X] bbbbXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - - +.X(.+)+[X][X] bbbbXcXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - - +.XX(.+)+[X] bbbbXXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - - +.X(.+)+[X] bbbbXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - - +.X(.+)+[X][X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - - +.XX(.+)+[X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - - +.[X](.+)+[X] bbbbXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - - +.[X](.+)+[X][X] bbbbXcXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - - +.[X][X](.+)+[X] bbbbXXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa y - - +.[X](.+)+[X] bbbbXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - - +.[X](.+)+[X][X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - - +.[X][X](.+)+[X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - - +tt+$ xxxtt y - - diff --git a/thrdvar.h b/thrdvar.h index 32a0c7f..4434b5d 100644 --- a/thrdvar.h +++ b/thrdvar.h @@ -143,6 +143,7 @@ PERLVAR(Tregcomp_rx, regexp *) /* from regcomp.c */ PERLVAR(Textralen, I32) /* from regcomp.c */ PERLVAR(Tcolorset, int) /* from regcomp.c */ PERLVARA(Tcolors,6, char *) /* from regcomp.c */ +PERLVAR(Treg_whilem_seen, I32) /* number of WHILEM in this expr */ PERLVAR(Treginput, char *) /* String-input pointer. */ PERLVAR(Tregbol, char *) /* Beginning of input, for ^ check. */ PERLVAR(Tregeol, char *) /* End of input, for $ check. */ @@ -172,6 +173,10 @@ PERLVARI(Treg_oldcurpm, PMOP*, NULL) /* curpm before match */ PERLVARI(Treg_curpm, PMOP*, NULL) /* curpm during match */ PERLVAR(Treg_oldsaved, char*) /* old saved substr during match */ PERLVAR(Treg_oldsavedlen, STRLEN) /* old length of saved substr during match */ +PERLVAR(Treg_maxiter, I32) /* max wait until caching pos */ +PERLVAR(Treg_leftiter, I32) /* wait until caching pos */ +PERLVARI(Treg_poscache, char *, Nullch) /* cache of pos of WHILEM */ +PERLVAR(Treg_poscache_size, STRLEN) /* size of pos cache of WHILEM */ PERLVARI(Tregcompp, regcomp_t, MEMBER_TO_FPTR(Perl_pregcomp)) /* Pointer to REx compiler */