From: Ilya Zakharevich Date: Sun, 16 Nov 1997 00:29:39 +0000 (-0500) Subject: Jumbo regexp patch applied (with minor fix-up tweaks): X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=c277df42229d99fecbc76f5da53793a409ac66e1;p=p5sagit%2Fp5-mst-13.2.git Jumbo regexp patch applied (with minor fix-up tweaks): Subject: Version 7 of Jumbo RE patch available p4raw-id: //depot/perl@267 --- diff --git a/dump.c b/dump.c index 51fd157..24602e9 100644 --- a/dump.c +++ b/dump.c @@ -359,18 +359,17 @@ dump_pm(PMOP *pm) dump("PMf_REPL = "); dump_op(pm->op_pmreplroot); } - if (pm->op_pmshort) { - dump("PMf_SHORT = %s\n",SvPEEK(pm->op_pmshort)); - } - if (pm->op_pmflags) { + if (pm->op_pmflags || (pm->op_pmregexp && pm->op_pmregexp->check_substr)) { SV *tmpsv = newSVpv("", 0); if (pm->op_pmflags & PMf_USED) sv_catpv(tmpsv, ",USED"); if (pm->op_pmflags & PMf_ONCE) sv_catpv(tmpsv, ",ONCE"); - if (pm->op_pmflags & PMf_SCANFIRST) + if (pm->op_pmregexp && pm->op_pmregexp->check_substr + && !(pm->op_pmregexp->reganch & ROPT_NOSCAN)) sv_catpv(tmpsv, ",SCANFIRST"); - if (pm->op_pmflags & PMf_ALL) + if (pm->op_pmregexp && pm->op_pmregexp->check_substr + && pm->op_pmregexp->reganch & ROPT_CHECK_ALL) sv_catpv(tmpsv, ",ALL"); if (pm->op_pmflags & PMf_SKIPWHITE) sv_catpv(tmpsv, ",SKIPWHITE"); diff --git a/embed.h b/embed.h index 3594e87..6069959 100644 --- a/embed.h +++ b/embed.h @@ -185,7 +185,6 @@ #define do_trans Perl_do_trans #define do_vecset Perl_do_vecset #define do_vop Perl_do_vop -#define doeval Perl_doeval #define dofindlabel Perl_dofindlabel #define dopoptoeval Perl_dopoptoeval #define dounwind Perl_dounwind @@ -331,6 +330,7 @@ #define magic_clearsig Perl_magic_clearsig #define magic_existspack Perl_magic_existspack #define magic_freedefelem Perl_magic_freedefelem +#define magic_freeregexp Perl_magic_freeregexp #define magic_get Perl_magic_get #define magic_getarylen Perl_magic_getarylen #define magic_getdefelem Perl_magic_getdefelem @@ -890,32 +890,11 @@ #define reall_srchlen Perl_reall_srchlen #define ref Perl_ref #define refkids Perl_refkids -#define regarglen Perl_regarglen -#define regbol Perl_regbol -#define regcode Perl_regcode -#define regdummy Perl_regdummy #define regdump Perl_regdump -#define regendp Perl_regendp -#define regeol Perl_regeol -#define reginput Perl_reginput +#define regexec_flags Perl_regexec_flags #define regkind Perl_regkind -#define reglastparen Perl_reglastparen -#define regmyendp Perl_regmyendp -#define regmyp_size Perl_regmyp_size -#define regmystartp Perl_regmystartp -#define regnarrate Perl_regnarrate -#define regnaughty Perl_regnaughty #define regnext Perl_regnext -#define regnpar Perl_regnpar -#define regparse Perl_regparse -#define regprecomp Perl_regprecomp -#define regprev Perl_regprev #define regprop Perl_regprop -#define regsawback Perl_regsawback -#define regsize Perl_regsize -#define regstartp Perl_regstartp -#define regtill Perl_regtill -#define regxend Perl_regxend #define repeat_amg Perl_repeat_amg #define repeat_ass_amg Perl_repeat_ass_amg #define repeatcpy Perl_repeatcpy @@ -1046,6 +1025,7 @@ #define sv_cmp Perl_sv_cmp #define sv_cmp_locale Perl_sv_cmp_locale #define sv_collxfrm Perl_sv_collxfrm +#define sv_compile_2op Perl_sv_compile_2op #define sv_dec Perl_sv_dec #define sv_derived_from Perl_sv_derived_from #define sv_dump Perl_sv_dump @@ -1197,7 +1177,6 @@ #define no_wrongref Perl_no_wrongref #define pad_reset_pending Perl_pad_reset_pending #define padix_floor Perl_padix_floor -#define regflags Perl_regflags #define safecalloc Perl_safecalloc #define safefree Perl_safefree #define safemalloc Perl_safemalloc diff --git a/global.sym b/global.sym index f2352f3..b720369 100644 --- a/global.sym +++ b/global.sym @@ -173,30 +173,11 @@ psig_name psig_ptr rcsid reall_srchlen -regarglen -regbol -regcode -regdummy -regendp -regeol -regflags -reginput +regdump +regexec_flags regkind -reglastparen -regmyendp -regmyp_size -regmystartp -regnarrate -regnaughty -regnpar -regparse -regprecomp -regprev -regsawback -regsize -regstartp -regtill -regxend +regnext +regprop repeat_amg repeat_ass_amg retstack @@ -429,7 +410,6 @@ do_tell do_trans do_vecset do_vop -doeval dofindlabel dopoptoeval dounwind @@ -526,6 +506,7 @@ magic_clearpack magic_clearsig magic_existspack magic_freedefelem +magic_freeregexp magic_get magic_getarylen magic_getdefelem @@ -1021,6 +1002,7 @@ q ref refkids regdump +regexec_flags regnext regprop repeatcpy @@ -1120,6 +1102,7 @@ sv_clean_objs sv_clear sv_cmp sv_cmp_locale +sv_compile_2op sv_collxfrm sv_dec sv_derived_from diff --git a/mg.c b/mg.c index 97e9d99..b7b09d3 100644 --- a/mg.c +++ b/mg.c @@ -418,7 +418,7 @@ magic_get(SV *sv, MAGIC *mg) } sv_setpvn(sv,s,i); if (tainting) - tainted = was_tainted || rx->exec_tainted; + tainted = was_tainted || RX_MATCH_TAINTED(rx); break; } } @@ -1305,6 +1305,14 @@ magic_setuvar(SV *sv, MAGIC *mg) return 0; } +int +magic_freeregexp(SV *sv, MAGIC *mg) +{ + regexp *re = (regexp *)mg->mg_obj; + ReREFCNT_dec(re); + return 0; +} + #ifdef USE_LOCALE_COLLATE int magic_setcollxfrm(SV *sv, MAGIC *mg) diff --git a/op.c b/op.c index 21dc249..30cbe3a 100644 --- a/op.c +++ b/op.c @@ -522,6 +522,12 @@ find_thread_magical(char *name) case ';': sv_setpv(sv, "\034"); break; + case '&': + case '`': + case '\'': + sawampersand = TRUE; + SvREADONLY_on(sv); + break; } sv_magic(sv, 0, 0, name, 1); DEBUG_L(PerlIO_printf(PerlIO_stderr(), @@ -594,8 +600,7 @@ op_free(OP *o) /* FALL THROUGH */ case OP_PUSHRE: case OP_MATCH: - pregfree(cPMOPo->op_pmregexp); - SvREFCNT_dec(cPMOPo->op_pmshort); + ReREFCNT_dec(cPMOPo->op_pmregexp); break; } @@ -1914,7 +1919,12 @@ newUNOP(I32 type, I32 flags, OP *first) unop->op_first = first; unop->op_flags = flags | OPf_KIDS; unop->op_private = 1 | (flags >> 8); - +#if 1 + if(type == OP_STUDY && first->op_type == OP_MATCH) { + first->op_type = OP_PUSHRE; + first->op_ppaddr = ppaddr[OP_PUSHRE]; + } +#endif unop = (UNOP*) CHECKOP(type, unop); if (unop->op_next) return (OP*)unop; @@ -2065,7 +2075,6 @@ pmruntime(OP *o, OP *expr, OP *repl) pm->op_pmregexp = pregcomp(p, p + plen, pm); if (strEQ("\\s+", pm->op_pmregexp->precomp)) pm->op_pmflags |= PMf_WHITE; - hoistmust(pm); op_free(expr); } else { @@ -4446,7 +4455,6 @@ OP * ck_split(OP *o) { register OP *kid; - PMOP* pm; if (o->op_flags & OPf_STACKED) return no_fh_allowed(o); @@ -4471,11 +4479,6 @@ ck_split(OP *o) cLISTOPo->op_first = kid; kid->op_sibling = sibl; } - pm = (PMOP*)kid; - if (pm->op_pmshort && !(pm->op_pmflags & PMf_ALL)) { - SvREFCNT_dec(pm->op_pmshort); /* can't use substring to optimize */ - pm->op_pmshort = 0; - } kid->op_type = OP_PUSHRE; kid->op_ppaddr = ppaddr[OP_PUSHRE]; diff --git a/op.h b/op.h index 7e853c5..6aa441f 100644 --- a/op.h +++ b/op.h @@ -177,16 +177,14 @@ struct pmop { OP * op_pmreplstart; PMOP * op_pmnext; /* list of all scanpats */ REGEXP * op_pmregexp; /* compiled expression */ - SV * op_pmshort; /* for a fast bypass of execute() */ U16 op_pmflags; U16 op_pmpermflags; - char op_pmslen; }; #define PMf_USED 0x0001 /* pm has been used once already */ #define PMf_ONCE 0x0002 /* use pattern only once per reset */ -#define PMf_SCANFIRST 0x0004 /* initial constant not anchored */ -#define PMf_ALL 0x0008 /* initial constant is whole pat */ +#define PMf_REVERSED 0x0004 /* Should be matched right->left */ +/*#define PMf_ALL 0x0008*/ /* initial constant is whole pat */ #define PMf_SKIPWHITE 0x0010 /* skip leading whitespace for split */ #define PMf_FOLD 0x0020 /* case insensitivity */ #define PMf_CONST 0x0040 /* subst replacement is constant */ diff --git a/perl.c b/perl.c index c2f7ffc..6f3e15c 100644 --- a/perl.c +++ b/perl.c @@ -418,36 +418,6 @@ perl_destruct(register PerlInterpreter *sv_interp) /* defgv, aka *_ should be taken care of elsewhere */ -#if 0 /* just about all regexp stuff, seems to be ok */ - - /* shortcuts to regexp stuff */ - leftgv = Nullgv; - ampergv = Nullgv; - - SAVEFREEOP(curpm); - SAVEFREEOP(oldlastpm); /* for saving regexp context during debugger */ - - regprecomp = NULL; /* uncompiled string. */ - regparse = NULL; /* Input-scan pointer. */ - regxend = NULL; /* End of input for compile */ - regnpar = 0; /* () count. */ - regcode = NULL; /* Code-emit pointer; ®dummy = don't. */ - regsize = 0; /* Code size. */ - regnaughty = 0; /* How bad is this pattern? */ - regsawback = 0; /* Did we see \1, ...? */ - - reginput = NULL; /* String-input pointer. */ - regbol = NULL; /* Beginning of input, for ^ check. */ - regeol = NULL; /* End of input, for $ check. */ - regstartp = (char **)NULL; /* Pointer to startp array. */ - regendp = (char **)NULL; /* Ditto for endp. */ - reglastparen = 0; /* Similarly for lastparen. */ - regtill = NULL; /* How far we are required to go. */ - regflags = 0; /* are we folding, multilining? */ - regprev = (char)NULL; /* char before regbol, \n if none */ - -#endif /* if 0 */ - /* clean up after study() */ SvREFCNT_dec(lastscream); lastscream = Nullsv; diff --git a/perl.h b/perl.h index 4894aa0..431ac96 100644 --- a/perl.h +++ b/perl.h @@ -1748,29 +1748,6 @@ EXT U32 hints; /* various compilation flags */ #define HINT_STRICT_VARS 0x00000400 #define HINT_LOCALE 0x00000800 -/**************************************************************************/ -/* This regexp stuff is global since it always happens within 1 expr eval */ -/**************************************************************************/ - -EXT char * regprecomp; /* uncompiled string. */ -EXT char * regparse; /* Input-scan pointer. */ -EXT char * regxend; /* End of input for compile */ -EXT I32 regnpar; /* () count. */ -EXT char * regcode; /* Code-emit pointer; ®dummy = don't. */ -EXT I32 regsize; /* Code size. */ -EXT I32 regnaughty; /* How bad is this pattern? */ -EXT I32 regsawback; /* Did we see \1, ...? */ - -EXT char * reginput; /* String-input pointer. */ -EXT char * regbol; /* Beginning of input, for ^ check. */ -EXT char * regeol; /* End of input, for $ check. */ -EXT char ** regstartp; /* Pointer to startp array. */ -EXT char ** regendp; /* Ditto for endp. */ -EXT U32 * reglastparen; /* Similarly for lastparen. */ -EXT char * regtill; /* How far we are required to go. */ -EXT U16 regflags; /* are we folding, multilining? */ -EXT char regprev; /* char before regbol, \n if none */ - EXT bool do_undump; /* -u or dump seen? */ EXT VOL U32 debug; @@ -2072,6 +2049,8 @@ EXT MGVTBL vtbl_mutex = {0, 0, 0, 0, magic_mutexfree}; EXT MGVTBL vtbl_defelem = {magic_getdefelem,magic_setdefelem, 0, 0, magic_freedefelem}; +EXT MGVTBL vtbl_regexp = {0,0,0,0, magic_freeregexp}; + #ifdef USE_LOCALE_COLLATE EXT MGVTBL vtbl_collxfrm = {0, magic_setcollxfrm, @@ -2114,6 +2093,7 @@ EXT MGVTBL vtbl_mutex; #endif /* USE_THREADS */ EXT MGVTBL vtbl_defelem; +EXT MGVTBL vtbl_regexp; #ifdef USE_LOCALE_COLLATE EXT MGVTBL vtbl_collxfrm; diff --git a/pod/perlre.pod b/pod/perlre.pod index 14892a8..7d0ba54 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -289,6 +289,104 @@ easier just to say: if (/foo/ && $` =~ /bar$/) +For lookbehind see below. + +=item (?<=regexp) + +A zero-width positive lookbehind assertion. For example, C +matches a word following a tab, without including the tab in C<$&>. +Works only for fixed-width lookbehind. + +=item (? +matches any occurrence of "foo" that isn't following "bar". +Works only for fixed-width lookbehind. + +=item (?{ code }) + +Experimental "evaluate any Perl code" zero-width assertion. Always +succeeds. Currently the quoting rules are somewhat convoluted, as is the +determination where the C ends. + + +=item C<(?Eregexp)> + +An "independend" subexpression. Matches the substring which a +I C would match if anchored at the given position, +B. + +Say, C<^(?Ea*)ab> will never match, since C<(?Ea*)> (anchored +at the beginning of string, as above) will match I the characters +C at the beginning of string, leaving no C for C to match. +In contrast, C will match the same as C, since the match of +the subgroup C is influenced by the following group C (see +L<"Backtracking">). In particular, C inside C will match +less characters that a standalone C, since this makes the tail match. + +Note that a similar effect to C<(?Eregexp)> may be achieved by + + (?=(regexp))\1 + +since the lookahead is in I<"logical"> context, thus matches the same +substring as a standalone C. The following C<\1> eats the matched +string, thus making a zero-length assertion into an analogue of +C<(?>...)>. (The difference of these two constructions is that the +second one uses a catching group, thus shifts ordinals of +backreferences in the rest of a regular expression.) + +This construction is very useful for optimizations of "eternal" +matches, since it will not backtrack (see L<"Backtracking">). Say, + + / \( ( + [^()]+ + | + \( [^()]* \) + )+ + \) /x + +will match a nonempty group with matching two-or-less-level-deep +parentheses. It is very efficient in finding such groups. However, +if there is no such group, it is going to take forever (on reasonably +long string), since there are so many different ways to split a long +string into several substrings (this is essentially what C<(.+)+> is +doing, and this is a subpattern of the above pattern). Say, on +C<((()aaaaaaaaaaaaaaaaaa> the above pattern detects no-match in 5sec +(on kitchentop'96 processor), and each extra letter doubles this time. + +However, a tiny modification of this + + / \( ( + (?> [^()]+ ) + | + \( [^()]* \) + )+ + \) /x + +which uses (?>...) matches exactly when the above one does (it is a +good excercise to check this), but finishes in a fourth of the above +time on a similar string with 1000000 Cs. + +Note that on simple groups like the above C<(?> [^()]+ )> a similar +effect may be achieved by negative lookahead, as in C<[^()]+ (?! [^()] )>. +This was only 4 times slower on a string with 1000000 Cs. + +=item (?(condition)yes-regexp|no-regexp) + +=item (?(condition)yes-regexp) + +Conditional expression. C<(condition)> should be either an integer in +parentheses (which is valid if the corresponding pair of parentheses +matched), or lookahead/lookbehind/evaluate zero-width assertion. + +Say, + + / ( \( )? + [^()]+ + (?(1) \) )/x + +matches a chunk of non-parentheses, possibly included in parentheses +themselves. =item (?imsx) @@ -306,6 +404,15 @@ pattern. For example: $pattern = "(?i)foobar"; if ( /$pattern/ ) +Note that these modifiers are localized inside an enclosing group (if +any). Say, + + ( (?i) blah ) \s+ \1 + +(assuming C modifier, and no C modifier outside of this group) +will match a repeated (I!) word C in any +case. + =back The specific choice of question mark for this and the new minimal @@ -315,10 +422,10 @@ and "question" exactly what is going on. That's psychology... =head2 Backtracking -A fundamental feature of regular expression matching involves the notion -called I. which is used (when needed) by all regular -expression quantifiers, namely C<*>, C<*?>, C<+>, C<+?>, C<{n,m}>, and -C<{n,m}?>. +A fundamental feature of regular expression matching involves the +notion called I. which is currently used (when needed) +by all regular expression quantifiers, namely C<*>, C<*?>, C<+>, +C<+?>, C<{n,m}>, and C<{n,m}?>. For a regular expression to match, the I regular expression must match, not just part of it. So if the beginning of a pattern containing a @@ -498,6 +605,14 @@ time to run And if you used C<*>'s instead of limiting it to 0 through 5 matches, then it would take literally forever--or until you ran out of stack space. +A powerful tool for optimizing such beasts is "independent" groups, +which do not backtrace (see Lregexp)>>). Note also that +zero-length lookahead/lookbehind assertions will not backtrace to make +the tail match, since they are in "logical" context: only the fact +whether they match or not is considered relevant. For an example +where side-effects of a lookahead I have influenced the +following match, see Lregexp)>>. + =head2 Version 8 Regular Expressions In case you're not familiar with the "regular" Version 8 regexp diff --git a/pp.c b/pp.c index 819aea7..5ed3e98 100644 --- a/pp.c +++ b/pp.c @@ -514,6 +514,7 @@ PP(pp_gelem) PP(pp_study) { djSP; dPOPss; + register UNOP *unop = cUNOP; register unsigned char *s; register I32 pos; register I32 ch; @@ -521,6 +522,14 @@ PP(pp_study) register I32 *snext; STRLEN len; + if(unop->op_first && unop->op_first->op_type == OP_PUSHRE) { + PMOP *pm = (PMOP *)unop->op_first; + SV *rv = sv_newmortal(); + sv = newSVrv(rv, "Regexp"); + sv_magic(sv,(SV*)ReREFCNT_inc(pm->op_pmregexp),'r',0,0); + RETURNX(PUSHs(rv)); + } + if (sv == lastscream) { if (SvSCREAM(sv)) RETPUSHYES; @@ -4126,10 +4135,12 @@ PP(pp_split) s = m; } } - else if (pm->op_pmshort && !rx->nparens) { - i = SvCUR(pm->op_pmshort); - if (i == 1) { - i = *SvPVX(pm->op_pmshort); + else if (rx->check_substr && !rx->nparens + && (rx->reganch & ROPT_CHECK_ALL) + && !(rx->reganch & ROPT_ANCH)) { + i = SvCUR(rx->check_substr); + if (i == 1 && !SvTAIL(rx->check_substr)) { + i = *SvPVX(rx->check_substr); while (--limit) { /*SUPPRESS 530*/ for (m = s; m < strend && *m != i; m++) ; @@ -4147,7 +4158,7 @@ PP(pp_split) #ifndef lint while (s < strend && --limit && (m=fbm_instr((unsigned char*)s, (unsigned char*)strend, - pm->op_pmshort)) ) + rx->check_substr)) ) #endif { dstr = NEWSV(31, m-s); @@ -4162,9 +4173,9 @@ PP(pp_split) else { maxiters += (strend - s) * rx->nparens; while (s < strend && --limit && - pregexec(rx, s, strend, orig, 1, Nullsv, TRUE)) + regexec_flags(rx, s, strend, orig, 1, Nullsv, NULL, 0)) { - TAINT_IF(rx->exec_tainted); + TAINT_IF(RX_MATCH_TAINTED(rx)); if (rx->subbase && rx->subbase != orig) { m = s; diff --git a/pp_ctl.c b/pp_ctl.c index 1ba4c8f..8691cfa 100644 --- a/pp_ctl.c +++ b/pp_ctl.c @@ -26,7 +26,6 @@ #define DOCATCH(o) ((CATCH_GET == TRUE) ? docatch(o) : (o)) static OP *docatch _((OP *o)); -static OP *doeval _((int gimme)); static OP *dofindlabel _((OP *o, char *label, OP **opstack, OP **oplimit)); static void doparseform _((SV *sv)); static I32 dopoptoeval _((I32 startingblock)); @@ -37,6 +36,7 @@ static void save_lines _((AV *array, SV *sv)); static int sortcv _((const void *, const void *)); static int sortcmp _((const void *, const void *)); static int sortcmp_locale _((const void *, const void *)); +static OP *doeval _((int gimme, OP** startop)); static I32 sortcxix; @@ -71,21 +71,34 @@ PP(pp_regcomp) { register char *t; SV *tmpstr; STRLEN len; + MAGIC *mg = Null(MAGIC*); tmpstr = POPs; - t = SvPV(tmpstr, len); - - /* JMR: Check against the last compiled regexp */ - if ( ! pm->op_pmregexp || ! pm->op_pmregexp->precomp - || strnNE(pm->op_pmregexp->precomp, t, len) - || pm->op_pmregexp->precomp[len]) { - if (pm->op_pmregexp) { - pregfree(pm->op_pmregexp); - pm->op_pmregexp = Null(REGEXP*); /* crucial if regcomp aborts */ - } + if(SvROK(tmpstr)) { + SV *sv = SvRV(tmpstr); + if(SvMAGICAL(sv)) + mg = mg_find(sv, 'r'); + } + if(mg) { + regexp *re = (regexp *)mg->mg_obj; + ReREFCNT_dec(pm->op_pmregexp); + pm->op_pmregexp = ReREFCNT_inc(re); + } + else { + t = SvPV(tmpstr, len); + + /* JMR: Check against the last compiled regexp */ + if ( ! pm->op_pmregexp || ! pm->op_pmregexp->precomp + || strnNE(pm->op_pmregexp->precomp, t, len) + || pm->op_pmregexp->precomp[len]) { + if (pm->op_pmregexp) { + ReREFCNT_dec(pm->op_pmregexp); + pm->op_pmregexp = Null(REGEXP*); /* crucial if regcomp aborts */ + } - pm->op_pmflags = pm->op_pmpermflags; /* reset case sensitivity */ - pm->op_pmregexp = pregcomp(t, t + len, pm); + pm->op_pmflags = pm->op_pmpermflags; /* reset case sensitivity */ + pm->op_pmregexp = pregcomp(t, t + len, pm); + } } if (!pm->op_pmregexp->prelen && curpm) @@ -95,7 +108,6 @@ PP(pp_regcomp) { if (pm->op_pmflags & PMf_KEEP) { pm->op_private &= ~OPpRUNTIME; /* no point compiling again */ - hoistmust(pm); cLOGOP->op_first->op_next = op->op_next; } RETURN; @@ -123,13 +135,14 @@ PP(pp_substcont) sv_catsv(dstr, POPs); /* Are we done */ - if (cx->sb_once || !pregexec(rx, s, cx->sb_strend, orig, - s == m, Nullsv, cx->sb_safebase)) + if (cx->sb_once || !regexec_flags(rx, s, cx->sb_strend, orig, + s == m, Nullsv, NULL, + cx->sb_safebase ? 0 : REXEC_COPY_STR)) { SV *targ = cx->sb_targ; sv_catpvn(dstr, s, cx->sb_strend - s); - TAINT_IF(cx->sb_rxtainted || rx->exec_tainted); + TAINT_IF(cx->sb_rxtainted || RX_MATCH_TAINTED(rx)); (void)SvOOK_off(targ); Safefree(SvPVX(targ)); @@ -158,7 +171,7 @@ PP(pp_substcont) cx->sb_m = m = rx->startp[0]; sv_catpvn(dstr, s, m-s); cx->sb_s = rx->endp[0]; - cx->sb_rxtainted |= rx->exec_tainted; + cx->sb_rxtainted |= RX_MATCH_TAINTED(rx); rxres_save(&cx->sb_rxres, rx); RETURNOP(pm->op_pmreplstart); } @@ -2087,9 +2100,63 @@ docatch(OP *o) return Nullop; } +OP * +sv_compile_2op(SV *sv, OP** startop, char *code, AV** avp) +/* sv Text to convert to OP tree. */ +/* startop op_free() this to undo. */ +/* code Short string id of the caller. */ +{ + dSP; /* Make POPBLOCK work. */ + PERL_CONTEXT *cx; + SV **newsp; + I32 gimme; + I32 optype; + OP dummy; + OP *oop = op, *rop; + char tmpbuf[TYPE_DIGITS(long) + 12 + 10]; + char *safestr; + + ENTER; + lex_start(sv); + SAVETMPS; + /* switch to eval mode */ + + SAVESPTR(compiling.cop_filegv); + SAVEI16(compiling.cop_line); + sprintf(tmpbuf, "_<(%.10s_eval %lu)", code, (unsigned long)++evalseq); + compiling.cop_filegv = gv_fetchfile(tmpbuf+2); + compiling.cop_line = 1; + /* XXX For Cs within BEGIN {} blocks, this ends up + deleting the eval's FILEGV from the stash before gv_check() runs + (i.e. before run-time proper). To work around the coredump that + ensues, we always turn GvMULTI_on for any globals that were + introduced within evals. See force_ident(). GSAR 96-10-12 */ + safestr = savepv(tmpbuf); + SAVEDELETE(defstash, safestr, strlen(safestr)); + SAVEI32(hints); + SAVEPPTR(op); + hints = 0; + + op = &dummy; + op->op_type = 0; /* Avoid uninit warning. */ + op->op_flags = 0; /* Avoid uninit warning. */ + PUSHBLOCK(cx, CXt_EVAL, SP); + PUSHEVAL(cx, 0, compiling.cop_filegv); + rop = doeval(G_SCALAR, startop); + POPBLOCK(cx,curpm); + POPEVAL(cx); + + (*startop)->op_type = OP_NULL; + (*startop)->op_ppaddr = ppaddr[OP_NULL]; + lex_end(); + *avp = (AV*)SvREFCNT_inc(comppad); + LEAVE; + return rop; +} + /* With USE_THREADS, eval_owner must be held on entry to doeval */ static OP * -doeval(int gimme) +doeval(int gimme, OP** startop) { dSP; OP *saveop = op; @@ -2141,7 +2208,7 @@ doeval(int gimme) av_store(comppadlist, 1, (SV*)comppad); CvPADLIST(compcv) = comppadlist; - if (saveop->op_type != OP_REQUIRE) + if (!saveop || saveop->op_type != OP_REQUIRE) CvOUTSIDE(compcv) = (CV*)SvREFCNT_inc(caller); SAVEFREESV(compcv); @@ -2165,7 +2232,7 @@ doeval(int gimme) curcop->cop_arybase = 0; SvREFCNT_dec(rs); rs = newSVpv("\n", 1); - if (saveop->op_flags & OPf_SPECIAL) + if (saveop && saveop->op_flags & OPf_SPECIAL) in_eval |= 4; else sv_setpv(ERRSV,""); @@ -2173,7 +2240,7 @@ doeval(int gimme) SV **newsp; I32 gimme; PERL_CONTEXT *cx; - I32 optype; + I32 optype = 0; /* Might be reset by POPEVAL. */ op = saveop; if (eval_root) { @@ -2181,14 +2248,22 @@ doeval(int gimme) eval_root = Nullop; } SP = stack_base + POPMARK; /* pop original mark */ - POPBLOCK(cx,curpm); - POPEVAL(cx); - pop_return(); + if (!startop) { + POPBLOCK(cx,curpm); + POPEVAL(cx); + pop_return(); + } lex_end(); LEAVE; if (optype == OP_REQUIRE) { char* msg = SvPVx(ERRSV, na); DIE("%s", *msg ? msg : "Compilation failed in require"); + } else if (startop) { + char* msg = SvPVx(ERRSV, na); + + POPBLOCK(cx,curpm); + POPEVAL(cx); + croak("%sCompilation failed in regexp", (*msg ? msg : "Unknown error\n")); } SvREFCNT_dec(rs); rs = SvREFCNT_inc(nrs); @@ -2203,7 +2278,12 @@ doeval(int gimme) SvREFCNT_dec(rs); rs = SvREFCNT_inc(nrs); compiling.cop_line = 0; - SAVEFREEOP(eval_root); + if (startop) { + *startop = eval_root; + SvREFCNT_dec(CvOUTSIDE(compcv)); + CvOUTSIDE(compcv) = Nullcv; + } else + SAVEFREEOP(eval_root); if (gimme & G_VOID) scalarvoid(eval_root); else if (gimme & G_ARRAY) @@ -2229,7 +2309,7 @@ doeval(int gimme) CvDEPTH(compcv) = 1; SP = stack_base + POPMARK; /* pop original mark */ - op = saveop; /* The caller may need it. */ + op = saveop; /* The caller may need it. */ #ifdef USE_THREADS MUTEX_LOCK(&eval_mutex); eval_owner = 0; @@ -2382,7 +2462,7 @@ PP(pp_require) eval_owner = thr; MUTEX_UNLOCK(&eval_mutex); #endif /* USE_THREADS */ - return DOCATCH(doeval(G_SCALAR)); + return DOCATCH(doeval(G_SCALAR, NULL)); } PP(pp_dofile) @@ -2442,7 +2522,7 @@ PP(pp_entereval) eval_owner = thr; MUTEX_UNLOCK(&eval_mutex); #endif /* USE_THREADS */ - ret = doeval(gimme); + ret = doeval(gimme, NULL); if (PERLDB_INTER && was != sub_generation /* Some subs defined here. */ && ret != op->op_next) { /* Successive compilation. */ strcpy(safestr, "_<(eval )"); /* Anything fake and short. */ diff --git a/pp_hot.c b/pp_hot.c index 6dbc259..d05f578 100644 --- a/pp_hot.c +++ b/pp_hot.c @@ -754,6 +754,7 @@ PP(pp_match) I32 minmatch = 0; I32 oldsave = savestack_ix; I32 update_minmatch = 1; + SV *screamer; if (op->op_flags & OPf_STACKED) TARG = POPs; @@ -761,6 +762,7 @@ PP(pp_match) TARG = GvSV(defgv); EXTEND(SP,1); } + PUTBACK; /* EVAL blocks need stack_sp. */ s = SvPV(TARG, len); strend = s + len; if (!s) @@ -768,6 +770,7 @@ PP(pp_match) TAINT_NOT; if (pm->op_pmflags & PMf_USED) { + failure: if (gimme == G_ARRAY) RETURN; RETPUSHNO; @@ -777,6 +780,12 @@ PP(pp_match) pm = curpm; rx = pm->op_pmregexp; } + if (rx->minlen > len) goto failure; + + screamer = ( (SvSCREAM(TARG) && rx->check_substr + && SvTYPE(rx->check_substr) == SVt_PVBM + && SvVALID(rx->check_substr)) + ? TARG : Nullsv); truebase = t = s; if (global = pm->op_pmflags & PMf_GLOBAL) { rx->startp[0] = 0; @@ -793,6 +802,7 @@ PP(pp_match) gimme = G_SCALAR; /* accidental array context? */ safebase = (((gimme == G_ARRAY) || global || !rx->nparens) && !sawampersand); + safebase = safebase ? 0 : REXEC_COPY_STR ; if (pm->op_pmflags & (PMf_MULTILINE|PMf_SINGLELINE)) { SAVEINT(multiline); multiline = pm->op_pmflags & PMf_MULTILINE; @@ -806,43 +816,52 @@ play_it_again: if (update_minmatch++) minmatch = (s == rx->startp[0]); } - if (pm->op_pmshort) { - if (pm->op_pmflags & PMf_SCANFIRST) { - if (SvSCREAM(TARG)) { - if (screamfirst[BmRARE(pm->op_pmshort)] < 0) + if (rx->check_substr) { + if (!(rx->reganch & ROPT_NOSCAN)) { /* Floating checkstring. */ + if ( screamer ) { + I32 p = -1; + + if (screamfirst[BmRARE(rx->check_substr)] < 0) goto nope; - else if (!(s = screaminstr(TARG, pm->op_pmshort))) + else if (!(s = screaminstr(TARG, rx->check_substr, + rx->check_offset_min, 0, &p, 0))) goto nope; - else if (pm->op_pmflags & PMf_ALL) + else if ((rx->reganch & ROPT_CHECK_ALL) + && !sawampersand && !SvTAIL(rx->check_substr)) goto yup; } - else if (!(s = fbm_instr((unsigned char*)s, - (unsigned char*)strend, pm->op_pmshort))) + else if (!(s = fbm_instr((unsigned char*)s + rx->check_offset_min, + (unsigned char*)strend, + rx->check_substr))) goto nope; - else if (pm->op_pmflags & PMf_ALL) + else if ((rx->reganch & ROPT_CHECK_ALL) && !sawampersand) goto yup; - if (s && rx->regback >= 0) { - ++BmUSEFUL(pm->op_pmshort); - s -= rx->regback; - if (s < t) - s = t; + if (s && rx->check_offset_max < t - s) { + ++BmUSEFUL(rx->check_substr); + s -= rx->check_offset_max; } else s = t; } - else if (!multiline) { - if (*SvPVX(pm->op_pmshort) != *s - || (pm->op_pmslen > 1 - && memNE(SvPVX(pm->op_pmshort), s, pm->op_pmslen))) + /* Now checkstring is fixed, i.e. at fixed offset from the + beginning of match, and the match is anchored at s. */ + else if (!multiline) { /* Anchored near beginning of string. */ + I32 slen; + if (*SvPVX(rx->check_substr) != s[rx->check_offset_min] + || ((slen = SvCUR(rx->check_substr)) > 1 + && memNE(SvPVX(rx->check_substr), + s + rx->check_offset_min, slen))) goto nope; } - if (!rx->naughty && --BmUSEFUL(pm->op_pmshort) < 0) { - SvREFCNT_dec(pm->op_pmshort); - pm->op_pmshort = Nullsv; /* opt is being useless */ + if (!rx->naughty && --BmUSEFUL(rx->check_substr) < 0 + && rx->check_substr == rx->float_substr) { + SvREFCNT_dec(rx->check_substr); + rx->check_substr = Nullsv; /* opt is being useless */ + rx->float_substr = Nullsv; } } - if (pregexec(rx, s, strend, truebase, minmatch, - SvSCREAM(TARG) ? TARG : Nullsv, safebase)) + if (regexec_flags(rx, s, strend, truebase, minmatch, + screamer, NULL, safebase)) { curpm = pm; if (pm->op_pmflags & PMf_ONCE) @@ -854,7 +873,7 @@ play_it_again: /*NOTREACHED*/ gotcha: - TAINT_IF(rx->exec_tainted); + TAINT_IF(RX_MATCH_TAINTED(rx)); if (gimme == G_ARRAY) { I32 iters, i, len; @@ -863,6 +882,7 @@ play_it_again: i = 1; else i = 0; + SPAGAIN; /* EVAL blocks could move the stack. */ EXTEND(SP, iters + i); EXTEND_MORTAL(iters + i); for (i = !i; i <= iters; i++) { @@ -878,6 +898,7 @@ play_it_again: strend = rx->subend; if (rx->startp[0] && rx->startp[0] == rx->endp[0]) ++rx->endp[0]; + PUTBACK; /* EVAL blocks may use stack */ goto play_it_again; } LEAVE_SCOPE(oldsave); @@ -904,9 +925,9 @@ play_it_again: RETPUSHYES; } -yup: - TAINT_IF(rx->exec_tainted); - ++BmUSEFUL(pm->op_pmshort); +yup: /* Confirmed by check_substr */ + TAINT_IF(RX_MATCH_TAINTED(rx)); + ++BmUSEFUL(rx->check_substr); curpm = pm; if (pm->op_pmflags & PMf_ONCE) pm->op_pmflags |= PMf_USED; @@ -916,7 +937,7 @@ yup: rx->subbeg = truebase; rx->subend = strend; rx->startp[0] = s; - rx->endp[0] = s + SvCUR(pm->op_pmshort); + rx->endp[0] = s + SvCUR(rx->check_substr); goto gotcha; } if (sawampersand) { @@ -926,14 +947,14 @@ yup: rx->subbeg = tmps; rx->subend = tmps + (strend-t); tmps = rx->startp[0] = tmps + (s - t); - rx->endp[0] = tmps + SvCUR(pm->op_pmshort); + rx->endp[0] = tmps + SvCUR(rx->check_substr); } LEAVE_SCOPE(oldsave); RETPUSHYES; nope: - if (pm->op_pmshort) - ++BmUSEFUL(pm->op_pmshort); + if (rx->check_substr) + ++BmUSEFUL(rx->check_substr); ret_no: if (global && !(pm->op_pmflags & PMf_CONTINUE)) { @@ -1403,6 +1424,8 @@ PP(pp_subst) STRLEN len; int force_on_match = 0; I32 oldsave = savestack_ix; + I32 update_minmatch = 1; + SV *screamer; /* known replacement string? */ dstr = (pm->op_pmflags & PMf_CONST) ? POPs : Nullsv; @@ -1432,41 +1455,52 @@ PP(pp_subst) pm = curpm; rx = pm->op_pmregexp; } - safebase = (!rx->nparens && !sawampersand); + screamer = ( (SvSCREAM(TARG) && rx->check_substr + && SvTYPE(rx->check_substr) == SVt_PVBM + && SvVALID(rx->check_substr)) + ? TARG : Nullsv); + safebase = (!rx->nparens && !sawampersand) ? 0 : REXEC_COPY_STR; if (pm->op_pmflags & (PMf_MULTILINE|PMf_SINGLELINE)) { SAVEINT(multiline); multiline = pm->op_pmflags & PMf_MULTILINE; } orig = m = s; - if (pm->op_pmshort) { - if (pm->op_pmflags & PMf_SCANFIRST) { - if (SvSCREAM(TARG)) { - if (screamfirst[BmRARE(pm->op_pmshort)] < 0) + if (rx->check_substr) { + if (!(rx->reganch & ROPT_NOSCAN)) { /* It floats. */ + if (screamer) { + I32 p = -1; + + if (screamfirst[BmRARE(rx->check_substr)] < 0) goto nope; - else if (!(s = screaminstr(TARG, pm->op_pmshort))) + else if (!(s = screaminstr(TARG, rx->check_substr, rx->check_offset_min, 0, &p, 0))) goto nope; } - else if (!(s = fbm_instr((unsigned char*)s, (unsigned char*)strend, - pm->op_pmshort))) + else if (!(s = fbm_instr((unsigned char*)s + rx->check_offset_min, + (unsigned char*)strend, + rx->check_substr))) goto nope; - if (s && rx->regback >= 0) { - ++BmUSEFUL(pm->op_pmshort); - s -= rx->regback; - if (s < m) - s = m; + if (s && rx->check_offset_max < s - m) { + ++BmUSEFUL(rx->check_substr); + s -= rx->check_offset_max; } else s = m; } - else if (!multiline) { - if (*SvPVX(pm->op_pmshort) != *s - || (pm->op_pmslen > 1 - && memNE(SvPVX(pm->op_pmshort), s, pm->op_pmslen))) + /* Now checkstring is fixed, i.e. at fixed offset from the + beginning of match, and the match is anchored at s. */ + else if (!multiline) { /* Anchored at beginning of string. */ + I32 slen; + if (*SvPVX(rx->check_substr) != s[rx->check_offset_min] + || ((slen = SvCUR(rx->check_substr)) > 1 + && memNE(SvPVX(rx->check_substr), + s + rx->check_offset_min, slen))) goto nope; } - if (!rx->naughty && --BmUSEFUL(pm->op_pmshort) < 0) { - SvREFCNT_dec(pm->op_pmshort); - pm->op_pmshort = Nullsv; /* opt is being useless */ + if (!rx->naughty && --BmUSEFUL(rx->check_substr) < 0 + && rx->check_substr == rx->float_substr) { + SvREFCNT_dec(rx->check_substr); + rx->check_substr = Nullsv; /* opt is being useless */ + rx->float_substr = Nullsv; } } @@ -1477,9 +1511,9 @@ PP(pp_subst) c = dstr ? SvPV(dstr, clen) : Nullch; /* can do inplace substitution? */ - if (c && clen <= rx->minlen && safebase) { - if (! pregexec(rx, s, strend, orig, 0, - SvSCREAM(TARG) ? TARG : Nullsv, safebase)) { + if (c && clen <= rx->minlen && (once || !(safebase & REXEC_COPY_STR)) + && !(rx->reganch & ROPT_LOOKBEHIND_SEEN)) { + if (!regexec_flags(rx, s, strend, orig, 0, screamer, NULL, safebase)) { PUSHs(&sv_no); LEAVE_SCOPE(oldsave); RETURN; @@ -1493,9 +1527,14 @@ PP(pp_subst) curpm = pm; SvSCREAM_off(TARG); /* disable possible screamer */ if (once) { - rxtainted = rx->exec_tainted; - m = rx->startp[0]; - d = rx->endp[0]; + rxtainted = RX_MATCH_TAINTED(rx); + if (rx->subbase) { + m = orig + (rx->startp[0] - rx->subbase); + d = orig + (rx->endp[0] - rx->subbase); + } else { + m = rx->startp[0]; + d = rx->endp[0]; + } s = orig; if (m - s > strend - d) { /* faster to shorten from end */ if (clen) { @@ -1537,7 +1576,7 @@ PP(pp_subst) do { if (iters++ > maxiters) DIE("Substitution loop"); - rxtainted |= rx->exec_tainted; + rxtainted |= RX_MATCH_TAINTED(rx); m = rx->startp[0]; /*SUPPRESS 560*/ if (i = m - s) { @@ -1550,8 +1589,8 @@ PP(pp_subst) d += clen; } s = rx->endp[0]; - } while (pregexec(rx, s, strend, orig, s == m, - Nullsv, TRUE)); /* don't match same null twice */ + } while (regexec_flags(rx, s, strend, orig, s == m, + Nullsv, NULL, 0)); /* don't match same null twice */ if (s != d) { i = strend - s; SvCUR_set(TARG, d - SvPVX(TARG) + i); @@ -1567,14 +1606,13 @@ PP(pp_subst) RETURN; } - if (pregexec(rx, s, strend, orig, 0, - SvSCREAM(TARG) ? TARG : Nullsv, safebase)) { + if (regexec_flags(rx, s, strend, orig, 0, screamer, NULL, safebase)) { if (force_on_match) { force_on_match = 0; s = SvPV_force(TARG, len); goto force_it; } - rxtainted = rx->exec_tainted; + rxtainted = RX_MATCH_TAINTED(rx); dstr = NEWSV(25, sv_len(TARG)); sv_setpvn(dstr, m, s-m); curpm = pm; @@ -1586,7 +1624,7 @@ PP(pp_subst) do { if (iters++ > maxiters) DIE("Substitution loop"); - rxtainted |= rx->exec_tainted; + rxtainted |= RX_MATCH_TAINTED(rx); if (rx->subbase && rx->subbase != orig) { m = s; s = orig; @@ -1601,7 +1639,7 @@ PP(pp_subst) sv_catpvn(dstr, c, clen); if (once) break; - } while (pregexec(rx, s, strend, orig, s == m, Nullsv, safebase)); + } while (regexec_flags(rx, s, strend, orig, s == m, Nullsv, NULL, safebase)); sv_catpvn(dstr, s, strend - s); TAINT_IF(rxtainted); @@ -1624,7 +1662,7 @@ PP(pp_subst) goto ret_no; nope: - ++BmUSEFUL(pm->op_pmshort); + ++BmUSEFUL(rx->check_substr); ret_no: PUSHs(&sv_no); diff --git a/proto.h b/proto.h index 2dfe86d..63545ff 100644 --- a/proto.h +++ b/proto.h @@ -163,7 +163,6 @@ void gv_init _((GV* gv, HV* stash, char* name, STRLEN len, int multi)); HV* gv_stashpv _((char* name, I32 create)); HV* gv_stashpvn _((char* name, U32 namelen, I32 create)); HV* gv_stashsv _((SV* sv, I32 create)); -void hoistmust _((PMOP* pm)); void hv_clear _((HV* tb)); void hv_delayfree_ent _((HV* hv, HE* entry)); SV* hv_delete _((HV* tb, char* key, U32 klen, I32 flags)); @@ -208,6 +207,7 @@ int magic_clearpack _((SV* sv, MAGIC* mg)); int magic_clearsig _((SV* sv, MAGIC* mg)); int magic_existspack _((SV* sv, MAGIC* mg)); int magic_freedefelem _((SV* sv, MAGIC* mg)); +int magic_freeregexp _((SV* sv, MAGIC* mg)); int magic_get _((SV* sv, MAGIC* mg)); int magic_getarylen _((SV* sv, MAGIC* mg)); int magic_getdefelem _((SV* sv, MAGIC* mg)); @@ -397,10 +397,11 @@ regexp* pregcomp _((char* exp, char* xend, PMOP* pm)); OP* ref _((OP* o, I32 type)); OP* refkids _((OP* o, I32 type)); void regdump _((regexp* r)); -I32 pregexec _((regexp* prog, char* stringarg, char* strend, char* strbeg, I32 minend, SV* screamer, I32 safebase)); -void pregfree _((struct regexp* r)); -char* regnext _((char* p)); -void regprop _((SV* sv, char* o)); +I32 pregexec _((regexp* prog, char* stringarg, char* strend, char* strbeg, I32 minend, SV* screamer, U32 nosave)); +I32 regexec_flags _((regexp* prog, char* stringarg, char* strend, char* strbeg, I32 minend, SV* screamer, void* data, U32 flags)); + void pregfree _((struct regexp* r)); +regnode*regnext _((regnode* p)); +void regprop _((SV* sv, regnode* o)); void repeatcpy _((char* to, char* from, I32 len, I32 count)); char* rninstr _((char* big, char* bigend, char* little, char* lend)); Sighandler_t rsignal _((int, Sighandler_t)); @@ -451,7 +452,7 @@ UV scan_hex _((char* start, I32 len, I32* retlen)); char* scan_num _((char* s)); UV scan_oct _((char* start, I32 len, I32* retlen)); OP* scope _((OP* o)); -char* screaminstr _((SV* bigsv, SV* littlesv)); +char* screaminstr _((SV* bigsv, SV* littlesv, I32 start_shift, I32 end_shift, I32 *state, I32 last)); #ifndef VMS I32 setenv_getix _((char* nam)); #endif @@ -491,6 +492,7 @@ I32 sv_cmp_locale _((SV* sv1, SV* sv2)); #ifdef USE_LOCALE_COLLATE char* sv_collxfrm _((SV* sv, STRLEN* nxp)); #endif +OP* sv_compile_2op _((SV* sv, OP** startp, char* code, AV** avp)); void sv_dec _((SV* sv)); void sv_dump _((SV* sv)); bool sv_derived_from _((SV* sv, char* name)); diff --git a/regcomp.c b/regcomp.c index c8118f2..6489b78 100644 --- a/regcomp.c +++ b/regcomp.c @@ -56,12 +56,28 @@ #include "EXTERN.h" #include "perl.h" #include "INTERN.h" + +#define REG_COMP_C #include "regcomp.h" #ifdef USE_THREADS #undef op #endif /* USE_THREADS */ +static regnode regdummy; +static char * regparse; /* Input-scan pointer. */ +static char * regxend; /* End of input for compile */ +static regnode * regcode; /* Code-emit pointer; ®dummy = don't. */ +static I32 regnaughty; /* How bad is this pattern? */ +static I32 regsawback; /* Did we see \1, ...? */ + +/* This guys appear both in regcomp.c and regexec.c, but there is no + other reason to have them global. */ +static char * regprecomp; /* uncompiled string. */ +static I32 regnpar; /* () count. */ +static I32 regsize; /* Code size. */ +static U16 regflags; /* are we folding, multilining? */ + #ifdef MSDOS # if defined(BUGGY_MSC6) /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */ @@ -100,22 +116,580 @@ * Forward declarations for pregcomp()'s friends. */ -static char *reg _((I32, I32 *)); -static char *reganode _((char, unsigned short)); -static char *regatom _((I32 *)); -static char *regbranch _((I32 *)); -static void regc _((char)); -static char *regclass _((void)); +static regnode *reg _((I32, I32 *)); +static regnode *reganode _((U8, U32)); +static regnode *regatom _((I32 *)); +static regnode *regbranch _((I32 *, I32)); +static void regc _((U8, char *)); +static regnode *regclass _((void)); STATIC I32 regcurly _((char *)); -static char *regnode _((char)); -static char *regpiece _((I32 *)); -static void reginsert _((char, char *)); -static void regoptail _((char *, char *)); +static regnode *reg_node _((U8)); +static regnode *regpiece _((I32 *)); +static void reginsert _((U8, regnode *)); +static void regoptail _((regnode *, regnode *)); static void regset _((char *, I32)); -static void regtail _((char *, char *)); +static void regtail _((regnode *, regnode *)); static char* regwhite _((char *, char *)); static char* nextchar _((void)); +static U32 regseen; +static I32 seen_zerolen; +static regexp *rx; +static I32 extralen; + +#ifdef DEBUGGING +static int colorset; +char *colors[4]; +#endif + +/* Length of a variant. */ + +typedef struct { + I32 len_min; + I32 len_delta; + I32 pos_min; + I32 pos_delta; + SV *last_found; + I32 last_end; /* min value, <0 unless valid. */ + I32 last_start_min; + I32 last_start_max; + SV **longest; /* Either &l_fixed, or &l_float. */ + SV *longest_fixed; + I32 offset_fixed; + SV *longest_float; + I32 offset_float_min; + I32 offset_float_max; + I32 flags; +} scan_data_t; + +static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + +#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL) +#define SF_BEFORE_SEOL 0x1 +#define SF_BEFORE_MEOL 0x2 +#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL) +#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL) + +#define SF_FIX_SHIFT_EOL (+2) +#define SF_FL_SHIFT_EOL (+4) + +#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL) +#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL) + +#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL) +#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */ +#define SF_IS_INF 0x40 +#define SF_HAS_PAR 0x80 +#define SF_IN_PAR 0x100 +#define SF_HAS_EVAL 0x200 + +static void +scan_commit(scan_data_t *data) +{ + STRLEN l = SvCUR(data->last_found); + STRLEN old_l = SvCUR(*data->longest); + + if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) { + sv_setsv(*data->longest, data->last_found); + if (*data->longest == data->longest_fixed) { + data->offset_fixed = l ? data->last_start_min : data->pos_min; + if (data->flags & SF_BEFORE_EOL) + data->flags + |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL); + else + data->flags &= ~SF_FIX_BEFORE_EOL; + } else { + data->offset_float_min = l ? data->last_start_min : data->pos_min; + data->offset_float_max = (l + ? data->last_start_max + : data->pos_min + data->pos_delta); + if (data->flags & SF_BEFORE_EOL) + data->flags + |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL); + else + data->flags &= ~SF_FL_BEFORE_EOL; + } + } + SvCUR_set(data->last_found, 0); + data->last_end = -1; + data->flags &= ~SF_BEFORE_EOL; +} + +#define SCF_DO_SUBSTR 1 + +/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set + to the position after last scanned or to NULL. */ + +static I32 +study_chunk(regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags) + /* scanp: Start here (read-write). */ + /* deltap: Write maxlen-minlen here. */ + /* last: Stop before this one. */ +{ + I32 min = 0, pars = 0, code; + regnode *scan = *scanp, *next; + I32 delta = 0; + int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); + I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; + scan_data_t data_fake; + + while (scan && OP(scan) != END && scan < last) { + /* Peephole optimizer: */ + + if (regkind[(U8)OP(scan)] == EXACT) { + regnode *n = regnext(scan); + U32 stringok = 1; +#ifdef DEBUGGING + regnode *stop = scan; +#endif + + next = scan + (*OPERAND(scan) + 2 - 1)/sizeof(regnode) + 2; + /* Skip NOTHING, merge EXACT*. */ + while (n && + ( regkind[(U8)OP(n)] == NOTHING || + (stringok && (OP(n) == OP(scan)))) + && NEXT_OFF(n) + && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) { + if (OP(n) == TAIL || n > next) + stringok = 0; + if (regkind[(U8)OP(n)] == NOTHING) { + NEXT_OFF(scan) += NEXT_OFF(n); + next = n + NODE_STEP_REGNODE; +#ifdef DEBUGGING + if (stringok) + stop = n; +#endif + n = regnext(n); + } else { + int oldl = *OPERAND(scan); + regnode *nnext = regnext(n); + + if (oldl + *OPERAND(n) > U8_MAX) + break; + NEXT_OFF(scan) += NEXT_OFF(n); + *OPERAND(scan) += *OPERAND(n); + next = n + (*OPERAND(n) + 2 - 1)/sizeof(regnode) + 2; + /* Now we can overwrite *n : */ + Move(OPERAND(n) + 1, OPERAND(scan) + oldl + 1, + *OPERAND(n) + 1, char); +#ifdef DEBUGGING + if (stringok) + stop = next - 1; +#endif + n = nnext; + } + } +#ifdef DEBUGGING + /* Allow dumping */ + n = scan + (*OPERAND(scan) + 2 - 1)/sizeof(regnode) + 2; + while (n <= stop) { + if (regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) { + OP(n) = OPTIMIZED; + NEXT_OFF(n) = 0; + } + n++; + } +#endif + + } + if (OP(scan) != CURLYX) { + int max = (reg_off_by_arg[OP(scan)] ? I32_MAX : U16_MAX); + int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan)); + int noff; + regnode *n = scan; + + /* Skip NOTHING and LONGJMP. */ + while ((n = regnext(n)) + && ((regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n))) + || ((OP(n) == LONGJMP) && (noff = ARG(n)))) + && off + noff < max) + off += noff; + if (reg_off_by_arg[OP(scan)]) + ARG(scan) = off; + else + NEXT_OFF(scan) = off; + } + if (OP(scan) == BRANCH || OP(scan) == BRANCHJ + || OP(scan) == IFTHEN || OP(scan) == SUSPEND) { + next = regnext(scan); + code = OP(scan); + + if (OP(next) == code || code == IFTHEN || code == SUSPEND) { + I32 max1 = 0, min1 = I32_MAX, num = 0; + + if (flags & SCF_DO_SUBSTR) + scan_commit(data); + while (OP(scan) == code) { + I32 deltanext, minnext; + + num++; + data_fake.flags = 0; + next = regnext(scan); + scan = NEXTOPER(scan); + if (code != BRANCH) + scan = NEXTOPER(scan); + /* We suppose the run is continuous, last=next...*/ + minnext = study_chunk(&scan, &deltanext, next, + &data_fake, 0); + if (min1 > minnext) + min1 = minnext; + if (max1 < minnext + deltanext) + max1 = minnext + deltanext; + if (deltanext == I32_MAX) + is_inf = 1; + scan = next; + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (data_fake.flags & SF_HAS_EVAL) + data->flags |= SF_HAS_EVAL; + if (code == SUSPEND) + break; + } + if (code == IFTHEN && num < 2) /* Empty ELSE branch */ + min1 = 0; + if (flags & SCF_DO_SUBSTR) { + data->pos_min += min1; + data->pos_delta += max1 - min1; + if (max1 != min1 || is_inf) + data->longest = &(data->longest_float); + } + min += min1; + delta += max1 - min1; + } else if (code == BRANCHJ) /* single branch is optimized. */ + scan = NEXTOPER(NEXTOPER(scan)); + else /* single branch is optimized. */ + scan = NEXTOPER(scan); + continue; + } else if (OP(scan) == EXACT) { + min += *OPERAND(scan); + if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */ + I32 l = *OPERAND(scan); + + /* The code below prefers earlier match for fixed + offset, later match for variable offset. */ + if (data->last_end == -1) { /* Update the start info. */ + data->last_start_min = data->pos_min; + data->last_start_max = is_inf + ? I32_MAX : data->pos_min + data->pos_delta; + } + sv_catpvn(data->last_found, OPERAND(scan)+1, l); + data->last_end = data->pos_min + l; + data->pos_min += l; /* As in the first entry. */ + data->flags &= ~SF_BEFORE_EOL; + } + } else if (regkind[(U8)OP(scan)] == EXACT) { + if (flags & SCF_DO_SUBSTR) + scan_commit(data); + min += *OPERAND(scan); + if (data && (flags & SCF_DO_SUBSTR)) + data->pos_min += *OPERAND(scan); + } else if (strchr(varies,OP(scan))) { + I32 mincount, maxcount, minnext, deltanext, pos_before, fl; + regnode *oscan = scan; + + switch (regkind[(U8)OP(scan)]) { + case WHILEM: + scan = NEXTOPER(scan); + goto finish; + case PLUS: + if (flags & SCF_DO_SUBSTR) { + next = NEXTOPER(scan); + if (OP(next) == EXACT) { + mincount = 1; + maxcount = REG_INFTY; + next = regnext(scan); + scan = NEXTOPER(scan); + goto do_curly; + } + } + if (flags & SCF_DO_SUBSTR) + data->pos_min++; + min++; + /* Fall through. */ + case STAR: + is_inf = 1; + scan = regnext(scan); + if (flags & SCF_DO_SUBSTR) { + scan_commit(data); + data->longest = &(data->longest_float); + } + goto optimize_curly_tail; + case CURLY: + mincount = ARG1(scan); + maxcount = ARG2(scan); + next = regnext(scan); + scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS; + do_curly: + if (flags & SCF_DO_SUBSTR) { + if (mincount == 0) scan_commit(data); + pos_before = data->pos_min; + } + if (data) { + fl = data->flags; + data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL); + if (is_inf) + data->flags |= SF_IS_INF; + } + /* This will finish on WHILEM, setting scan, or on NULL: */ + minnext = study_chunk(&scan, &deltanext, last, data, + mincount == 0 + ? (flags & ~SCF_DO_SUBSTR) : flags); + if (!scan) /* It was not CURLYX, but CURLY. */ + scan = next; + if (dowarn && (minnext + deltanext == 0) + && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))) + warn("Strange *+?{} on zero-length expression"); + min += minnext * mincount; + is_inf |= (maxcount == REG_INFTY && (minnext + deltanext) > 0 + || deltanext == I32_MAX); + delta += (minnext + deltanext) * maxcount - minnext * mincount; + + /* Try powerful optimization CURLYX => CURLYN. */ +#ifdef REGALIGN_STRUCT + if ( OP(oscan) == CURLYX && data + && data->flags & SF_IN_PAR + && !(data->flags & SF_HAS_EVAL) + && !deltanext && minnext == 1 ) { + /* Try to optimize to CURLYN. */ + regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; + regnode *nxt1 = nxt, *nxt2; + + /* Skip open. */ + nxt = regnext(nxt); + if (!strchr(simple,OP(nxt)) + && !(regkind[(U8)OP(nxt)] == EXACT + && *OPERAND(nxt) == 1)) + goto nogo; + nxt2 = nxt; + nxt = regnext(nxt); + if (OP(nxt) != CLOSE) + goto nogo; + /* Now we know that nxt2 is the only contents: */ + oscan->flags = ARG(nxt); + OP(oscan) = CURLYN; + OP(nxt1) = NOTHING; /* was OPEN. */ +#ifdef DEBUGGING + OP(nxt1 + 1) = OPTIMIZED; /* was count. */ + NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */ + NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */ + OP(nxt) = OPTIMIZED; /* was CLOSE. */ + OP(nxt + 1) = OPTIMIZED; /* was count. */ + NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */ +#endif + } +#endif + nogo: + + /* Try optimization CURLYX => CURLYM. */ + if ( OP(oscan) == CURLYX && data +#ifdef REGALIGN_STRUCT + && !(data->flags & SF_HAS_PAR) +#else + && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) +#endif + && !(data->flags & SF_HAS_EVAL) + && !deltanext ) { + /* XXXX How to optimize if data == 0? */ + /* Optimize to a simpler form. */ + regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */ + regnode *nxt2; + + OP(oscan) = CURLYM; + while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/ + && (OP(nxt2) != WHILEM)) + nxt = nxt2; + OP(nxt2) = SUCCEED; /* Whas WHILEM */ +#ifdef REGALIGN_STRUCT + /* Need to optimize away parenths. */ + if (data->flags & SF_IN_PAR) { + /* Set the parenth number. */ + regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/ + + if (OP(nxt) != CLOSE) + FAIL("panic opt close"); + oscan->flags = ARG(nxt); + OP(nxt1) = OPTIMIZED; /* was OPEN. */ + OP(nxt) = OPTIMIZED; /* was CLOSE. */ +#ifdef DEBUGGING + OP(nxt1 + 1) = OPTIMIZED; /* was count. */ + OP(nxt + 1) = OPTIMIZED; /* was count. */ + NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */ + NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */ +#endif +#if 0 + while ( nxt1 && (OP(nxt1) != WHILEM)) { + regnode *nnxt = regnext(nxt1); + + if (nnxt == nxt) { + if (reg_off_by_arg[OP(nxt1)]) + ARG_SET(nxt1, nxt2 - nxt1); + else if (nxt2 - nxt1 < U16_MAX) + NEXT_OFF(nxt1) = nxt2 - nxt1; + else + OP(nxt) = NOTHING; /* Cannot beautify */ + } + nxt1 = nnxt; + } +#endif + /* Optimize again: */ + study_chunk(&nxt1, &deltanext, nxt, NULL, 0); + } else + oscan->flags = 0; +#endif + } + if (data && fl & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (flags & SCF_DO_SUBSTR) { + SV *last_str = Nullsv; + int counted = mincount != 0; + + if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */ + I32 b = pos_before >= data->last_start_min + ? pos_before : data->last_start_min; + STRLEN l; + char *s = SvPV(data->last_found, l); + + l -= b - data->last_start_min; + /* Get the added string: */ + last_str = newSVpv(s + b - data->last_start_min, l); + if (deltanext == 0 && pos_before == b) { + /* What was added is a constant string */ + if (mincount > 1) { + SvGROW(last_str, (mincount * l) + 1); + repeatcpy(SvPVX(last_str) + l, + SvPVX(last_str), l, mincount - 1); + SvCUR(last_str) *= mincount; + /* Add additional parts. */ + SvCUR_set(data->last_found, + SvCUR(data->last_found) - l); + sv_catsv(data->last_found, last_str); + data->last_end += l * (mincount - 1); + } + } + } + /* It is counted once already... */ + data->pos_min += minnext * (mincount - counted); + data->pos_delta += - counted * deltanext + + (minnext + deltanext) * maxcount - minnext * mincount; + if (mincount != maxcount) { + scan_commit(data); + if (mincount && last_str) { + sv_setsv(data->last_found, last_str); + data->last_end = data->pos_min; + data->last_start_min = + data->pos_min - SvCUR(last_str); + data->last_start_max = is_inf + ? I32_MAX + : data->pos_min + data->pos_delta + - SvCUR(last_str); + } + data->longest = &(data->longest_float); + } + } + if (fl & SF_HAS_EVAL) + data->flags |= SF_HAS_EVAL; + optimize_curly_tail: +#ifdef REGALIGN + if (OP(oscan) != CURLYX) { + while (regkind[(U8)OP(next = regnext(oscan))] == NOTHING + && NEXT_OFF(next)) + NEXT_OFF(oscan) += NEXT_OFF(next); + } +#endif + continue; + default: /* REF only? */ + if (flags & SCF_DO_SUBSTR) { + scan_commit(data); + data->longest = &(data->longest_float); + } + is_inf = 1; + break; + } + } else if (strchr(simple,OP(scan))) { + if (flags & SCF_DO_SUBSTR) { + scan_commit(data); + data->pos_min++; + } + min++; + } else if (regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) { + data->flags |= (OP(scan) == MEOL + ? SF_BEFORE_MEOL + : SF_BEFORE_SEOL); + } else if (regkind[(U8)OP(scan)] == BRANCHJ + && (scan->flags || data) + && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) { + I32 deltanext, minnext; + regnode *nscan; + + data_fake.flags = 0; + next = regnext(scan); + nscan = NEXTOPER(NEXTOPER(scan)); + minnext = study_chunk(&nscan, &deltanext, last, &data_fake, 0); + if (scan->flags) { + if (deltanext) { + FAIL("variable length lookbehind not implemented"); + } else if (minnext > U8_MAX) { + FAIL2("lookbehind longer than %d not implemented", U8_MAX); + } + scan->flags = minnext; + } + if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (data_fake.flags & SF_HAS_EVAL) + data->flags |= SF_HAS_EVAL; + } else if (OP(scan) == OPEN) { + pars++; + } else if (OP(scan) == CLOSE && ARG(scan) == is_par) { +#ifdef REGALIGN_STRUCT + next = regnext(scan); + + if ( next && (OP(next) != WHILEM) && next < last) +#endif + is_par = 0; /* Disable optimization */ + } else if (OP(scan) == EVAL) { + if (data) + data->flags |= SF_HAS_EVAL; + } + /* Else: zero-length, ignore. */ + scan = regnext(scan); + } + + finish: + *scanp = scan; + *deltap = is_inf ? I32_MAX : delta; + if (flags & SCF_DO_SUBSTR && is_inf) + data->pos_delta = I32_MAX - data->pos_min; + if (is_par > U8_MAX) + is_par = 0; + if (is_par && pars==1 && data) { + data->flags |= SF_IN_PAR; + data->flags &= ~SF_HAS_PAR; + } else if (pars && data) { + data->flags |= SF_HAS_PAR; + data->flags &= ~SF_IN_PAR; + } + return min; +} + +static I32 +add_data(I32 n, char *s) +{ + if (rx->data) { + Renewc(rx->data, + sizeof(*rx->data) + sizeof(void*) * (rx->data->count + n - 1), + char, struct reg_data); + Renew(rx->data->what, rx->data->count + n, U8); + rx->data->count += n; + } else { + Newc(1207, rx->data, sizeof(*rx->data) + sizeof(void*) * (n - 1), + char, struct reg_data); + New(1208, rx->data->what, n, U8); + rx->data->count = n; + } + Copy(s, rx->data->what + rx->data->count - n, n, U8); + return rx->data->count - n; +} + /* - pregcomp - compile a regular expression into internal code * @@ -135,33 +709,29 @@ regexp * pregcomp(char *exp, char *xend, PMOP *pm) { register regexp *r; - register char *scan; - register SV *longish; - SV *longest; - register I32 len; - register char *first; + regnode *scan; + SV **longest; + SV *longest_fixed; + SV *longest_float; + regnode *first; I32 flags; - I32 backish; - I32 backest; - I32 curback; I32 minlen = 0; I32 sawplus = 0; I32 sawopen = 0; -#define MAX_REPEAT_DEPTH 12 - struct { - char *opcode; - I32 count; - } repeat_stack[MAX_REPEAT_DEPTH]; - I32 repeat_depth = 0; - I32 repeat_count = 1; /* We start unmultiplied. */ if (exp == NULL) - croak("NULL regexp argument"); + FAIL("NULL regexp argument"); regprecomp = savepvn(exp, xend - exp); + DEBUG_r(PerlIO_printf(Perl_debug_log, "compiling RE `%*s'\n", + xend - exp, regprecomp)); regflags = pm->op_pmflags; regsawback = 0; + regseen = 0; + seen_zerolen = *exp == '^' ? -1 : 0; + extralen = 0; + /* First pass: determine size, legality. */ regparse = exp; regxend = xend; @@ -169,24 +739,61 @@ pregcomp(char *exp, char *xend, PMOP *pm) regnpar = 1; regsize = 0L; regcode = ®dummy; - regc((char)MAGIC); + regc((U8)MAGIC, (char*)regcode); if (reg(0, &flags) == NULL) { Safefree(regprecomp); regprecomp = Nullch; return(NULL); } + DEBUG_r(PerlIO_printf(Perl_debug_log, "size %d ", regsize)); + + DEBUG_r( + if (!colorset) { + int i = 0; + char *s = getenv("TERMCAP_COLORS"); + + colorset = 1; + if (s) { + colors[0] = s = savepv(s); + while (++i < 4) { + s = strchr(s, '\t'); + if (!s) + FAIL("Not enough TABs in TERMCAP_COLORS"); + *s = '\0'; + colors[i] = ++s; + } + } else { + while (i < 4) + colors[i++] = ""; + } + /* Reset colors: */ + PerlIO_printf(Perl_debug_log, "%s%s%s%s", + colors[0],colors[1],colors[2],colors[3]); + } + ); - /* Small enough for pointer-storage convention? */ - if (regsize >= 32767L) /* Probably could be 65535L. */ + /* Small enough for pointer-storage convention? + If extralen==0, this means that we will not need long jumps. */ +#ifndef REGALIGN_STRUCT + if (regsize >= 0x10000L && extralen) FAIL("regexp too big"); +#else + if (regsize >= 0x10000L && extralen) + regsize += extralen; + else + extralen = 0; +#endif /* Allocate space and initialize. */ - Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp); + Newc(1001, r, sizeof(regexp) + (unsigned)regsize * sizeof(regnode), + char, regexp); if (r == NULL) FAIL("regexp out of space"); + r->refcnt = 1; r->prelen = xend - exp; r->precomp = regprecomp; r->subbeg = r->subbase = NULL; + rx = r; /* Second pass: emit code. */ regparse = exp; @@ -194,23 +801,24 @@ pregcomp(char *exp, char *xend, PMOP *pm) regnaughty = 0; regnpar = 1; regcode = r->program; - regc((char)MAGIC); + regc((U8)MAGIC, (char*) regcode++); + r->data = 0; if (reg(0, &flags) == NULL) return(NULL); /* Dig out information for optimizations. */ pm->op_pmflags = regflags; - r->regstart = Nullsv; /* Worst-case defaults. */ r->reganch = 0; - r->regmust = Nullsv; - r->regback = -1; - r->regstclass = Nullch; + r->regstclass = NULL; r->naughty = regnaughty >= 10; /* Probably an expensive pattern. */ - scan = r->program+1; /* First BRANCH. */ - if (OP(regnext(scan)) == END) {/* Only one top-level choice. */ - scan = NEXTOPER(scan); + scan = r->program + 1; /* First BRANCH. */ + if (OP(scan) != BRANCH) { /* Only one top-level choice. */ + scan_data_t data; + I32 fake; + StructCopy(&zero_scan_data, &data, scan_data_t); first = scan; + /* Skip introductions and multiplicators >= 1. */ while ((OP(first) == OPEN && (sawopen = 1)) || (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) || (OP(first) == PLUS) || @@ -225,19 +833,14 @@ pregcomp(char *exp, char *xend, PMOP *pm) /* Starting-point info. */ again: - if (OP(first) == EXACT) { - r->regstart = newSVpv(OPERAND(first)+1,*OPERAND(first)); - if (SvCUR(r->regstart) > !sawstudy) - fbm_compile(r->regstart); - (void)SvUPGRADE(r->regstart, SVt_PVBM); - } + if (OP(first) == EXACT); /* Empty, get anchored substr later. */ else if (strchr(simple+2,OP(first))) r->regstclass = first; else if (regkind[(U8)OP(first)] == BOUND || regkind[(U8)OP(first)] == NBOUND) r->regstclass = first; else if (regkind[(U8)OP(first)] == BOL) { - r->reganch |= ROPT_ANCH_BOL; + r->reganch |= (OP(first) == MBOL ? ROPT_ANCH_MBOL: ROPT_ANCH_BOL); first = NEXTOPER(first); goto again; } @@ -258,8 +861,9 @@ pregcomp(char *exp, char *xend, PMOP *pm) if (sawplus && (!sawopen || !regsawback)) r->reganch |= ROPT_SKIP; /* x+ must match 1st of run */ - DEBUG_r(PerlIO_printf(Perl_debug_log, "first %d next %d offset %ld\n", - OP(first), OP(NEXTOPER(first)), (long)(first - scan))); + /* Scan is after the zeroth branch, first is atomic matcher. */ + DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %d\n", + first - scan + 1)); /* * If there's something expensive in the r.e., find the * longest literal string that must appear and make it the @@ -268,155 +872,94 @@ pregcomp(char *exp, char *xend, PMOP *pm) * and avoiding duplication strengthens checking. Not a * strong reason, but sufficient in the absence of others. * [Now we resolve ties in favor of the earlier string if - * it happens that curback has been invalidated, since the + * it happens that c_offset_min has been invalidated, since the * earlier string may buy us something the later one won't.] */ - longish = newSVpv("",0); - longest = newSVpv("",0); - len = 0; minlen = 0; - curback = 0; - backish = 0; - backest = 0; - while (OP(scan) != END) { - if (OP(scan) == BRANCH) { - if (OP(regnext(scan)) == BRANCH) { - curback = -30000; - while (OP(scan) == BRANCH) - scan = regnext(scan); - } - else /* single branch is ok */ - scan = NEXTOPER(scan); - continue; - } - if (OP(scan) == UNLESSM) { - curback = -30000; - scan = regnext(scan); - continue; - } - if (OP(scan) == EXACT) { - char *t; - - first = scan; - while ((t = regnext(scan)) && OP(t) == CLOSE) - scan = t; - minlen += *OPERAND(first) * repeat_count; - if (curback - backish == len) { - sv_catpvn(longish, OPERAND(first)+1, - *OPERAND(first)); - len += *OPERAND(first); - curback += *OPERAND(first); - first = regnext(scan); - } - else if (*OPERAND(first) >= len + (curback >= 0)) { - len = *OPERAND(first); - sv_setpvn(longish, OPERAND(first)+1,len); - backish = curback; - curback += len; - first = regnext(scan); - } - else - curback += *OPERAND(first); - } - else if (strchr(varies,OP(scan))) { - int tcount; - char *next; - - if (repeat_depth < MAX_REPEAT_DEPTH - && ((OP(scan) == PLUS - && (tcount = 1) - && (next = NEXTOPER(scan))) - || (regkind[(U8)OP(scan)] == CURLY - && (tcount = ARG1(scan)) - && (next = NEXTOPER(scan)+4)))) - { - /* We treat (abc)+ as (abc)(abc)*. */ - - /* Mark the place to return back. */ - repeat_stack[repeat_depth].opcode = regnext(scan); - repeat_stack[repeat_depth].count = repeat_count; - repeat_depth++; - repeat_count *= tcount; - - /* Go deeper: */ - scan = next; - continue; - } - else { - curback = -30000; - len = 0; - if (SvCUR(longish) > SvCUR(longest)) { - sv_setsv(longest,longish); - backest = backish; - } - sv_setpvn(longish,"",0); - } - } - else if (strchr(simple,OP(scan))) { - curback++; - minlen += repeat_count; - len = 0; - if (SvCUR(longish) > SvCUR(longest)) { - sv_setsv(longest,longish); - backest = backish; - } - sv_setpvn(longish,"",0); - } - scan = regnext(scan); - if (!scan) { /* Go up PLUS or CURLY. */ - if (!repeat_depth--) - croak("panic: re scan"); - scan = repeat_stack[repeat_depth].opcode; - repeat_count = repeat_stack[repeat_depth].count; - /* Need to submit the longest string found: */ - curback = -30000; - len = 0; - if (SvCUR(longish) > SvCUR(longest)) { - sv_setsv(longest,longish); - backest = backish; - } - sv_setpvn(longish,"",0); - } - } - /* Prefer earlier on tie, unless we can tail match latter */ - - if (SvCUR(longish) + (first && regkind[(U8)OP(first)] == EOL) - > SvCUR(longest)) - { - sv_setsv(longest,longish); - backest = backish; + data.longest_fixed = newSVpv("",0); + data.longest_float = newSVpv("",0); + data.last_found = newSVpv("",0); + data.longest = &(data.longest_fixed); + first = scan; + + minlen = study_chunk(&first, &fake, scan + regsize, /* Up to end */ + &data, SCF_DO_SUBSTR); + if ( regnpar == 1 && data.longest == &(data.longest_fixed) + && data.last_start_min == 0 && data.last_end > 0 + && !seen_zerolen + && (!(regseen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS))) + r->reganch |= ROPT_CHECK_ALL; + scan_commit(&data); + SvREFCNT_dec(data.last_found); + + if (SvCUR(data.longest_float) + || (data.flags & SF_FL_BEFORE_EOL + && (!(data.flags & SF_FL_BEFORE_MEOL) + || (regflags & PMf_MULTILINE)))) { + if (SvCUR(data.longest_fixed) + && data.offset_fixed == data.offset_float_min) + goto remove; /* Like in (a)+. */ + + r->float_substr = data.longest_float; + r->float_min_offset = data.offset_float_min; + r->float_max_offset = data.offset_float_max; + fbm_compile(r->float_substr); + BmUSEFUL(r->float_substr) = 100; + if (data.flags & SF_FL_BEFORE_EOL /* Cannot have SEOL and MULTI */ + && (!(data.flags & SF_FL_BEFORE_MEOL) + || (regflags & PMf_MULTILINE))) + SvTAIL_on(r->float_substr); + } else { + remove: + r->float_substr = Nullsv; + SvREFCNT_dec(data.longest_float); } - else - sv_setpvn(longish,"",0); - if (SvCUR(longest) - && (!r->regstart - || !fbm_instr((unsigned char*) SvPVX(r->regstart), - (unsigned char *) (SvPVX(r->regstart) - + SvCUR(r->regstart)), - longest))) - { - r->regmust = longest; - if (backest < 0) - backest = -1; - r->regback = backest; - if (SvCUR(longest) > !(sawstudy || - (first && regkind[(U8)OP(first)] == EOL))) - fbm_compile(r->regmust); - (void)SvUPGRADE(r->regmust, SVt_PVBM); - BmUSEFUL(r->regmust) = 100; - if (first && regkind[(U8)OP(first)] == EOL && SvCUR(longish)) - SvTAIL_on(r->regmust); + + if (SvCUR(data.longest_fixed) + || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ + && (!(data.flags & SF_FIX_BEFORE_MEOL) + || (regflags & PMf_MULTILINE)))) { + r->anchored_substr = data.longest_fixed; + r->anchored_offset = data.offset_fixed; + fbm_compile(r->anchored_substr); + BmUSEFUL(r->anchored_substr) = 100; + if (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ + && (!(data.flags & SF_FIX_BEFORE_MEOL) + || (regflags & PMf_MULTILINE))) + SvTAIL_on(r->anchored_substr); + } else { + r->anchored_substr = Nullsv; + SvREFCNT_dec(data.longest_fixed); } - else { - SvREFCNT_dec(longest); - longest = Nullsv; + + /* A temporary algorithm prefers floated substr to fixed one to dig more info. */ + if (SvCUR(data.longest_fixed) > SvCUR(data.longest_float)) { + r->check_substr = r->anchored_substr; + r->check_offset_min = r->check_offset_max = r->anchored_offset; + if (r->reganch & ROPT_ANCH_SINGLE) + r->reganch |= ROPT_NOSCAN; + } else { + r->check_substr = r->float_substr; + r->check_offset_min = data.offset_float_min; + r->check_offset_max = data.offset_float_max; } - SvREFCNT_dec(longish); + } else { + /* Several toplevels. Best we can is to set minlen. */ + I32 fake; + + DEBUG_r(PerlIO_printf(Perl_debug_log, "\n")); + scan = r->program + 1; + minlen = study_chunk(&scan, &fake, scan + regsize, NULL, 0); + r->check_substr = r->anchored_substr = r->float_substr = Nullsv; } r->nparens = regnpar - 1; r->minlen = minlen; + if (regseen & REG_SEEN_GPOS) + r->reganch |= ROPT_GPOS_SEEN; + if (regseen & REG_SEEN_LOOKBEHIND) + r->reganch |= ROPT_LOOKBEHIND_SEEN; Newz(1002, r->startp, regnpar, char*); Newz(1002, r->endp, regnpar, char*); DEBUG_r(regdump(r)); @@ -432,16 +975,17 @@ pregcomp(char *exp, char *xend, PMOP *pm) * is a trifle forced, but the need to tie the tails of the branches to what * follows makes it hard to avoid. */ -static char * +static regnode * reg(I32 paren, I32 *flagp) - /* Parenthesized? */ - + /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */ { - register char *ret; - register char *br; - register char *ender = 0; + register regnode *ret; /* Will be the head of the group. */ + register regnode *br; + register regnode *lastbr; + register regnode *ender = 0; register I32 parno = 0; - I32 flags; + I32 flags, oregflags = regflags, have_branch = 0, open = 0; + char c; *flagp = HASWIDTH; /* Tentatively. */ @@ -450,33 +994,138 @@ reg(I32 paren, I32 *flagp) if (*regparse == '?') { regparse++; paren = *regparse++; - ret = NULL; + ret = NULL; /* For look-ahead/behind. */ switch (paren) { - case ':': + case '<': +#ifndef REGALIGN_STRUCT + FAIL("lookbehind non-implemented without REGALIGN_STRUCT"); +#endif + regseen |= REG_SEEN_LOOKBEHIND; + if (*regparse == '!') + paren = ','; + if (*regparse != '=' && *regparse != '!') + goto unknown; + regparse++; case '=': case '!': + seen_zerolen++; + case ':': + case '>': break; case '$': case '@': - croak("Sequence (?%c...) not implemented", (int)paren); + FAIL2("Sequence (?%c...) not implemented", (int)paren); break; case '#': while (*regparse && *regparse != ')') regparse++; if (*regparse != ')') - croak("Sequence (?#... not terminated"); + FAIL("Sequence (?#... not terminated"); nextchar(); *flagp = TRYAGAIN; return NULL; + case '{': + { + dTHR; + I32 count = 1, n = 0; + char c; + char *s = regparse; + SV *sv; + OP_4tree *sop, *rop; + + seen_zerolen++; + while (count && (c = *regparse)) { + if (c == '\\' && regparse[1]) + regparse++; + else if (c == '{') + count++; + else if (c == '}') + count--; + regparse++; + } + if (*regparse != ')') + FAIL("Sequence (?{...}) not terminated or not {}-balanced"); + if (!SIZE_ONLY) { + AV *av; + + if (regparse - 1 - s) + sv = newSVpv(s, regparse - 1 - s); + else + sv = newSVpv("", 0); + + rop = sv_compile_2op(sv, &sop, "re", &av); + + n = add_data(3, "nso"); + rx->data->data[n] = (void*)rop; + rx->data->data[n+1] = (void*)av; + rx->data->data[n+2] = (void*)sop; + SvREFCNT_dec(sv); + } + + nextchar(); + if (tainted) + FAIL("Eval-group in insecure regular expression"); + return reganode(EVAL, n); + } + case '(': + { + if (regparse[0] == '?') { + if (regparse[1] == '=' || regparse[1] == '!' + || regparse[1] == '<' + || regparse[1] == '{') { /* Lookahead or eval. */ + I32 flag; + + ret = reg_node(LOGICAL); + regtail(ret, reg(1, &flag)); + goto insert_if; + } + } else if (regparse[0] >= '1' && regparse[0] <= '9' ) { + parno = atoi(regparse++); + + while (isDIGIT(*regparse)) + regparse++; + ret = reganode(GROUPP, parno); + if ((c = *nextchar()) != ')') + FAIL2("Switch (?(number%c not recognized", c); + insert_if: + regtail(ret, reganode(IFTHEN, 0)); + br = regbranch(&flags, 1); + if (br == NULL) + br = reganode(LONGJMP, 0); + else + regtail(br, reganode(LONGJMP, 0)); + c = *nextchar(); + if (c == '|') { + lastbr = reganode(IFTHEN, 0); /* Fake one for optimizer. */ + regbranch(&flags, 1); + regtail(ret, lastbr); + c = *nextchar(); + } else + lastbr = NULL; + if (c != ')') + FAIL("Switch (?(condition)... contains too many branches"); + ender = reg_node(TAIL); + regtail(br, ender); + if (lastbr) { + regtail(lastbr, ender); + regtail(NEXTOPER(NEXTOPER(lastbr)), ender); + } else + regtail(ret, ender); + return ret; + } else { + FAIL2("Unknown condition for (?(%.2s", regparse); + } + } case 0: - croak("Sequence (? incomplete"); + FAIL("Sequence (? incomplete"); break; default: --regparse; while (*regparse && strchr("iogcmsx", *regparse)) pmflag(®flags, *regparse++); + unknown: if (*regparse != ')') - croak("Sequence (?%c...) not recognized", *regparse); + FAIL2("Sequence (?%c...) not recognized", *regparse); nextchar(); *flagp = TRYAGAIN; return NULL; @@ -486,62 +1135,99 @@ reg(I32 paren, I32 *flagp) parno = regnpar; regnpar++; ret = reganode(OPEN, parno); + open = 1; } } else ret = NULL; /* Pick up the branches, linking them together. */ - br = regbranch(&flags); + br = regbranch(&flags, 1); if (br == NULL) return(NULL); - if (ret != NULL) - regtail(ret, br); /* OPEN -> first. */ - else + if (*regparse == '|') { + if (!SIZE_ONLY && extralen) { + reginsert(BRANCHJ, br); + } else + reginsert(BRANCH, br); + have_branch = 1; + if (SIZE_ONLY) + extralen += 1; /* For BRANCHJ-BRANCH. */ + } else if (paren == ':') { + *flagp |= flags&SIMPLE; + } + if (open) { /* Starts with OPEN. */ + regtail(ret, br); /* OPEN -> first. */ + } else if (paren != '?') /* Not Conditional */ ret = br; if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH; *flagp |= flags&SPSTART; + lastbr = br; while (*regparse == '|') { + if (!SIZE_ONLY && extralen) { + ender = reganode(LONGJMP,0); + regtail(NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */ + } + if (SIZE_ONLY) + extralen += 2; /* Account for LONGJMP. */ nextchar(); - br = regbranch(&flags); + br = regbranch(&flags, 0); if (br == NULL) return(NULL); - regtail(ret, br); /* BRANCH -> BRANCH. */ + regtail(lastbr, br); /* BRANCH -> BRANCH. */ + lastbr = br; if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH; *flagp |= flags&SPSTART; } - /* Make a closing node, and hook it on the end. */ - switch (paren) { - case ':': - ender = regnode(NOTHING); - break; - case 1: - ender = reganode(CLOSE, parno); - break; - case '=': - case '!': - ender = regnode(SUCCEED); - *flagp &= ~HASWIDTH; - break; - case 0: - ender = regnode(END); - break; - } - regtail(ret, ender); - - /* Hook the tails of the branches to the closing node. */ - for (br = ret; br != NULL; br = regnext(br)) - regoptail(br, ender); + if (have_branch || paren != ':') { + /* Make a closing node, and hook it on the end. */ + switch (paren) { + case ':': + ender = reg_node(TAIL); + break; + case 1: + ender = reganode(CLOSE, parno); + break; + case '<': + case '>': + case ',': + case '=': + case '!': + ender = reg_node(SUCCEED); + *flagp &= ~HASWIDTH; + break; + case 0: + ender = reg_node(END); + break; + } + regtail(lastbr, ender); - if (paren == '=') { - reginsert(IFMATCH,ret); - regtail(ret, regnode(NOTHING)); + if (have_branch) { + /* Hook the tails of the branches to the closing node. */ + for (br = ret; br != NULL; br = regnext(br)) { + regoptail(br, ender); + } + } } - else if (paren == '!') { - reginsert(UNLESSM,ret); - regtail(ret, regnode(NOTHING)); + + { + char *p; + static char parens[] = "=!<,>"; + + if (paren && (p = strchr(parens, paren))) { + int node = ((p - parens) % 2) ? UNLESSM : IFMATCH; + int flag = (p - parens) > 1; + + if (paren == '>') + node = SUSPEND, flag = 0; + reginsert(node,ret); +#ifdef REGALIGN_STRUCT + ret->flags = flag; +#endif + regtail(ret, reg_node(TAIL)); + } } /* Check for proper termination. */ @@ -554,6 +1240,9 @@ reg(I32 paren, I32 *flagp) FAIL("junk on end of regexp"); /* "Can't happen". */ /* NOTREACHED */ } + if (paren != 0) { + regflags = oregflags; + } return(ret); } @@ -563,18 +1252,28 @@ reg(I32 paren, I32 *flagp) * * Implements the concatenation operator. */ -static char * -regbranch(I32 *flagp) +static regnode * +regbranch(I32 *flagp, I32 first) { - register char *ret; - register char *chain; - register char *latest; - I32 flags = 0; + register regnode *ret; + register regnode *chain = NULL; + register regnode *latest; + I32 flags = 0, c = 0; - *flagp = WORST; /* Tentatively. */ + if (first) + ret = NULL; + else { + if (!SIZE_ONLY && extralen) + ret = reganode(BRANCHJ,0); + else + ret = reg_node(BRANCH); + } + + if (!first && SIZE_ONLY) + extralen += 1; /* BRANCHJ */ + + *flagp = WORST; /* Tentatively. */ - ret = regnode(BRANCH); - chain = NULL; regparse--; nextchar(); while (regparse < regxend && *regparse != '|' && *regparse != ')') { @@ -584,18 +1283,26 @@ regbranch(I32 *flagp) if (flags & TRYAGAIN) continue; return(NULL); - } + } else if (ret == NULL) + ret = latest; *flagp |= flags&HASWIDTH; - if (chain == NULL) /* First piece. */ + if (chain == NULL) /* First piece. */ *flagp |= flags&SPSTART; else { regnaughty++; regtail(chain, latest); } chain = latest; + c++; + } + if (chain == NULL) { /* Loop ran zero times. */ + chain = reg_node(NOTHING); + if (ret == NULL) + ret = chain; + } + if (c == 1) { + *flagp |= flags&SIMPLE; } - if (chain == NULL) /* Loop ran zero times. */ - (void) regnode(NOTHING); return(ret); } @@ -609,17 +1316,17 @@ regbranch(I32 *flagp) * It might seem that this node could be dispensed with entirely, but the * endmarker role is not redundant. */ -static char * +static regnode * regpiece(I32 *flagp) { - register char *ret; + register regnode *ret; register char op; register char *next; I32 flags; char *origparse = regparse; char *maxpos; I32 min; - I32 max = 32767; + I32 max = REG_INFTY; ret = regatom(&flags); if (ret == NULL) { @@ -629,14 +1336,6 @@ regpiece(I32 *flagp) } op = *regparse; - if (op == '(' && regparse[1] == '?' && regparse[2] == '#') { - while (op && op != ')') - op = *++regparse; - if (op) { - nextchar(); - op = *regparse; - } - } if (op == '{' && regcurly(regparse)) { next = regparse + 1; @@ -661,7 +1360,9 @@ regpiece(I32 *flagp) maxpos = regparse; max = atoi(maxpos); if (!max && *maxpos != '0') - max = 32767; /* meaning "infinity" */ + max = REG_INFTY; /* meaning "infinity" */ + else if (max >= REG_INFTY) + FAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); regparse = next; nextchar(); @@ -672,23 +1373,30 @@ regpiece(I32 *flagp) } else { regnaughty += 4 + regnaughty; /* compound interest */ - regtail(ret, regnode(WHILEM)); + regtail(ret, reg_node(WHILEM)); + if (!SIZE_ONLY && extralen) { + reginsert(LONGJMP,ret); + reginsert(NOTHING,ret); + NEXT_OFF(ret) = 3; /* Go over LONGJMP. */ + } reginsert(CURLYX,ret); - regtail(ret, regnode(NOTHING)); + if (!SIZE_ONLY && extralen) + NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */ + regtail(ret, reg_node(NOTHING)); + if (SIZE_ONLY) + extralen += 3; } +#ifdef REGALIGN_STRUCT + ret->flags = 0; +#endif if (min > 0) *flagp = (WORST|HASWIDTH); if (max && max < min) - croak("Can't do {n,m} with n > m"); - if (regcode != ®dummy) { -#ifdef REGALIGN - *(unsigned short *)(ret+3) = min; - *(unsigned short *)(ret+5) = max; -#else - ret[3] = min >> 8; ret[4] = min & 0377; - ret[5] = max >> 8; ret[6] = max & 0377; -#endif + FAIL("Can't do {n,m} with n > m"); + if (!SIZE_ONLY) { + ARG1_SET(ret, min); + ARG2_SET(ret, max); } goto nest_check; @@ -700,8 +1408,10 @@ regpiece(I32 *flagp) return(ret); } +#if 0 /* Now runtime fix should be reliable. */ if (!(flags&HASWIDTH) && op != '?') - FAIL("regexp *+ operand could be empty"); /* else may core dump */ + FAIL("regexp *+ operand could be empty"); +#endif nextchar(); @@ -709,6 +1419,9 @@ regpiece(I32 *flagp) if (op == '*' && (flags&SIMPLE)) { reginsert(STAR, ret); +#ifdef REGALIGN_STRUCT + ret->flags = 0; +#endif regnaughty += 4; } else if (op == '*') { @@ -716,6 +1429,9 @@ regpiece(I32 *flagp) goto do_curly; } else if (op == '+' && (flags&SIMPLE)) { reginsert(PLUS, ret); +#ifdef REGALIGN_STRUCT + ret->flags = 0; +#endif regnaughty += 3; } else if (op == '+') { @@ -726,7 +1442,7 @@ regpiece(I32 *flagp) goto do_curly; } nest_check: - if (dowarn && regcode != ®dummy && !(flags&HASWIDTH) && max > 10000) { + if (dowarn && !SIZE_ONLY && !(flags&HASWIDTH) && max > 10000) { warn("%.*s matches null string many times", regparse - origparse, origparse); } @@ -735,7 +1451,7 @@ regpiece(I32 *flagp) nextchar(); reginsert(MINMOD, ret); #ifdef REGALIGN - regtail(ret, ret + 4); + regtail(ret, ret + NODE_STEP_REGNODE); #else regtail(ret, ret + 3); #endif @@ -756,10 +1472,10 @@ regpiece(I32 *flagp) * * [Yes, it is worth fixing, some scripts can run twice the speed.] */ -static char * +static regnode * regatom(I32 *flagp) { - register char *ret = 0; + register regnode *ret = 0; I32 flags; *flagp = WORST; /* Tentatively. */ @@ -767,29 +1483,32 @@ regatom(I32 *flagp) tryagain: switch (*regparse) { case '^': + seen_zerolen++; nextchar(); if (regflags & PMf_MULTILINE) - ret = regnode(MBOL); + ret = reg_node(MBOL); else if (regflags & PMf_SINGLELINE) - ret = regnode(SBOL); + ret = reg_node(SBOL); else - ret = regnode(BOL); + ret = reg_node(BOL); break; case '$': + if (regparse[1]) + seen_zerolen++; nextchar(); if (regflags & PMf_MULTILINE) - ret = regnode(MEOL); + ret = reg_node(MEOL); else if (regflags & PMf_SINGLELINE) - ret = regnode(SEOL); + ret = reg_node(SEOL); else - ret = regnode(EOL); + ret = reg_node(EOL); break; case '.': nextchar(); if (regflags & PMf_SINGLELINE) - ret = regnode(SANY); + ret = reg_node(SANY); else - ret = regnode(ANY); + ret = reg_node(ANY); regnaughty++; *flagp |= HASWIDTH|SIMPLE; break; @@ -806,7 +1525,7 @@ tryagain: goto tryagain; return(NULL); } - *flagp |= flags&(HASWIDTH|SPSTART); + *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE); break; case '|': case ')': @@ -814,7 +1533,7 @@ tryagain: *flagp |= TRYAGAIN; return NULL; } - croak("internal urp in regexp at /%s/", regparse); + FAIL2("internal urp in regexp at /%s/", regparse); /* Supposed to be caught earlier. */ break; case '{': @@ -831,57 +1550,61 @@ tryagain: case '\\': switch (*++regparse) { case 'A': - ret = regnode(SBOL); + seen_zerolen++; + ret = reg_node(SBOL); *flagp |= SIMPLE; nextchar(); break; case 'G': - ret = regnode(GPOS); + ret = reg_node(GPOS); + regseen |= REG_SEEN_GPOS; *flagp |= SIMPLE; nextchar(); break; case 'Z': - ret = regnode(SEOL); + ret = reg_node(SEOL); *flagp |= SIMPLE; nextchar(); break; case 'w': - ret = regnode((regflags & PMf_LOCALE) ? ALNUML : ALNUM); + ret = reg_node((regflags & PMf_LOCALE) ? ALNUML : ALNUM); *flagp |= HASWIDTH|SIMPLE; nextchar(); break; case 'W': - ret = regnode((regflags & PMf_LOCALE) ? NALNUML : NALNUM); + ret = reg_node((regflags & PMf_LOCALE) ? NALNUML : NALNUM); *flagp |= HASWIDTH|SIMPLE; nextchar(); break; case 'b': - ret = regnode((regflags & PMf_LOCALE) ? BOUNDL : BOUND); + seen_zerolen++; + ret = reg_node((regflags & PMf_LOCALE) ? BOUNDL : BOUND); *flagp |= SIMPLE; nextchar(); break; case 'B': - ret = regnode((regflags & PMf_LOCALE) ? NBOUNDL : NBOUND); + seen_zerolen++; + ret = reg_node((regflags & PMf_LOCALE) ? NBOUNDL : NBOUND); *flagp |= SIMPLE; nextchar(); break; case 's': - ret = regnode((regflags & PMf_LOCALE) ? SPACEL : SPACE); + ret = reg_node((regflags & PMf_LOCALE) ? SPACEL : SPACE); *flagp |= HASWIDTH|SIMPLE; nextchar(); break; case 'S': - ret = regnode((regflags & PMf_LOCALE) ? NSPACEL : NSPACE); + ret = reg_node((regflags & PMf_LOCALE) ? NSPACEL : NSPACE); *flagp |= HASWIDTH|SIMPLE; nextchar(); break; case 'd': - ret = regnode(DIGIT); + ret = reg_node(DIGIT); *flagp |= HASWIDTH|SIMPLE; nextchar(); break; case 'D': - ret = regnode(NDIGIT); + ret = reg_node(NDIGIT); *flagp |= HASWIDTH|SIMPLE; nextchar(); break; @@ -934,18 +1657,19 @@ tryagain: default: { register I32 len; - register char ender; + register U8 ender; register char *p; - char *oldp; + char *oldp, *s; I32 numlen; regparse++; defchar: - ret = regnode((regflags & PMf_FOLD) + ret = reg_node((regflags & PMf_FOLD) ? ((regflags & PMf_LOCALE) ? EXACTFL : EXACTF) : EXACT); - regc(0); /* save spot for len */ + s = OPERAND(ret); + regc(0, s++); /* save spot for len */ for (len = 0, p = regparse - 1; len < 127 && p < regxend; len++) @@ -1043,11 +1767,11 @@ tryagain: p = oldp; else { len++; - regc(ender); + regc(ender, s++); } break; } - regc(ender); + regc(ender, s++); } loopdone: regparse = p - 1; @@ -1058,9 +1782,16 @@ tryagain: *flagp |= HASWIDTH; if (len == 1) *flagp |= SIMPLE; - if (regcode != ®dummy) + if (!SIZE_ONLY) *OPERAND(ret) = len; - regc('\0'); + regc('\0', s++); + if (SIZE_ONLY) { +#ifdef REGALIGN_STRUCT + regsize += (len + 2 + sizeof(regnode) - 1) / sizeof(regnode); +#endif + } else { + regcode += (len + 2 + sizeof(regnode) - 1) / sizeof(regnode); + } } break; } @@ -1088,38 +1819,41 @@ regwhite(char *p, char *e) static void regset(char *opnd, register I32 c) { - if (opnd == ®dummy) + if (SIZE_ONLY) return; c &= 0xFF; opnd[1 + (c >> 3)] |= (1 << (c & 7)); } -static char * +static regnode * regclass(void) { - register char *opnd; + register char *opnd, *s; register I32 Class; register I32 lastclass = 1234; register I32 range = 0; - register char *ret; + register regnode *ret; register I32 def; I32 numlen; - ret = regnode(ANYOF); - opnd = regcode; + s = opnd = OPERAND(regcode); + ret = reg_node(ANYOF); for (Class = 0; Class < 33; Class++) - regc(0); + regc(0, s++); if (*regparse == '^') { /* Complement of range. */ regnaughty++; regparse++; - if (opnd != ®dummy) + if (!SIZE_ONLY) *opnd |= ANYOF_INVERT; } - if (opnd != ®dummy) { + if (!SIZE_ONLY) { + regcode += ANY_SKIP; if (regflags & PMf_FOLD) *opnd |= ANYOF_FOLD; if (regflags & PMf_LOCALE) *opnd |= ANYOF_LOCALE; + } else { + regsize += ANY_SKIP; } if (*regparse == ']' || *regparse == '-') goto skipcond; /* allow 1st char to be ] or - */ @@ -1131,7 +1865,7 @@ regclass(void) switch (Class) { case 'w': if (regflags & PMf_LOCALE) { - if (opnd != ®dummy) + if (!SIZE_ONLY) *opnd |= ANYOF_ALNUML; } else { @@ -1143,7 +1877,7 @@ regclass(void) continue; case 'W': if (regflags & PMf_LOCALE) { - if (opnd != ®dummy) + if (!SIZE_ONLY) *opnd |= ANYOF_NALNUML; } else { @@ -1155,7 +1889,7 @@ regclass(void) continue; case 's': if (regflags & PMf_LOCALE) { - if (opnd != ®dummy) + if (!SIZE_ONLY) *opnd |= ANYOF_SPACEL; } else { @@ -1167,7 +1901,7 @@ regclass(void) continue; case 'S': if (regflags & PMf_LOCALE) { - if (opnd != ®dummy) + if (!SIZE_ONLY) *opnd |= ANYOF_NSPACEL; } else { @@ -1279,40 +2013,33 @@ nextchar(void) } /* -- regnode - emit a node +- reg_node - emit a node */ +static regnode * /* Location. */ #ifdef CAN_PROTOTYPE -static char * /* Location. */ -regnode(char op) +reg_node(U8 op) #else -static char * /* Location. */ -regnode(op) -char op; +reg_node(op) +U8 op; #endif { - register char *ret; - register char *ptr; + register regnode *ret; + register regnode *ptr; ret = regcode; - if (ret == ®dummy) { -#ifdef REGALIGN - if (!(regsize & 1)) - regsize++; -#endif + if (SIZE_ONLY) { + SIZE_ALIGN(regsize); +#ifdef REGALIGN_STRUCT + regsize += 1; +#else regsize += 3; +#endif return(ret); } -#ifdef REGALIGN -#ifndef lint - if (!((long)ret & 1)) - *ret++ = 127; -#endif -#endif + NODE_ALIGN_FILL(ret); ptr = ret; - *ptr++ = op; - *ptr++ = '\0'; /* Null "next" pointer. */ - *ptr++ = '\0'; + FILL_ADVANCE_NODE(ptr, op); regcode = ptr; return(ret); @@ -1321,45 +2048,32 @@ char op; /* - reganode - emit a node with an argument */ +static regnode * /* Location. */ #ifdef CAN_PROTOTYPE -static char * /* Location. */ -reganode(char op, unsigned short arg) +reganode(U8 op, U32 arg) #else -static char * /* Location. */ reganode(op, arg) -char op; -unsigned short arg; +U8 op; +U32 arg; #endif { - register char *ret; - register char *ptr; + register regnode *ret; + register regnode *ptr; ret = regcode; - if (ret == ®dummy) { + if (SIZE_ONLY) { + SIZE_ALIGN(regsize); #ifdef REGALIGN - if (!(regsize & 1)) - regsize++; -#endif + regsize += 2; +#else regsize += 5; +#endif return(ret); } -#ifdef REGALIGN -#ifndef lint - if (!((long)ret & 1)) - *ret++ = 127; -#endif -#endif + NODE_ALIGN_FILL(ret); ptr = ret; - *ptr++ = op; - *ptr++ = '\0'; /* Null "next" pointer. */ - *ptr++ = '\0'; -#ifdef REGALIGN - *(unsigned short *)(ret+3) = arg; -#else - ret[3] = arg >> 8; ret[4] = arg & 0377; -#endif - ptr += 2; + FILL_ADVANCE_NODE_ARG(ptr, op, arg); regcode = ptr; return(ret); @@ -1370,17 +2084,16 @@ unsigned short arg; */ #ifdef CAN_PROTOTYPE static void -regc(char b) +regc(U8 b, char* s) #else static void -regc(b) -char b; +regc(b, s) +U8 b; +char *s; #endif { - if (regcode != ®dummy) - *regcode++ = b; - else - regsize++; + if (!SIZE_ONLY) + *s = b; } /* @@ -1390,60 +2103,52 @@ char b; */ #ifdef CAN_PROTOTYPE static void -reginsert(char op, char *opnd) +reginsert(U8 op, regnode *opnd) #else static void reginsert(op, opnd) -char op; -char *opnd; +U8 op; +regnode *opnd; #endif { - register char *src; - register char *dst; - register char *place; - register int offset = (regkind[(U8)op] == CURLY ? 4 : 0); - - if (regcode == ®dummy) { -#ifdef REGALIGN - regsize += 4 + offset; -#else - regsize += 3 + offset; -#endif + register regnode *src; + register regnode *dst; + register regnode *place; + register int offset = regarglen[(U8)op]; + +/* (regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ + + if (SIZE_ONLY) { + regsize += NODE_STEP_REGNODE + offset; return; } src = regcode; -#ifdef REGALIGN - regcode += 4 + offset; -#else - regcode += 3 + offset; -#endif + regcode += NODE_STEP_REGNODE + offset; dst = regcode; while (src > opnd) - *--dst = *--src; + StructCopy(--src, --dst, regnode); place = opnd; /* Op node, where operand used to be. */ - *place++ = op; - *place++ = '\0'; - *place++ = '\0'; - while (offset-- > 0) - *place++ = '\0'; -#ifdef REGALIGN - *place++ = '\177'; + src = NEXTOPER(place); + FILL_ADVANCE_NODE(place, op); + Zero(src, offset, regnode); +#if defined(REGALIGN) && !defined(REGALIGN_STRUCT) + src[offset + 1] = '\177'; #endif } /* -- regtail - set the next-pointer at the end of a node chain +- regtail - set the next-pointer at the end of a node chain of p to val. */ static void -regtail(char *p, char *val) +regtail(regnode *p, regnode *val) { - register char *scan; - register char *temp; + register regnode *scan; + register regnode *temp; register I32 offset; - if (p == ®dummy) + if (SIZE_ONLY) return; /* Find last node. */ @@ -1456,12 +2161,18 @@ regtail(char *p, char *val) } #ifdef REGALIGN +# ifdef REGALIGN_STRUCT + if (reg_off_by_arg[OP(scan)]) { + ARG_SET(scan, val - scan); + } else { + NEXT_OFF(scan) = val - scan; + } +# else offset = val - scan; -#ifndef lint +# ifndef lint *(short*)(scan+1) = offset; -#else - offset = offset; -#endif +# endif +#endif #else if (OP(scan) == BACK) offset = scan - val; @@ -1476,12 +2187,17 @@ regtail(char *p, char *val) - regoptail - regtail on operand of first argument; nop if operandless */ static void -regoptail(char *p, char *val) +regoptail(regnode *p, regnode *val) { /* "Operandless" and "op != BRANCH" are synonymous in practice. */ - if (p == NULL || p == ®dummy || regkind[(U8)OP(p)] != BRANCH) + if (p == NULL || SIZE_ONLY) + return; + if (regkind[(U8)OP(p)] == BRANCH) { + regtail(NEXTOPER(p), val); + } else if ( regkind[(U8)OP(p)] == BRANCHJ) { + regtail(NEXTOPER(NEXTOPER(p)), val); + } else return; - regtail(NEXTOPER(p), val); } /* @@ -1507,55 +2223,104 @@ regcurly(register char *s) #ifdef DEBUGGING +static regnode * +dumpuntil(regnode *start, regnode *node, regnode *last, SV* sv, I32 l) +{ + register char op = EXACT; /* Arbitrary non-END op. */ + register regnode *next, *onode; + + while (op != END && (!last || node < last)) { + /* While that wasn't END last time... */ + + NODE_ALIGN(node); + op = OP(node); + if (op == CLOSE) + l--; + next = regnext(node); + /* Where, what. */ + if (OP(node) == OPTIMIZED) + goto after_print; + regprop(sv, node); + PerlIO_printf(Perl_debug_log, "%4d%*s%s", node - start, + 2*l + 1, "", SvPVX(sv)); + if (next == NULL) /* Next ptr. */ + PerlIO_printf(Perl_debug_log, "(0)"); + else + PerlIO_printf(Perl_debug_log, "(%d)", next - start); + (void)PerlIO_putc(Perl_debug_log, '\n'); + after_print: + if (regkind[(U8)op] == BRANCHJ) { + register regnode *nnode = (OP(next) == LONGJMP + ? regnext(next) + : next); + if (last && nnode > last) + nnode = last; + node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1); + } else if (regkind[(U8)op] == BRANCH) { + node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1); + } else if ( op == CURLY) { /* `next' might be very big: optimizer */ + node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS, + NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1); + } else if (regkind[(U8)op] == CURLY && op != CURLYX) { + node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS, + next, sv, l + 1); + } else if ( op == PLUS || op == STAR) { + node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1); + } else if (op == ANYOF) { + node = NEXTOPER(node); + node += ANY_SKIP; + } else if (regkind[(U8)op] == EXACT) { + /* Literal string, where present. */ + node += ((*OPERAND(node)) + 2 + sizeof(regnode) - 1) / sizeof(regnode); + node = NEXTOPER(node); + } else { + node = NEXTOPER(node); + node += regarglen[(U8)op]; + } + if (op == CURLYX || op == OPEN) + l++; + else if (op == WHILEM) + l--; + } + return node; +} + /* - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form */ void regdump(regexp *r) { - register char *s; - register char op = EXACT; /* Arbitrary non-END op. */ - register char *next; SV *sv = sv_newmortal(); - s = r->program + 1; - while (op != END) { /* While that wasn't END last time... */ -#ifdef REGALIGN - if (!((long)s & 1)) - s++; -#endif - op = OP(s); - /* where, what */ - regprop(sv, s); - PerlIO_printf(Perl_debug_log, "%2ld%s", (long)(s - r->program), SvPVX(sv)); - next = regnext(s); - s += regarglen[(U8)op]; - if (next == NULL) /* Next ptr. */ - PerlIO_printf(Perl_debug_log, "(0)"); - else - PerlIO_printf(Perl_debug_log, "(%ld)", (long)(s-r->program)+(next-s)); - s += 3; - if (op == ANYOF) { - s += 33; - } - if (regkind[(U8)op] == EXACT) { - /* Literal string, where present. */ - s++; - (void)PerlIO_putc(Perl_debug_log, ' '); - (void)PerlIO_putc(Perl_debug_log, '<'); - while (*s != '\0') { - (void)PerlIO_putc(Perl_debug_log,*s); - s++; - } - (void)PerlIO_putc(Perl_debug_log, '>'); - s++; - } - (void)PerlIO_putc(Perl_debug_log, '\n'); - } + (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0); /* Header fields of interest. */ - if (r->regstart) - PerlIO_printf(Perl_debug_log, "start `%s' ", SvPVX(r->regstart)); + if (r->anchored_substr) + PerlIO_printf(Perl_debug_log, "anchored `%s%s%s'%s at %d ", + colors[0], + SvPVX(r->anchored_substr), + colors[1], + SvTAIL(r->anchored_substr) ? "$" : "", + r->anchored_offset); + if (r->float_substr) + PerlIO_printf(Perl_debug_log, "floating `%s%s%s'%s at %d..%u ", + colors[0], + SvPVX(r->float_substr), + colors[1], + SvTAIL(r->float_substr) ? "$" : "", + r->float_min_offset, r->float_max_offset); + if (r->check_substr) + PerlIO_printf(Perl_debug_log, + r->check_substr == r->float_substr + ? "(checking floating" : "(checking anchored"); + if (r->reganch & ROPT_NOSCAN) + PerlIO_printf(Perl_debug_log, " noscan"); + if (r->reganch & ROPT_CHECK_ALL) + PerlIO_printf(Perl_debug_log, " isall"); + if (r->check_substr) + PerlIO_printf(Perl_debug_log, ") "); + if (r->regstclass) { regprop(sv, r->regstclass); PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv)); @@ -1564,17 +2329,18 @@ regdump(regexp *r) PerlIO_printf(Perl_debug_log, "anchored"); if (r->reganch & ROPT_ANCH_BOL) PerlIO_printf(Perl_debug_log, "(BOL)"); + if (r->reganch & ROPT_ANCH_MBOL) + PerlIO_printf(Perl_debug_log, "(MBOL)"); if (r->reganch & ROPT_ANCH_GPOS) PerlIO_printf(Perl_debug_log, "(GPOS)"); PerlIO_putc(Perl_debug_log, ' '); } + if (r->reganch & ROPT_GPOS_SEEN) + PerlIO_printf(Perl_debug_log, "GPOS "); if (r->reganch & ROPT_SKIP) PerlIO_printf(Perl_debug_log, "plus "); if (r->reganch & ROPT_IMPLICIT) PerlIO_printf(Perl_debug_log, "implicit "); - if (r->regmust != NULL) - PerlIO_printf(Perl_debug_log, "must have \"%s\" back %ld ", SvPVX(r->regmust), - (long) r->regback); PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen); PerlIO_printf(Perl_debug_log, "\n"); } @@ -1583,7 +2349,7 @@ regdump(regexp *r) - regprop - printable representation of opcode */ void -regprop(SV *sv, char *o) +regprop(SV *sv, regnode *o) { register char *p = 0; @@ -1620,17 +2386,20 @@ regprop(SV *sv, char *o) p = "BRANCH"; break; case EXACT: - p = "EXACT"; + sv_catpvf(sv, "EXACT <%s%s%s>", colors[0], OPERAND(o) + 1, colors[1]); break; case EXACTF: - p = "EXACTF"; + sv_catpvf(sv, "EXACTF <%s%s%s>", colors[0], OPERAND(o) + 1, colors[1]); break; case EXACTFL: - p = "EXACTFL"; + sv_catpvf(sv, "EXACTFL <%s%s%s>", colors[0], OPERAND(o) + 1, colors[1]); break; case NOTHING: p = "NOTHING"; break; + case TAIL: + p = "TAIL"; + break; case BACK: p = "BACK"; break; @@ -1652,23 +2421,37 @@ regprop(SV *sv, char *o) case CURLY: sv_catpvf(sv, "CURLY {%d,%d}", ARG1(o), ARG2(o)); break; + case CURLYM: +#ifdef REGALIGN_STRUCT + sv_catpvf(sv, "CURLYM[%d] {%d,%d}", o->flags, ARG1(o), ARG2(o)); +#else + sv_catpvf(sv, "CURLYM {%d,%d}", ARG1(o), ARG2(o)); +#endif + break; + case CURLYN: +#ifdef REGALIGN_STRUCT + sv_catpvf(sv, "CURLYN[%d] {%d,%d}", o->flags, ARG1(o), ARG2(o)); +#else + sv_catpvf(sv, "CURLYN {%d,%d}", ARG1(o), ARG2(o)); +#endif + break; case CURLYX: sv_catpvf(sv, "CURLYX {%d,%d}", ARG1(o), ARG2(o)); break; case REF: - sv_catpvf(sv, "REF%d", ARG1(o)); + sv_catpvf(sv, "REF%d", ARG(o)); break; case REFF: - sv_catpvf(sv, "REFF%d", ARG1(o)); + sv_catpvf(sv, "REFF%d", ARG(o)); break; case REFFL: - sv_catpvf(sv, "REFFL%d", ARG1(o)); + sv_catpvf(sv, "REFFL%d", ARG(o)); break; case OPEN: - sv_catpvf(sv, "OPEN%d", ARG1(o)); + sv_catpvf(sv, "OPEN%d", ARG(o)); break; case CLOSE: - sv_catpvf(sv, "CLOSE%d", ARG1(o)); + sv_catpvf(sv, "CLOSE%d", ARG(o)); p = NULL; break; case STAR: @@ -1684,10 +2467,18 @@ regprop(SV *sv, char *o) p = "GPOS"; break; case UNLESSM: +#ifdef REGALIGN_STRUCT + sv_catpvf(sv, "UNLESSM[-%d]", o->flags); +#else p = "UNLESSM"; +#endif break; case IFMATCH: +#ifdef REGALIGN_STRUCT + sv_catpvf(sv, "IFMATCH[-%d]", o->flags); +#else p = "IFMATCH"; +#endif break; case SUCCEED: p = "SUCCEED"; @@ -1725,6 +2516,33 @@ regprop(SV *sv, char *o) case NSPACEL: p = "NSPACEL"; break; + case EVAL: + p = "EVAL"; + break; + case LONGJMP: + p = "LONGJMP"; + break; + case BRANCHJ: + p = "BRANCHJ"; + break; + case IFTHEN: + p = "IFTHEN"; + break; + case GROUPP: + sv_catpvf(sv, "GROUPP%d", ARG(o)); + break; + case LOGICAL: + p = "LOGICAL"; + break; + case SUSPEND: + p = "SUSPEND"; + break; + case RENUM: + p = "RENUM"; + break; + case OPTIMIZED: + p = "OPTIMIZED"; + break; default: FAIL("corrupted regexp opcode"); } @@ -1736,25 +2554,106 @@ regprop(SV *sv, char *o) void pregfree(struct regexp *r) { - if (!r) + if (!r || (--r->refcnt > 0)) return; - if (r->precomp) { + if (r->precomp) Safefree(r->precomp); - r->precomp = Nullch; - } - if (r->subbase) { + if (r->subbase) Safefree(r->subbase); - r->subbase = Nullch; - } - if (r->regmust) { - SvREFCNT_dec(r->regmust); - r->regmust = Nullsv; - } - if (r->regstart) { - SvREFCNT_dec(r->regstart); - r->regstart = Nullsv; + if (r->anchored_substr) + SvREFCNT_dec(r->anchored_substr); + if (r->float_substr) + SvREFCNT_dec(r->float_substr); + if (r->data) { + int n = r->data->count; + while (--n >= 0) { + switch (r->data->what[n]) { + case 's': + SvREFCNT_dec((SV*)r->data->data[n]); + break; + case 'o': + op_free((OP_4tree*)r->data->data[n]); + break; + case 'n': + break; + default: + FAIL2("panic: regfree data code '%c'", r->data->what[n]); + } + } + Safefree(r->data->what); + Safefree(r->data); } Safefree(r->startp); Safefree(r->endp); Safefree(r); } + +/* + - regnext - dig the "next" pointer out of a node + * + * [Note, when REGALIGN is defined there are two places in regmatch() + * that bypass this code for speed.] + */ +regnode * +regnext(register regnode *p) +{ + register I32 offset; + + if (p == ®dummy) + return(NULL); + + offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p)); + if (offset == 0) + return(NULL); + +#ifdef REGALIGN + return(p+offset); +#else + if (OP(p) == BACK) + return(p-offset); + else + return(p+offset); +#endif +} + +#ifdef I_STDARG +void +re_croak2(const char* pat1,const char* pat2,...) +#else +/*VARARGS0*/ +void +re_croak2(const char* pat1,const char* pat2, va_alist) + const char* pat1; + const char* pat2; + va_dcl +#endif +{ + va_list args; + STRLEN l1 = strlen(pat1); + STRLEN l2 = strlen(pat2); + char buf[512]; + char *message; + + if (l1 > 510) + l1 = 510; + if (l1 + l2 > 510) + l2 = 510 - l1; + Copy(pat1, buf, l1 , char); + Copy(pat2, buf + l1, l2 , char); + buf[l1 + l2 + 1] = '\n'; + buf[l1 + l2 + 2] = '\0'; +#ifdef I_STDARG + va_start(args, pat2); +#else + va_start(args); +#endif + message = mess(buf, &args); + va_end(args); + l1 = strlen(message); + if (l1 > 512) + l1 = 512; + Copy(message, buf, l1 , char); + buf[l1] = '\0'; /* Overwrite \n */ + croak("%s", buf); +} + diff --git a/regcomp.h b/regcomp.h index 5915086..dec5ac3 100644 --- a/regcomp.h +++ b/regcomp.h @@ -1,6 +1,8 @@ /* regcomp.h */ +typedef OP OP_4tree; /* Will be redefined later. */ + /* * The "internal use only" fields in regexp.h are present to pass info from * compile to execute that permits the execute phase to run lots faster on @@ -31,6 +33,18 @@ * For instance, /[a-z].foo/ has a regmust of 'foo' and a regback of 2.] */ +/* #ifndef gould */ +/* #ifndef cray */ +/* #ifndef eta10 */ +#define REGALIGN +/* #endif */ +/* #endif */ +/* #endif */ + +#ifdef REGALIGN +# define REGALIGN_STRUCT +#endif + /* * Structure for regexp "program". This is essentially a linear encoding * of a nondeterministic finite-state machine (aka syntax charts or @@ -72,27 +86,40 @@ #define BOUNDL 21 /* no Match "" at any word boundary */ #define NBOUND 22 /* no Match "" at any word non-boundary */ #define NBOUNDL 23 /* no Match "" at any word non-boundary */ -#define REF 24 /* num Match already matched string */ -#define REFF 25 /* num Match already matched string, folded */ -#define REFFL 26 /* num Match already matched string, folded in loc. */ -#define OPEN 27 /* num Mark this point in input as start of #n. */ -#define CLOSE 28 /* num Analogous to OPEN. */ -#define MINMOD 29 /* no Next operator is not greedy. */ -#define GPOS 30 /* no Matches where last m//g left off. */ -#define IFMATCH 31 /* no Succeeds if the following matches. */ -#define UNLESSM 32 /* no Fails if the following matches. */ -#define SUCCEED 33 /* no Return from a subroutine, basically. */ -#define WHILEM 34 /* no Do curly processing and see if rest matches. */ -#define ALNUM 35 /* no Match any alphanumeric character */ -#define ALNUML 36 /* no Match any alphanumeric char in locale */ -#define NALNUM 37 /* no Match any non-alphanumeric character */ -#define NALNUML 38 /* no Match any non-alphanumeric char in locale */ -#define SPACE 39 /* no Match any whitespace character */ -#define SPACEL 40 /* no Match any whitespace char in locale */ -#define NSPACE 41 /* no Match any non-whitespace character */ -#define NSPACEL 42 /* no Match any non-whitespace char in locale */ -#define DIGIT 43 /* no Match any numeric character */ -#define NDIGIT 44 /* no Match any non-numeric character */ +#define REF 24 /* num Match some already matched string */ +#define OPEN 25 /* num Mark this point in input as start of #n. */ +#define CLOSE 26 /* num Analogous to OPEN. */ +#define MINMOD 27 /* no Next operator is not greedy. */ +#define GPOS 28 /* no Matches where last m//g left off. */ +#define IFMATCH 29 /* off Succeeds if the following matches. */ +#define UNLESSM 30 /* off Fails if the following matches. */ +#define SUCCEED 31 /* no Return from a subroutine, basically. */ +#define WHILEM 32 /* no Do curly processing and see if rest matches. */ +#define ALNUM 33 /* no Match any alphanumeric character */ +#define ALNUML 34 /* no Match any alphanumeric char in locale */ +#define NALNUM 35 /* no Match any non-alphanumeric character */ +#define NALNUML 36 /* no Match any non-alphanumeric char in locale */ +#define SPACE 37 /* no Match any whitespace character */ +#define SPACEL 38 /* no Match any whitespace char in locale */ +#define NSPACE 39 /* no Match any non-whitespace character */ +#define NSPACEL 40 /* no Match any non-whitespace char in locale */ +#define DIGIT 41 /* no Match any numeric character */ +#define NDIGIT 42 /* no Match any non-numeric character */ +#define CURLYM 43 /* no Match this medium-complex thing {n,m} times. */ +#define CURLYN 44 /* no Match next-after-this simple thing + {n,m} times, set parenths. */ +#define TAIL 45 /* no Match empty string. Can jump here from outside. */ +#define REFF 46 /* num Match already matched string, folded */ +#define REFFL 47 /* num Match already matched string, folded in loc. */ +#define EVAL 48 /* evl Execute some Perl code. */ +#define LONGJMP 49 /* off Jump far away, requires REGALIGN_STRUCT. */ +#define BRANCHJ 50 /* off BRANCH with long offset, requires REGALIGN_STRUCT. */ +#define IFTHEN 51 /* off Switch, should be preceeded by switcher . */ +#define GROUPP 52 /* num Whether the group matched. */ +#define LOGICAL 53 /* no Next opcode should set the flag only. */ +#define SUSPEND 54 /* off "Independent" sub-RE. */ +#define RENUM 55 /* off Group with independently numbered parens. */ +#define OPTIMIZED 56 /* off Placeholder for dump. */ /* * Opcode notes: @@ -113,25 +140,13 @@ * per match) are implemented with STAR and PLUS for speed * and to minimize recursive plunges. * - * OPEN,CLOSE ...are numbered at compile time. + * OPEN,CLOSE,GROUPP ...are numbered at compile time. */ #ifndef DOINIT -EXT char regarglen[]; +EXT const U8 regkind[]; #else -EXT char regarglen[] = { - 0,0,0,0,0,0,0,0,0,0, - /*CURLY*/ 4, /*CURLYX*/ 4, - 0,0,0,0,0,0,0,0,0,0,0,0, - /*REF*/ 2, 2, 2, /*OPEN*/ 2, /*CLOSE*/ 2, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -}; -#endif - -#ifndef DOINIT -EXT char regkind[]; -#else -EXT char regkind[] = { +EXT const U8 regkind[] = { END, BOL, BOL, @@ -157,14 +172,12 @@ EXT char regkind[] = { NBOUND, NBOUND, REF, - REF, - REF, OPEN, CLOSE, MINMOD, GPOS, - BRANCH, - BRANCH, + BRANCHJ, + BRANCHJ, END, WHILEM, ALNUM, @@ -177,23 +190,38 @@ EXT char regkind[] = { NSPACE, DIGIT, NDIGIT, + CURLY, + CURLY, + NOTHING, + REF, + REF, + EVAL, + LONGJMP, + BRANCHJ, + BRANCHJ, + GROUPP, + LOGICAL, + BRANCHJ, + BRANCHJ, + NOTHING, }; #endif -/* The following have no fixed length. */ +/* The following have no fixed length. char* since we do strchr on it. */ #ifndef DOINIT -EXT char varies[]; +EXT const char varies[]; #else -EXT char varies[] = { - BRANCH, BACK, STAR, PLUS, CURLY, CURLYX, REF, REFF, REFFL, WHILEM, 0 +EXT const char varies[] = { + BRANCH, BACK, STAR, PLUS, CURLY, CURLYX, REF, REFF, REFFL, + WHILEM, CURLYM, CURLYN, BRANCHJ, IFTHEN, SUSPEND, 0 }; #endif -/* The following always have a length of 1. */ +/* The following always have a length of 1. char* since we do strchr on it. */ #ifndef DOINIT -EXT char simple[]; +EXT const char simple[]; #else -EXT char simple[] = { +EXT const char simple[] = { ANY, SANY, ANYOF, ALNUM, ALNUML, NALNUM, NALNUML, SPACE, SPACEL, NSPACE, NSPACEL, @@ -201,8 +229,6 @@ EXT char simple[] = { }; #endif -EXT char regdummy; - /* * A node is one char of opcode followed by two chars of "next" pointer. * "Next" pointers are stored as two 8-bit pieces, high order first. The @@ -219,42 +245,118 @@ EXT char regdummy; * stored negative.] */ -#ifndef gould -#ifndef cray -#ifndef eta10 -#define REGALIGN -#endif -#endif -#endif +#ifdef REGALIGN_STRUCT + +struct regnode_string { + U8 flags; + U8 type; + U16 next_off; + U8 string[1]; +}; -#define OP(p) (*(p)) +struct regnode_1 { + U8 flags; + U8 type; + U16 next_off; + U32 arg1; +}; + +struct regnode_2 { + U8 flags; + U8 type; + U16 next_off; + U16 arg1; + U16 arg2; +}; + +#endif + +#define REG_INFTY I16_MAX -#ifndef lint #ifdef REGALIGN -#define NEXT(p) (*(short*)(p+1)) -#define ARG1(p) (*(unsigned short*)(p+3)) -#define ARG2(p) (*(unsigned short*)(p+5)) +# define ARG_VALUE(arg) (arg) +# define ARG__SET(arg,val) ((arg) = (val)) #else -#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) -#define ARG1(p) (((*((p)+3)&0377)<<8) + (*((p)+4)&0377)) -#define ARG2(p) (((*((p)+5)&0377)<<8) + (*((p)+6)&0377)) +# define ARG_VALUE(arg) (((*((char*)&arg)&0377)<<8) + (*(((char*)&arg)+1)&0377)) +# define ARG__SET(arg,val) (((char*)&arg)[0] = (val) >> 8; ((char*)&arg)[1] = (val) & 0377;) #endif + +#define ARG(p) ARG_VALUE(ARG_LOC(p)) +#define ARG1(p) ARG_VALUE(ARG1_LOC(p)) +#define ARG2(p) ARG_VALUE(ARG2_LOC(p)) +#define ARG_SET(p, val) ARG__SET(ARG_LOC(p), (val)) +#define ARG1_SET(p, val) ARG__SET(ARG1_LOC(p), (val)) +#define ARG2_SET(p, val) ARG__SET(ARG2_LOC(p), (val)) + +#ifndef lint +# ifdef REGALIGN +# ifdef REGALIGN_STRUCT +# define NEXT_OFF(p) ((p)->next_off) +# define NODE_ALIGN(node) +# define NODE_ALIGN_FILL(node) ((node)->flags = 0xde) /* deadbeef */ +# else +# define NEXT_OFF(p) (*(short*)(p+1)) +# define NODE_ALIGN(node) ((!((long)node & 1)) ? node++ : 0) +# define NODE_ALIGN_FILL(node) ((!((long)node & 1)) ? *node++ = 127 : 0) +# endif +# else +# define NEXT_OFF(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) +# define NODE_ALIGN(node) +# define NODE_ALIGN_FILL(node) +# endif #else /* lint */ -#define NEXT(p) 0 +# define NEXT_OFF(p) 0 +# define NODE_ALIGN(node) +# define NODE_ALIGN_FILL(node) #endif /* lint */ -#define OPERAND(p) ((p) + 3) +#define SIZE_ALIGN NODE_ALIGN + +#ifdef REGALIGN_STRUCT +# define OP(p) ((p)->type) +# define OPERAND(p) (((struct regnode_string *)p)->string) +# define NODE_ALIGN(node) +# define ARG_LOC(p) (((struct regnode_1 *)p)->arg1) +# define ARG1_LOC(p) (((struct regnode_2 *)p)->arg1) +# define ARG2_LOC(p) (((struct regnode_2 *)p)->arg2) +# define NODE_STEP_REGNODE 1 /* sizeof(regnode)/sizeof(regnode) */ +# define EXTRA_STEP_2ARGS EXTRA_SIZE(struct regnode_2) +#else +# define OP(p) (*(p)) +# define OPERAND(p) ((p) + 3) +# define ARG_LOC(p) (*(unsigned short*)(p+3)) +# define ARG1_LOC(p) (*(unsigned short*)(p+3)) +# define ARG2_LOC(p) (*(unsigned short*)(p+5)) +typedef char* regnode; +# define NODE_STEP_REGNODE NODE_STEP_B +# define EXTRA_STEP_2ARGS 4 +#endif #ifdef REGALIGN -#define NEXTOPER(p) ((p) + 4) -#define PREVOPER(p) ((p) - 4) +# define NODE_STEP_B 4 #else -#define NEXTOPER(p) ((p) + 3) -#define PREVOPER(p) ((p) - 3) +# define NODE_STEP_B 3 +#endif + +#define NEXTOPER(p) ((p) + NODE_STEP_REGNODE) +#define PREVOPER(p) ((p) - NODE_STEP_REGNODE) + +#ifdef REGALIGN_STRUCT +# define FILL_ADVANCE_NODE(ptr, op) STMT_START { \ + (ptr)->type = op; (ptr)->next_off = 0; (ptr)++; } STMT_END +# define FILL_ADVANCE_NODE_ARG(ptr, op, arg) STMT_START { \ + ARG_SET(ptr, arg); FILL_ADVANCE_NODE(ptr, op); (ptr) += 1; } STMT_END +#else +# define FILL_ADVANCE_NODE(ptr, op) STMT_START { \ + *(ptr)++ = op; *(ptr)++ = '\0'; *(ptr)++ = '\0'; } STMT_END +# define FILL_ADVANCE_NODE_ARG(ptr, op, arg) STMT_START { \ + ARG_SET(ptr, arg); FILL_ADVANCE_NODE(ptr, op); (ptr) += 2; } STMT_END #endif #define MAGIC 0234 +#define SIZE_ONLY (regcode == ®dummy) + /* Flags for first parameter byte of ANYOF */ #define ANYOF_INVERT 0x40 #define ANYOF_FOLD 0x20 @@ -265,6 +367,13 @@ EXT char regdummy; #define ANYOF_SPACEL 0x02 #define ANYOF_NSPACEL 0x01 +#ifdef REGALIGN_STRUCT +#define ANY_SKIP ((33 - 1)/sizeof(regnode) + 1) +#else +#define ANY_SKIP 32 /* overwrite the first byte of + * the next guy. */ +#endif + /* * Utility definitions. */ @@ -278,4 +387,71 @@ EXT char regdummy; #define UCHARAT(p) regdummy #endif /* lint */ -#define FAIL(m) croak("/%.127s/: %s",regprecomp,m) +#define FAIL(m) croak ("/%.127s/: %s", regprecomp,m) +#define FAIL2(pat,m) re_croak2("/%.127s/: ",pat,regprecomp,m) + +#define EXTRA_SIZE(guy) ((sizeof(guy)-1)/sizeof(struct regnode)) + +#ifdef REG_COMP_C +const static U8 regarglen[] = { +# ifdef REGALIGN_STRUCT + 0,0,0,0,0,0,0,0,0,0, + /*CURLY*/ EXTRA_SIZE(struct regnode_2), + /*CURLYX*/ EXTRA_SIZE(struct regnode_2), + 0,0,0,0,0,0,0,0,0,0,0,0, + /*REF*/ EXTRA_SIZE(struct regnode_1), + /*OPEN*/ EXTRA_SIZE(struct regnode_1), + /*CLOSE*/ EXTRA_SIZE(struct regnode_1), + 0,0, + /*IFMATCH*/ EXTRA_SIZE(struct regnode_1), + /*UNLESSM*/ EXTRA_SIZE(struct regnode_1), + 0,0,0,0,0,0,0,0,0,0,0,0, + /*CURLYM*/ EXTRA_SIZE(struct regnode_2), + /*CURLYN*/ EXTRA_SIZE(struct regnode_2), + 0, + /*REFF*/ EXTRA_SIZE(struct regnode_1), + /*REFFL*/ EXTRA_SIZE(struct regnode_1), + /*EVAL*/ EXTRA_SIZE(struct regnode_1), + /*LONGJMP*/ EXTRA_SIZE(struct regnode_1), + /*BRANCHJ*/ EXTRA_SIZE(struct regnode_1), + /*IFTHEN*/ EXTRA_SIZE(struct regnode_1), + /*GROUPP*/ EXTRA_SIZE(struct regnode_1), + /*LOGICAL*/ 0, + /*SUSPEND*/ EXTRA_SIZE(struct regnode_1), + /*RENUM*/ EXTRA_SIZE(struct regnode_1), 0, +# else + 0,0,0,0,0,0,0,0,0,0, + /*CURLY*/ 4, /*CURLYX*/ 4, + 0,0,0,0,0,0,0,0,0,0,0,0, + /*REF*/ 2, /*OPEN*/ 2, /*CLOSE*/ 2, + 0,0, /*IFMATCH*/ 2, /*UNLESSM*/ 2, + 0,0,0,0,0,0,0,0,0,0,0,0,/*CURLYM*/ 4,/*CURLYN*/ 4, + 0, /*REFF*/ 2, /*REFFL*/ 2, /*EVAL*/ 2, /*LONGJMP*/ 2, /*BRANCHJ*/ 2, + /*IFTHEN*/ 2, /*GROUPP*/ 2, /*LOGICAL*/ 0, /*RENUM*/ 2, /*RENUM*/ 2, 0, +# endif +}; + +const static char reg_off_by_arg[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0 .. 15 */ + 0,0,0,0,0,0,0,0,0,0,0,0,0, /*IFMATCH*/ 2, /*UNLESSM*/ 2, 0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 32 .. 47 */ + 0, /*LONGJMP*/ 1, /*BRANCHJ*/ 1, /*IFTHEN*/ 1, 0, 0, + /*RENUM*/ 1, /*RENUM*/ 1,0, +}; +#endif + +struct reg_data { + U32 count; + U8 *what; + void* data[1]; +}; + +#define REG_SEEN_ZERO_LEN 1 +#define REG_SEEN_LOOKBEHIND 2 +#define REG_SEEN_GPOS 4 + +#ifdef DEBUGGING +extern char *colors[4]; +#endif + +void re_croak2 _((const char* pat1,const char* pat2,...)) __attribute__((noreturn)); diff --git a/regexec.c b/regexec.c index 9e8d451..d74c234 100644 --- a/regexec.c +++ b/regexec.c @@ -55,13 +55,37 @@ #include "perl.h" #include "regcomp.h" +static char * reginput; /* String-input pointer. */ +static char * regbol; /* Beginning of input, for ^ check. */ +static char * regeol; /* End of input, for $ check. */ +static char ** regstartp; /* Pointer to startp array. */ +static char ** regendp; /* Ditto for endp. */ +static U32 * reglastparen; /* Similarly for lastparen. */ +static char * regtill; /* How far we are required to go. */ +static char regprev; /* char before regbol, \n if none */ + +static char * regprecomp; /* uncompiled string. */ +static I32 regnpar; /* () count. */ +static I32 regsize; /* Largest OPEN seens. */ +static char ** reg_start_tmp; +static U32 reg_start_tmpl; +static struct reg_data *data; +static char *bostr; + +static U32 reg_flags; /* tainted/warned */ +static I32 reg_eval_set; + +#define RF_tainted 1 /* tainted information used? */ +#define RF_warned 2 /* warned about big count? */ +#define RF_evaled 4 /* Did an EVAL? */ + #ifndef STATIC #define STATIC static #endif #ifdef DEBUGGING -static I32 regnarrate = 0; -static char* regprogram = 0; +static I32 regnarrate = 0; +static regnode* regprogram = 0; #endif /* Current curly descriptor */ @@ -72,8 +96,8 @@ struct curcur { int min; /* the minimal number of scans to match */ int max; /* the maximal number of scans to match */ int minmod; /* whether to work our way up or down */ - char * scan; /* the thing to match */ - char * next; /* what has to match after it */ + regnode * scan; /* the thing to match */ + regnode * next; /* what has to match after it */ char * lastloc; /* where we started matching this scan */ CURCUR * oldcc; /* current curly before we started this one */ }; @@ -82,6 +106,15 @@ static CURCUR* regcc; typedef I32 CHECKPOINT; +/* + * Forwards. + */ + +static I32 regmatch _((regnode *prog)); +static I32 regrepeat _((regnode *p, I32 max)); +static I32 regrepeat_hard _((regnode *p, I32 max, I32 *lp)); +static I32 regtry _((regexp *prog, char *startpos)); +static bool reginclass _((char *p, I32 c)); static CHECKPOINT regcppush _((I32 parenfloor)); static char * regcppop _((void)); @@ -90,13 +123,14 @@ regcppush(I32 parenfloor) { dTHR; int retval = savestack_ix; - int i = (regsize - parenfloor) * 3; + int i = (regsize - parenfloor) * 4; int p; SSCHECK(i + 5); for (p = regsize; p > parenfloor; p--) { SSPUSHPTR(regendp[p]); SSPUSHPTR(regstartp[p]); + SSPUSHINT(reg_start_tmp[p]); SSPUSHINT(p); } SSPUSHINT(regsize); @@ -107,6 +141,10 @@ regcppush(I32 parenfloor) return retval; } +/* These are needed since we do not localize EVAL nodes: */ +# define REGCP_SET DEBUG_r(PerlIO_printf(Perl_debug_log, " Setting an EVAL scope, savestack=%i\n", savestack_ix)); lastcp = savestack_ix +# define REGCP_UNWIND DEBUG_r(lastcp != savestack_ix ? PerlIO_printf(Perl_debug_log," Clearing an EVAL scope, savestack=%i..%i\n", lastcp, savestack_ix) : 0); regcpblow(lastcp) + static char * regcppop(void) { @@ -120,13 +158,27 @@ regcppop(void) input = (char *) SSPOPPTR; *reglastparen = SSPOPINT; regsize = SSPOPINT; - for (i -= 3; i > 0; i -= 3) { + for (i -= 3; i > 0; i -= 4) { paren = (U32)SSPOPINT; + reg_start_tmp[paren] = (char *) SSPOPPTR; regstartp[paren] = (char *) SSPOPPTR; tmps = (char*)SSPOPPTR; if (paren <= *reglastparen) regendp[paren] = tmps; + DEBUG_r( + PerlIO_printf(Perl_debug_log, " restoring \\%d to %d(%d)..%d%s\n", + paren, regstartp[paren] - regbol, + reg_start_tmp[paren] - regbol, + regendp[paren] - regbol, + (paren > *reglastparen ? "(no)" : "")); + ); } + DEBUG_r( + if (*reglastparen + 1 <= regnpar) { + PerlIO_printf(Perl_debug_log, " restoring \\%d..\\%d to undef\n", + *reglastparen + 1, regnpar); + } + ); for (paren = *reglastparen + 1; paren <= regnpar; paren++) { if (paren > regsize) regstartp[paren] = Nullch; @@ -135,78 +187,56 @@ regcppop(void) return input; } -/* After a successful match in WHILEM, we want to restore paren matches - * that have been overwritten by a failed match attempt in the process - * of reaching this success. We do this by restoring regstartp[i] - * wherever regendp[i] has not changed; if OPEN is changed to modify - * regendp[], the '== endp' test below should be changed to match. - * This corrects the error of: - * 0 > length [ "foobar" =~ / ( (foo) | (bar) )* /x ]->[1] - */ -static void -regcppartblow(I32 base) -{ - dTHR; - I32 i = SSPOPINT; - U32 paren; - char *startp; - char *endp; - assert(i == SAVEt_REGCONTEXT); - i = SSPOPINT; - /* input, lastparen, size */ - SSPOPPTR; SSPOPINT; SSPOPINT; - for (i -= 3; i > 0; i -= 3) { - paren = (U32)SSPOPINT; - startp = (char *) SSPOPPTR; - endp = (char *) SSPOPPTR; - if (paren <= *reglastparen && regendp[paren] == endp) - regstartp[paren] = startp; - } - assert(savestack_ix == base); -} - -#define regcpblow(cp) leave_scope(cp) +#define regcpblow(cp) LEAVE_SCOPE(cp) /* * pregexec and friends */ /* - * Forwards. + - pregexec - match a regexp against a string */ - -static I32 regmatch _((char *prog)); -static I32 regrepeat _((char *p, I32 max)); -static I32 regtry _((regexp *prog, char *startpos)); -static bool reginclass _((char *p, I32 c)); - -static bool regtainted; /* tainted information used? */ - +I32 +pregexec(register regexp *prog, char *stringarg, register char *strend, char *strbeg, I32 minend, SV *screamer, U32 nosave) +/* strend: pointer to null at end of string */ +/* strbeg: real beginning of string */ +/* minend: end of match must be >=minend after stringarg. */ +/* nosave: For optimizations. */ +{ + return + regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL, + nosave ? 0 : REXEC_COPY_STR); +} + /* - - pregexec - match a regexp against a string + - regexec_flags - match a regexp against a string */ I32 -pregexec(register regexp *prog, char *stringarg, register char *strend, char *strbeg, I32 minend, SV *screamer, I32 safebase) - - - /* pointer to null at end of string */ - /* real beginning of string */ - /* end of match must be at least minend after stringarg */ - - /* no need to remember string in subbase */ +regexec_flags(register regexp *prog, char *stringarg, register char *strend, char *strbeg, I32 minend, SV *screamer, void *data, U32 flags) +/* strend: pointer to null at end of string */ +/* strbeg: real beginning of string */ +/* minend: end of match must be >=minend after stringarg. */ +/* data: May be used for some additional optimizations. */ +/* nosave: For optimizations. */ { register char *s; - register char *c; + register regnode *c; register char *startpos = stringarg; register I32 tmp; - I32 minlen = 0; /* must match at least this many chars */ + I32 minlen; /* must match at least this many chars */ I32 dontbother = 0; /* how many characters not to try at end */ CURCUR cc; + I32 start_shift = 0; /* Offset of the start to find + constant substr. */ + I32 end_shift = 0; /* Same for the end. */ + I32 scream_pos = -1; /* Internal iterator of scream. */ + char *scream_olds; cc.cur = 0; cc.oldcc = 0; regcc = &cc; + regprecomp = prog->precomp; /* Needed for error messages. */ #ifdef DEBUGGING regnarrate = debug & 512; regprogram = prog->program; @@ -218,6 +248,9 @@ pregexec(register regexp *prog, char *stringarg, register char *strend, char *st return 0; } + minlen = prog->minlen; + if (strend - startpos < minlen) goto phooey; + if (startpos == strbeg) /* is ^ valid at stringarg? */ regprev = '\n'; else { @@ -226,54 +259,58 @@ pregexec(register regexp *prog, char *stringarg, register char *strend, char *st regprev = '\0'; /* force ^ to NOT match */ } - regprecomp = prog->precomp; /* Check validity of program. */ if (UCHARAT(prog->program) != MAGIC) { FAIL("corrupted regexp program"); } regnpar = prog->nparens; - regtainted = FALSE; + reg_flags = 0; + reg_eval_set = 0; /* If there is a "must appear" string, look for it. */ s = startpos; - if (prog->regmust != Nullsv && + if (!(flags & REXEC_CHECKED) + && prog->check_substr != Nullsv && !(prog->reganch & ROPT_ANCH_GPOS) && - (!(prog->reganch & ROPT_ANCH_BOL) - || (multiline && prog->regback >= 0)) ) + (!(prog->reganch & (ROPT_ANCH_BOL | ROPT_ANCH_MBOL)) + || (multiline && prog->check_substr == prog->anchored_substr)) ) { - if (stringarg == strbeg && screamer) { - if (screamfirst[BmRARE(prog->regmust)] >= 0) - s = screaminstr(screamer,prog->regmust); + start_shift = prog->check_offset_min; + /* Should be nonnegative! */ + end_shift = minlen - start_shift - SvCUR(prog->check_substr); + if (screamer) { + if (screamfirst[BmRARE(prog->check_substr)] >= 0) + s = screaminstr(screamer, prog->check_substr, + start_shift + (stringarg - strbeg), + end_shift, &scream_pos, 0); else s = Nullch; + scream_olds = s; } else - s = fbm_instr((unsigned char*)s, (unsigned char*)strend, - prog->regmust); + s = fbm_instr((unsigned char*)s + start_shift, + (unsigned char*)strend - end_shift, + prog->check_substr); if (!s) { - ++BmUSEFUL(prog->regmust); /* hooray */ + ++BmUSEFUL(prog->check_substr); /* hooray */ goto phooey; /* not present */ - } - else if (prog->regback >= 0) { - s -= prog->regback; - if (s < startpos) - s = startpos; - minlen = prog->regback + SvCUR(prog->regmust); - } - else if (!prog->naughty && --BmUSEFUL(prog->regmust) < 0) { /* boo */ - SvREFCNT_dec(prog->regmust); - prog->regmust = Nullsv; /* disable regmust */ - s = startpos; - } - else { + } else if ((s - stringarg) > prog->check_offset_max) { + ++BmUSEFUL(prog->check_substr); /* hooray/2 */ + s -= prog->check_offset_max; + } else if (!prog->naughty + && --BmUSEFUL(prog->check_substr) < 0 + && prog->check_substr == prog->float_substr) { /* boo */ + SvREFCNT_dec(prog->check_substr); + prog->check_substr = Nullsv; /* disable */ + prog->float_substr = Nullsv; /* clear */ s = startpos; - minlen = SvCUR(prog->regmust); - } + } else s = startpos; } - /* Mark beginning of line for ^ . */ + /* Mark beginning of line for ^ and lookbehind. */ regbol = startpos; + bostr = strbeg; /* Mark end of line for $ (and such) */ regeol = strend; @@ -281,13 +318,24 @@ pregexec(register regexp *prog, char *stringarg, register char *strend, char *st /* see how far we have to get to not match where we matched before */ regtill = startpos+minend; + DEBUG_r( + PerlIO_printf(Perl_debug_log, + "Matching `%.60s%s' against `%.*s%s'\n", + prog->precomp, + (strlen(prog->precomp) > 60 ? "..." : ""), + (strend - startpos > 60 ? 60 : strend - startpos), + startpos, + (strend - startpos > 60 ? "..." : "")) + ); + /* Simplest case: anchored match need be tried only once. */ /* [unless only anchor is BOL and multiline is set] */ if (prog->reganch & ROPT_ANCH) { if (regtry(prog, startpos)) goto got_it; else if (!(prog->reganch & ROPT_ANCH_GPOS) && - (multiline || (prog->reganch & ROPT_IMPLICIT))) + (multiline || (prog->reganch & ROPT_IMPLICIT) + || (prog->reganch & ROPT_ANCH_MBOL))) { if (minlen) dontbother = minlen - 1; @@ -306,45 +354,64 @@ pregexec(register regexp *prog, char *stringarg, register char *strend, char *st } /* Messy cases: unanchored match. */ - if (prog->regstart) { - if (prog->reganch & ROPT_SKIP) { /* we have /x+whatever/ */ - /* it must be a one character string */ - char ch = SvPVX(prog->regstart)[0]; - while (s < strend) { - if (*s == ch) { - if (regtry(prog, s)) - goto got_it; - s++; - while (s < strend && *s == ch) - s++; - } + if (prog->anchored_substr && prog->reganch & ROPT_SKIP) { + /* we have /x+whatever/ */ + /* it must be a one character string */ + char ch = SvPVX(prog->anchored_substr)[0]; + while (s < strend) { + if (*s == ch) { + if (regtry(prog, s)) goto got_it; s++; + while (s < strend && *s == ch) + s++; } + s++; } - else if (SvTYPE(prog->regstart) == SVt_PVBM) { - /* We know what string it must start with. */ - while ((s = fbm_instr((unsigned char*)s, - (unsigned char*)strend, prog->regstart)) != NULL) - { - if (regtry(prog, s)) - goto got_it; - s++; + } + /*SUPPRESS 560*/ + else if (prog->anchored_substr != Nullsv + || (prog->float_substr != Nullsv + && prog->float_max_offset < strend - s)) { + SV *must = prog->anchored_substr + ? prog->anchored_substr : prog->float_substr; + I32 back_max = + prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset; + I32 back_min = + prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset; + I32 delta = back_max - back_min; + char *last = strend - SvCUR(must) - back_min; /* Cannot start after this */ + char *last1 = s - 1; /* Last position checked before */ + + /* XXXX check_substr already used to find `s', can optimize if + check_substr==must. */ + scream_pos = -1; + dontbother = end_shift; + strend -= dontbother; + while ( (s <= last) && + (screamer + ? (s = screaminstr(screamer, must, s + back_min - strbeg, + end_shift, &scream_pos, 0)) + : (s = fbm_instr((unsigned char*)s + back_min, + (unsigned char*)strend, must))) ) { + if (s - back_max > last1) { + last1 = s - back_min; + s = s - back_max; + } else { + char *t = last1 + 1; + + last1 = s - back_min; + s = t; } - } - else { /* Optimized fbm_instr: */ - c = SvPVX(prog->regstart); - while ((s = ninstr(s, strend, c, c + SvCUR(prog->regstart))) != NULL) - { + while (s <= last1) { if (regtry(prog, s)) goto got_it; s++; } } goto phooey; - } - /*SUPPRESS 560*/ - if (c = prog->regstclass) { + } else if (c = prog->regstclass) { I32 doevery = (prog->reganch & ROPT_SKIP) == 0; + char *class; if (minlen) dontbother = minlen - 1; @@ -353,9 +420,9 @@ pregexec(register regexp *prog, char *stringarg, register char *strend, char *st /* We know what class it must start with. */ switch (OP(c)) { case ANYOF: - c = OPERAND(c); + class = OPERAND(c); while (s < strend) { - if (reginclass(c, *s)) { + if (reginclass(class, *s)) { if (tmp && regtry(prog, s)) goto got_it; else @@ -367,7 +434,7 @@ pregexec(register regexp *prog, char *stringarg, register char *strend, char *st } break; case BOUNDL: - regtainted = TRUE; + reg_flags |= RF_tainted; /* FALL THROUGH */ case BOUND: if (minlen) @@ -386,7 +453,7 @@ pregexec(register regexp *prog, char *stringarg, register char *strend, char *st goto got_it; break; case NBOUNDL: - regtainted = TRUE; + reg_flags |= RF_tainted; /* FALL THROUGH */ case NBOUND: if (minlen) @@ -417,7 +484,7 @@ pregexec(register regexp *prog, char *stringarg, register char *strend, char *st } break; case ALNUML: - regtainted = TRUE; + reg_flags |= RF_tainted; while (s < strend) { if (isALNUM_LC(*s)) { if (tmp && regtry(prog, s)) @@ -444,7 +511,7 @@ pregexec(register regexp *prog, char *stringarg, register char *strend, char *st } break; case NALNUML: - regtainted = TRUE; + reg_flags |= RF_tainted; while (s < strend) { if (!isALNUM_LC(*s)) { if (tmp && regtry(prog, s)) @@ -471,7 +538,7 @@ pregexec(register regexp *prog, char *stringarg, register char *strend, char *st } break; case SPACEL: - regtainted = TRUE; + reg_flags |= RF_tainted; while (s < strend) { if (isSPACE_LC(*s)) { if (tmp && regtry(prog, s)) @@ -498,7 +565,7 @@ pregexec(register regexp *prog, char *stringarg, register char *strend, char *st } break; case NSPACEL: - regtainted = TRUE; + reg_flags |= RF_tainted; while (s < strend) { if (!isSPACE_LC(*s)) { if (tmp && regtry(prog, s)) @@ -540,7 +607,26 @@ pregexec(register regexp *prog, char *stringarg, register char *strend, char *st } } else { - if (minlen) + dontbother = 0; + if (prog->float_substr != Nullsv) { /* Trim the end. */ + char *last; + I32 oldpos = scream_pos; + + if (screamer) { + last = screaminstr(screamer, prog->float_substr, s - strbeg, + end_shift, &scream_pos, 1); /* last one */ + if (!last) { + last = scream_olds; /* Only one occurence. */ + } + } else { + STRLEN len; + char *little = SvPV(prog->float_substr, len); + last = rninstr(s, strend, little, little + len); + } + if (last == NULL) goto phooey; /* Should not happen! */ + dontbother = strend - last - 1; + } + if (minlen && (dontbother < minlen)) dontbother = minlen - 1; strend -= dontbother; /* We don't know much -- general case. */ @@ -557,11 +643,11 @@ got_it: strend += dontbother; /* uncheat */ prog->subbeg = strbeg; prog->subend = strend; - prog->exec_tainted = regtainted; + RX_MATCH_TAINTED_SET(prog, reg_flags & RF_tainted); /* make sure $`, $&, $', and $digit will work later */ - if (strbeg != prog->subbase) { - if (safebase) { + if (strbeg != prog->subbase) { /* second+ //g match. */ + if (!(flags & REXEC_COPY_STR)) { if (prog->subbase) { Safefree(prog->subbase); prog->subbase = Nullch; @@ -595,9 +681,11 @@ phooey: static I32 /* 0 failure, 1 success */ regtry(regexp *prog, char *startpos) { + dTHR; register I32 i; register char **sp; register char **ep; + CHECKPOINT lastcp; reginput = startpos; regstartp = prog->startp; @@ -605,22 +693,31 @@ regtry(regexp *prog, char *startpos) reglastparen = &prog->lastparen; prog->lastparen = 0; regsize = 0; + if (reg_start_tmpl <= prog->nparens) { + reg_start_tmpl = prog->nparens*3/2 + 3; + if(reg_start_tmp) + Renew(reg_start_tmp, reg_start_tmpl, char*); + else + New(22,reg_start_tmp, reg_start_tmpl, char*); + } sp = prog->startp; ep = prog->endp; + data = prog->data; if (prog->nparens) { for (i = prog->nparens; i >= 0; i--) { *sp++ = NULL; *ep++ = NULL; } } + REGCP_SET; if (regmatch(prog->program + 1) && reginput >= regtill) { prog->startp[0] = startpos; prog->endp[0] = reginput; return 1; } - else - return 0; + REGCP_UNWIND; + return 0; } /* @@ -638,17 +735,19 @@ regtry(regexp *prog, char *startpos) * advantage of machines that use a register save mask on subroutine entry. */ static I32 /* 0 failure, 1 success */ -regmatch(char *prog) +regmatch(regnode *prog) { - register char *scan; /* Current node. */ - char *next; /* Next node. */ + dTHR; + register regnode *scan; /* Current node. */ + regnode *next; /* Next node. */ + regnode *inner; /* Next node in internal branch. */ register I32 nextchar; register I32 n; /* no or next */ register I32 ln; /* len or last */ register char *s; /* operand or save */ register char *locinput = reginput; - register I32 c1, c2; /* case fold search */ - int minmod = 0; + register I32 c1, c2, paren; /* case fold search, parenth */ + int minmod = 0, sw = 0, logical = 0; #ifdef DEBUGGING static int regindent = 0; regindent++; @@ -657,25 +756,43 @@ regmatch(char *prog) nextchar = UCHARAT(locinput); scan = prog; while (scan != NULL) { +#define sayNO_L (logical ? (logical = 0, sw = 0, goto cont) : sayNO) #ifdef DEBUGGING -#define sayYES goto yes -#define sayNO goto no -#define saySAME(x) if (x) goto yes; else goto no - if (regnarrate) { - SV *prop = sv_newmortal(); - regprop(prop, scan); - PerlIO_printf(Perl_debug_log, "%*s%2ld%-8.8s\t<%.10s>\n", - regindent*2, "", (long)(scan - regprogram), - SvPVX(prop), locinput); - } +# define sayYES goto yes +# define sayNO goto no +# define saySAME(x) if (x) goto yes; else goto no +# define REPORT_CODE_OFF 24 #else -#define sayYES return 1 -#define sayNO return 0 -#define saySAME(x) return x +# define sayYES return 1 +# define sayNO return 0 +# define saySAME(x) return x #endif + DEBUG_r( { + SV *prop = sv_newmortal(); + int docolor = *colors[0]; + int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */ + int l = (regeol - locinput > taill ? taill : regeol - locinput); + int pref_len = (locinput - bostr > (5 + taill) - l + ? (5 + taill) - l : locinput - bostr); + + if (l + pref_len < (5 + taill) && l < regeol - locinput) + l = ( regeol - locinput > (5 + taill) - pref_len + ? (5 + taill) - pref_len : regeol - locinput); + regprop(prop, scan); + PerlIO_printf(Perl_debug_log, + "%4i <%s%.*s%s%s%s%.*s%s>%*s|%*s%2d%s\n", + locinput - bostr, + colors[2], pref_len, locinput - pref_len, colors[3], + (docolor ? "" : "> <"), + colors[0], l, locinput, colors[1], + 15 - l - pref_len + 1, + "", + regindent*2, "", scan - regprogram, + SvPVX(prop)); + } ); #ifdef REGALIGN - next = scan + NEXT(scan); + next = scan + NEXT_OFF(scan); if (next == scan) next = NULL; #else @@ -686,7 +803,8 @@ regmatch(char *prog) case BOL: if (locinput == regbol ? regprev == '\n' - : ((nextchar || locinput < regeol) && locinput[-1] == '\n') ) + : (multiline && + (nextchar || locinput < regeol) && locinput[-1] == '\n') ) { /* regtill = regbol; */ break; @@ -737,7 +855,7 @@ regmatch(char *prog) break; case EXACT: s = OPERAND(scan); - ln = *s++; + ln = UCHARAT(s++); /* Inline the first character, for speed. */ if (UCHARAT(s) != nextchar) sayNO; @@ -749,11 +867,11 @@ regmatch(char *prog) nextchar = UCHARAT(locinput); break; case EXACTFL: - regtainted = TRUE; + reg_flags |= RF_tainted; /* FALL THROUGH */ case EXACTF: s = OPERAND(scan); - ln = *s++; + ln = UCHARAT(s++); /* Inline the first character, for speed. */ if (UCHARAT(s) != nextchar && UCHARAT(s) != ((OP(scan) == EXACTF) @@ -779,7 +897,7 @@ regmatch(char *prog) nextchar = UCHARAT(++locinput); break; case ALNUML: - regtainted = TRUE; + reg_flags |= RF_tainted; /* FALL THROUGH */ case ALNUM: if (!nextchar) @@ -790,7 +908,7 @@ regmatch(char *prog) nextchar = UCHARAT(++locinput); break; case NALNUML: - regtainted = TRUE; + reg_flags |= RF_tainted; /* FALL THROUGH */ case NALNUM: if (!nextchar && locinput >= regeol) @@ -802,7 +920,7 @@ regmatch(char *prog) break; case BOUNDL: case NBOUNDL: - regtainted = TRUE; + reg_flags |= RF_tainted; /* FALL THROUGH */ case BOUND: case NBOUND: @@ -820,7 +938,7 @@ regmatch(char *prog) sayNO; break; case SPACEL: - regtainted = TRUE; + reg_flags |= RF_tainted; /* FALL THROUGH */ case SPACE: if (!nextchar && locinput >= regeol) @@ -831,7 +949,7 @@ regmatch(char *prog) nextchar = UCHARAT(++locinput); break; case NSPACEL: - regtainted = TRUE; + reg_flags |= RF_tainted; /* FALL THROUGH */ case NSPACE: if (!nextchar) @@ -854,23 +972,21 @@ regmatch(char *prog) nextchar = UCHARAT(++locinput); break; case REFFL: - regtainted = TRUE; + reg_flags |= RF_tainted; /* FALL THROUGH */ - case REF: + case REF: case REFF: - n = ARG1(scan); /* which paren pair */ + n = ARG(scan); /* which paren pair */ s = regstartp[n]; - if (!s) - sayNO; - if (!regendp[n]) - sayNO; + if (*reglastparen < n || !s) + break; /* Zero length always matches */ if (s == regendp[n]) break; /* Inline the first character, for speed. */ if (UCHARAT(s) != nextchar && (OP(scan) == REF || (UCHARAT(s) != ((OP(scan) == REFF - ? fold : fold_locale)[nextchar])))) + ? fold : fold_locale)[nextchar])))) sayNO; ln = regendp[n] - s; if (locinput + ln > regeol) @@ -886,32 +1002,95 @@ regmatch(char *prog) break; case NOTHING: + case TAIL: break; case BACK: break; + case EVAL: + { + dSP; + OP_4tree *oop = op; + COP *ocurcop = curcop; + SV **ocurpad = curpad; + SV *ret; + + n = ARG(scan); + op = (OP_4tree*)data->data[n]; + DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%x\n", op) ); + curpad = AvARRAY((AV*)data->data[n + 1]); + if (!reg_eval_set) { + /* Preserve whatever is on stack now, otherwise + OP_NEXTSTATE will overwrite it. */ + SAVEINT(reg_eval_set); /* Protect against unwinding. */ + reg_eval_set = 1; + DEBUG_r(DEBUG_s( + PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %i\n", stack_sp - stack_base); + )); + SAVEINT(cxstack[cxstack_ix].blk_oldsp); + cxstack[cxstack_ix].blk_oldsp = stack_sp - stack_base; + /* Otherwise OP_NEXTSTATE will free whatever on stack now. */ + SAVETMPS; + /* Apparently this is not needed, judging by wantarray. */ + /* SAVEINT(cxstack[cxstack_ix].blk_gimme); + cxstack[cxstack_ix].blk_gimme = G_SCALAR; */ + } + + runops(); /* Scalar context. */ + SPAGAIN; + ret = POPs; + PUTBACK; + + if (logical) { + logical = 0; + sw = SvTRUE(ret); + } + op = oop; + curpad = ocurpad; + curcop = ocurcop; + break; + } case OPEN: - n = ARG1(scan); /* which paren pair */ - regstartp[n] = locinput; + n = ARG(scan); /* which paren pair */ + reg_start_tmp[n] = locinput; if (n > regsize) regsize = n; break; case CLOSE: - n = ARG1(scan); /* which paren pair */ + n = ARG(scan); /* which paren pair */ + regstartp[n] = reg_start_tmp[n]; regendp[n] = locinput; if (n > *reglastparen) *reglastparen = n; break; + case GROUPP: + n = ARG(scan); /* which paren pair */ + sw = (*reglastparen >= n && regendp[n] != NULL); + break; + case IFTHEN: + if (sw) + next = NEXTOPER(NEXTOPER(scan)); + else { + next = scan + ARG(scan); + if (OP(next) == IFTHEN) /* Fake one. */ + next = NEXTOPER(NEXTOPER(next)); + } + break; + case LOGICAL: + logical = 1; + break; case CURLYX: { - dTHR; CURCUR cc; CHECKPOINT cp = savestack_ix; + + if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */ + next += ARG(next); cc.oldcc = regcc; regcc = &cc; cc.parenfloor = *reglastparen; cc.cur = -1; cc.min = ARG1(scan); cc.max = ARG2(scan); - cc.scan = NEXTOPER(scan) + 4; + cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS; cc.next = next; cc.minmod = minmod; cc.lastloc = 0; @@ -932,24 +1111,34 @@ regmatch(char *prog) * that we can try again after backing off. */ - CHECKPOINT cp; + CHECKPOINT cp, lastcp; CURCUR* cc = regcc; + char *lastloc = cc->lastloc; /* Detection of 0-len. */ + n = cc->cur + 1; /* how many we know we matched */ reginput = locinput; -#ifdef DEBUGGING - if (regnarrate) - PerlIO_printf(Perl_debug_log, "%*s %ld %lx\n", regindent*2, "", - (long)n, (long)cc); -#endif + DEBUG_r( + PerlIO_printf(Perl_debug_log, + "%*s %ld out of %ld..%ld cc=%lx\n", + REPORT_CODE_OFF+regindent*2, "", + (long)n, (long)cc->min, + (long)cc->max, (long)cc) + ); /* If degenerate scan matches "", assume scan done. */ if (locinput == cc->lastloc && n >= cc->min) { regcc = cc->oldcc; ln = regcc->cur; + DEBUG_r( + PerlIO_printf(Perl_debug_log, "%*s empty match detected, try continuation...\n", REPORT_CODE_OFF+regindent*2, "") + ); if (regmatch(cc->next)) sayYES; + DEBUG_r( + PerlIO_printf(Perl_debug_log, "%*s failed...\n", REPORT_CODE_OFF+regindent*2, "") + ); regcc->cur = ln; regcc = cc; sayNO; @@ -963,6 +1152,10 @@ regmatch(char *prog) if (regmatch(cc->scan)) sayYES; cc->cur = n - 1; + cc->lastloc = lastloc; + DEBUG_r( + PerlIO_printf(Perl_debug_log, "%*s failed...\n", REPORT_CODE_OFF+regindent*2, "") + ); sayNO; } @@ -972,28 +1165,45 @@ regmatch(char *prog) regcc = cc->oldcc; ln = regcc->cur; cp = regcppush(cc->parenfloor); + REGCP_SET; if (regmatch(cc->next)) { - regcppartblow(cp); + regcpblow(cp); sayYES; /* All done. */ } + REGCP_UNWIND; regcppop(); regcc->cur = ln; regcc = cc; - if (n >= cc->max) /* Maximum greed exceeded? */ + if (n >= cc->max) { /* Maximum greed exceeded? */ + if (dowarn && n >= REG_INFTY + && !(reg_flags & RF_warned)) { + reg_flags |= RF_warned; + warn("count exceeded %d", REG_INFTY - 1); + } sayNO; + } + DEBUG_r( + PerlIO_printf(Perl_debug_log, "%*s trying longer...\n", REPORT_CODE_OFF+regindent*2, "") + ); /* Try scanning more and see if it helps. */ reginput = locinput; cc->cur = n; cc->lastloc = locinput; cp = regcppush(cc->parenfloor); + REGCP_SET; if (regmatch(cc->scan)) { - regcppartblow(cp); + regcpblow(cp); sayYES; } + DEBUG_r( + PerlIO_printf(Perl_debug_log, "%*s failed...\n", REPORT_CODE_OFF+regindent*2, "") + ); + REGCP_UNWIND; regcppop(); cc->cur = n - 1; + cc->lastloc = lastloc; sayNO; } @@ -1003,12 +1213,21 @@ regmatch(char *prog) cp = regcppush(cc->parenfloor); cc->cur = n; cc->lastloc = locinput; + REGCP_SET; if (regmatch(cc->scan)) { - regcppartblow(cp); + regcpblow(cp); sayYES; } + REGCP_UNWIND; regcppop(); /* Restore some previous $s? */ reginput = locinput; + DEBUG_r( + PerlIO_printf(Perl_debug_log, "%*s failed, try continuation...\n", REPORT_CODE_OFF+regindent*2, "") + ); + } + if (dowarn && n >= REG_INFTY && !(reg_flags & RF_warned)) { + reg_flags |= RF_warned; + warn("count exceeded %d", REG_INFTY - 1); } /* Failed deeper matches of scan, so see if this one works. */ @@ -1016,35 +1235,57 @@ regmatch(char *prog) ln = regcc->cur; if (regmatch(cc->next)) sayYES; + DEBUG_r( + PerlIO_printf(Perl_debug_log, "%*s failed...\n", REPORT_CODE_OFF+regindent*2, "") + ); regcc->cur = ln; regcc = cc; cc->cur = n - 1; + cc->lastloc = lastloc; sayNO; } /* NOT REACHED */ - case BRANCH: { - if (OP(next) != BRANCH) /* No choice. */ - next = NEXTOPER(scan);/* Avoid recursion. */ + case BRANCHJ: + next = scan + ARG(scan); + if (next == scan) + next = NULL; + inner = NEXTOPER(NEXTOPER(scan)); + goto do_branch; + case BRANCH: + inner = NEXTOPER(scan); + do_branch: + { + CHECKPOINT lastcp; + c1 = OP(scan); + if (OP(next) != c1) /* No choice. */ + next = inner; /* Avoid recursion. */ else { int lastparen = *reglastparen; + + REGCP_SET; do { reginput = locinput; - if (regmatch(NEXTOPER(scan))) + if (regmatch(inner)) sayYES; + REGCP_UNWIND; for (n = *reglastparen; n > lastparen; n--) regendp[n] = 0; *reglastparen = n; - + scan = next; #ifdef REGALIGN /*SUPPRESS 560*/ - if (n = NEXT(scan)) - scan += n; + if (n = (c1 == BRANCH ? NEXT_OFF(next) : ARG(next))) + next += n; else - scan = NULL; + next = NULL; #else - scan = regnext(scan); + next = regnext(next); #endif - } while (scan != NULL && OP(scan) == BRANCH); + inner = NEXTOPER(scan); + if (c1 == BRANCHJ) { + inner = NEXTOPER(inner); + } + } while (scan != NULL && OP(scan) == c1); sayNO; /* NOTREACHED */ } @@ -1053,25 +1294,163 @@ regmatch(char *prog) case MINMOD: minmod = 1; break; + case CURLYM: + { + I32 l; + CHECKPOINT lastcp; + + /* We suppose that the next guy does not need + backtracking: in particular, it is of constant length, + and has no parenths to influence future backrefs. */ + ln = ARG1(scan); /* min to match */ + n = ARG2(scan); /* max to match */ +#ifdef REGALIGN_STRUCT + paren = scan->flags; + if (paren) { + if (paren > regsize) + regsize = paren; + if (paren > *reglastparen) + *reglastparen = paren; + } +#endif + scan = NEXTOPER(scan) + 4/sizeof(regnode); + if (paren) + scan += NEXT_OFF(scan); /* Skip former OPEN. */ + reginput = locinput; + if (minmod) { + minmod = 0; + if (ln && regrepeat_hard(scan, ln, &l) < ln) + sayNO; + if (l == 0 && n >= ln + /* In fact, this is tricky. If paren, then the + fact that we did/didnot match may influence + future execution. */ + && !(paren && ln == 0)) + ln = n; + locinput = reginput; + if (regkind[(U8)OP(next)] == EXACT) { + c1 = UCHARAT(OPERAND(next) + 1); + if (OP(next) == EXACTF) + c2 = fold[c1]; + else if (OP(next) == EXACTFL) + c2 = fold_locale[c1]; + else + c2 = c1; + } else + c1 = c2 = -1000; + REGCP_SET; + while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */ + /* If it could work, try it. */ + if (c1 == -1000 || + UCHARAT(reginput) == c1 || + UCHARAT(reginput) == c2) + { + if (paren) { + if (n) { + regstartp[paren] = reginput - l; + regendp[paren] = reginput; + } else + regendp[paren] = NULL; + } + if (regmatch(next)) + sayYES; + REGCP_UNWIND; + } + /* Couldn't or didn't -- move forward. */ + reginput = locinput; + if (regrepeat_hard(scan, 1, &l)) { + ln++; + locinput = reginput; + } + else + sayNO; + } + } else { + n = regrepeat_hard(scan, n, &l); + if (n != 0 && l == 0 + /* In fact, this is tricky. If paren, then the + fact that we did/didnot match may influence + future execution. */ + && !(paren && ln == 0)) + ln = n; + locinput = reginput; + DEBUG_r( + PerlIO_printf(Perl_debug_log, "%*s matched %ld times, len=%ld...\n", REPORT_CODE_OFF+regindent*2, "", n, l) + ); + if (n >= ln) { + if (regkind[(U8)OP(next)] == EXACT) { + c1 = UCHARAT(OPERAND(next) + 1); + if (OP(next) == EXACTF) + c2 = fold[c1]; + else if (OP(next) == EXACTFL) + c2 = fold_locale[c1]; + else + c2 = c1; + } else + c1 = c2 = -1000; + } + REGCP_SET; + while (n >= ln) { + /* If it could work, try it. */ + if (c1 == -1000 || + UCHARAT(reginput) == c1 || + UCHARAT(reginput) == c2) + { + DEBUG_r( + PerlIO_printf(Perl_debug_log, "%*s trying tail with n=%ld...\n", REPORT_CODE_OFF+regindent*2, "", n) + ); + if (paren) { + if (n) { + regstartp[paren] = reginput - l; + regendp[paren] = reginput; + } else + regendp[paren] = NULL; + } + if (regmatch(next)) + sayYES; + REGCP_UNWIND; + } + /* Couldn't or didn't -- back up. */ + n--; + locinput -= l; + reginput = locinput; + } + } + sayNO; + break; + } + case CURLYN: + paren = scan->flags; /* Which paren to set */ + if (paren > regsize) + regsize = paren; + if (paren > *reglastparen) + *reglastparen = paren; + ln = ARG1(scan); /* min to match */ + n = ARG2(scan); /* max to match */ + scan = regnext(NEXTOPER(scan) + 4/sizeof(regnode)); + goto repeat; case CURLY: + paren = 0; ln = ARG1(scan); /* min to match */ n = ARG2(scan); /* max to match */ - scan = NEXTOPER(scan) + 4; + scan = NEXTOPER(scan) + 4/sizeof(regnode); goto repeat; case STAR: ln = 0; - n = 32767; + n = REG_INFTY; scan = NEXTOPER(scan); + paren = 0; goto repeat; case PLUS: + ln = 1; + n = REG_INFTY; + scan = NEXTOPER(scan); + paren = 0; + repeat: /* * Lookahead to avoid useless match attempts * when we know what character comes next. */ - ln = 1; - n = 32767; - scan = NEXTOPER(scan); - repeat: if (regkind[(U8)OP(next)] == EXACT) { c1 = UCHARAT(OPERAND(next) + 1); if (OP(next) == EXACTF) @@ -1085,67 +1464,130 @@ regmatch(char *prog) c1 = c2 = -1000; reginput = locinput; if (minmod) { + CHECKPOINT lastcp; minmod = 0; if (ln && regrepeat(scan, ln) < ln) sayNO; - while (n >= ln || (n == 32767 && ln > 0)) { /* ln overflow ? */ + REGCP_SET; + while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */ /* If it could work, try it. */ if (c1 == -1000 || UCHARAT(reginput) == c1 || UCHARAT(reginput) == c2) { + if (paren) { + if (n) { + regstartp[paren] = reginput - 1; + regendp[paren] = reginput; + } else + regendp[paren] = NULL; + } if (regmatch(next)) sayYES; + REGCP_UNWIND; } - /* Couldn't or didn't -- back up. */ + /* Couldn't or didn't -- move forward. */ reginput = locinput + ln; if (regrepeat(scan, 1)) { ln++; reginput = locinput + ln; - } - else + } else sayNO; } } else { + CHECKPOINT lastcp; n = regrepeat(scan, n); if (ln < n && regkind[(U8)OP(next)] == EOL && - (!multiline || OP(next) == SEOL)) + (!multiline || OP(next) == SEOL)) ln = n; /* why back off? */ - while (n >= ln) { - /* If it could work, try it. */ - if (c1 == -1000 || - UCHARAT(reginput) == c1 || - UCHARAT(reginput) == c2) - { - if (regmatch(next)) - sayYES; + REGCP_SET; + if (paren) { + while (n >= ln) { + /* If it could work, try it. */ + if (c1 == -1000 || + UCHARAT(reginput) == c1 || + UCHARAT(reginput) == c2) + { + if (paren && n) { + if (n) { + regstartp[paren] = reginput - 1; + regendp[paren] = reginput; + } else + regendp[paren] = NULL; + } + if (regmatch(next)) + sayYES; + REGCP_UNWIND; + } + /* Couldn't or didn't -- back up. */ + n--; + reginput = locinput + n; + } + } else { + while (n >= ln) { + /* If it could work, try it. */ + if (c1 == -1000 || + UCHARAT(reginput) == c1 || + UCHARAT(reginput) == c2) + { + if (regmatch(next)) + sayYES; + REGCP_UNWIND; + } + /* Couldn't or didn't -- back up. */ + n--; + reginput = locinput + n; } - /* Couldn't or didn't -- back up. */ - n--; - reginput = locinput + n; } } sayNO; + break; case SUCCEED: case END: reginput = locinput; /* put where regtry can find it */ sayYES; /* Success! */ - case IFMATCH: - reginput = locinput; - scan = NEXTOPER(scan); - if (!regmatch(scan)) - sayNO; - break; + case SUSPEND: + n = 1; + goto do_ifmatch; case UNLESSM: - reginput = locinput; - scan = NEXTOPER(scan); - if (regmatch(scan)) - sayNO; + n = 0; + if (locinput < bostr + scan->flags) + goto say_yes; + goto do_ifmatch; + case IFMATCH: + n = 1; + if (locinput < bostr + scan->flags) + goto say_no; + do_ifmatch: + reginput = locinput - scan->flags; + inner = NEXTOPER(NEXTOPER(scan)); + if (regmatch(inner) != n) { + say_no: + if (logical) { + logical = 0; + sw = 0; + goto do_longjump; + } else + sayNO; + } + say_yes: + if (logical) { + logical = 0; + sw = 1; + } + if (OP(scan) == SUSPEND) + locinput = reginput; + /* FALL THROUGH. */ + case LONGJMP: + do_longjump: + next = scan + ARG(scan); + if (next == scan) + next = NULL; break; default: PerlIO_printf(PerlIO_stderr(), "%lx %d\n", - (unsigned long)scan, scan[1]); + (unsigned long)scan, OP(scan)); FAIL("regexp memory corruption"); } scan = next; @@ -1181,7 +1623,7 @@ no: * rather than incrementing count on every character.] */ static I32 -regrepeat(char *p, I32 max) +regrepeat(regnode *p, I32 max) { register char *scan; register char *opnd; @@ -1189,7 +1631,7 @@ regrepeat(char *p, I32 max) register char *loceol = regeol; scan = reginput; - if (max != 32767 && max < loceol - scan) + if (max != REG_INFTY && max < loceol - scan) loceol = scan + max; opnd = OPERAND(p); switch (OP(p)) { @@ -1212,7 +1654,7 @@ regrepeat(char *p, I32 max) scan++; break; case EXACTFL: /* length of string is 1 */ - regtainted = TRUE; + reg_flags |= RF_tainted; c = UCHARAT(++opnd); while (scan < loceol && (UCHARAT(scan) == c || UCHARAT(scan) == fold_locale[c])) @@ -1227,7 +1669,7 @@ regrepeat(char *p, I32 max) scan++; break; case ALNUML: - regtainted = TRUE; + reg_flags |= RF_tainted; while (scan < loceol && isALNUM_LC(*scan)) scan++; break; @@ -1236,7 +1678,7 @@ regrepeat(char *p, I32 max) scan++; break; case NALNUML: - regtainted = TRUE; + reg_flags |= RF_tainted; while (scan < loceol && !isALNUM_LC(*scan)) scan++; break; @@ -1245,7 +1687,7 @@ regrepeat(char *p, I32 max) scan++; break; case SPACEL: - regtainted = TRUE; + reg_flags |= RF_tainted; while (scan < loceol && isSPACE_LC(*scan)) scan++; break; @@ -1254,7 +1696,7 @@ regrepeat(char *p, I32 max) scan++; break; case NSPACEL: - regtainted = TRUE; + reg_flags |= RF_tainted; while (scan < loceol && !isSPACE_LC(*scan)) scan++; break; @@ -1273,10 +1715,53 @@ regrepeat(char *p, I32 max) c = scan - reginput; reginput = scan; + DEBUG_r( + { + SV *prop = sv_newmortal(); + + regprop(prop, p); + PerlIO_printf(Perl_debug_log, + "%*s %s can match %ld times out of %ld...\n", + REPORT_CODE_OFF+1, "", SvPVX(prop),c,max); + }); + return(c); } /* + - regrepeat_hard - repeatedly match something, report total lenth and length + * + * The repeater is supposed to have constant length. + */ + +static I32 +regrepeat_hard(regnode *p, I32 max, I32 *lp) +{ + register char *scan; + register char *start; + register char *loceol = regeol; + I32 l = -1; + + start = reginput; + while (reginput < loceol && (scan = reginput, regmatch(p))) { + if (l == -1) { + *lp = l = reginput - start; + if (max != REG_INFTY && l*max < loceol - scan) + loceol = scan + l*max; + if (l == 0) { + return max; + } + } + } + if (reginput < loceol) + reginput = scan; + else + scan = reginput; + + return (scan - start)/l; +} + +/* - regclass - determine if a character falls into a character class */ @@ -1292,7 +1777,7 @@ reginclass(register char *p, register I32 c) else if (flags & ANYOF_FOLD) { I32 cf; if (flags & ANYOF_LOCALE) { - regtainted = TRUE; + reg_flags |= RF_tainted; cf = fold_locale[c]; } else @@ -1302,7 +1787,7 @@ reginclass(register char *p, register I32 c) } if (!match && (flags & ANYOF_ISA)) { - regtainted = TRUE; + reg_flags |= RF_tainted; if (((flags & ANYOF_ALNUML) && isALNUM_LC(c)) || ((flags & ANYOF_NALNUML) && !isALNUM_LC(c)) || @@ -1316,30 +1801,3 @@ reginclass(register char *p, register I32 c) return match ^ ((flags & ANYOF_INVERT) != 0); } -/* - - regnext - dig the "next" pointer out of a node - * - * [Note, when REGALIGN is defined there are two places in regmatch() - * that bypass this code for speed.] - */ -char * -regnext(register char *p) -{ - register I32 offset; - - if (p == ®dummy) - return(NULL); - - offset = NEXT(p); - if (offset == 0) - return(NULL); - -#ifdef REGALIGN - return(p+offset); -#else - if (OP(p) == BACK) - return(p-offset); - else - return(p+offset); -#endif -} diff --git a/regexp.h b/regexp.h index 684851c..2f7aa02 100644 --- a/regexp.h +++ b/regexp.h @@ -9,13 +9,19 @@ */ +struct regnode { + U8 flags; + U8 type; + U16 next_off; +}; + +typedef struct regnode regnode; + typedef struct regexp { + I32 refcnt; char **startp; char **endp; - SV *regstart; /* Internal use only. */ - char *regstclass; - SV *regmust; /* Internal use only. */ - I32 regback; /* Can regmust locate first try? */ + regnode *regstclass; I32 minlen; /* mininum possible length of $& */ I32 prelen; /* length of precomp */ U32 nparens; /* number of parentheses */ @@ -25,13 +31,41 @@ typedef struct regexp { char *subbeg; /* same, but not responsible for allocation */ char *subend; /* end of subbase */ U16 naughty; /* how exponential is this pattern? */ - char reganch; /* Internal use only. */ - char exec_tainted; /* Tainted information used by regexec? */ - char program[1]; /* Unwarranted chumminess with compiler. */ + U16 reganch; /* Internal use only + + Tainted information used by regexec? */ + SV *anchored_substr; /* Substring at fixed position wrt start. */ + I32 anchored_offset; /* Position of it. */ + SV *float_substr; /* Substring at variable position wrt start. */ + I32 float_min_offset; /* Minimal position of it. */ + I32 float_max_offset; /* Maximal position of it. */ + SV *check_substr; /* Substring to check before matching. */ + I32 check_offset_min; /* Offset of the above. */ + I32 check_offset_max; /* Offset of the above. */ + struct reg_data *data; /* Additional data. */ + regnode program[1]; /* Unwarranted chumminess with compiler. */ } regexp; -#define ROPT_ANCH 3 -#define ROPT_ANCH_BOL 1 -#define ROPT_ANCH_GPOS 2 -#define ROPT_SKIP 4 -#define ROPT_IMPLICIT 8 +#define ROPT_ANCH (ROPT_ANCH_BOL|ROPT_ANCH_MBOL|ROPT_ANCH_GPOS) +#define ROPT_ANCH_SINGLE (ROPT_ANCH_BOL|ROPT_ANCH_GPOS) +#define ROPT_ANCH_BOL 1 +#define ROPT_ANCH_MBOL 2 +#define ROPT_ANCH_GPOS 4 +#define ROPT_SKIP 8 +#define ROPT_IMPLICIT 0x10 /* Converted .* to ^.* */ +#define ROPT_NOSCAN 0x20 /* Check-string always at start. */ +#define ROPT_GPOS_SEEN 0x40 +#define ROPT_CHECK_ALL 0x80 +#define ROPT_LOOKBEHIND_SEEN 0x100 + +#define ROPT_TAINTED_SEEN 0x8000 + +#define RX_MATCH_TAINTED(prog) ((prog)->reganch & ROPT_TAINTED_SEEN) +#define RX_MATCH_TAINTED_SET(prog, t) ((t) \ + ? ((prog)->reganch |= ROPT_TAINTED_SEEN) \ + : ((prog)->reganch &= ~ROPT_TAINTED_SEEN)) + +#define REXEC_COPY_STR 1 /* Need to copy the string. */ +#define REXEC_CHECKED 2 /* check_substr already checked. */ + +#define ReREFCNT_inc(re) ((re && re->refcnt++), re) +#define ReREFCNT_dec(re) pregfree(re) diff --git a/sv.c b/sv.c index aeb2055..408cc77 100644 --- a/sv.c +++ b/sv.c @@ -2355,7 +2355,7 @@ sv_magic(register SV *sv, SV *obj, int how, char *name, I32 namlen) mg->mg_moremagic = SvMAGIC(sv); SvMAGIC(sv) = mg; - if (!obj || obj == sv || how == '#') + if (!obj || obj == sv || how == '#' || how == 'r') mg->mg_obj = obj; else { dTHR; @@ -2435,6 +2435,9 @@ sv_magic(register SV *sv, SV *obj, int how, char *name, I32 namlen) case 'q': mg->mg_virtual = &vtbl_packelem; break; + case 'r': + mg->mg_virtual = &vtbl_regexp; + break; case 'S': mg->mg_virtual = &vtbl_sig; break; @@ -4657,6 +4660,10 @@ sv_dump(SV *sv) sv_catpv(d, " ),"); } } + case SVt_PVBM: + if (SvTAIL(sv)) sv_catpv(d, "TAIL,"); + if (SvCOMPILED(sv)) sv_catpv(d, "COMPILED,"); + break; } if (*(SvEND(d) - 1) == ',') diff --git a/t/op/misc.t b/t/op/misc.t index 5a61acd..c529830 100755 --- a/t/op/misc.t +++ b/t/op/misc.t @@ -335,3 +335,13 @@ print "eat flaming death\n" unless ($s == 7); sub foo { local $_ = shift; split; @_ } @x = foo(' x y z '); print "you die joe!\n" unless "@x" eq 'x y z'; +######## +/(?{"{"})/ # Check it outside of eval too +EXPECT +/(?{"{"})/: Sequence (?{...}) not terminated or not {}-balanced at - line 1. +######## +/(?{"{"}})/ # Check it outside of eval too +EXPECT +Unmatched right bracket at (re_eval 1) line 1, at end of line +syntax error at (re_eval 1) line 1, near ""{"}" +Compilation failed in regexp at - line 1. diff --git a/t/op/pat.t b/t/op/pat.t index 0478911..03af122 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -2,7 +2,7 @@ # $RCSfile: pat.t,v $$Revision: 4.1 $$Date: 92/08/07 18:28:12 $ -print "1..62\n"; +print "1..97\n"; $x = "abc\ndef\n"; @@ -217,3 +217,114 @@ print "ok 61\n"; /\Gc/g; print "not " if defined pos $_; print "ok 62\n"; + +$out = 1; +'abc' =~ m'a(?{ $out = 2 })b'; +print "not " if $out != 2; +print "ok 63\n"; + +$out = 1; +'abc' =~ m'a(?{ $out = 3 })c'; +print "not " if $out != 1; +print "ok 64\n"; + +$_ = 'foobar1 bar2 foobar3 barfoobar5 foobar6'; +@out = /(? 1, + 'ax13876y25677mcb' => 0, # not b. + 'ax13876y35677nbc' => 0, # Num too big + 'ax13876y25677y21378obc' => 1, + 'ax13876y25677y21378zbc' => 0, # Not followed by [k-o] + 'ax13876y25677y21378y21378kbc' => 1, + 'ax13876y25677y21378y21378kcb' => 0, # Not b. + 'ax13876y25677y21378y21378y21378kbc' => 0, # 5 runs + ); + +for ( keys %ans ) { + print "# const-len `$_' not => $ans{$_}\nnot " + if $ans{$_} xor /a(?=([yx]($long_constant_len)){2,4}[k-o]).*b./o; + print "ok $test\n"; + $test++; + print "# var-len `$_' not => $ans{$_}\nnot " + if $ans{$_} xor /a(?=([yx]($long_var_len)){2,4}[k-o]).*b./o; + print "ok $test\n"; + $test++; +} + +$_ = " a (bla()) and x(y b((l)u((e))) and b(l(e)e)e"; +$expect = "(bla()) ((l)u((e))) (l(e)e)"; + +sub matchit { + m' + ( + \( + (?{ $c = 1 }) # Initialize + (?: + (?(?{ $c == 0 }) # PREVIOUS iteration was OK, stop the loop + (?! + ) # Fail: will unwind one iteration back + ) + (?: + [^()]+ # Match a big chunk + (?= + [()] + ) # Do not try to match subchunks + | + \( + (?{ ++$c }) + | + \) + (?{ --$c }) + ) + )+ # This may not match with different subblocks + ) + (?(?{ $c != 0 }) + (?! + ) # Fail + ) # Otherwise the chunk 1 may succeed with $c>0 + 'xg; +} + +push @ans, $res while $res = matchit; + +print "# ans='@ans'\n# expect='$expect'\nnot " if "@ans" ne "1 1 1"; +print "ok $test\n"; +$test++; + +@ans = matchit; + +print "# ans='@ans'\n# expect='$expect'\nnot " if "@ans" ne $expect; +print "ok $test\n"; +$test++; + +@ans = ('a/b' =~ m%(.*/)?(.*)%); # Stack may be bad +print "not " if "@ans" ne 'a/ b'; +print "ok $test\n"; +$test++; + +$code = '$blah = 45'; +$blah = 12; +/(?{$code})/; +print "not " if $blah != 45; +print "ok $test\n"; +$test++; + diff --git a/t/op/re_tests b/t/op/re_tests index ce4c5a5..29a6518 100644 --- a/t/op/re_tests +++ b/t/op/re_tests @@ -8,6 +8,8 @@ ab*c abc y $& abc ab*bc abc y $& abc ab*bc abbc y $& abbc ab*bc abbbbc y $& abbbbc +.{1} abbbbc y $& a +.{3,4} abbbbc y $& abbb ab{0,}bc abbbbc y $& abbbbc ab+bc abbc y $& abbc ab+bc abc n - - @@ -29,6 +31,7 @@ ab{0,1}c abc y $& abc ^abc abcc y $& abc ^abc$ aabc n - - abc$ aabc y $& abc +abc$ aabcd n - - ^ abc y $& $ abc y $& a.c abc y $& abc @@ -299,10 +302,132 @@ a(?=c|d). abad y $& ad a(?:b|c|d)(.) ace y $1 e a(?:b|c|d)*(.) ace y $1 e a(?:b|c|d)+?(.) ace y $1 e +a(?:b|c|d)+?(.) acdbcdbe y $1 d +a(?:b|c|d)+(.) acdbcdbe y $1 e +a(?:b|c|d){2}(.) acdbcdbe y $1 b +a(?:b|c|d){4,5}(.) acdbcdbe y $1 b +a(?:b|c|d){4,5}?(.) acdbcdbe y $1 d +((foo)|(bar))* foobar y $1-$2-$3 bar-foo-bar +:(?: - c - /(?/: Sequence (? incomplete +a(?:b|c|d){6,7}(.) acdbcdbe y $1 e +a(?:b|c|d){6,7}?(.) acdbcdbe y $1 e +a(?:b|c|d){5,6}(.) acdbcdbe y $1 e +a(?:b|c|d){5,6}?(.) acdbcdbe y $1 b +a(?:b|c|d){5,7}(.) acdbcdbe y $1 e +a(?:b|c|d){5,7}?(.) acdbcdbe y $1 b a(?:b|(c|e){1,2}?|d)+?(.) ace y $1$2 ce ^(.+)?B AB y $1 A -'([a-z]+)\s\1'i Aa aa y $&-$1 Aa aa-Aa -'([a-z]+)\s\1'i Ab ab y $&-$1 Ab ab-Ab +^([^a-z])|(\^)$ . y $1 . +^[<>]& <&OUT y $& <& +^(a\1?){4}$ aaaaaaaaaa y $1 aaaa +^(a\1?){4}$ aaaaaaaaa n - - +^(a\1?){4}$ aaaaaaaaaaa n - - +^(a\1){4}$ aaaaaaaaaa y $1 aaaa +^(a\1){4}$ aaaaaaaaa n - - +^(a\1){4}$ aaaaaaaaaaa n - - +(?:(f)(o)(o)|(b)(a)(r))* foobar y $1:$2:$3:$4:$5:$6 f:o:o:b:a:r +(?<=a)b ab y $& b +(?<=a)b cb n - - +(?<=a)b b n - - +(?a+)ab aaab n - - +((?>[^()]+)|\([^()]*\))+ ((abc(ade)ufh()()x y $& abc(ade)ufh()()x diff --git a/t/op/regexp.t b/t/op/regexp.t index 803f1d0..2736084 100755 --- a/t/op/regexp.t +++ b/t/op/regexp.t @@ -19,7 +19,11 @@ # Column 4 contains a string, usually C<$&>. # # Column 5 contains the expected result of double-quote -# interpolating that string after the match. +# interpolating that string after the match, or start of error message. +# +# Columns 1, 2 and 5 are \n-interpolated. + +$iters = shift || 1; # Poor man performance suite, 10000 is OK. open(TESTS,'op/re_tests') || open(TESTS,'t/op/re_tests') || die "Can't open re_tests"; @@ -30,24 +34,33 @@ seek(TESTS,0,0); $. = 0; $| = 1; -print "1..$numtests\n"; +print "1..$numtests\n# $iters iterations\n"; TEST: while () { ($pat, $subject, $result, $repl, $expect) = split(/[\t\n]/,$_); $input = join(':',$pat,$subject,$result,$repl,$expect); $pat = "'$pat'" unless $pat =~ /^[:']/; + $pat =~ s/\\n/\n/g; + $subject =~ s/\\n/\n/g; + $expect =~ s/\\n/\n/g; + $expect = $repl = '-' if $skip_amp and $input =~ /\$[&\`\']/; for $study ("", "study \$subject") { - eval "$study; \$match = (\$subject =~ m$pat); \$got = \"$repl\";"; + $c = $iters; + eval "$study; \$match = (\$subject =~ m$pat) while \$c--; \$got = \"$repl\";"; + chomp( $err = $@ ); if ($result eq 'c') { - if ($@ !~ m!^\Q$expect!) { print "not ok $.\n"; next TEST } + if ($err !~ m!^\Q$expect!) { print "not ok $. (compile) $input => `$err'\n"; next TEST } last; # no need to study a syntax error } + elsif ($@) { + print "not ok $. $input => error `$err'\n"; next TEST; + } elsif ($result eq 'n') { - if ($match) { print "not ok $. $input => $got\n"; next TEST } + if ($match) { print "not ok $. ($study) $input => false positive\n"; next TEST } } else { if (!$match || $got ne $expect) { - print "not ok $. $input => $got\n"; + print "not ok $. ($study) $input => `$got', match=$match\n"; next TEST; } } diff --git a/t/op/split.t b/t/op/split.t index 0724652..7f0acce 100755 --- a/t/op/split.t +++ b/t/op/split.t @@ -2,7 +2,7 @@ # $RCSfile: split.t,v $$Revision: 4.1 $$Date: 92/08/07 18:28:26 $ -print "1..20\n"; +print "1..25\n"; $FS = ':'; @@ -90,3 +90,24 @@ print $_ eq "Z" ? "ok 19\n" : "#$_\nnot ok 19\n"; $_ = join('|', split(/.?/, '',-1), 'Z'); print $_ eq "Z" ? "ok 20\n" : "#$_\nnot ok 20\n"; + +# Are /^/m patterns scanned? +$_ = join '|', split(/^a/m, "a b a\na d a", 20); +print $_ eq "| b a\n| d a" ? "ok 21\n" : "not ok 21\n# `$_'\n"; + +# Are /$/m patterns scanned? +$_ = join '|', split(/a$/m, "a b a\na d a", 20); +print $_ eq "a b |\na d |" ? "ok 22\n" : "not ok 22\n# `$_'\n"; + +# Are /^/m patterns scanned? +$_ = join '|', split(/^aa/m, "aa b aa\naa d aa", 20); +print $_ eq "| b aa\n| d aa" ? "ok 23\n" : "not ok 23\n# `$_'\n"; + +# Are /$/m patterns scanned? +$_ = join '|', split(/aa$/m, "aa b aa\naa d aa", 20); +print $_ eq "aa b |\naa d |" ? "ok 24\n" : "not ok 24\n# `$_'\n"; + +# Greedyness: +$_ = "a : b :c: d"; +@ary = split(/\s*:\s*/); +if (($res = join(".",@ary)) eq "a.b.c.d") {print "ok 25\n";} else {print "not ok 25\n# res=`$res' != `a.b.c.d'\n";} diff --git a/t/op/subst.t b/t/op/subst.t index efea970..c6cfb8c 100755 --- a/t/op/subst.t +++ b/t/op/subst.t @@ -2,7 +2,7 @@ # $RCSfile: s.t,v $$Revision: 4.1 $$Date: 92/08/07 18:28:22 $ -print "1..62\n"; +print "1..67\n"; $x = 'foo'; $_ = "x"; @@ -157,11 +157,11 @@ $x ne $x || s/bb/x/; print $_ eq 'aaaXXXXxb' ? "ok 39\n" : "not ok 39\n"; $_ = 'abc123xyz'; -s/\d+/$&*2/e; # yields 'abc246xyz' +s/(\d+)/$1*2/e; # yields 'abc246xyz' print $_ eq 'abc246xyz' ? "ok 40\n" : "not ok 40\n"; -s/\d+/sprintf("%5d",$&)/e; # yields 'abc 246xyz' +s/(\d+)/sprintf("%5d",$1)/e; # yields 'abc 246xyz' print $_ eq 'abc 246xyz' ? "ok 41\n" : "not ok 41\n"; -s/\w/$& x 2/eg; # yields 'aabbcc 224466xxyyzz' +s/(\w)/$1 x 2/eg; # yields 'aabbcc 224466xxyyzz' print $_ eq 'aabbcc 224466xxyyzz' ? "ok 42\n" : "not ok 42\n"; $_ = "aaaaa"; @@ -232,10 +232,32 @@ print exp_vars('foo $(DIR)/yyy bar',0) eq 'foo $(UNDEFINEDNAME)/xxx/yyy bar' # a match nested in the RHS of a substitution: $_ = "abcd"; -s/../$x = $&, m#.#/eg; +s/(..)/$x = $1, m#.#/eg; print $x eq "cd" ? "ok 61\n" : "not ok 61\n"; +# Subst and lookbehind + +$_="ccccc"; +s/(?op_pmshort && pm->op_pmregexp->regstart && - (!pm->op_pmregexp->regmust || pm->op_pmregexp->reganch & ROPT_ANCH) - ) { - if (!(pm->op_pmregexp->reganch & ROPT_ANCH)) - pm->op_pmflags |= PMf_SCANFIRST; - pm->op_pmshort = SvREFCNT_inc(pm->op_pmregexp->regstart); - pm->op_pmslen = SvCUR(pm->op_pmshort); - } - else if (pm->op_pmregexp->regmust) {/* is there a better short-circuit? */ - if (pm->op_pmshort && - sv_eq(pm->op_pmshort,pm->op_pmregexp->regmust)) - { - if (pm->op_pmflags & PMf_SCANFIRST) { - SvREFCNT_dec(pm->op_pmshort); - pm->op_pmshort = Nullsv; - } - else { - SvREFCNT_dec(pm->op_pmregexp->regmust); - pm->op_pmregexp->regmust = Nullsv; - return; - } - } - /* promote the better string */ - if ((!pm->op_pmshort && - !(pm->op_pmregexp->reganch & ROPT_ANCH_GPOS)) || - ((pm->op_pmflags & PMf_SCANFIRST) && - (SvCUR(pm->op_pmshort) < SvCUR(pm->op_pmregexp->regmust)))) { - SvREFCNT_dec(pm->op_pmshort); /* ok if null */ - pm->op_pmshort = pm->op_pmregexp->regmust; - pm->op_pmslen = SvCUR(pm->op_pmshort); - pm->op_pmregexp->regmust = Nullsv; - pm->op_pmflags |= PMf_SCANFIRST; - } - } -} - static char * scan_trans(char *start) { diff --git a/util.c b/util.c index 767082c..bc90b55 100644 --- a/util.c +++ b/util.c @@ -819,7 +819,8 @@ fbm_compile(SV *sv) I32 rarest = 0; U32 frequency = 256; - if (len > 255) + sv_upgrade(sv, SVt_PVBM); + if (len > 255 || len == 0) /* TAIL might be on on a zero-length string. */ return; /* can't have offsets that big */ Sv_Grow(sv,len+258); table = (unsigned char*)(SvPVX(sv) + len + 1); @@ -834,7 +835,6 @@ fbm_compile(SV *sv) table[*s] = i; s--,i++; } - sv_upgrade(sv, SVt_PVBM); sv_magic(sv, Nullsv, 'B', Nullch, 0); /* deep magic */ SvVALID_on(sv); @@ -864,8 +864,15 @@ fbm_instr(unsigned char *big, register unsigned char *bigend, SV *littlestr) if (SvTYPE(littlestr) != SVt_PVBM || !SvVALID(littlestr)) { STRLEN len; char *l = SvPV(littlestr,len); - if (!len) + if (!len) { + if (SvTAIL(littlestr)) { + if (bigend > big && bigend[-1] == '\n') + return bigend - 1; + else + return bigend; + } return (char*)big; + } return ninstr((char*)big,(char*)bigend, l, l + len); } @@ -911,20 +918,35 @@ fbm_instr(unsigned char *big, register unsigned char *bigend, SV *littlestr) while (tmp--) { if (*--s == *--little) continue; + differ: s = olds + 1; /* here we pay the price for failure */ little = oldlittle; if (s < bigend) /* fake up continue to outer loop */ goto top2; return Nullch; } + if (SvTAIL(littlestr) /* automatically multiline */ + && olds + 1 != bigend + && olds[1] != '\n') + goto differ; return (char *)s; } } return Nullch; } +/* start_shift, end_shift are positive quantities which give offsets + of ends of some substring of bigstr. + If `last' we want the last occurence. + old_posp is the way of communication between consequent calls if + the next call needs to find the . + The initial *old_posp should be -1. + Note that we do not take into account SvTAIL, so it may give wrong + positives if _ALL flag is set. + */ + char * -screaminstr(SV *bigstr, SV *littlestr) +screaminstr(SV *bigstr, SV *littlestr, I32 start_shift, I32 end_shift, I32 *old_posp, I32 last) { register unsigned char *s, *x; register unsigned char *big; @@ -932,54 +954,65 @@ screaminstr(SV *bigstr, SV *littlestr) register I32 previous; register I32 first; register unsigned char *little; - register unsigned char *bigend; + register I32 stop_pos; register unsigned char *littleend; + I32 found = 0; - if ((pos = screamfirst[BmRARE(littlestr)]) < 0) + if (*old_posp == -1 + ? (pos = screamfirst[BmRARE(littlestr)]) < 0 + : (((pos = *old_posp), pos += screamnext[pos]) == 0)) return Nullch; little = (unsigned char *)(SvPVX(littlestr)); littleend = little + SvCUR(littlestr); first = *little++; + /* The value of pos we can start at: */ previous = BmPREVIOUS(littlestr); big = (unsigned char *)(SvPVX(bigstr)); - bigend = big + SvCUR(bigstr); - while (pos < previous) { + /* The value of pos we can stop at: */ + stop_pos = SvCUR(bigstr) - end_shift - (SvCUR(littlestr) - 1 - previous); + if (previous + start_shift > stop_pos) return Nullch; + while (pos < previous + start_shift) { if (!(pos += screamnext[pos])) return Nullch; } #ifdef POINTERRIGOR do { + if (pos >= stop_pos) return Nullch; if (big[pos-previous] != first) continue; for (x=big+pos+1-previous,s=little; s < littleend; /**/ ) { - if (x >= bigend) - return Nullch; if (*s++ != *x++) { s--; break; } } - if (s == littleend) - return (char *)(big+pos-previous); + if (s == littleend) { + *old_posp = pos; + if (!last) return (char *)(big+pos-previous); + found = 1; + } } while ( pos += screamnext[pos] ); + return (last && found) ? (char *)(big+(*old_posp)-previous) : Nullch; #else /* !POINTERRIGOR */ big -= previous; do { + if (pos >= stop_pos) return Nullch; if (big[pos] != first) continue; for (x=big+pos+1,s=little; s < littleend; /**/ ) { - if (x >= bigend) - return Nullch; if (*s++ != *x++) { s--; break; } } - if (s == littleend) - return (char *)(big+pos); + if (s == littleend) { + *old_posp = pos; + if (!last) return (char *)(big+pos); + found = 1; + } } while ( pos += screamnext[pos] ); + return (last && found) ? (char *)(big+(*old_posp)) : Nullch; #endif /* POINTERRIGOR */ - return Nullch; } I32