X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=regexec.c;h=e8434784518c2d4ffb8797ffaedbc6bf6545b399;hb=e77e2f143f073d08d6764e30771960b2bccde0db;hp=ebe7883b6091d1ee5533814aecd11ee102b63402;hpb=faf82a0b75a45f1e4dbb7ad8cecdfaf9a30a643d;p=p5sagit%2Fp5-mst-13.2.git diff --git a/regexec.c b/regexec.c index ebe7883..e843478 100644 --- a/regexec.c +++ b/regexec.c @@ -5,6 +5,11 @@ * "One Ring to rule them all, One Ring to find them..." */ +/* This file contains functions for executing a regular expression. See + * also regcomp.c which funnily enough, contains functions for compiling + * a regular expression. + */ + /* NOTE: this is derived from Henry Spencer's regexp code, and should not * confused with the original package (see point 3 below). Thanks, Henry! */ @@ -67,7 +72,8 @@ * **** Alterations to Henry's code are... **** - **** Copyright (c) 1991-2002, Larry Wall + **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, + **** 2000, 2001, 2002, 2003, 2004, by Larry Wall and others **** **** You may distribute under the terms of either the GNU General Public **** License or the Artistic License, as specified in the README file. @@ -86,6 +92,7 @@ #define RF_warned 2 /* warned about big count? */ #define RF_evaled 4 /* Did an EVAL with setting? */ #define RF_utf8 8 /* String contains multibyte chars? */ +#define RF_false 16 /* odd number of nested negatives */ #define UTF ((PL_reg_flags & RF_utf8) != 0) @@ -171,7 +178,7 @@ S_regcppush(pTHX_ I32 parenfloor) Perl_croak(aTHX_ "panic: paren_elems_to_push < 0"); #define REGCP_OTHER_ELEMS 6 - SSCHECK(paren_elems_to_push + REGCP_OTHER_ELEMS); + SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS); for (p = PL_regsize; p > parenfloor; p--) { /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */ SSPUSHINT(PL_regendp[p]); @@ -544,7 +551,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, goto fail_finish; /* we may be pointing at the wrong string */ if (s && RX_MATCH_COPIED(prog)) - s = prog->subbeg + (s - SvPVX(sv)); + s = strbeg + (s - SvPVX(sv)); if (data) *data->scream_olds = s; } @@ -852,10 +859,6 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, char *startpos = strbeg; t = s; - if (prog->reganch & ROPT_UTF8) { - PL_regdata = prog->data; - PL_bostr = startpos; - } cache_re(prog); s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1); if (!s) { @@ -955,6 +958,8 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta I32 doevery = (prog->reganch & ROPT_SKIP) == 0; char *m; STRLEN ln; + STRLEN lnc; + register STRLEN uskip; unsigned int c1; unsigned int c2; char *e; @@ -965,7 +970,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta switch (OP(c)) { case ANYOF: if (do_utf8) { - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) || !UTF8_IS_INVARIANT((U8)s[0]) ? reginclass(c, (U8*)s, 0, do_utf8) : @@ -977,7 +982,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1010,10 +1015,12 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } break; case EXACTF: - m = STRING(c); - ln = STR_LEN(c); + m = STRING(c); + ln = STR_LEN(c); /* length to match in octets/bytes */ + lnc = (I32) ln; /* length to match in characters */ if (UTF) { STRLEN ulen1, ulen2; + U8 *sm = (U8 *) m; U8 tmpbuf1[UTF8_MAXLEN_UCLC+1]; U8 tmpbuf2[UTF8_MAXLEN_UCLC+1]; @@ -1024,6 +1031,11 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC, 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); + lnc = 0; + while (sm < ((U8 *) m + ln)) { + lnc++; + sm += UTF8SKIP(sm); + } } else { c1 = *(U8*)m; @@ -1031,12 +1043,13 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } goto do_exactf; case EXACTFL: - m = STRING(c); - ln = STR_LEN(c); + m = STRING(c); + ln = STR_LEN(c); + lnc = (I32) ln; c1 = *(U8*)m; c2 = PL_fold_locale[c1]; do_exactf: - e = HOP3c(strend, -(I32)ln, s); + e = HOP3c(strend, -((I32)lnc), s); if (norun && e < s) e = s; /* Due to minlen logic of intuit() */ @@ -1059,6 +1072,8 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta STRLEN len, foldlen; if (c1 == c2) { + /* Upper and lower of 1st char are equal - + * probably not a "letter". */ while (s <= e) { c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len, ckWARN(WARN_UTF8) ? @@ -1163,7 +1178,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta tmp = ((OP(c) == BOUND ? isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0); LOAD_UTF8_CHARCLASS(alnum,"a"); - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (tmp == !(OP(c) == BOUND ? swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) : isALNUM_LC_utf8((U8*)s))) @@ -1172,7 +1187,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta if ((norun || regtry(prog, s))) goto got_it; } - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1206,14 +1221,14 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta tmp = ((OP(c) == NBOUND ? isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0); LOAD_UTF8_CHARCLASS(alnum,"a"); - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (tmp == !(OP(c) == NBOUND ? swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) : isALNUM_LC_utf8((U8*)s))) tmp = !tmp; else if ((norun || regtry(prog, s))) goto got_it; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1235,7 +1250,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case ALNUM: if (do_utf8) { LOAD_UTF8_CHARCLASS(alnum,"a"); - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1244,7 +1259,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1264,7 +1279,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case ALNUML: PL_reg_flags |= RF_tainted; if (do_utf8) { - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (isALNUM_LC_utf8((U8*)s)) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1273,7 +1288,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1293,7 +1308,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case NALNUM: if (do_utf8) { LOAD_UTF8_CHARCLASS(alnum,"a"); - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1302,7 +1317,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1322,7 +1337,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case NALNUML: PL_reg_flags |= RF_tainted; if (do_utf8) { - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (!isALNUM_LC_utf8((U8*)s)) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1331,7 +1346,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1351,7 +1366,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case SPACE: if (do_utf8) { LOAD_UTF8_CHARCLASS(space," "); - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1360,7 +1375,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1380,7 +1395,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case SPACEL: PL_reg_flags |= RF_tainted; if (do_utf8) { - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1389,7 +1404,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1409,7 +1424,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case NSPACE: if (do_utf8) { LOAD_UTF8_CHARCLASS(space," "); - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1418,7 +1433,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1438,7 +1453,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case NSPACEL: PL_reg_flags |= RF_tainted; if (do_utf8) { - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1447,7 +1462,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1467,7 +1482,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case DIGIT: if (do_utf8) { LOAD_UTF8_CHARCLASS(digit,"0"); - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1476,7 +1491,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1496,7 +1511,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case DIGITL: PL_reg_flags |= RF_tainted; if (do_utf8) { - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (isDIGIT_LC_utf8((U8*)s)) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1505,7 +1520,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1525,7 +1540,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case NDIGIT: if (do_utf8) { LOAD_UTF8_CHARCLASS(digit,"0"); - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1534,7 +1549,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1554,7 +1569,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case NDIGITL: PL_reg_flags |= RF_tainted; if (do_utf8) { - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (!isDIGIT_LC_utf8((U8*)s)) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1563,7 +1578,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1866,7 +1881,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * PL_multiline ? FBMrf_MULTILINE : 0))) ) { /* we may be pointing at the wrong string */ if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog)) - s = prog->subbeg + (s - SvPVX(sv)); + s = strbeg + (s - SvPVX(sv)); DEBUG_r( did_match = 1 ); if (HOPc(s, -back_max) > last1) { last1 = HOPc(s, -back_min); @@ -1955,7 +1970,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * last = scream_olds; /* Only one occurrence. */ /* we may be pointing at the wrong string */ else if (RX_MATCH_COPIED(prog)) - s = prog->subbeg + (s - SvPVX(sv)); + s = strbeg + (s - SvPVX(sv)); } else { STRLEN len; @@ -2095,8 +2110,7 @@ S_regtry(pTHX_ regexp *prog, char *startpos) if (PL_reg_sv) { /* Make $_ available to executed code. */ if (PL_reg_sv != DEFSV) { - /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */ - SAVESPTR(DEFSV); + SAVE_DEFSV; DEFSV = PL_reg_sv; } @@ -2838,6 +2852,7 @@ S_regmatch(pTHX_ regnode *prog) COP *ocurcop = PL_curcop; PAD *old_comppad; SV *ret; + struct regexp *oreg = PL_reg_re; n = ARG(scan); PL_op = (OP_4tree*)PL_regdata->data[n]; @@ -2970,8 +2985,10 @@ S_regmatch(pTHX_ regnode *prog) sw = SvTRUE(ret); logical = 0; } - else + else { sv_setsv(save_scalar(PL_replgv), ret); + cache_re(oreg); + } break; } case OPEN: @@ -3204,7 +3221,10 @@ S_regmatch(pTHX_ regnode *prog) "%*s already tried at this position...\n", REPORT_CODE_OFF+PL_regindent*2, "") ); - sayNO_SILENT; + if (PL_reg_flags & RF_false) + sayYES; + else + sayNO_SILENT; } PL_reg_poscache[o] |= (1<= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */ + while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */ /* If it could work, try it. */ if (c1 == -1000 || UCHARAT(PL_reginput) == c1 || @@ -3449,13 +3448,6 @@ S_regmatch(pTHX_ regnode *prog) } else { n = regrepeat_hard(scan, n, &l); - /* if we matched something zero-length we don't need to - backtrack, unless the minimum count is zero and we - are capturing the result - in that case the capture - being defined or not may affect later execution - */ - if (n != 0 && l == 0 && !(paren && ln == 0)) - ln = n; /* don't backtrack */ locinput = PL_reginput; DEBUG_r( PerlIO_printf(Perl_debug_log, @@ -3472,19 +3464,8 @@ S_regmatch(pTHX_ regnode *prog) if (! HAS_TEXT(text_node)) c1 = c2 = -1000; else { if (PL_regkind[(U8)OP(text_node)] == REF) { - I32 n, ln; - n = ARG(text_node); /* which paren pair */ - ln = PL_regstartp[n]; - /* assume yes if we haven't seen CLOSEn */ - if ( - (I32)*PL_reglastparen < n || - ln == -1 || - ln == PL_regendp[n] - ) { - c1 = c2 = -1000; - goto assume_ok_REG; - } - c1 = *(PL_bostr + ln); + c1 = c2 = -1000; + goto assume_ok_REG; } else { c1 = (U8)*STRING(text_node); } @@ -3581,19 +3562,8 @@ S_regmatch(pTHX_ regnode *prog) if (! HAS_TEXT(text_node)) c1 = c2 = -1000; else { if (PL_regkind[(U8)OP(text_node)] == REF) { - I32 n, ln; - n = ARG(text_node); /* which paren pair */ - ln = PL_regstartp[n]; - /* assume yes if we haven't seen CLOSEn */ - if ( - (I32)*PL_reglastparen < n || - ln == -1 || - ln == PL_regendp[n] - ) { - c1 = c2 = -1000; - goto assume_ok_easy; - } - s = (U8*)PL_bostr + ln; + c1 = c2 = -1000; + goto assume_ok_easy; } else { s = (U8*)STRING(text_node); } @@ -3887,6 +3857,7 @@ S_regmatch(pTHX_ regnode *prog) } else PL_reginput = locinput; + PL_reg_flags ^= RF_false; goto do_ifmatch; case IFMATCH: n = 1; @@ -3902,6 +3873,8 @@ S_regmatch(pTHX_ regnode *prog) do_ifmatch: inner = NEXTOPER(NEXTOPER(scan)); if (regmatch(inner) != n) { + if (n == 0) + PL_reg_flags ^= RF_false; say_no: if (logical) { logical = 0; @@ -3911,6 +3884,8 @@ S_regmatch(pTHX_ regnode *prog) else sayNO; } + if (n == 0) + PL_reg_flags ^= RF_false; say_yes: if (logical) { logical = 0; @@ -4277,7 +4252,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max) /* - regrepeat_hard - repeatedly match something, report total lenth and length * - * The repeater is supposed to have constant length. + * The repeater is supposed to have constant non-zero length. */ STATIC I32