X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=regexec.c;h=e8434784518c2d4ffb8797ffaedbc6bf6545b399;hb=e77e2f143f073d08d6764e30771960b2bccde0db;hp=961611b710825188c41b8ae2a755a3fa61517db9;hpb=b0e70d55183e72f46527dd222872eff0e4a92c42;p=p5sagit%2Fp5-mst-13.2.git diff --git a/regexec.c b/regexec.c index 961611b..e843478 100644 --- a/regexec.c +++ b/regexec.c @@ -5,6 +5,11 @@ * "One Ring to rule them all, One Ring to find them..." */ +/* This file contains functions for executing a regular expression. See + * also regcomp.c which funnily enough, contains functions for compiling + * a regular expression. + */ + /* NOTE: this is derived from Henry Spencer's regexp code, and should not * confused with the original package (see point 3 below). Thanks, Henry! */ @@ -68,7 +73,7 @@ **** Alterations to Henry's code are... **** **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, - **** 2000, 2001, 2002, 2003, by Larry Wall and others + **** 2000, 2001, 2002, 2003, 2004, by Larry Wall and others **** **** You may distribute under the terms of either the GNU General Public **** License or the Artistic License, as specified in the README file. @@ -87,6 +92,7 @@ #define RF_warned 2 /* warned about big count? */ #define RF_evaled 4 /* Did an EVAL with setting? */ #define RF_utf8 8 /* String contains multibyte chars? */ +#define RF_false 16 /* odd number of nested negatives */ #define UTF ((PL_reg_flags & RF_utf8) != 0) @@ -952,6 +958,8 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta I32 doevery = (prog->reganch & ROPT_SKIP) == 0; char *m; STRLEN ln; + STRLEN lnc; + register STRLEN uskip; unsigned int c1; unsigned int c2; char *e; @@ -962,7 +970,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta switch (OP(c)) { case ANYOF: if (do_utf8) { - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) || !UTF8_IS_INVARIANT((U8)s[0]) ? reginclass(c, (U8*)s, 0, do_utf8) : @@ -974,7 +982,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1007,10 +1015,12 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } break; case EXACTF: - m = STRING(c); - ln = STR_LEN(c); + m = STRING(c); + ln = STR_LEN(c); /* length to match in octets/bytes */ + lnc = (I32) ln; /* length to match in characters */ if (UTF) { STRLEN ulen1, ulen2; + U8 *sm = (U8 *) m; U8 tmpbuf1[UTF8_MAXLEN_UCLC+1]; U8 tmpbuf2[UTF8_MAXLEN_UCLC+1]; @@ -1021,6 +1031,11 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC, 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); + lnc = 0; + while (sm < ((U8 *) m + ln)) { + lnc++; + sm += UTF8SKIP(sm); + } } else { c1 = *(U8*)m; @@ -1028,12 +1043,13 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } goto do_exactf; case EXACTFL: - m = STRING(c); - ln = STR_LEN(c); + m = STRING(c); + ln = STR_LEN(c); + lnc = (I32) ln; c1 = *(U8*)m; c2 = PL_fold_locale[c1]; do_exactf: - e = HOP3c(strend, -(I32)ln, s); + e = HOP3c(strend, -((I32)lnc), s); if (norun && e < s) e = s; /* Due to minlen logic of intuit() */ @@ -1056,6 +1072,8 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta STRLEN len, foldlen; if (c1 == c2) { + /* Upper and lower of 1st char are equal - + * probably not a "letter". */ while (s <= e) { c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len, ckWARN(WARN_UTF8) ? @@ -1160,7 +1178,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta tmp = ((OP(c) == BOUND ? isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0); LOAD_UTF8_CHARCLASS(alnum,"a"); - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (tmp == !(OP(c) == BOUND ? swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) : isALNUM_LC_utf8((U8*)s))) @@ -1169,7 +1187,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta if ((norun || regtry(prog, s))) goto got_it; } - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1203,14 +1221,14 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta tmp = ((OP(c) == NBOUND ? isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0); LOAD_UTF8_CHARCLASS(alnum,"a"); - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (tmp == !(OP(c) == NBOUND ? swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) : isALNUM_LC_utf8((U8*)s))) tmp = !tmp; else if ((norun || regtry(prog, s))) goto got_it; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1232,7 +1250,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case ALNUM: if (do_utf8) { LOAD_UTF8_CHARCLASS(alnum,"a"); - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1241,7 +1259,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1261,7 +1279,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case ALNUML: PL_reg_flags |= RF_tainted; if (do_utf8) { - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (isALNUM_LC_utf8((U8*)s)) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1270,7 +1288,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1290,7 +1308,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case NALNUM: if (do_utf8) { LOAD_UTF8_CHARCLASS(alnum,"a"); - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1299,7 +1317,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1319,7 +1337,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case NALNUML: PL_reg_flags |= RF_tainted; if (do_utf8) { - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (!isALNUM_LC_utf8((U8*)s)) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1328,7 +1346,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1348,7 +1366,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case SPACE: if (do_utf8) { LOAD_UTF8_CHARCLASS(space," "); - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1357,7 +1375,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1377,7 +1395,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case SPACEL: PL_reg_flags |= RF_tainted; if (do_utf8) { - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1386,7 +1404,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1406,7 +1424,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case NSPACE: if (do_utf8) { LOAD_UTF8_CHARCLASS(space," "); - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1415,7 +1433,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1435,7 +1453,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case NSPACEL: PL_reg_flags |= RF_tainted; if (do_utf8) { - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1444,7 +1462,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1464,7 +1482,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case DIGIT: if (do_utf8) { LOAD_UTF8_CHARCLASS(digit,"0"); - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1473,7 +1491,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1493,7 +1511,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case DIGITL: PL_reg_flags |= RF_tainted; if (do_utf8) { - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (isDIGIT_LC_utf8((U8*)s)) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1502,7 +1520,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1522,7 +1540,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case NDIGIT: if (do_utf8) { LOAD_UTF8_CHARCLASS(digit,"0"); - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1531,7 +1549,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -1551,7 +1569,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case NDIGITL: PL_reg_flags |= RF_tainted; if (do_utf8) { - while (s < strend) { + while (s + (uskip = UTF8SKIP(s)) <= strend) { if (!isDIGIT_LC_utf8((U8*)s)) { if (tmp && (norun || regtry(prog, s))) goto got_it; @@ -1560,7 +1578,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s += UTF8SKIP(s); + s += uskip; } } else { @@ -2092,8 +2110,7 @@ S_regtry(pTHX_ regexp *prog, char *startpos) if (PL_reg_sv) { /* Make $_ available to executed code. */ if (PL_reg_sv != DEFSV) { - /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */ - SAVESPTR(DEFSV); + SAVE_DEFSV; DEFSV = PL_reg_sv; } @@ -3204,7 +3221,10 @@ S_regmatch(pTHX_ regnode *prog) "%*s already tried at this position...\n", REPORT_CODE_OFF+PL_regindent*2, "") ); - sayNO_SILENT; + if (PL_reg_flags & RF_false) + sayYES; + else + sayNO_SILENT; } PL_reg_poscache[o] |= (1<= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */ + while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */ /* If it could work, try it. */ if (c1 == -1000 || UCHARAT(PL_reginput) == c1 || @@ -3436,8 +3434,6 @@ S_regmatch(pTHX_ regnode *prog) } if (regmatch(next)) sayYES; - /* t/op/regexp.t test 885 fails if this is performed */ - /* *PL_reglastparen = lparen; */ REGCP_UNWIND(lastcp); } /* Couldn't or didn't -- move forward. */ @@ -3452,13 +3448,6 @@ S_regmatch(pTHX_ regnode *prog) } else { n = regrepeat_hard(scan, n, &l); - /* if we matched something zero-length we don't need to - backtrack, unless the minimum count is zero and we - are capturing the result - in that case the capture - being defined or not may affect later execution - */ - if (n != 0 && l == 0 && !(paren && ln == 0)) - ln = n; /* don't backtrack */ locinput = PL_reginput; DEBUG_r( PerlIO_printf(Perl_debug_log, @@ -3475,19 +3464,8 @@ S_regmatch(pTHX_ regnode *prog) if (! HAS_TEXT(text_node)) c1 = c2 = -1000; else { if (PL_regkind[(U8)OP(text_node)] == REF) { - I32 n, ln; - n = ARG(text_node); /* which paren pair */ - ln = PL_regstartp[n]; - /* assume yes if we haven't seen CLOSEn */ - if ( - (I32)*PL_reglastparen < n || - ln == -1 || - ln == PL_regendp[n] - ) { - c1 = c2 = -1000; - goto assume_ok_REG; - } - c1 = *(PL_bostr + ln); + c1 = c2 = -1000; + goto assume_ok_REG; } else { c1 = (U8)*STRING(text_node); } @@ -3525,7 +3503,6 @@ S_regmatch(pTHX_ regnode *prog) } if (regmatch(next)) sayYES; - *PL_reglastparen = lparen; REGCP_UNWIND(lastcp); } /* Couldn't or didn't -- back up. */ @@ -3585,19 +3562,8 @@ S_regmatch(pTHX_ regnode *prog) if (! HAS_TEXT(text_node)) c1 = c2 = -1000; else { if (PL_regkind[(U8)OP(text_node)] == REF) { - I32 n, ln; - n = ARG(text_node); /* which paren pair */ - ln = PL_regstartp[n]; - /* assume yes if we haven't seen CLOSEn */ - if ( - (I32)*PL_reglastparen < n || - ln == -1 || - ln == PL_regendp[n] - ) { - c1 = c2 = -1000; - goto assume_ok_easy; - } - s = (U8*)PL_bostr + ln; + c1 = c2 = -1000; + goto assume_ok_easy; } else { s = (U8*)STRING(text_node); } @@ -3638,7 +3604,6 @@ S_regmatch(pTHX_ regnode *prog) PL_reginput = locinput; if (minmod) { CHECKPOINT lastcp; - I32 lparen = *PL_reglastparen; minmod = 0; if (ln && regrepeat(scan, ln) < ln) sayNO; @@ -3745,7 +3710,6 @@ S_regmatch(pTHX_ regnode *prog) if (c == (UV)c1 || c == (UV)c2) { TRYPAREN(paren, ln, PL_reginput); - *PL_reglastparen = lparen; REGCP_UNWIND(lastcp); } } @@ -3753,7 +3717,6 @@ S_regmatch(pTHX_ regnode *prog) else if (c1 == -1000) { TRYPAREN(paren, ln, PL_reginput); - *PL_reglastparen = lparen; REGCP_UNWIND(lastcp); } /* Couldn't or didn't -- move forward. */ @@ -3768,7 +3731,6 @@ S_regmatch(pTHX_ regnode *prog) } else { CHECKPOINT lastcp; - I32 lparen = *PL_reglastparen; n = regrepeat(scan, n); locinput = PL_reginput; if (ln < n && PL_regkind[(U8)OP(next)] == EOL && @@ -3799,7 +3761,6 @@ S_regmatch(pTHX_ regnode *prog) if (c1 == -1000 || c == (UV)c1 || c == (UV)c2) { TRYPAREN(paren, n, PL_reginput); - *PL_reglastparen = lparen; REGCP_UNWIND(lastcp); } /* Couldn't or didn't -- back up. */ @@ -3823,7 +3784,6 @@ S_regmatch(pTHX_ regnode *prog) if (c1 == -1000 || c == (UV)c1 || c == (UV)c2) { TRYPAREN(paren, n, PL_reginput); - *PL_reglastparen = lparen; REGCP_UNWIND(lastcp); } /* Couldn't or didn't -- back up. */ @@ -3897,6 +3857,7 @@ S_regmatch(pTHX_ regnode *prog) } else PL_reginput = locinput; + PL_reg_flags ^= RF_false; goto do_ifmatch; case IFMATCH: n = 1; @@ -3912,6 +3873,8 @@ S_regmatch(pTHX_ regnode *prog) do_ifmatch: inner = NEXTOPER(NEXTOPER(scan)); if (regmatch(inner) != n) { + if (n == 0) + PL_reg_flags ^= RF_false; say_no: if (logical) { logical = 0; @@ -3921,6 +3884,8 @@ S_regmatch(pTHX_ regnode *prog) else sayNO; } + if (n == 0) + PL_reg_flags ^= RF_false; say_yes: if (logical) { logical = 0; @@ -4287,7 +4252,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max) /* - regrepeat_hard - repeatedly match something, report total lenth and length * - * The repeater is supposed to have constant length. + * The repeater is supposed to have constant non-zero length. */ STATIC I32