X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=regexec.c;h=4f403a044377a62b97ceaca1761bdeb9bf6b6ea5;hb=9497474043a29ae84d941e84594160f022e3f31c;hp=5dbe166af7a0daaa23e95782351c9c0647fa8a89;hpb=d2212429110ea5c7ffe992aa2cb18bde7ce6e83d;p=p5sagit%2Fp5-mst-13.2.git diff --git a/regexec.c b/regexec.c index 5dbe166..4f403a0 100644 --- a/regexec.c +++ b/regexec.c @@ -87,7 +87,7 @@ #define RF_evaled 4 /* Did an EVAL with setting? */ #define RF_utf8 8 /* String contains multibyte chars? */ -#define UTF (PL_reg_flags & RF_utf8) +#define UTF ((PL_reg_flags & RF_utf8) != 0) #define RS_init 1 /* eval environment created */ #define RS_set 2 /* replsv value is set */ @@ -239,7 +239,7 @@ S_regcppop(pTHX) ); } DEBUG_r( - if (*PL_reglastparen + 1 <= PL_regnpar) { + if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) { PerlIO_printf(Perl_debug_log, " restoring \\%"IVdf"..\\%"IVdf" to undef\n", (IV)(*PL_reglastparen + 1), (IV)PL_regnpar); @@ -256,8 +256,8 @@ S_regcppop(pTHX) * building DynaLoader will fail: * "Error: '*' not in typemap in DynaLoader.xs, line 164" * --jhi */ - for (paren = *PL_reglastparen + 1; paren <= PL_regnpar; paren++) { - if (paren > PL_regsize) + for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) { + if ((I32)paren > PL_regsize) PL_regstartp[paren] = -1; PL_regendp[paren] = -1; } @@ -431,7 +431,8 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, ); }); - if (prog->minlen > CHR_DIST((U8*)strend, (U8*)strpos)) { + /* CHR_DIST() would be more correct here but it makes things slow. */ + if (prog->minlen > strend - strpos) { DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short... [re_intuit_start]\n")); goto fail; @@ -999,8 +1000,10 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta to_utf8_lower((U8*)m, tmpbuf1, &ulen1); to_utf8_upper((U8*)m, tmpbuf2, &ulen2); - c1 = utf8_to_uvchr(tmpbuf1, 0); - c2 = utf8_to_uvchr(tmpbuf2, 0); + c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC, + 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); + c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC, + 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); } else { c1 = *(U8*)m; @@ -1013,7 +1016,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta c1 = *(U8*)m; c2 = PL_fold_locale[c1]; do_exactf: - e = do_utf8 ? s + ln : strend - ln; + e = HOP3c(strend, -(I32)ln, s); if (norun && e < s) e = s; /* Due to minlen logic of intuit() */ @@ -1037,11 +1040,13 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta if (c1 == c2) { while (s <= e) { - c = utf8_to_uvchr((U8*)s, &len); + c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); if ( c == c1 && (ln == len || ibcmp_utf8(s, (char **)0, 0, do_utf8, - m, (char **)0, ln, UTF)) + m, (char **)0, ln, (bool)UTF)) && (norun || regtry(prog, s)) ) goto got_it; else { @@ -1053,7 +1058,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta !ibcmp_utf8((char *) foldbuf, (char **)0, foldlen, do_utf8, m, - (char **)0, ln, UTF)) + (char **)0, ln, (bool)UTF)) && (norun || regtry(prog, s)) ) goto got_it; } @@ -1062,7 +1067,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else { while (s <= e) { - c = utf8_to_uvchr((U8*)s, &len); + c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); /* Handle some of the three Greek sigmas cases. * Note that not all the possible combinations @@ -1078,7 +1085,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta if ( (c == c1 || c == c2) && (ln == len || ibcmp_utf8(s, (char **)0, 0, do_utf8, - m, (char **)0, ln, UTF)) + m, (char **)0, ln, (bool)UTF)) && (norun || regtry(prog, s)) ) goto got_it; else { @@ -1090,7 +1097,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta !ibcmp_utf8((char *) foldbuf, (char **)0, foldlen, do_utf8, m, - (char **)0, ln, UTF)) + (char **)0, ln, (bool)UTF)) && (norun || regtry(prog, s)) ) goto got_it; } @@ -1129,11 +1136,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta if (s == PL_bostr) tmp = '\n'; else { - U8 *r = reghop3((U8*)s, -1, (U8*)startpos); + U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr); - tmp = s > (char*)r ? - utf8n_to_uvchr(r, s - (char*)r, 0, 0) : - utf8n_to_uvchr(s, UTF8SKIP(s), 0, 0); + tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0); } tmp = ((OP(c) == BOUND ? isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0); @@ -1174,11 +1179,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta if (s == PL_bostr) tmp = '\n'; else { - U8 *r = reghop3((U8*)s, -1, (U8*)startpos); + U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr); - tmp = s > (char*)r ? - utf8n_to_uvchr(r, s - (char*)r, 0, 0) : - utf8n_to_uvchr(s, UTF8SKIP(s), 0, 0); + tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0); } tmp = ((OP(c) == NBOUND ? isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0); @@ -2132,7 +2135,7 @@ S_regtry(pTHX_ regexp *prog, char *startpos) sp = prog->startp; ep = prog->endp; if (prog->nparens) { - for (i = prog->nparens; i > *PL_reglastparen; i--) { + for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) { *++sp = -1; *++ep = -1; } @@ -2382,7 +2385,7 @@ S_regmatch(pTHX_ regnode *prog) case EXACT: s = STRING(scan); ln = STR_LEN(scan); - if (do_utf8 != (UTF!=0)) { + if (do_utf8 != UTF) { /* The target and the pattern have differing utf8ness. */ char *l = locinput; char *e = s + ln; @@ -2394,7 +2397,9 @@ S_regmatch(pTHX_ regnode *prog) if (l >= PL_regeol) sayNO; if (NATIVE_TO_UNI(*(U8*)s) != - utf8_to_uvuni((U8*)l, &ulen)) + utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY)) sayNO; l += ulen; s ++; @@ -2406,7 +2411,9 @@ S_regmatch(pTHX_ regnode *prog) if (l >= PL_regeol) sayNO; if (NATIVE_TO_UNI(*((U8*)l)) != - utf8_to_uvuni((U8*)s, &ulen)) + utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY)) sayNO; s += ulen; l ++; @@ -2439,7 +2446,7 @@ S_regmatch(pTHX_ regnode *prog) char *l = locinput; char *e = PL_regeol; - if (ibcmp_utf8(s, 0, ln, UTF, + if (ibcmp_utf8(s, 0, ln, (bool)UTF, l, &e, 0, do_utf8)) { /* One more case for the sharp s: * pack("U0U*", 0xDF) =~ /ss/i, @@ -2565,11 +2572,9 @@ S_regmatch(pTHX_ regnode *prog) if (locinput == PL_bostr) ln = '\n'; else { - U8 *r = reghop3((U8*)locinput, -1, (U8*)PL_reg_starttry); + U8 *r = reghop3((U8*)locinput, -1, (U8*)PL_bostr); - ln = locinput > (char*)r ? - utf8n_to_uvchr(r, locinput - (char*)r, 0, 0) : - utf8n_to_uvchr(locinput, UTF8SKIP(locinput), 0, 0); + ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0); } if (OP(scan) == BOUND || OP(scan) == NBOUND) { ln = isALNUM_uni(ln); @@ -2723,7 +2728,7 @@ S_regmatch(pTHX_ regnode *prog) n = ARG(scan); /* which paren pair */ ln = PL_regstartp[n]; PL_reg_leftiter = PL_reg_maxiter; /* Void cache */ - if (*PL_reglastparen < n || ln == -1) + if ((I32)*PL_reglastparen < n || ln == -1) sayNO; /* Do not match unless seen CLOSEn. */ if (ln == PL_regendp[n]) break; @@ -2922,13 +2927,13 @@ S_regmatch(pTHX_ regnode *prog) n = ARG(scan); /* which paren pair */ PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr; PL_regendp[n] = locinput - PL_bostr; - if (n > *PL_reglastparen) + if (n > (I32)*PL_reglastparen) *PL_reglastparen = n; *PL_reglastcloseparen = n; break; case GROUPP: n = ARG(scan); /* which paren pair */ - sw = (*PL_reglastparen >= n && PL_regendp[n] != -1); + sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1); break; case IFTHEN: PL_reg_leftiter = PL_reg_maxiter; /* Void cache */ @@ -3030,7 +3035,7 @@ S_regmatch(pTHX_ regnode *prog) PL_regcc = &cc; /* XXXX Probably it is better to teach regpush to support parenfloor > PL_regsize... */ - if (parenfloor > *PL_reglastparen) + if (parenfloor > (I32)*PL_reglastparen) parenfloor = *PL_reglastparen; /* Pessimization... */ cc.parenfloor = parenfloor; cc.cur = -1; @@ -3066,10 +3071,10 @@ S_regmatch(pTHX_ regnode *prog) DEBUG_r( PerlIO_printf(Perl_debug_log, - "%*s %ld out of %ld..%ld cc=%lx\n", + "%*s %ld out of %ld..%ld cc=%"UVxf"\n", REPORT_CODE_OFF+PL_regindent*2, "", (long)n, (long)cc->min, - (long)cc->max, (long)cc) + (long)cc->max, PTR2UV(cc)) ); /* If degenerate scan matches "", assume scan done. */ @@ -3114,7 +3119,7 @@ S_regmatch(pTHX_ regnode *prog) if (PL_reg_leftiter-- == 0) { I32 size = (PL_reg_maxiter + 7)/8; if (PL_reg_poscache) { - if (PL_reg_poscache_size < size) { + if ((I32)PL_reg_poscache_size < size) { Renew(PL_reg_poscache, size, char); PL_reg_poscache_size = size; } @@ -3299,7 +3304,7 @@ S_regmatch(pTHX_ regnode *prog) if (paren) { if (paren > PL_regsize) PL_regsize = paren; - if (paren > *PL_reglastparen) + if (paren > (I32)*PL_reglastparen) *PL_reglastparen = paren; } scan = NEXTOPER(scan) + NODE_STEP_REGNODE; @@ -3333,7 +3338,7 @@ S_regmatch(pTHX_ regnode *prog) ln = PL_regstartp[n]; /* assume yes if we haven't seen CLOSEn */ if ( - *PL_reglastparen < n || + (I32)*PL_reglastparen < n || ln == -1 || ln == PL_regendp[n] ) { @@ -3415,7 +3420,7 @@ S_regmatch(pTHX_ regnode *prog) ln = PL_regstartp[n]; /* assume yes if we haven't seen CLOSEn */ if ( - *PL_reglastparen < n || + (I32)*PL_reglastparen < n || ln == -1 || ln == PL_regendp[n] ) { @@ -3475,7 +3480,7 @@ S_regmatch(pTHX_ regnode *prog) paren = scan->flags; /* Which paren to set */ if (paren > PL_regsize) PL_regsize = paren; - if (paren > *PL_reglastparen) + if (paren > (I32)*PL_reglastparen) *PL_reglastparen = paren; ln = ARG1(scan); /* min to match */ n = ARG2(scan); /* max to match */ @@ -3524,7 +3529,7 @@ S_regmatch(pTHX_ regnode *prog) ln = PL_regstartp[n]; /* assume yes if we haven't seen CLOSEn */ if ( - *PL_reglastparen < n || + (I32)*PL_reglastparen < n || ln == -1 || ln == PL_regendp[n] ) { @@ -3551,11 +3556,17 @@ S_regmatch(pTHX_ regnode *prog) to_utf8_lower((U8*)s, tmpbuf1, &ulen1); to_utf8_upper((U8*)s, tmpbuf2, &ulen2); - c1 = utf8_to_uvuni(tmpbuf1, 0); - c2 = utf8_to_uvuni(tmpbuf2, 0); + c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); + c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); } else { - c2 = c1 = utf8_to_uvchr(s, NULL); + c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); } } } @@ -3574,6 +3585,7 @@ S_regmatch(pTHX_ regnode *prog) if (c1 != -1000) { char *e; /* Should not check after this */ char *old = locinput; + int count = 0; if (n == REG_INFTY) { e = PL_regeol - 1; @@ -3593,7 +3605,6 @@ S_regmatch(pTHX_ regnode *prog) e = PL_regeol - 1; } while (1) { - int count; /* Find place 'next' could work */ if (!do_utf8) { if (c1 == c2) { @@ -3611,18 +3622,28 @@ S_regmatch(pTHX_ regnode *prog) else { STRLEN len; if (c1 == c2) { - for (count = 0; - locinput <= e && - utf8_to_uvchr((U8*)locinput, &len) != c1; - count++) + /* count initialised to + * utf8_distance(old, locinput) */ + while (locinput <= e && + utf8n_to_uvchr((U8*)locinput, + UTF8_MAXLEN, &len, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY) != (UV)c1) { locinput += len; - + count++; + } } else { - for (count = 0; locinput <= e; count++) { - UV c = utf8_to_uvchr((U8*)locinput, &len); - if (c == c1 || c == c2) + /* count initialised to + * utf8_distance(old, locinput) */ + while (locinput <= e) { + UV c = utf8n_to_uvchr((U8*)locinput, + UTF8_MAXLEN, &len, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); + if (c == (UV)c1 || c == (UV)c2) break; - locinput += len; + locinput += len; + count++; } } } @@ -3644,6 +3665,7 @@ S_regmatch(pTHX_ regnode *prog) locinput += UTF8SKIP(locinput); else locinput++; + count = 1; } } else @@ -3651,11 +3673,14 @@ S_regmatch(pTHX_ regnode *prog) UV c; if (c1 != -1000) { if (do_utf8) - c = utf8_to_uvchr((U8*)PL_reginput, NULL); + c = utf8n_to_uvchr((U8*)PL_reginput, + UTF8_MAXLEN, 0, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); else c = UCHARAT(PL_reginput); /* If it could work, try it. */ - if (c == c1 || c == c2) + if (c == (UV)c1 || c == (UV)c2) { TRYPAREN(paren, n, PL_reginput); REGCP_UNWIND(lastcp); @@ -3698,12 +3723,15 @@ S_regmatch(pTHX_ regnode *prog) while (n >= ln) { if (c1 != -1000) { if (do_utf8) - c = utf8_to_uvchr((U8*)PL_reginput, NULL); + c = utf8n_to_uvchr((U8*)PL_reginput, + UTF8_MAXLEN, 0, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); else c = UCHARAT(PL_reginput); } /* If it could work, try it. */ - if (c1 == -1000 || c == c1 || c == c2) + if (c1 == -1000 || c == (UV)c1 || c == (UV)c2) { TRYPAREN(paren, n, PL_reginput); REGCP_UNWIND(lastcp); @@ -3718,12 +3746,15 @@ S_regmatch(pTHX_ regnode *prog) while (n >= ln) { if (c1 != -1000) { if (do_utf8) - c = utf8_to_uvchr((U8*)PL_reginput, NULL); + c = utf8n_to_uvchr((U8*)PL_reginput, + UTF8_MAXLEN, 0, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); else c = UCHARAT(PL_reginput); } /* If it could work, try it. */ - if (c1 == -1000 || c == c1 || c == c2) + if (c1 == -1000 || c == (UV)c1 || c == (UV)c2) { TRYPAREN(paren, n, PL_reginput); REGCP_UNWIND(lastcp); @@ -3959,7 +3990,9 @@ S_regrepeat(pTHX_ regnode *p, I32 max) register bool do_utf8 = PL_reg_match_utf8; scan = PL_reginput; - if (max != REG_INFTY && max < loceol - scan) + if (max == REG_INFTY) + max = I32_MAX; + else if (max < loceol - scan) loceol = scan + max; switch (OP(p)) { case REG_ANY: @@ -4253,15 +4286,16 @@ Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV if (PL_regdata->what[n] == 's') { SV *rv = (SV*)PL_regdata->data[n]; AV *av = (AV*)SvRV((SV*)rv); + SV **ary = AvARRAY(av); SV **a, **b; /* See the end of regcomp.c:S_reglass() for * documentation of these array elements. */ - si = *av_fetch(av, 0, FALSE); - a = av_fetch(av, 1, FALSE); - b = av_fetch(av, 2, FALSE); - + si = *ary; + a = SvTYPE(ary[1]) == SVt_RV ? &ary[1] : 0; + b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0; + if (a) sw = *a; else if (si && doinit) { @@ -4296,11 +4330,13 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register b { char flags = ANYOF_FLAGS(n); bool match = FALSE; - UV c; + UV c = *p; STRLEN len = 0; STRLEN plen; - c = do_utf8 ? utf8_to_uvchr(p, &len) : *p; + if (do_utf8 && !UTF8_IS_INVARIANT(c)) + c = utf8n_to_uvchr(p, UTF8_MAXLEN, &len, + ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c)); if (do_utf8 || (flags & ANYOF_UNICODE)) { @@ -4353,7 +4389,7 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register b if (ANYOF_BITMAP_TEST(n, c)) match = TRUE; else if (flags & ANYOF_FOLD) { - I32 f; + U8 f; if (flags & ANYOF_LOCALE) { PL_reg_flags |= RF_tainted; @@ -4495,7 +4531,7 @@ S_to_utf8_substr(pTHX_ register regexp *prog) SV* sv; if (prog->float_substr && !prog->float_utf8) { prog->float_utf8 = sv = NEWSV(117, 0); - SvSetMagicSV(sv, prog->float_substr); + SvSetSV(sv, prog->float_substr); sv_utf8_upgrade(sv); if (SvTAIL(prog->float_substr)) SvTAIL_on(sv); @@ -4504,7 +4540,7 @@ S_to_utf8_substr(pTHX_ register regexp *prog) } if (prog->anchored_substr && !prog->anchored_utf8) { prog->anchored_utf8 = sv = NEWSV(118, 0); - SvSetMagicSV(sv, prog->anchored_substr); + SvSetSV(sv, prog->anchored_substr); sv_utf8_upgrade(sv); if (SvTAIL(prog->anchored_substr)) SvTAIL_on(sv); @@ -4519,7 +4555,7 @@ S_to_byte_substr(pTHX_ register regexp *prog) SV* sv; if (prog->float_utf8 && !prog->float_substr) { prog->float_substr = sv = NEWSV(117, 0); - SvSetMagicSV(sv, prog->float_utf8); + SvSetSV(sv, prog->float_utf8); if (sv_utf8_downgrade(sv, TRUE)) { if (SvTAIL(prog->float_utf8)) SvTAIL_on(sv); @@ -4532,7 +4568,7 @@ S_to_byte_substr(pTHX_ register regexp *prog) } if (prog->anchored_utf8 && !prog->anchored_substr) { prog->anchored_substr = sv = NEWSV(118, 0); - SvSetMagicSV(sv, prog->anchored_utf8); + SvSetSV(sv, prog->anchored_utf8); if (sv_utf8_downgrade(sv, TRUE)) { if (SvTAIL(prog->anchored_utf8)) SvTAIL_on(sv);