From: Nicholas Clark Date: Sat, 5 Jan 2008 16:47:06 +0000 (+0000) Subject: Make Perl_pregcomp() use SvUTF8() of the pattern, rather than the flag X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=b9ad30b40cf004f5ea6fd7a945a950cf873aed7b;p=p5sagit%2Fp5-mst-13.2.git Make Perl_pregcomp() use SvUTF8() of the pattern, rather than the flag bit in pmflags, to decide whether the pattern is UTF-8. p4raw-id: //depot/perl@32851 --- diff --git a/op.c b/op.c index 2ea2ef8..a0c4f35 100644 --- a/op.c +++ b/op.c @@ -3432,14 +3432,24 @@ Perl_pmruntime(pTHX_ OP *o, OP *expr, bool isreg) pm = (PMOP*)o; if (expr->op_type == OP_CONST) { - SV * const pat = ((SVOP*)expr)->op_sv; + SV *pat = ((SVOP*)expr)->op_sv; U32 pm_flags = pm->op_pmflags & PMf_COMPILETIME; if (o->op_flags & OPf_SPECIAL) pm_flags |= RXf_SPLIT; - if (DO_UTF8(pat)) - pm_flags |= RXf_UTF8; + if (DO_UTF8(pat)) { + assert (SvUTF8(pat)); + } else if (SvUTF8(pat)) { + /* Not doing UTF-8, despite what the SV says. Is this only if we're + trapped in use 'bytes'? */ + /* Make a copy of the octet sequence, but without the flag on, as + the compiler now honours the SvUTF8 flag on pat. */ + STRLEN len; + const char *const p = SvPV(pat, len); + pat = newSVpvn_flags(p, len, SVs_TEMP); + } + assert(!(pm_flags & RXf_UTF8)); PM_SETRE(pm, CALLREGCOMP(pat, pm_flags)); diff --git a/pp_ctl.c b/pp_ctl.c index 6353df6..d2094f5 100644 --- a/pp_ctl.c +++ b/pp_ctl.c @@ -148,8 +148,18 @@ PP(pp_regcomp) if (PL_op->op_flags & OPf_SPECIAL) PL_reginterp_cnt = I32_MAX; /* Mark as safe. */ - if (DO_UTF8(tmpstr)) - pm_flags |= RXf_UTF8; + if (DO_UTF8(tmpstr)) { + assert (SvUTF8(tmpstr)); + } else if (SvUTF8(tmpstr)) { + /* Not doing UTF-8, despite what the SV says. Is this only if + we're trapped in use 'bytes'? */ + /* Make a copy of the octet sequence, but without the flag on, + as the compiler now honours the SvUTF8 flag on tmpstr. */ + STRLEN len; + const char *const p = SvPV(tmpstr, len); + tmpstr = newSVpvn_flags(p, len, SVs_TEMP); + } + assert(!(pm_flags & RXf_UTF8)); if (eng) PM_SETRE(pm, CALLREGCOMP_ENG(eng, tmpstr, pm_flags)); diff --git a/regcomp.c b/regcomp.c index b71e546..a7b1cf6 100644 --- a/regcomp.c +++ b/regcomp.c @@ -4151,7 +4151,7 @@ Perl_pregcomp(pTHX_ const SV * const pattern, const U32 flags) #endif REGEXP * -Perl_re_compile(pTHX_ const SV * const pattern, const U32 pm_flags) +Perl_re_compile(pTHX_ const SV * const pattern, U32 pm_flags) { dVAR; REGEXP *rx; @@ -4175,7 +4175,10 @@ Perl_re_compile(pTHX_ const SV * const pattern, const U32 pm_flags) GET_RE_DEBUG_FLAGS_DECL; DEBUG_r(if (!PL_colorset) reginitcolors()); - RExC_utf8 = RExC_orig_utf8 = pm_flags & RXf_UTF8; + RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern); + assert(!(pm_flags & RXf_UTF8)); + if (RExC_utf8) + pm_flags |= RXf_UTF8; DEBUG_COMPILE_r({ SV *dsv= sv_newmortal(); diff --git a/regexec.c b/regexec.c index 95bba2e..8144f99 100644 --- a/regexec.c +++ b/regexec.c @@ -3755,7 +3755,19 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) U32 pm_flags = 0; const I32 osize = PL_regsize; - if (DO_UTF8(ret)) pm_flags |= RXf_UTF8; + if (DO_UTF8(ret)) { + assert (SvUTF8(ret)); + } else if (SvUTF8(ret)) { + /* Not doing UTF-8, despite what the SV says. Is + this only if we're trapped in use 'bytes'? */ + /* Make a copy of the octet sequence, but without + the flag on, as the compiler now honours the + SvUTF8 flag on ret. */ + STRLEN len; + const char *const p = SvPV(ret, len); + ret = newSVpvn_flags(p, len, SVs_TEMP); + } + assert(!(pm_flags & RXf_UTF8)); rx = CALLREGCOMP(ret, pm_flags); if (!(SvFLAGS(ret) & (SVs_TEMP | SVs_PADTMP | SVf_READONLY