Make Perl_pregcomp() use SvUTF8() of the pattern, rather than the flag
Nicholas Clark [Sat, 5 Jan 2008 16:47:06 +0000 (16:47 +0000)]
bit in pmflags, to decide whether the pattern is UTF-8.

p4raw-id: //depot/perl@32851

op.c
pp_ctl.c
regcomp.c
regexec.c

diff --git a/op.c b/op.c
index 2ea2ef8..a0c4f35 100644 (file)
--- a/op.c
+++ b/op.c
@@ -3432,14 +3432,24 @@ Perl_pmruntime(pTHX_ OP *o, OP *expr, bool isreg)
     pm = (PMOP*)o;
 
     if (expr->op_type == OP_CONST) {
-       SV * const pat = ((SVOP*)expr)->op_sv;
+       SV *pat = ((SVOP*)expr)->op_sv;
        U32 pm_flags = pm->op_pmflags & PMf_COMPILETIME;
 
        if (o->op_flags & OPf_SPECIAL)
            pm_flags |= RXf_SPLIT;
 
-       if (DO_UTF8(pat))
-           pm_flags |= RXf_UTF8;
+       if (DO_UTF8(pat)) {
+           assert (SvUTF8(pat));
+       } else if (SvUTF8(pat)) {
+           /* Not doing UTF-8, despite what the SV says. Is this only if we're
+              trapped in use 'bytes'?  */
+           /* Make a copy of the octet sequence, but without the flag on, as
+              the compiler now honours the SvUTF8 flag on pat.  */
+           STRLEN len;
+           const char *const p = SvPV(pat, len);
+           pat = newSVpvn_flags(p, len, SVs_TEMP);
+       }
+       assert(!(pm_flags & RXf_UTF8));
 
        PM_SETRE(pm, CALLREGCOMP(pat, pm_flags));
 
index 6353df6..d2094f5 100644 (file)
--- a/pp_ctl.c
+++ b/pp_ctl.c
@@ -148,8 +148,18 @@ PP(pp_regcomp)
            if (PL_op->op_flags & OPf_SPECIAL)
                PL_reginterp_cnt = I32_MAX; /* Mark as safe.  */
 
-           if (DO_UTF8(tmpstr))
-               pm_flags |= RXf_UTF8;
+           if (DO_UTF8(tmpstr)) {
+               assert (SvUTF8(tmpstr));
+           } else if (SvUTF8(tmpstr)) {
+               /* Not doing UTF-8, despite what the SV says. Is this only if
+                  we're trapped in use 'bytes'?  */
+               /* Make a copy of the octet sequence, but without the flag on,
+                  as the compiler now honours the SvUTF8 flag on tmpstr.  */
+               STRLEN len;
+               const char *const p = SvPV(tmpstr, len);
+               tmpstr = newSVpvn_flags(p, len, SVs_TEMP);
+           }
+           assert(!(pm_flags & RXf_UTF8));
 
                if (eng) 
                PM_SETRE(pm, CALLREGCOMP_ENG(eng, tmpstr, pm_flags));
index b71e546..a7b1cf6 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -4151,7 +4151,7 @@ Perl_pregcomp(pTHX_ const SV * const pattern, const U32 flags)
 #endif
 
 REGEXP *
-Perl_re_compile(pTHX_ const SV * const pattern, const U32 pm_flags)
+Perl_re_compile(pTHX_ const SV * const pattern, U32 pm_flags)
 {
     dVAR;
     REGEXP *rx;
@@ -4175,7 +4175,10 @@ Perl_re_compile(pTHX_ const SV * const pattern, const U32 pm_flags)
     GET_RE_DEBUG_FLAGS_DECL;
     DEBUG_r(if (!PL_colorset) reginitcolors());
 
-    RExC_utf8 = RExC_orig_utf8 = pm_flags & RXf_UTF8;
+    RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
+    assert(!(pm_flags & RXf_UTF8));
+    if (RExC_utf8)
+       pm_flags |= RXf_UTF8;
 
     DEBUG_COMPILE_r({
         SV *dsv= sv_newmortal();
index 95bba2e..8144f99 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -3755,7 +3755,19 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                        U32 pm_flags = 0;
                        const I32 osize = PL_regsize;
 
-                       if (DO_UTF8(ret)) pm_flags |= RXf_UTF8;
+                       if (DO_UTF8(ret)) {
+                           assert (SvUTF8(ret));
+                       } else if (SvUTF8(ret)) {
+                           /* Not doing UTF-8, despite what the SV says. Is
+                              this only if we're trapped in use 'bytes'?  */
+                           /* Make a copy of the octet sequence, but without
+                              the flag on, as the compiler now honours the
+                              SvUTF8 flag on ret.  */
+                           STRLEN len;
+                           const char *const p = SvPV(ret, len);
+                           ret = newSVpvn_flags(p, len, SVs_TEMP);
+                       }
+                       assert(!(pm_flags & RXf_UTF8));
                        rx = CALLREGCOMP(ret, pm_flags);
                        if (!(SvFLAGS(ret)
                              & (SVs_TEMP | SVs_PADTMP | SVf_READONLY