Fix for "UTF-8 bug with s///" from Hugo.
Jarkko Hietaniemi [Wed, 20 Mar 2002 13:59:58 +0000 (13:59 +0000)]
p4raw-id: //depot/perl@15356

embed.fnc
embed.h
proto.h
regcomp.c
regcomp.h
regexec.c
sv.c
t/op/pat.t

index 84d89d0..8b7727b 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -1167,6 +1167,8 @@ s |U8*    |reghop3        |U8 *pos|I32 off|U8 *lim
 s      |U8*    |reghopmaybe    |U8 *pos|I32 off
 s      |U8*    |reghopmaybe3   |U8 *pos|I32 off|U8 *lim
 s      |char*  |find_byclass   |regexp * prog|regnode *c|char *s|char *strend|char *startpos|I32 norun
+s      |void   |to_utf8_substr |regexp * prog
+s      |void   |to_byte_substr |regexp * prog
 #endif
 
 #if defined(PERL_IN_DUMP_C) || defined(PERL_DECL_PROT)
diff --git a/embed.h b/embed.h
index 787a045..10c6a53 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define reghopmaybe            S_reghopmaybe
 #define reghopmaybe3           S_reghopmaybe3
 #define find_byclass           S_find_byclass
+#define to_utf8_substr         S_to_utf8_substr
+#define to_byte_substr         S_to_byte_substr
 #endif
 #if defined(PERL_IN_DUMP_C) || defined(PERL_DECL_PROT)
 #define deb_curcv              S_deb_curcv
 #define reghopmaybe(a,b)       S_reghopmaybe(aTHX_ a,b)
 #define reghopmaybe3(a,b,c)    S_reghopmaybe3(aTHX_ a,b,c)
 #define find_byclass(a,b,c,d,e,f)      S_find_byclass(aTHX_ a,b,c,d,e,f)
+#define to_utf8_substr(a)      S_to_utf8_substr(aTHX_ a)
+#define to_byte_substr(a)      S_to_byte_substr(aTHX_ a)
 #endif
 #if defined(PERL_IN_DUMP_C) || defined(PERL_DECL_PROT)
 #define deb_curcv(a)           S_deb_curcv(aTHX_ a)
diff --git a/proto.h b/proto.h
index dad9b57..ac6f281 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -1206,6 +1206,8 @@ STATIC U8*        S_reghop3(pTHX_ U8 *pos, I32 off, U8 *lim);
 STATIC U8*     S_reghopmaybe(pTHX_ U8 *pos, I32 off);
 STATIC U8*     S_reghopmaybe3(pTHX_ U8 *pos, I32 off, U8 *lim);
 STATIC char*   S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun);
+STATIC void    S_to_utf8_substr(pTHX_ regexp * prog);
+STATIC void    S_to_byte_substr(pTHX_ regexp * prog);
 #endif
 
 #if defined(PERL_IN_DUMP_C) || defined(PERL_DECL_PROT)
index 7c34d8f..639f140 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -931,6 +931,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg
                        ? I32_MAX : data->pos_min + data->pos_delta;
                }
                sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
+               if (UTF)
+                   SvUTF8_on(data->last_found);
                data->last_end = data->pos_min + l;
                data->pos_min += l; /* As in the first entry. */
                data->flags &= ~SF_BEFORE_EOL;
@@ -1963,17 +1965,23 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
                && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
                    goto remove_float;          /* As in (a)+. */
 
-           r->float_substr = data.longest_float;
+           if (SvUTF8(data.longest_float)) {
+               r->float_utf8 = data.longest_float;
+               r->float_substr = Nullsv;
+           } else {
+               r->float_substr = data.longest_float;
+               r->float_utf8 = Nullsv;
+           }
            r->float_min_offset = data.offset_float_min;
            r->float_max_offset = data.offset_float_max;
            t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
                       && (!(data.flags & SF_FL_BEFORE_MEOL)
                           || (RExC_flags16 & PMf_MULTILINE)));
-           fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
+           fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
        }
        else {
          remove_float:
-           r->float_substr = Nullsv;
+           r->float_substr = r->float_utf8 = Nullsv;
            SvREFCNT_dec(data.longest_float);
            longest_float_length = 0;
        }
@@ -1985,22 +1993,29 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
                    || (RExC_flags16 & PMf_MULTILINE)))) {
            int t;
 
-           r->anchored_substr = data.longest_fixed;
+           if (SvUTF8(data.longest_fixed)) {
+               r->anchored_utf8 = data.longest_fixed;
+               r->anchored_substr = Nullsv;
+           } else {
+               r->anchored_substr = data.longest_fixed;
+               r->anchored_utf8 = Nullsv;
+           }
            r->anchored_offset = data.offset_fixed;
            t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
                     || (RExC_flags16 & PMf_MULTILINE)));
-           fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
+           fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
        }
        else {
-           r->anchored_substr = Nullsv;
+           r->anchored_substr = r->anchored_utf8 = Nullsv;
            SvREFCNT_dec(data.longest_fixed);
            longest_fixed_length = 0;
        }
        if (r->regstclass
            && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
            r->regstclass = NULL;
-       if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
+       if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
+           && stclass_flag
            && !(data.start_class->flags & ANYOF_EOS)
            && !cl_is_anything(data.start_class)) {
            I32 n = add_data(pRExC_state, 1, "f");
@@ -2023,20 +2038,22 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
        /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
        if (longest_fixed_length > longest_float_length) {
            r->check_substr = r->anchored_substr;
+           r->check_utf8 = r->anchored_utf8;
            r->check_offset_min = r->check_offset_max = r->anchored_offset;
            if (r->reganch & ROPT_ANCH_SINGLE)
                r->reganch |= ROPT_NOSCAN;
        }
        else {
            r->check_substr = r->float_substr;
+           r->check_utf8 = r->float_utf8;
            r->check_offset_min = data.offset_float_min;
            r->check_offset_max = data.offset_float_max;
        }
        /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
           This should be changed ASAP!  */
-       if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
+       if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
            r->reganch |= RE_USE_INTUIT;
-           if (SvTAIL(r->check_substr))
+           if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
                r->reganch |= RE_INTUIT_TAIL;
        }
     }
@@ -2052,7 +2069,8 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
        data.start_class = &ch_class;
        data.last_closep = &last_close;
        minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
-       r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
+       r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
+               = r->float_substr = r->float_utf8 = Nullsv;
        if (!(data.start_class->flags & ANYOF_EOS)
            && !cl_is_anything(data.start_class)) {
            I32 n = add_data(pRExC_state, 1, "f");
@@ -4529,6 +4547,15 @@ Perl_regdump(pTHX_ regexp *r)
                      PL_colors[1],
                      SvTAIL(r->anchored_substr) ? "$" : "",
                      (IV)r->anchored_offset);
+    else if (r->anchored_utf8)
+       PerlIO_printf(Perl_debug_log,
+                     "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
+                     PL_colors[0],
+                     (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
+                     SvPVX(r->anchored_utf8),
+                     PL_colors[1],
+                     SvTAIL(r->anchored_utf8) ? "$" : "",
+                     (IV)r->anchored_offset);
     if (r->float_substr)
        PerlIO_printf(Perl_debug_log,
                      "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
@@ -4538,15 +4565,25 @@ Perl_regdump(pTHX_ regexp *r)
                      PL_colors[1],
                      SvTAIL(r->float_substr) ? "$" : "",
                      (IV)r->float_min_offset, (UV)r->float_max_offset);
-    if (r->check_substr)
+    else if (r->float_utf8)
+       PerlIO_printf(Perl_debug_log,
+                     "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
+                     PL_colors[0],
+                     (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
+                     SvPVX(r->float_utf8),
+                     PL_colors[1],
+                     SvTAIL(r->float_utf8) ? "$" : "",
+                     (IV)r->float_min_offset, (UV)r->float_max_offset);
+    if (r->check_substr || r->check_utf8)
        PerlIO_printf(Perl_debug_log,
                      r->check_substr == r->float_substr
+                     && r->check_utf8 == r->float_utf8
                      ? "(checking floating" : "(checking anchored");
     if (r->reganch & ROPT_NOSCAN)
        PerlIO_printf(Perl_debug_log, " noscan");
     if (r->reganch & ROPT_CHECK_ALL)
        PerlIO_printf(Perl_debug_log, " isall");
-    if (r->check_substr)
+    if (r->check_substr || r->check_utf8)
        PerlIO_printf(Perl_debug_log, ") ");
 
     if (r->regstclass) {
@@ -4795,18 +4832,21 @@ Perl_re_intuit_string(pTHX_ regexp *prog)
 {                              /* Assume that RE_INTUIT is set */
     DEBUG_r(
        {   STRLEN n_a;
-           char *s = SvPV(prog->check_substr,n_a);
+           char *s = SvPV(prog->check_substr
+                     ? prog->check_substr : prog->check_utf8, n_a);
 
            if (!PL_colorset) reginitcolors();
            PerlIO_printf(Perl_debug_log,
-                     "%sUsing REx substr:%s `%s%.60s%s%s'\n",
-                     PL_colors[4],PL_colors[5],PL_colors[0],
+                     "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
+                     PL_colors[4],
+                     prog->check_substr ? "" : "utf8 ",
+                     PL_colors[5],PL_colors[0],
                      s,
                      PL_colors[1],
                      (strlen(s) > 60 ? "..." : ""));
        } );
 
-    return prog->check_substr;
+    return prog->check_substr ? prog->check_substr : prog->check_utf8;
 }
 
 void
@@ -4841,8 +4881,12 @@ Perl_pregfree(pTHX_ struct regexp *r)
     if (r->substrs) {
        if (r->anchored_substr)
            SvREFCNT_dec(r->anchored_substr);
+       if (r->anchored_utf8)
+           SvREFCNT_dec(r->anchored_utf8);
        if (r->float_substr)
            SvREFCNT_dec(r->float_substr);
+       if (r->float_utf8)
+           SvREFCNT_dec(r->float_utf8);
        Safefree(r->substrs);
     }
     if (r->data) {
index bfd00dc..8c027bf 100644 (file)
--- a/regcomp.h
+++ b/regcomp.h
@@ -386,7 +386,8 @@ struct reg_data {
 struct reg_substr_datum {
     I32 min_offset;
     I32 max_offset;
-    SV *substr;
+    SV *substr;                /* non-utf8 variant */
+    SV *utf8_substr;   /* utf8 variant */
 };
 
 struct reg_substr_data {
@@ -394,10 +395,13 @@ struct reg_substr_data {
 };
 
 #define anchored_substr substrs->data[0].substr
+#define anchored_utf8 substrs->data[0].utf8_substr
 #define anchored_offset substrs->data[0].min_offset
 #define float_substr substrs->data[1].substr
+#define float_utf8 substrs->data[1].utf8_substr
 #define float_min_offset substrs->data[1].min_offset
 #define float_max_offset substrs->data[1].max_offset
 #define check_substr substrs->data[2].substr
+#define check_utf8 substrs->data[2].utf8_substr
 #define check_offset_min substrs->data[2].min_offset
 #define check_offset_max substrs->data[2].max_offset
index e676568..5a6d72d 100644 (file)
--- a/regexec.c
+++ b/regexec.c
  * Forwards.
  */
 
-#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
+#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 
 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
@@ -392,6 +392,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
     register SV *check;
     char *strbeg;
     char *t;
+    int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
     I32 ml_anch;
     register char *other_last = Nullch;        /* other substr checked before this */
     char *check_at = Nullch;           /* check substr found at this pos */
@@ -437,7 +438,20 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
     }
     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
     PL_regeol = strend;
-    check = prog->check_substr;
+    if (do_utf8) {
+       if (!prog->check_utf8 && prog->check_substr)
+           to_utf8_substr(prog);
+       check = prog->check_utf8;
+    } else {
+       if (!prog->check_substr && prog->check_utf8)
+           to_byte_substr(prog);
+       check = prog->check_substr;
+    }
+   if (check == &PL_sv_undef) {
+       DEBUG_r(PerlIO_printf(Perl_debug_log,
+               "Non-utf string cannot match utf check string\n"));
+       goto fail;
+    }
     if (prog->reganch & ROPT_ANCH) {   /* Match at beg-of-str or after \n */
        ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
                     || ( (prog->reganch & ROPT_ANCH_BOL)
@@ -543,7 +557,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
 
     DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
                          (s ? "Found" : "Did not find"),
-                         ((check == prog->anchored_substr) ? "anchored" : "floating"),
+                         (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
                          PL_colors[0],
                          (int)(SvCUR(check) - (SvTAIL(check)!=0)),
                          SvPVX(check),
@@ -566,16 +580,17 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
        Probably it is right to do no SCREAM here...
      */
 
-    if (prog->float_substr && prog->anchored_substr) {
+    if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
        /* Take into account the "other" substring. */
        /* XXXX May be hopelessly wrong for UTF... */
        if (!other_last)
            other_last = strpos;
-       if (check == prog->float_substr) {
+       if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
          do_other_anchored:
            {
                char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
                char *s1 = s;
+               SV* must;
 
                t = s - prog->check_offset_max;
                if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
@@ -593,20 +608,27 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
                    last1 = last;
  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
                /* On end-of-str: see comment below. */
-               s = fbm_instr((unsigned char*)t,
-                             HOP3(HOP3(last1, prog->anchored_offset, strend)
-                                  + SvCUR(prog->anchored_substr),
-                                  -(SvTAIL(prog->anchored_substr)!=0), strbeg),
-                             prog->anchored_substr,
-                             PL_multiline ? FBMrf_MULTILINE : 0);
+               must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
+               if (must == &PL_sv_undef) {
+                   s = (char*)NULL;
+                   DEBUG_r(must = prog->anchored_utf8);        /* for debug */
+               }
+               else
+                   s = fbm_instr(
+                       (unsigned char*)t,
+                       HOP3(HOP3(last1, prog->anchored_offset, strend)
+                               + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
+                       must,
+                       PL_multiline ? FBMrf_MULTILINE : 0
+                   );
                DEBUG_r(PerlIO_printf(Perl_debug_log,
                        "%s anchored substr `%s%.*s%s'%s",
                        (s ? "Found" : "Contradicts"),
                        PL_colors[0],
-                         (int)(SvCUR(prog->anchored_substr)
-                         - (SvTAIL(prog->anchored_substr)!=0)),
-                         SvPVX(prog->anchored_substr),
-                         PL_colors[1], (SvTAIL(prog->anchored_substr) ? "$" : "")));
+                         (int)(SvCUR(must)
+                         - (SvTAIL(must)!=0)),
+                         SvPVX(must),
+                         PL_colors[1], (SvTAIL(must) ? "$" : "")));
                if (!s) {
                    if (last1 >= last2) {
                        DEBUG_r(PerlIO_printf(Perl_debug_log,
@@ -633,54 +655,60 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
            }
        }
        else {          /* Take into account the floating substring. */
-               char *last, *last1;
-               char *s1 = s;
-
-               t = HOP3c(s, -start_shift, strbeg);
-               last1 = last =
-                   HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
-               if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
-                   last = HOP3c(t, prog->float_max_offset, strend);
-               s = HOP3c(t, prog->float_min_offset, strend);
-               if (s < other_last)
-                   s = other_last;
+           char *last, *last1;
+           char *s1 = s;
+           SV* must;
+
+           t = HOP3c(s, -start_shift, strbeg);
+           last1 = last =
+               HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
+           if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
+               last = HOP3c(t, prog->float_max_offset, strend);
+           s = HOP3c(t, prog->float_min_offset, strend);
+           if (s < other_last)
+               s = other_last;
  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
-               /* fbm_instr() takes into account exact value of end-of-str
-                  if the check is SvTAIL(ed).  Since false positives are OK,
-                  and end-of-str is not later than strend we are OK. */
+           must = do_utf8 ? prog->float_utf8 : prog->float_substr;
+           /* fbm_instr() takes into account exact value of end-of-str
+              if the check is SvTAIL(ed).  Since false positives are OK,
+              and end-of-str is not later than strend we are OK. */
+           if (must == &PL_sv_undef) {
+               s = (char*)NULL;
+               DEBUG_r(must = prog->float_utf8);       /* for debug message */
+           }
+           else
                s = fbm_instr((unsigned char*)s,
-                             (unsigned char*)last + SvCUR(prog->float_substr)
-                                 - (SvTAIL(prog->float_substr)!=0),
-                             prog->float_substr, PL_multiline ? FBMrf_MULTILINE : 0);
-               DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
-                       (s ? "Found" : "Contradicts"),
-                       PL_colors[0],
-                         (int)(SvCUR(prog->float_substr)
-                         - (SvTAIL(prog->float_substr)!=0)),
-                         SvPVX(prog->float_substr),
-                         PL_colors[1], (SvTAIL(prog->float_substr) ? "$" : "")));
-               if (!s) {
-                   if (last1 == last) {
-                       DEBUG_r(PerlIO_printf(Perl_debug_log,
-                                               ", giving up...\n"));
-                       goto fail_finish;
-                   }
+                             (unsigned char*)last + SvCUR(must)
+                                 - (SvTAIL(must)!=0),
+                             must, PL_multiline ? FBMrf_MULTILINE : 0);
+           DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
+                   (s ? "Found" : "Contradicts"),
+                   PL_colors[0],
+                     (int)(SvCUR(must) - (SvTAIL(must)!=0)),
+                     SvPVX(must),
+                     PL_colors[1], (SvTAIL(must) ? "$" : "")));
+           if (!s) {
+               if (last1 == last) {
                    DEBUG_r(PerlIO_printf(Perl_debug_log,
-                       ", trying anchored starting at offset %ld...\n",
-                       (long)(s1 + 1 - i_strpos)));
-                   other_last = last;
-                   s = HOP3c(t, 1, strend);
-                   goto restart;
-               }
-               else {
-                   DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
-                         (long)(s - i_strpos)));
-                   other_last = s; /* Fix this later. --Hugo */
-                   s = s1;
-                   if (t == strpos)
-                       goto try_at_start;
-                   goto try_at_offset;
+                                           ", giving up...\n"));
+                   goto fail_finish;
                }
+               DEBUG_r(PerlIO_printf(Perl_debug_log,
+                   ", trying anchored starting at offset %ld...\n",
+                   (long)(s1 + 1 - i_strpos)));
+               other_last = last;
+               s = HOP3c(t, 1, strend);
+               goto restart;
+           }
+           else {
+               DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
+                     (long)(s - i_strpos)));
+               other_last = s; /* Fix this later. --Hugo */
+               s = s1;
+               if (t == strpos)
+                   goto try_at_start;
+               goto try_at_offset;
+           }
        }
     }
 
@@ -703,7 +731,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
            while (t < strend - prog->minlen) {
                if (*t == '\n') {
                    if (t < check_at - prog->check_offset_min) {
-                       if (prog->anchored_substr) {
+                       if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
                            /* Since we moved from the found position,
                               we definitely contradict the found anchored
                               substr.  Due to the above check we do not
@@ -743,7 +771,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
        }
        s = t;
       set_useful:
-       ++BmUSEFUL(prog->check_substr); /* hooray/5 */
+       ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr);    /* hooray/5 */
     }
     else {
        /* The found string does not prohibit matching at strpos,
@@ -767,15 +795,23 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
        );
       success_at_start:
        if (!(prog->reganch & ROPT_NAUGHTY)     /* XXXX If strpos moved? */
-           && prog->check_substr               /* Could be deleted already */
-           && --BmUSEFUL(prog->check_substr) < 0
-           && prog->check_substr == prog->float_substr)
+           && (do_utf8 ? (
+               prog->check_utf8                /* Could be deleted already */
+               && --BmUSEFUL(prog->check_utf8) < 0
+               && (prog->check_utf8 == prog->float_utf8)
+           ) : (
+               prog->check_substr              /* Could be deleted already */
+               && --BmUSEFUL(prog->check_substr) < 0
+               && (prog->check_substr == prog->float_substr)
+           )))
        {
            /* If flags & SOMETHING - do not do it many times on the same match */
            DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
-           SvREFCNT_dec(prog->check_substr);
-           prog->check_substr = Nullsv;        /* disable */
-           prog->float_substr = Nullsv;        /* clear */
+           SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
+           if (do_utf8 ? prog->check_substr : prog->check_utf8)
+               SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
+           prog->check_substr = prog->check_utf8 = Nullsv;     /* disable */
+           prog->float_substr = prog->float_utf8 = Nullsv;     /* clear */
            check = Nullsv;                     /* abort */
            s = strpos;
            /* XXXX This is a remnant of the old implementation.  It
@@ -802,9 +838,9 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
        int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
                    ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
                    : 1);
-       char *endpos = (prog->anchored_substr || ml_anch)
+       char *endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
                ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
-               : (prog->float_substr
+               : (prog->float_substr || prog->float_utf8
                   ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
                           cl_l, strend)
                   : strend);
@@ -830,8 +866,8 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
            if ((prog->reganch & ROPT_ANCH) && !ml_anch)
                goto fail;
            /* Contradict one of substrings */
-           if (prog->anchored_substr) {
-               if (prog->anchored_substr == check) {
+           if (prog->anchored_substr || prog->anchored_utf8) {
+               if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
                    DEBUG_r( what = "anchored" );
                  hop_and_restart:
                    s = HOP3c(t, 1, strend);
@@ -871,7 +907,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
                          PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
                goto try_at_offset;
            }
-           if (!prog->float_substr)    /* Could have been deleted */
+           if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))     /* Could have been deleted */
                goto fail;
            /* Check is floating subtring. */
          retry_floating_check:
@@ -898,8 +934,8 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
     return s;
 
   fail_finish:                         /* Substring not found */
-    if (prog->check_substr)            /* could be removed already */
-       BmUSEFUL(prog->check_substr) += 5; /* hooray */
+    if (prog->check_substr || prog->check_utf8)                /* could be removed already */
+       BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
   fail:
     DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
                          PL_colors[4],PL_colors[5]));
@@ -1626,8 +1662,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
            PL_reg_ganch = strbeg;
     }
 
-    if (do_utf8 == (UTF!=0) &&
-       !(flags & REXEC_CHECKED) && prog->check_substr != Nullsv) {
+    if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
        re_scream_pos_data d;
 
        d.scream_olds = &scream_olds;
@@ -1677,7 +1712,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
                dontbother = minlen - 1;
            end = HOP3c(strend, -dontbother, strbeg) - 1;
            /* for multiline we only have to try after newlines */
-           if (prog->check_substr) {
+           if (prog->check_substr || prog->check_utf8) {
                if (s == startpos)
                    goto after_try;
                while (1) {
@@ -1713,13 +1748,16 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
     }
 
     /* Messy cases:  unanchored match. */
-    if (prog->anchored_substr && prog->reganch & ROPT_SKIP) {
+    if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
        /* we have /x+whatever/ */
        /* it must be a one character string (XXXX Except UTF?) */
-       char ch = SvPVX(prog->anchored_substr)[0];
+       char ch;
 #ifdef DEBUGGING
        int did_match = 0;
 #endif
+       if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
+           do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
+       ch = SvPVX(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
 
        if (do_utf8) {
            while (s < strend) {
@@ -1751,23 +1789,37 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
                );
     }
     /*SUPPRESS 560*/
-    else if (do_utf8 == (UTF!=0) &&
-            (prog->anchored_substr != Nullsv
-             || (prog->float_substr != Nullsv
-                 && prog->float_max_offset < strend - s))) {
-       SV *must = prog->anchored_substr
-           ? prog->anchored_substr : prog->float_substr;
-       I32 back_max =
-           prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
-       I32 back_min =
-           prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
-       char *last = HOP3c(strend,      /* Cannot start after this */
-                         -(I32)(CHR_SVLEN(must)
-                                - (SvTAIL(must) != 0) + back_min), strbeg);
+    else if (prog->anchored_substr != Nullsv
+             || prog->anchored_utf8 != Nullsv
+             || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
+                 && prog->float_max_offset < strend - s)) {
+       SV *must;
+       I32 back_max;
+       I32 back_min;
+       char *last;
        char *last1;            /* Last position checked before */
 #ifdef DEBUGGING
        int did_match = 0;
 #endif
+       if (prog->anchored_substr || prog->anchored_utf8) {
+           if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
+               do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
+           must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
+           back_max = back_min = prog->anchored_offset;
+       } else {
+           if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
+               do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
+           must = do_utf8 ? prog->float_utf8 : prog->float_substr;
+           back_max = prog->float_max_offset;
+           back_min = prog->float_min_offset;
+       }
+       if (must == &PL_sv_undef)
+           /* could not downgrade utf8 check substring, so must fail */
+           goto phooey;
+
+       last = HOP3c(strend,    /* Cannot start after this */
+                         -(I32)(CHR_SVLEN(must)
+                                - (SvTAIL(must) != 0) + back_min), strbeg);
 
        if (s > PL_bostr)
            last1 = HOPc(s, -1);
@@ -1815,7 +1867,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
        DEBUG_r(if (!did_match)
                     PerlIO_printf(Perl_debug_log, 
                                   "Did not find %s substr `%s%.*s%s'%s...\n",
-                             ((must == prog->anchored_substr)
+                             ((must == prog->anchored_substr || must == prog->anchored_utf8)
                               ? "anchored" : "floating"),
                              PL_colors[0],
                              (int)(SvCUR(must) - (SvTAIL(must)!=0)),
@@ -1855,20 +1907,26 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
     }
     else {
        dontbother = 0;
-       if (prog->float_substr != Nullsv) {     /* Trim the end. */
+       if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
+           /* Trim the end. */
            char *last;
+           SV* float_real;
+
+           if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
+               do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
+           float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
 
            if (flags & REXEC_SCREAM) {
-               last = screaminstr(sv, prog->float_substr, s - strbeg,
+               last = screaminstr(sv, float_real, s - strbeg,
                                   end_shift, &scream_pos, 1); /* last one */
                if (!last)
                    last = scream_olds; /* Only one occurrence. */
            }
            else {
                STRLEN len;
-               char *little = SvPV(prog->float_substr, len);
+               char *little = SvPV(float_real, len);
 
-               if (SvTAIL(prog->float_substr)) {
+               if (SvTAIL(float_real)) {
                    if (memEQ(strend - len + 1, little, len - 1))
                        last = strend - len + 1;
                    else if (!PL_multiline)
@@ -4426,3 +4484,59 @@ restore_pos(pTHX_ void *arg)
        PL_curpm = PL_reg_oldcurpm;
     }  
 }
+
+STATIC void
+S_to_utf8_substr(pTHX_ register regexp *prog)
+{
+    SV* sv;
+    if (prog->float_substr && !prog->float_utf8) {
+       prog->float_utf8 = sv = NEWSV(117, 0);
+       SvSetMagicSV(sv, prog->float_substr);
+       sv_utf8_upgrade(sv);
+       if (SvTAIL(prog->float_substr))
+           SvTAIL_on(sv);
+       if (prog->float_substr == prog->check_substr)
+           prog->check_utf8 = sv;
+    }
+    if (prog->anchored_substr && !prog->anchored_utf8) {
+       prog->anchored_utf8 = sv = NEWSV(118, 0);
+       SvSetMagicSV(sv, prog->anchored_substr);
+       sv_utf8_upgrade(sv);
+       if (SvTAIL(prog->anchored_substr))
+           SvTAIL_on(sv);
+       if (prog->anchored_substr == prog->check_substr)
+           prog->check_utf8 = sv;
+    }
+}
+
+STATIC void
+S_to_byte_substr(pTHX_ register regexp *prog)
+{
+    SV* sv;
+    if (prog->float_utf8 && !prog->float_substr) {
+       prog->float_substr = sv = NEWSV(117, 0);
+       SvSetMagicSV(sv, prog->float_utf8);
+       if (sv_utf8_downgrade(sv, TRUE)) {
+           if (SvTAIL(prog->float_utf8))
+               SvTAIL_on(sv);
+       } else {
+           SvREFCNT_dec(sv);
+           prog->float_substr = sv = &PL_sv_undef;
+       }
+       if (prog->float_utf8 == prog->check_utf8)
+           prog->check_substr = sv;
+    }
+    if (prog->anchored_utf8 && !prog->anchored_substr) {
+       prog->anchored_substr = sv = NEWSV(118, 0);
+       SvSetMagicSV(sv, prog->anchored_utf8);
+       if (sv_utf8_downgrade(sv, TRUE)) {
+           if (SvTAIL(prog->anchored_utf8))
+               SvTAIL_on(sv);
+       } else {
+           SvREFCNT_dec(sv);
+           prog->anchored_substr = sv = &PL_sv_undef;
+       }
+       if (prog->anchored_utf8 == prog->check_utf8)
+           prog->check_substr = sv;
+    }
+}
diff --git a/sv.c b/sv.c
index ac40900..21de7ed 100644 (file)
--- a/sv.c
+++ b/sv.c
@@ -8592,6 +8592,7 @@ Perl_re_dup(pTHX_ REGEXP *r, CLONE_PARAMS *param)
        s->min_offset = r->substrs->data[i].min_offset;
        s->max_offset = r->substrs->data[i].max_offset;
        s->substr     = sv_dup_inc(r->substrs->data[i].substr, param);
+       s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
     }
 
     ret->regstclass = NULL;
index a00e624..b5dff4b 100755 (executable)
@@ -6,7 +6,7 @@
 
 $| = 1;
 
-print "1..864\n";
+print "1..892\n";
 
 BEGIN {
     chdir 't' if -d 't';
@@ -2730,3 +2730,44 @@ print "# some Unicode properties\n";
     print $u eq "feeber" ? "ok 864\n" : "not ok 864\n";
 }
 
+{
+    print "# UTF-8 bug with s///\n";
+    # check utf8/non-utf8 mixtures
+    # try to force all float/anchored check combinations
+    my $c = "\x{100}";
+    my $test = 865;
+    my $subst;
+    for my $re (
+       "xx.*$c", "x.*$c$c", "$c.*xx", "$c$c.*x", "xx.*(?=$c)", "(?=$c).*xx",
+    ) {
+       print "xxx" =~ /$re/ ? "not ok $test\n" : "ok $test\n";
+       ++$test;
+       print +($subst = "xxx") =~ s/$re// ? "not ok $test\n" : "ok $test\n";
+       ++$test;
+    }
+    for my $re ("xx.*$c*", "$c*.*xx") {
+       print "xxx" =~ /$re/ ? "ok $test\n" : "not ok $test\n";
+       ++$test;
+       ($subst = "xxx") =~ s/$re//;
+       print $subst eq '' ? "ok $test\n" : "not ok $test\t# $subst\n";
+       ++$test;
+    }
+    for my $re ("xxy*", "y*xx") {
+       print "xx$c" =~ /$re/ ? "ok $test\n" : "not ok $test\n";
+       ++$test;
+       ($subst = "xx$c") =~ s/$re//;
+       print $subst eq $c ? "ok $test\n" : "not ok $test\n";
+       ++$test;
+       print "xy$c" =~ /$re/ ? "not ok $test\n" : "ok $test\n";
+       ++$test;
+       print +($subst = "xy$c") =~ /$re/ ? "not ok $test\n" : "ok $test\n";
+       ++$test;
+    }
+    for my $re ("xy$c*z", "x$c*yz") {
+       print "xyz" =~ /$re/ ? "ok $test\n" : "not ok $test\n";
+       ++$test;
+       ($subst = "xyz") =~ s/$re//;
+       print $subst eq '' ? "ok $test\n" : "not ok $test\n";
+       ++$test;
+    }
+}