Fix for "UTF-8 bug with s///" from Hugo.
[p5sagit/p5-mst-13.2.git] / regcomp.c
index e81bc0a..639f140 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -69,7 +69,7 @@
  *
  ****    Alterations to Henry's code are...
  ****
- ****    Copyright (c) 1991-2001, Larry Wall
+ ****    Copyright (c) 1991-2002, Larry Wall
  ****
  ****    You may distribute under the terms of either the GNU General Public
  ****    License or the Artistic License, as specified in the README file.
@@ -385,15 +385,14 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 #define        vWARN(loc,m)                                                         \
     STMT_START {                                                             \
         IV offset = loc - RExC_precomp;          \
-       Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
+       Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,\
                 m, (int)offset, RExC_precomp, RExC_precomp + offset);          \
     } STMT_END                                                               \
 
 #define        vWARNdep(loc,m)                                                         \
     STMT_START {                                                             \
         IV offset = loc - RExC_precomp;          \
-        int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED;  \
-       Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\
+       Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), "%s" REPORT_LOCATION,\
                 m, (int)offset, RExC_precomp, RExC_precomp + offset);          \
     } STMT_END                                                               \
 
@@ -401,7 +400,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 #define        vWARN2(loc, m, a1)                                                   \
     STMT_START {                                                             \
         IV offset = loc - RExC_precomp;          \
-       Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
+       Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
                  a1,                                                         \
                 (int)offset, RExC_precomp, RExC_precomp + offset);        \
     } STMT_END
@@ -409,7 +408,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 #define        vWARN3(loc, m, a1, a2)                                               \
     STMT_START {                                                             \
       IV offset = loc - RExC_precomp;        \
-       Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,                    \
+       Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,                    \
                  a1, a2,                                                     \
                 (int)offset, RExC_precomp, RExC_precomp + offset);        \
     } STMT_END
@@ -417,7 +416,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 #define        vWARN4(loc, m, a1, a2, a3)                                           \
     STMT_START {                                                             \
       IV offset = loc - RExC_precomp;            \
-       Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
+       Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
                  a1, a2, a3,                                                 \
                 (int)offset, RExC_precomp, RExC_precomp + offset);        \
     } STMT_END
@@ -426,7 +425,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 #define        vWARN5(loc, m, a1, a2, a3, a4)                                       \
   STMT_START {                                                   \
       IV offset = loc - RExC_precomp;   \
-        Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,      \
+        Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
                  a1, a2, a3, a4,                                 \
                  (int)offset, RExC_precomp, RExC_precomp + offset);  \
     } STMT_END
@@ -736,6 +735,50 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg
                    n = nnext;
                }
            }
+
+           if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
+/*
+  Two problematic code points in Unicode casefolding of EXACT nodes:
+
+   U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
+   U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+
+   which casefold to
+
+   Unicode                     UTF-8
+
+   U+03B9 U+0308 U+0301                0xCE 0xB9 0xCC 0x88 0xCC 0x81
+   U+03C5 U+0308 U+0301                0xCF 0x85 0xCC 0x88 0xCC 0x81
+
+   This means that in case-insensitive matching (or "loose matching",
+   as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
+   length of the above casefolded versions) can match a target string
+   of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
+   This would rather mess up the minimum length computation.
+
+   What we'll do is to look for the tail four bytes, and then peek
+   at the preceding two bytes to see whether we need to decrease
+   the minimum length by four (six minus two).
+
+   Thanks to the design of UTF-8, there cannot be false matches:
+   A sequence of valid UTF-8 bytes cannot be a subsequence of
+   another valid sequence of UTF-8 bytes.
+
+*/
+                char *s0 = STRING(scan), *s, *t;
+                char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
+                char *t0 = "\xcc\x88\xcc\x81";
+                char *t1 = t0 + 3;
+                
+                for (s = s0 + 2;
+                     s < s2 && (t = ninstr(s, s1, t0, t1));
+                     s = t + 4) {
+                     if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
+                         ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
+                          min -= 4;
+                }
+           }
+
 #ifdef DEBUGGING
            /* Allow dumping */
            n = scan + NODE_SZ_STR(scan);
@@ -888,6 +931,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg
                        ? I32_MAX : data->pos_min + data->pos_delta;
                }
                sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
+               if (UTF)
+                   SvUTF8_on(data->last_found);
                data->last_end = data->pos_min + l;
                data->pos_min += l; /* As in the first entry. */
                data->flags &= ~SF_BEFORE_EOL;
@@ -1848,7 +1893,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
            first = NEXTOPER(first);
            goto again;
        }
-       else if ((OP(first) == STAR &&
+       else if (!sawopen && (OP(first) == STAR &&
            PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
            !(r->reganch & ROPT_ANCH) )
        {
@@ -1920,17 +1965,23 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
                && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
                    goto remove_float;          /* As in (a)+. */
 
-           r->float_substr = data.longest_float;
+           if (SvUTF8(data.longest_float)) {
+               r->float_utf8 = data.longest_float;
+               r->float_substr = Nullsv;
+           } else {
+               r->float_substr = data.longest_float;
+               r->float_utf8 = Nullsv;
+           }
            r->float_min_offset = data.offset_float_min;
            r->float_max_offset = data.offset_float_max;
            t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
                       && (!(data.flags & SF_FL_BEFORE_MEOL)
                           || (RExC_flags16 & PMf_MULTILINE)));
-           fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
+           fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
        }
        else {
          remove_float:
-           r->float_substr = Nullsv;
+           r->float_substr = r->float_utf8 = Nullsv;
            SvREFCNT_dec(data.longest_float);
            longest_float_length = 0;
        }
@@ -1942,22 +1993,29 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
                    || (RExC_flags16 & PMf_MULTILINE)))) {
            int t;
 
-           r->anchored_substr = data.longest_fixed;
+           if (SvUTF8(data.longest_fixed)) {
+               r->anchored_utf8 = data.longest_fixed;
+               r->anchored_substr = Nullsv;
+           } else {
+               r->anchored_substr = data.longest_fixed;
+               r->anchored_utf8 = Nullsv;
+           }
            r->anchored_offset = data.offset_fixed;
            t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
                     || (RExC_flags16 & PMf_MULTILINE)));
-           fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
+           fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
        }
        else {
-           r->anchored_substr = Nullsv;
+           r->anchored_substr = r->anchored_utf8 = Nullsv;
            SvREFCNT_dec(data.longest_fixed);
            longest_fixed_length = 0;
        }
        if (r->regstclass
            && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
            r->regstclass = NULL;
-       if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
+       if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
+           && stclass_flag
            && !(data.start_class->flags & ANYOF_EOS)
            && !cl_is_anything(data.start_class)) {
            I32 n = add_data(pRExC_state, 1, "f");
@@ -1980,20 +2038,22 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
        /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
        if (longest_fixed_length > longest_float_length) {
            r->check_substr = r->anchored_substr;
+           r->check_utf8 = r->anchored_utf8;
            r->check_offset_min = r->check_offset_max = r->anchored_offset;
            if (r->reganch & ROPT_ANCH_SINGLE)
                r->reganch |= ROPT_NOSCAN;
        }
        else {
            r->check_substr = r->float_substr;
+           r->check_utf8 = r->float_utf8;
            r->check_offset_min = data.offset_float_min;
            r->check_offset_max = data.offset_float_max;
        }
        /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
           This should be changed ASAP!  */
-       if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
+       if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
            r->reganch |= RE_USE_INTUIT;
-           if (SvTAIL(r->check_substr))
+           if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
                r->reganch |= RE_INTUIT_TAIL;
        }
     }
@@ -2009,7 +2069,8 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
        data.start_class = &ch_class;
        data.last_closep = &last_close;
        minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
-       r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
+       r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
+               = r->float_substr = r->float_utf8 = Nullsv;
        if (!(data.start_class->flags & ANYOF_EOS)
            && !cl_is_anything(data.start_class)) {
            I32 n = add_data(pRExC_state, 1, "f");
@@ -2124,6 +2185,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
                /* FALL THROUGH*/
            case '?':           /* (??...) */
                logical = 1;
+               if (*RExC_parse != '{')
+                   goto unknown;
                paren = *RExC_parse++;
                /* FALL THROUGH */
            case '{':           /* (?{...}) */
@@ -2361,9 +2424,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
     }
     else if (paren != '?')             /* Not Conditional */
        ret = br;
-    if (flags&HASWIDTH)
-       *flagp |= HASWIDTH;
-    *flagp |= flags&SPSTART;
+    *flagp |= flags & (SPSTART | HASWIDTH);
     lastbr = br;
     while (*RExC_parse == '|') {
        if (!SIZE_ONLY && RExC_extralen) {
@@ -2993,6 +3054,7 @@ tryagain:
            RExC_parse++;
 
        defchar:
+           ender = 0;
            ret = reg_node(pRExC_state, FOLD
                          ? (LOC ? EXACTFL : EXACTF)
                          : EXACT);
@@ -3138,22 +3200,34 @@ tryagain:
                    if (len)
                        p = oldp;
                    else if (UTF) {
+                        STRLEN unilen;
+
                         if (FOLD) {
                              /* Emit all the Unicode characters. */
                              for (foldbuf = tmpbuf;
                                   foldlen;
                                   foldlen -= numlen) {
                                   ender = utf8_to_uvchr(foldbuf, &numlen);
-                                  reguni(pRExC_state, ender, s, &numlen);
-                                  s       += numlen;
-                                  len     += numlen;
-                                  foldbuf += numlen;
+                                  if (numlen > 0) {
+                                       reguni(pRExC_state, ender, s, &unilen);
+                                       s       += unilen;
+                                       len     += unilen;
+                                       /* In EBCDIC the numlen
+                                        * and unilen can differ. */
+                                       foldbuf += numlen;
+                                       if (numlen >= foldlen)
+                                            break;
+                                  }
+                                  else
+                                       break; /* "Can't happen." */
                              }
                         }
                         else {
-                             reguni(pRExC_state, ender, s, &numlen);
-                             s   += numlen;
-                             len += numlen;
+                             reguni(pRExC_state, ender, s, &unilen);
+                             if (unilen > 0) {
+                                  s   += unilen;
+                                  len += unilen;
+                             }
                         }
                    }
                    else {
@@ -3163,22 +3237,34 @@ tryagain:
                    break;
                }
                if (UTF) {
+                    STRLEN unilen;
+
                     if (FOLD) {
                          /* Emit all the Unicode characters. */
                          for (foldbuf = tmpbuf;
                               foldlen;
                               foldlen -= numlen) {
                               ender = utf8_to_uvchr(foldbuf, &numlen);
-                              reguni(pRExC_state, ender, s, &numlen);
-                              s       += numlen;
-                              len     += numlen;
-                              foldbuf += numlen;
+                              if (numlen > 0) {
+                                   reguni(pRExC_state, ender, s, &unilen);
+                                   len     += unilen;
+                                   s       += unilen;
+                                   /* In EBCDIC the numlen
+                                    * and unilen can differ. */
+                                   foldbuf += numlen;
+                                   if (numlen >= foldlen)
+                                        break;
+                              }
+                              else
+                                   break;
                          }
                     }
                     else {
-                         reguni(pRExC_state, ender, s, &numlen);
-                         s   += numlen;
-                         len += numlen;
+                         reguni(pRExC_state, ender, s, &unilen);
+                         if (unilen > 0) {
+                              s   += unilen;
+                              len += unilen;
+                         }
                     }
                     len--;
                }
@@ -3218,7 +3304,7 @@ tryagain:
         if (RExC_utf8)
              SvUTF8_on(sv);
         if (sv_utf8_downgrade(sv, TRUE)) {
-             char *s       = Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding);
+             char *s       = sv_recode_to_utf8(sv, PL_encoding);
              STRLEN newlen = SvCUR(sv);
         
              if (!SIZE_ONLY) {
@@ -3390,8 +3476,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
 STATIC void
 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
 {
-    if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
-       POSIXCC(UCHARAT(RExC_parse))) {
+    if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
        char *s = RExC_parse;
        char  c = *s++;
 
@@ -3456,11 +3541,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
 
     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
 
-    if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && POSIXCC(nextvalue))
+    if (!SIZE_ONLY && POSIXCC(nextvalue))
        checkposixcc(pRExC_state);
 
-    if (UCHARAT(RExC_parse) == ']' || UCHARAT(RExC_parse) == '-')
-       goto charclassloop;             /* allow 1st char to be ] or - */
+    /* allow 1st char to be ] (allowing it to be - is dealt with later) */
+    if (UCHARAT(RExC_parse) == ']')
+       goto charclassloop;
 
     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
 
@@ -4009,22 +4095,20 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                          ANYOF_BITMAP_SET(ret, i);
          }
          if (value > 255 || UTF) {
+               UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
+               UV natvalue      = NATIVE_TO_UNI(value);
+
                ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
-               if (prevvalue < value)
+               if (prevnatvalue < natvalue) { /* what about > ? */
                    Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
-                                  (UV)prevvalue, (UV)value);
-               else if (prevvalue == value) {
-                   Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
-                                  (UV)value);
+                                  prevnatvalue, natvalue);
+               }
+               else if (prevnatvalue == natvalue) {
+                   Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
                    if (FOLD) {
-                        U8 tmpbuf [UTF8_MAXLEN+1];
                         U8 foldbuf[UTF8_MAXLEN_FOLD+1];
                         STRLEN foldlen;
-                        UV f;
-
-                        uvchr_to_utf8(tmpbuf, value);
-                        to_utf8_fold(tmpbuf, foldbuf, &foldlen);
-                        f = utf8_to_uvchr(foldbuf, 0);
+                        UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
 
                         /* If folding and foldable and a single
                          * character, insert also the folded version
@@ -4039,7 +4123,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                                   * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
                                   * where E folds into "pq" and F folds
                                   * into "rst", all other characters
-                                  * fold to single characters. */
+                                  * fold to single characters.  We save
+                                  * away these multicharacter foldings,
+                                  * to be later saved as part of the
+                                  * additional "s" data. */
                                  SV *sv;
 
                                  if (!unicode_alternate)
@@ -4112,6 +4199,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
        AV *av = newAV();
        SV *rv;
 
+       /* The 0th element stores the character class description
+        * in its textual form: used later (regexec.c:Perl_regclass_swatch())
+        * to initialize the appropriate swash (which gets stored in
+        * the 1st element), and also useful for dumping the regnode.
+        * The 2nd element stores the multicharacter foldings,
+        * used later (regexec.c:s_reginclasslen()). */
        av_store(av, 0, listsv);
        av_store(av, 1, NULL);
        av_store(av, 2, (SV*)unicode_alternate);
@@ -4454,6 +4547,15 @@ Perl_regdump(pTHX_ regexp *r)
                      PL_colors[1],
                      SvTAIL(r->anchored_substr) ? "$" : "",
                      (IV)r->anchored_offset);
+    else if (r->anchored_utf8)
+       PerlIO_printf(Perl_debug_log,
+                     "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
+                     PL_colors[0],
+                     (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
+                     SvPVX(r->anchored_utf8),
+                     PL_colors[1],
+                     SvTAIL(r->anchored_utf8) ? "$" : "",
+                     (IV)r->anchored_offset);
     if (r->float_substr)
        PerlIO_printf(Perl_debug_log,
                      "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
@@ -4463,15 +4565,25 @@ Perl_regdump(pTHX_ regexp *r)
                      PL_colors[1],
                      SvTAIL(r->float_substr) ? "$" : "",
                      (IV)r->float_min_offset, (UV)r->float_max_offset);
-    if (r->check_substr)
+    else if (r->float_utf8)
+       PerlIO_printf(Perl_debug_log,
+                     "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
+                     PL_colors[0],
+                     (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
+                     SvPVX(r->float_utf8),
+                     PL_colors[1],
+                     SvTAIL(r->float_utf8) ? "$" : "",
+                     (IV)r->float_min_offset, (UV)r->float_max_offset);
+    if (r->check_substr || r->check_utf8)
        PerlIO_printf(Perl_debug_log,
                      r->check_substr == r->float_substr
+                     && r->check_utf8 == r->float_utf8
                      ? "(checking floating" : "(checking anchored");
     if (r->reganch & ROPT_NOSCAN)
        PerlIO_printf(Perl_debug_log, " noscan");
     if (r->reganch & ROPT_CHECK_ALL)
        PerlIO_printf(Perl_debug_log, " isall");
-    if (r->check_substr)
+    if (r->check_substr || r->check_utf8)
        PerlIO_printf(Perl_debug_log, ") ");
 
     if (r->regstclass) {
@@ -4548,9 +4660,13 @@ Perl_regprop(pTHX_ SV *sv, regnode *o)
 
     if (k == EXACT) {
         SV *dsv = sv_2mortal(newSVpvn("", 0));
-       bool do_utf8 = DO_UTF8(sv);
+       /* Using is_utf8_string() is a crude hack but it may
+        * be the best for now since we have no flag "this EXACTish
+        * node was UTF-8" --jhi */
+       bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
        char *s    = do_utf8 ?
-         pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60, 0) :
+         pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
+                        UNI_DISPLAY_REGEX) :
          STRING(o);
        int len = do_utf8 ?
          strlen(s) :
@@ -4716,33 +4832,45 @@ Perl_re_intuit_string(pTHX_ regexp *prog)
 {                              /* Assume that RE_INTUIT is set */
     DEBUG_r(
        {   STRLEN n_a;
-           char *s = SvPV(prog->check_substr,n_a);
+           char *s = SvPV(prog->check_substr
+                     ? prog->check_substr : prog->check_utf8, n_a);
 
            if (!PL_colorset) reginitcolors();
            PerlIO_printf(Perl_debug_log,
-                     "%sUsing REx substr:%s `%s%.60s%s%s'\n",
-                     PL_colors[4],PL_colors[5],PL_colors[0],
+                     "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
+                     PL_colors[4],
+                     prog->check_substr ? "" : "utf8 ",
+                     PL_colors[5],PL_colors[0],
                      s,
                      PL_colors[1],
                      (strlen(s) > 60 ? "..." : ""));
        } );
 
-    return prog->check_substr;
+    return prog->check_substr ? prog->check_substr : prog->check_utf8;
 }
 
 void
 Perl_pregfree(pTHX_ struct regexp *r)
 {
-    DEBUG_r(if (!PL_colorset) reginitcolors());
+#ifdef DEBUGGING
+    SV *dsv = PERL_DEBUG_PAD_ZERO(0);
+#endif
 
     if (!r || (--r->refcnt > 0))
        return;
-    DEBUG_r(PerlIO_printf(Perl_debug_log,
-                     "%sFreeing REx:%s `%s%.60s%s%s'\n",
-                     PL_colors[4],PL_colors[5],PL_colors[0],
-                     r->precomp,
-                     PL_colors[1],
-                     (strlen(r->precomp) > 60 ? "..." : "")));
+    DEBUG_r({
+         char *s = pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60,
+                                 UNI_DISPLAY_REGEX);
+        int len = SvCUR(dsv);
+        if (!PL_colorset)
+             reginitcolors();
+        PerlIO_printf(Perl_debug_log,
+                      "%sFreeing REx:%s `%s%*.*s%s%s'\n",
+                      PL_colors[4],PL_colors[5],PL_colors[0],
+                      len, len, s,
+                      PL_colors[1],
+                      len > 60 ? "..." : "");
+    });
 
     if (r->precomp)
        Safefree(r->precomp);
@@ -4753,8 +4881,12 @@ Perl_pregfree(pTHX_ struct regexp *r)
     if (r->substrs) {
        if (r->anchored_substr)
            SvREFCNT_dec(r->anchored_substr);
+       if (r->anchored_utf8)
+           SvREFCNT_dec(r->anchored_utf8);
        if (r->float_substr)
            SvREFCNT_dec(r->float_substr);
+       if (r->float_utf8)
+           SvREFCNT_dec(r->float_utf8);
        Safefree(r->substrs);
     }
     if (r->data) {