[perl #41530] s/non-utf8/is-utf8/ fails.
Karl Williamson [Tue, 11 May 2010 16:57:41 +0000 (10:57 -0600)]
When the replacement is in utf8, there was failure to upgrade the result
when the source and the pattern weren't in utf8.  This simply checks
that when there is a match that will lead to the replacement being done.
It then does the upgrade.  If this led to changes in the source, we redo
the match because pointers to saved buffers could have changed.  There
may be other cases where we don't need to redo the match, but I don't
know the code well-enough to easily figure it out.

pp_hot.c
t/re/subst.t

index ea949b8..ab36593 100644 (file)
--- a/pp_hot.c
+++ b/pp_hot.c
@@ -2126,6 +2126,7 @@ PP(pp_subst)
        DIE(aTHX_ "%s", PL_no_modify);
     PUTBACK;
 
+  setup_match:
     s = SvPV_mutable(TARG, len);
     if (!SvPOKp(TARG) || SvTYPE(TARG) == SVt_PVGV)
        force_on_match = 1;
@@ -2181,6 +2182,22 @@ PP(pp_subst)
                         r_flags | REXEC_CHECKED);
     /* known replacement string? */
     if (dstr) {
+
+       /* Upgrade the source if the replacement is utf8 but the source is not,
+        * but only if it matched; see
+        * http://www.nntp.perl.org/group/perl.perl5.porters/2010/04/msg158809.html
+        */
+       if (matched && DO_UTF8(dstr) && ! DO_UTF8(TARG)) {
+           const STRLEN new_len = sv_utf8_upgrade(TARG);
+
+           /* If the lengths are the same, the pattern contains only
+            * invariants, can keep going; otherwise, various internal markers
+            * could be off, so redo */
+           if (new_len != len) {
+               goto setup_match;
+           }
+       }
+
        /* replacement needing upgrading? */
        if (DO_UTF8(TARG) && !doutf8) {
             nsv = sv_newmortal();
index 042f67a..82c4a6f 100644 (file)
@@ -7,7 +7,7 @@ BEGIN {
 }
 
 require './test.pl';
-plan( tests => 143 );
+plan( tests => 149 );
 
 $x = 'foo';
 $_ = "x";
@@ -614,3 +614,23 @@ fresh_perl_is( '$_="abcef"; s/bc|(.)\G(.)/$1 ? "[$1-$2]" : "XX"/ge; print' => 'a
   'bug: $var =~ s/$qr//e calling get-magic on $_ as well as $var',
  );
 }
+
+{ # Bug #41530; replacing non-utf8 with a utf8 causes problems
+    my $string = "a\x{a0}a";
+    my $sub_string = $string;
+    ok(! utf8::is_utf8($sub_string), "Verify that string isn't initially utf8");
+    $sub_string =~ s/a/\x{100}/g;
+    ok(utf8::is_utf8($sub_string),
+                        'Verify replace of non-utf8 with utf8 upgrades to utf8');
+    is($sub_string, "\x{100}\x{A0}\x{100}",
+                            'Verify #41530 fixed: replace of non-utf8 with utf8');
+
+    my $non_sub_string = $string;
+    ok(! utf8::is_utf8($non_sub_string),
+                                    "Verify that string isn't initially utf8");
+    $non_sub_string =~ s/b/\x{100}/g;
+    ok(! utf8::is_utf8($non_sub_string),
+            "Verify that failed substitute doesn't change string's utf8ness");
+    is($non_sub_string, $string,
+                        "Verify that failed substitute doesn't change string");
+}