Make chomp heed the utf8 flags on the target string and $/

diff --git a/doop.c b/doop.c

index ea64ff8..6724aca 100644 (file)
--- a/doop.c
+++ b/doop.c
@@ -1008,6 +1008,7 @@ Perl_do_chomp(pTHX_ register SV *sv)
     STRLEN len;
     STRLEN n_a;
     char *s;
+    char *temp_buffer = NULL;
 
     if (RsSNARF(PL_rs))
        return 0;
@@ -1059,6 +1060,27 @@ Perl_do_chomp(pTHX_ register SV *sv)
        else {
            STRLEN rslen;
            char *rsptr = SvPV(PL_rs, rslen);
+           if (SvUTF8(PL_rs) != SvUTF8(sv)) {
+               /* Assumption is that rs is shorter than the scalar.  */
+               if (SvUTF8(PL_rs)) {
+                   /* RS is utf8, scalar is 8 bit.  */
+                   bool is_utf8 = TRUE;
+                   temp_buffer = (char*)bytes_from_utf8((U8*)rsptr,
+                                                        &rslen, &is_utf8);
+                   if (is_utf8) {
+                       /* Cannot downgrade, therefore cannot possibly match
+                        */
+                       assert (temp_buffer == rsptr);
+                       temp_buffer = NULL;
+                       goto nope;
+                   }
+                   rsptr = temp_buffer;
+               } else {
+                   /* RS is 8 bit, scalar is utf8.  */
+                   temp_buffer = (char*)bytes_to_utf8((U8*)rsptr, &rslen);
+                   rsptr = temp_buffer;
+               }
+           }
            if (rslen == 1) {
                if (*s != *rsptr)
                    goto nope;
@@ -1081,6 +1103,7 @@ Perl_do_chomp(pTHX_ register SV *sv)
        SvSETMAGIC(sv);
     }
   nope:
+    Safefree(temp_buffer);
     return count;
 }
 
diff --git a/t/op/chop.t b/t/op/chop.t

index 87700de..68025b7 100755 (executable)
--- a/t/op/chop.t
+++ b/t/op/chop.t
@@ -6,7 +6,7 @@ BEGIN {
     require './test.pl';
 }
 
-plan tests => 51;
+plan tests => 91;
 
 $_ = 'abc';
 $c = do foo();
@@ -183,3 +183,29 @@ ok($@ =~ /Can\'t modify.*chop.*in.*assignment/);
 eval 'chomp($x, $y) = (1, 2);';
 ok($@ =~ /Can\'t modify.*chom?p.*in.*assignment/);
 
+my @chars = ("N", "\xd3", substr ("\xd4\x{100}", 0, 1), chr 1296);
+foreach my $start (@chars) {
+  foreach my $end (@chars) {
+    local $/ = $end;
+    my $message = "start=" . ord ($start) . " end=" . ord $end;
+    my $string = $start . $end;
+    chomp $string;
+    is ($string, $start, $message);
+
+    my $end_utf8 = $end;
+    utf8::encode ($end_utf8);
+    next if $end_utf8 eq $end;
+
+    # $end ne $end_utf8, so these should not chomp.
+    $string = $start . $end_utf8;
+    my $chomped = $string;
+    chomp $chomped;
+    is ($chomped, $string, "$message (end as bytes)");
+
+    $/ = $end_utf8;
+    $string = $start . $end;
+    $chomped = $string;
+    chomp $chomped;
+    is ($chomped, $string, "$message (\$/ as bytes)");
+  }
+}
doop.c		patch \| blob \| blame \| history
t/op/chop.t		patch \| blob \| blame \| history