From: Nicholas Clark <nick@ccl4.org>
Date: Thu, 15 Jan 2004 00:03:04 +0000 (+0000)
Subject: Make chomp heed the utf8 flags on the target string and $/
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=c4c87a065d5684a07ac86a151149508724e14d4e;p=p5sagit%2Fp5-mst-13.2.git

Make chomp heed the utf8 flags on the target string and $/
[Fixes #24888]
More work still needed to make chomp heed the encoding pragma.

p4raw-id: //depot/perl@22155
---

diff --git a/doop.c b/doop.c
index ea64ff8..6724aca 100644
--- a/doop.c
+++ b/doop.c
@@ -1008,6 +1008,7 @@ Perl_do_chomp(pTHX_ register SV *sv)
     STRLEN len;
     STRLEN n_a;
     char *s;
+    char *temp_buffer = NULL;
 
     if (RsSNARF(PL_rs))
 	return 0;
@@ -1059,6 +1060,27 @@ Perl_do_chomp(pTHX_ register SV *sv)
 	else {
 	    STRLEN rslen;
 	    char *rsptr = SvPV(PL_rs, rslen);
+	    if (SvUTF8(PL_rs) != SvUTF8(sv)) {
+		/* Assumption is that rs is shorter than the scalar.  */
+		if (SvUTF8(PL_rs)) {
+		    /* RS is utf8, scalar is 8 bit.  */
+		    bool is_utf8 = TRUE;
+		    temp_buffer = (char*)bytes_from_utf8((U8*)rsptr,
+							 &rslen, &is_utf8);
+		    if (is_utf8) {
+			/* Cannot downgrade, therefore cannot possibly match
+			 */
+			assert (temp_buffer == rsptr);
+			temp_buffer = NULL;
+			goto nope;
+		    }
+		    rsptr = temp_buffer;
+		} else {
+		    /* RS is 8 bit, scalar is utf8.  */
+		    temp_buffer = (char*)bytes_to_utf8((U8*)rsptr, &rslen);
+		    rsptr = temp_buffer;
+		}
+	    }
 	    if (rslen == 1) {
 		if (*s != *rsptr)
 		    goto nope;
@@ -1081,6 +1103,7 @@ Perl_do_chomp(pTHX_ register SV *sv)
 	SvSETMAGIC(sv);
     }
   nope:
+    Safefree(temp_buffer);
     return count;
 }
 
diff --git a/t/op/chop.t b/t/op/chop.t
index 87700de..68025b7 100755
--- a/t/op/chop.t
+++ b/t/op/chop.t
@@ -6,7 +6,7 @@ BEGIN {
     require './test.pl';
 }
 
-plan tests => 51;
+plan tests => 91;
 
 $_ = 'abc';
 $c = do foo();
@@ -183,3 +183,29 @@ ok($@ =~ /Can\'t modify.*chop.*in.*assignment/);
 eval 'chomp($x, $y) = (1, 2);';
 ok($@ =~ /Can\'t modify.*chom?p.*in.*assignment/);
 
+my @chars = ("N", "\xd3", substr ("\xd4\x{100}", 0, 1), chr 1296);
+foreach my $start (@chars) {
+  foreach my $end (@chars) {
+    local $/ = $end;
+    my $message = "start=" . ord ($start) . " end=" . ord $end;
+    my $string = $start . $end;
+    chomp $string;
+    is ($string, $start, $message);
+
+    my $end_utf8 = $end;
+    utf8::encode ($end_utf8);
+    next if $end_utf8 eq $end;
+
+    # $end ne $end_utf8, so these should not chomp.
+    $string = $start . $end_utf8;
+    my $chomped = $string;
+    chomp $chomped;
+    is ($chomped, $string, "$message (end as bytes)");
+
+    $/ = $end_utf8;
+    $string = $start . $end;
+    $chomped = $string;
+    chomp $chomped;
+    is ($chomped, $string, "$message (\$/ as bytes)");
+  }
+}