From: Nicholas Clark Date: Thu, 15 Jan 2004 00:03:04 +0000 (+0000) Subject: Make chomp heed the utf8 flags on the target string and $/ X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=c4c87a065d5684a07ac86a151149508724e14d4e;p=p5sagit%2Fp5-mst-13.2.git Make chomp heed the utf8 flags on the target string and $/ [Fixes #24888] More work still needed to make chomp heed the encoding pragma. p4raw-id: //depot/perl@22155 --- diff --git a/doop.c b/doop.c index ea64ff8..6724aca 100644 --- a/doop.c +++ b/doop.c @@ -1008,6 +1008,7 @@ Perl_do_chomp(pTHX_ register SV *sv) STRLEN len; STRLEN n_a; char *s; + char *temp_buffer = NULL; if (RsSNARF(PL_rs)) return 0; @@ -1059,6 +1060,27 @@ Perl_do_chomp(pTHX_ register SV *sv) else { STRLEN rslen; char *rsptr = SvPV(PL_rs, rslen); + if (SvUTF8(PL_rs) != SvUTF8(sv)) { + /* Assumption is that rs is shorter than the scalar. */ + if (SvUTF8(PL_rs)) { + /* RS is utf8, scalar is 8 bit. */ + bool is_utf8 = TRUE; + temp_buffer = (char*)bytes_from_utf8((U8*)rsptr, + &rslen, &is_utf8); + if (is_utf8) { + /* Cannot downgrade, therefore cannot possibly match + */ + assert (temp_buffer == rsptr); + temp_buffer = NULL; + goto nope; + } + rsptr = temp_buffer; + } else { + /* RS is 8 bit, scalar is utf8. */ + temp_buffer = (char*)bytes_to_utf8((U8*)rsptr, &rslen); + rsptr = temp_buffer; + } + } if (rslen == 1) { if (*s != *rsptr) goto nope; @@ -1081,6 +1103,7 @@ Perl_do_chomp(pTHX_ register SV *sv) SvSETMAGIC(sv); } nope: + Safefree(temp_buffer); return count; } diff --git a/t/op/chop.t b/t/op/chop.t index 87700de..68025b7 100755 --- a/t/op/chop.t +++ b/t/op/chop.t @@ -6,7 +6,7 @@ BEGIN { require './test.pl'; } -plan tests => 51; +plan tests => 91; $_ = 'abc'; $c = do foo(); @@ -183,3 +183,29 @@ ok($@ =~ /Can\'t modify.*chop.*in.*assignment/); eval 'chomp($x, $y) = (1, 2);'; ok($@ =~ /Can\'t modify.*chom?p.*in.*assignment/); +my @chars = ("N", "\xd3", substr ("\xd4\x{100}", 0, 1), chr 1296); +foreach my $start (@chars) { + foreach my $end (@chars) { + local $/ = $end; + my $message = "start=" . ord ($start) . " end=" . ord $end; + my $string = $start . $end; + chomp $string; + is ($string, $start, $message); + + my $end_utf8 = $end; + utf8::encode ($end_utf8); + next if $end_utf8 eq $end; + + # $end ne $end_utf8, so these should not chomp. + $string = $start . $end_utf8; + my $chomped = $string; + chomp $chomped; + is ($chomped, $string, "$message (end as bytes)"); + + $/ = $end_utf8; + $string = $start . $end; + $chomped = $string; + chomp $chomped; + is ($chomped, $string, "$message (\$/ as bytes)"); + } +}