From: SADAHIRO Tomoyuki Date: Fri, 16 Jan 2004 04:13:00 +0000 (+0900) Subject: Re: [perl #24888] chomp ignores utf8 X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=a6aa349da2cd706a05b205fa788c278b74c24bdc;p=p5sagit%2Fp5-mst-13.2.git Re: [perl #24888] chomp ignores utf8 Message-Id: <20040116040355.A849.BQW10602@nifty.com> Date: Fri, 16 Jan 2004 04:13:00 +0900 p4raw-id: //depot/perl@22196 --- diff --git a/MANIFEST b/MANIFEST index edcb2de..b8a5ee7 100644 --- a/MANIFEST +++ b/MANIFEST @@ -2930,6 +2930,7 @@ t/TEST The regression tester t/TestInit.pm Preamble library for core tests t/test.pl Simple testing library t/uni/case.pl See if Unicode casing works +t/uni/chomp.t See if Unicode chomp works t/uni/fold.t See if Unicode folding works t/uni/lower.t See if Unicode casing works t/uni/sprintf.t See if Unicode sprintf works diff --git a/doop.c b/doop.c index 47d64cb..545a70e 100644 --- a/doop.c +++ b/doop.c @@ -1009,6 +1009,7 @@ Perl_do_chomp(pTHX_ register SV *sv) STRLEN n_a; char *s; char *temp_buffer = NULL; + SV* svrecode = Nullsv; if (RsSNARF(PL_rs)) return 0; @@ -1044,6 +1045,18 @@ Perl_do_chomp(pTHX_ register SV *sv) if (SvREADONLY(sv)) Perl_croak(aTHX_ PL_no_modify); } + + if (PL_encoding) { + if (!SvUTF8(sv)) { + /* XXX, here sv is utf8-ized as a side-effect! + If encoding.pm is used properly, almost string-generating + operations, including literal strings, chr(), input data, etc. + should have been utf8-ized already, right? + */ + sv_recode_to_utf8(sv, PL_encoding); + } + } + s = SvPV(sv, len); if (s && len) { s += --len; @@ -1058,8 +1071,13 @@ Perl_do_chomp(pTHX_ register SV *sv) } } else { - STRLEN rslen; + STRLEN rslen, rs_charlen; char *rsptr = SvPV(PL_rs, rslen); + + rs_charlen = SvUTF8(PL_rs) + ? sv_len_utf8(PL_rs) + : rslen; + if (SvUTF8(PL_rs) != SvUTF8(sv)) { /* Assumption is that rs is shorter than the scalar. */ if (SvUTF8(PL_rs)) { @@ -1075,7 +1093,16 @@ Perl_do_chomp(pTHX_ register SV *sv) goto nope; } rsptr = temp_buffer; - } else { + } + else if (PL_encoding) { + /* RS is 8 bit, encoding.pm is used. + * Do not recode PL_rs as a side-effect. */ + svrecode = newSVpvn(rsptr, rslen); + sv_recode_to_utf8(svrecode, PL_encoding); + rsptr = SvPV(svrecode, rslen); + rs_charlen = sv_len_utf8(svrecode); + } + else { /* RS is 8 bit, scalar is utf8. */ temp_buffer = (char*)bytes_to_utf8((U8*)rsptr, &rslen); rsptr = temp_buffer; @@ -1093,7 +1120,7 @@ Perl_do_chomp(pTHX_ register SV *sv) s -= rslen - 1; if (memNE(s, rsptr, rslen)) goto nope; - count += rslen; + count += rs_charlen; } } s = SvPV_force(sv, n_a); @@ -1103,6 +1130,10 @@ Perl_do_chomp(pTHX_ register SV *sv) SvSETMAGIC(sv); } nope: + + if (svrecode) + SvREFCNT_dec(svrecode); + Safefree(temp_buffer); return count; } diff --git a/t/op/chop.t b/t/op/chop.t index 68025b7..29f5ddd 100755 --- a/t/op/chop.t +++ b/t/op/chop.t @@ -6,7 +6,7 @@ BEGIN { require './test.pl'; } -plan tests => 91; +plan tests => 93; $_ = 'abc'; $c = do foo(); @@ -209,3 +209,16 @@ foreach my $start (@chars) { is ($chomped, $string, "$message (\$/ as bytes)"); } } + +{ + # returns length in characters, but not in bytes. + $/ = "\x{100}"; + $a = "A$/"; + $b = chomp $a; + is ($b, 1); + + $/ = "\x{100}\x{101}"; + $a = "A$/"; + $b = chomp $a; + is ($b, 2); +} diff --git a/t/uni/chomp.t b/t/uni/chomp.t new file mode 100644 index 0000000..1cb3d15 --- /dev/null +++ b/t/uni/chomp.t @@ -0,0 +1,64 @@ +#!./perl -w + +BEGIN { + if ($ENV{'PERL_CORE'}){ + chdir 't'; + unshift @INC, '../lib'; + } + require Config; import Config; + if ($Config{'extensions'} !~ /\bEncode\b/) { + print "1..0 # Skip: Encode was not built\n"; + exit 0; + } + if (ord("A") == 193) { + print "1..0 # Skip: EBCDIC\n"; + exit 0; + } + unless (PerlIO::Layer->find('perlio')){ + print "1..0 # Skip: PerlIO required\n"; + exit 0; + } + eval 'use Encode'; + if ($@ =~ /dynamic loading not available/) { + print "1..0 # Skip: no dynamic loading, no Encode\n"; + exit 0; + } +} + +use strict; +use Test::More tests => (4 * 4 * 4) * (3); # (@char ** 3) * (keys %mbchars) + +# %mbchars = (encoding => { bytes => utf8, ... }, ...); +# * pack('C*') is expected to return bytes even if ${^ENCODING} is true. +our %mbchars = ( + 'big-5' => { + pack('C*', 0x40) => pack('U*', 0x40), # COMMERCIAL AT + pack('C*', 0xA4, 0x40) => "\x{4E00}", # CJK-4E00 + }, + 'euc-jp' => { + pack('C*', 0xB0, 0xA1) => "\x{4E9C}", # CJK-4E9C + pack('C*', 0x8F, 0xB0, 0xA1) => "\x{4E02}", # CJK-4E02 + }, + 'shift-jis' => { + pack('C*', 0xA9) => "\x{FF69}", # halfwidth katakana small U + pack('C*', 0x82, 0xA9) => "\x{304B}", # hiragana KA + }, +); + +for my $enc (sort keys %mbchars) { + local ${^ENCODING} = find_encoding($enc); + my @char = (sort(keys %{ $mbchars{$enc} }), + sort(values %{ $mbchars{$enc} })); + + for my $rs (@char) { + local $/ = $rs; + for my $start (@char) { + for my $end (@char) { + my $string = $start.$end; + my $expect = $end eq $rs ? $start : $string; + chomp $string; + is($string, $expect); + } + } + } +}