From: Jarkko Hietaniemi Date: Tue, 13 Nov 2001 22:50:27 +0000 (+0000) Subject: Fix for the :utf8 read() bug noticed by Matt Sergeant: X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=d09651054f608b613dc2d41d0267898bf3d676f8;p=p5sagit%2Fp5-mst-13.2.git Fix for the :utf8 read() bug noticed by Matt Sergeant: "large enough" Unicode characters returned more than one as their "Unicode size". p4raw-id: //depot/perl@12981 --- diff --git a/pp_sys.c b/pp_sys.c index ea35136..ed70307 100644 --- a/pp_sys.c +++ b/pp_sys.c @@ -1517,6 +1517,9 @@ PP(pp_sysread) int fp_utf8; Size_t got = 0; Size_t wanted; + bool charstart = NULL; + STRLEN skip; + STRLEN charskip; gv = (GV*)*++MARK; if ((PL_op->op_type == OP_READ || PL_op->op_type == OP_SYSREAD) @@ -1563,6 +1566,9 @@ PP(pp_sysread) DIE(aTHX_ "Negative length"); wanted = length; + charstart = TRUE; + charskip = 0; + #ifdef HAS_SOCKET if (PL_op->op_type == OP_RECV) { char namebuf[MAXPATHLEN]; @@ -1683,23 +1689,30 @@ PP(pp_sysread) /* Look at utf8 we got back and count the characters */ char *bend = buffer + count; while (buffer < bend) { - STRLEN skip = UTF8SKIP(buffer); - if (buffer+skip > bend) { + if (charstart) { + skip = UTF8SKIP(buffer); + charskip = 0; + } + if (buffer - charskip + skip > bend) { /* partial character - try for rest of it */ length = skip - (bend-buffer); offset = bend - SvPVX(bufsv); + charstart = FALSE; + charskip += count; goto more_bytes; } else { got++; buffer += skip; + charstart = TRUE; + charskip = 0; } } /* If we have not 'got' the number of _characters_ we 'wanted' get some more provided amount read (count) was what was requested (length) */ if (got < wanted && count == length) { - length = (wanted-got); + length = wanted - got; offset = bend - SvPVX(bufsv); goto more_bytes; } diff --git a/t/io/utf8.t b/t/io/utf8.t index 1a7d27f..2d74cff 100755 --- a/t/io/utf8.t +++ b/t/io/utf8.t @@ -12,9 +12,7 @@ BEGIN { no utf8; # needed for use utf8 not griping about the raw octets $| = 1; -my $total_tests = 25; -if (ord('A') == 193) { $total_tests = 24; } # EBCDIC platforms do not warn on UTF-8 -print "1..$total_tests\n"; +print "1..26\n"; open(F,"+>:utf8",'a'); print F chr(0x100).'£'; @@ -175,12 +173,38 @@ print "not " unless $x eq $chr; print "ok 24\n"; # Now we have a deformed file. -open F, "<:utf8", "a" or die $!; -$x = ; chomp $x; -{ local $SIG{__WARN__} = sub { print "ok 25\n"; }; -eval { sprintf "%vd\n", $x; } + +if (ord('A') == 193) { + print "ok 25 # Skip: EBCDIC\n"; # EBCDIC doesn't complain +} else { + open F, "<:utf8", "a" or die $!; + $x = ; chomp $x; + local $SIG{__WARN__} = sub { print "ok 25\n" }; + eval { sprintf "%vd\n", $x }; } close F; unlink('a'); +open F, ">a"; +@a = map { chr(1 << ($_ << 2)) } 0..5; # 0x1, 0x10, .., 0x100000 +print F @a; +close F; +open F, "<:utf8", "a"; +$a = 0; +for (@a) { + unless (read(F, $b, 1) == 1 && + length($b) == 1 && + ord($b) == ord($_) && + tell(F) == ($a += bytes::length($b))) { + print '# ord($_) == ', ord($_), "\n"; + print '# ord($b) == ', ord($b), "\n"; + print '# length($b) == ', length($b), "\n"; + print '# tell(F) == ', tell(F), "\n"; + print "not "; + last; + } +} +print "ok 26\n"; + +END { 1 while unlink "a" }