From: Nicholas Clark Date: Sat, 29 Apr 2006 23:33:36 +0000 (+0000) Subject: syswrite couldn't correctly handle surprises from UTF-8 overloading. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=c9cb0f4189a96c284198e261ba3e6a7df9d31f55;p=p5sagit%2Fp5-mst-13.2.git syswrite couldn't correctly handle surprises from UTF-8 overloading. As part of fixing this, syswrite now tries to take advantage of the UTF-8 cache logic for lengths and offsets on regular scalars. p4raw-id: //depot/perl@28019 --- diff --git a/pp_sys.c b/pp_sys.c index 03d3b5f..fdc9937 100644 --- a/pp_sys.c +++ b/pp_sys.c @@ -1777,11 +1777,13 @@ PP(pp_send) IO *io; SV *bufsv; const char *buffer; - Size_t length = 0; SSize_t retval; STRLEN blen; + STRLEN orig_blen_bytes; MAGIC *mg; const int op_type = PL_op->op_type; + bool doing_utf8; + U8 *tmpbuf = NULL; GV *const gv = (GV*)*++MARK; if (PL_op->op_type == OP_SYSWRITE @@ -1813,19 +1815,6 @@ PP(pp_send) bufsv = *++MARK; - if (op_type == OP_SYSWRITE) { - if (MARK >= SP) { - length = (Size_t) sv_len(bufsv); - } else { -#if Size_t_size > IVSIZE - length = (Size_t)SvNVx(*++MARK); -#else - length = (Size_t)SvIVx(*++MARK); -#endif - if ((SSize_t)length < 0) - DIE(aTHX_ "Negative length"); - } - } SETERRNO(0,0); io = GvIO(gv); if (!io || !IoIFP(io)) { @@ -1836,43 +1825,105 @@ PP(pp_send) goto say_undef; } + /* Do this first to trigger any overloading. */ + buffer = SvPV_const(bufsv, blen); + orig_blen_bytes = blen; + doing_utf8 = DO_UTF8(bufsv); + if (PerlIO_isutf8(IoIFP(io))) { if (!SvUTF8(bufsv)) { - bufsv = sv_2mortal(newSVsv(bufsv)); - buffer = sv_2pvutf8(bufsv, &blen); - } else - buffer = SvPV_const(bufsv, blen); + /* We don't modify the original scalar. */ + tmpbuf = bytes_to_utf8((const U8*) buffer, &blen); + buffer = (char *) tmpbuf; + doing_utf8 = TRUE; + } } - else { - if (DO_UTF8(bufsv)) { - /* Not modifying source SV, so making a temporary copy. */ - bufsv = sv_2mortal(newSVsv(bufsv)); - sv_utf8_downgrade(bufsv, FALSE); - } - buffer = SvPV_const(bufsv, blen); + else if (doing_utf8) { + STRLEN tmplen = blen; + U8 *result = bytes_from_utf8((const U8*) buffer, &tmplen, &doing_utf8); + if (!doing_utf8) { + tmpbuf = result; + buffer = (char *) tmpbuf; + blen = tmplen; + } + else { + assert((char *)result == buffer); + Perl_croak(aTHX_ "Wide character in %s", OP_DESC(PL_op)); + } } if (op_type == OP_SYSWRITE) { + Size_t length = 0; /* This length is in characters. */ + STRLEN blen_chars; IV offset; - if (DO_UTF8(bufsv)) { - /* length and offset are in chars */ - blen = sv_len_utf8(bufsv); + + if (doing_utf8) { + if (tmpbuf) { + /* The SV is bytes, and we've had to upgrade it. */ + blen_chars = orig_blen_bytes; + } else { + /* The SV really is UTF-8. */ + if (SvGMAGICAL(bufsv) || SvAMAGIC(bufsv)) { + /* Don't call sv_len_utf8 again because it will call magic + or overloading a second time, and we might get back a + different result. */ + blen_chars = utf8_length(buffer, buffer + blen); + } else { + /* It's safe, and it may well be cached. */ + blen_chars = sv_len_utf8(bufsv); + } + } + } else { + blen_chars = blen; + } + + if (MARK >= SP) { + length = blen_chars; + } else { +#if Size_t_size > IVSIZE + length = (Size_t)SvNVx(*++MARK); +#else + length = (Size_t)SvIVx(*++MARK); +#endif + if ((SSize_t)length < 0) + DIE(aTHX_ "Negative length"); } + if (MARK < SP) { offset = SvIVx(*++MARK); if (offset < 0) { - if (-offset > (IV)blen) + if (-offset > (IV)blen_chars) DIE(aTHX_ "Offset outside string"); - offset += blen; - } else if (offset >= (IV)blen && blen > 0) + offset += blen_chars; + } else if (offset >= (IV)blen_chars && blen_chars > 0) DIE(aTHX_ "Offset outside string"); } else offset = 0; - if (length > blen - offset) - length = blen - offset; - if (DO_UTF8(bufsv)) { - buffer = (const char*)utf8_hop((const U8 *)buffer, offset); - length = utf8_hop((U8 *)buffer, length) - (U8 *)buffer; + if (length > blen_chars - offset) + length = blen_chars - offset; + if (doing_utf8) { + /* Here we convert length from characters to bytes. */ + if (tmpbuf || SvGMAGICAL(bufsv) || SvAMAGIC(bufsv)) { + /* Either we had to convert the SV, or the SV is magical, or + the SV has overloading, in which case we can't or mustn't + or mustn't call it again. */ + + buffer = (const char*)utf8_hop((const U8 *)buffer, offset); + length = utf8_hop((U8 *)buffer, length) - (U8 *)buffer; + } else { + /* It's a real UTF-8 SV, and it's not going to change under + us. Take advantage of any cache. */ + I32 start = offset; + I32 len_I32 = length; + + /* Convert the start and end character positions to bytes. + Remember that the second argument to sv_pos_u2b is relative + to the first. */ + sv_pos_u2b(bufsv, &start, &len_I32); + + buffer += start; + length = len_I32; + } } else { buffer = buffer+offset; @@ -1908,10 +1959,13 @@ PP(pp_send) else DIE(aTHX_ PL_no_sock_func, "send"); #endif + if (tmpbuf) + Safefree(tmpbuf); + if (retval < 0) goto say_undef; SP = ORIGMARK; - if (DO_UTF8(bufsv)) + if (doing_utf8) retval = utf8_length((U8*)buffer, (U8*)buffer + retval); #if Size_t_size > IVSIZE PUSHn(retval); diff --git a/t/lib/warnings/9uninit b/t/lib/warnings/9uninit index 07fffa8..575161d 100644 --- a/t/lib/warnings/9uninit +++ b/t/lib/warnings/9uninit @@ -1162,7 +1162,7 @@ our ($g1); eval { my $x; seek $x,$m1, $g1 }; eval { my $x; sysseek $x,$m1, $g1 }; -eval { syswrite $m1, $g1 }; +eval { syswrite $m1, $g1 }; # logic changed - now won't try $g1 if $m1 is bad # eval { syswrite STDERR, $m1 }; # XXX under utf8, can give # eval { syswrite STDERR, $m1, $g1 }; # XXX different warnings # eval { syswrite STDERR, $m1, $g1, $m2 }; @@ -1176,7 +1176,6 @@ Use of uninitialized value $x in ref-to-glob cast at - line 6. Use of uninitialized value $g1 in sysseek at - line 6. Use of uninitialized value $m1 in sysseek at - line 6. Use of uninitialized value $m1 in ref-to-glob cast at - line 7. -Use of uninitialized value $g1 in syswrite at - line 7. Use of uninitialized value $m2 in socket at - line 11. Use of uninitialized value $g1 in socket at - line 11. Use of uninitialized value $m1 in socket at - line 11. diff --git a/t/uni/overload.t b/t/uni/overload.t index 478544c..5812425 100644 --- a/t/uni/overload.t +++ b/t/uni/overload.t @@ -7,7 +7,7 @@ BEGIN { } } -use Test::More tests => 68; +use Test::More tests => 116; package UTF8Toggle; use strict; @@ -151,17 +151,46 @@ SKIP: { my $tmpfile = 'overload.tmp'; -foreach my $operator (qw (print)) { +foreach my $operator ('print', 'syswrite', 'syswrite len', 'syswrite off', + 'syswrite len off') { foreach my $layer ('', ':utf8') { open my $fh, "+>$layer", $tmpfile or die $!; - my $u = UTF8Toggle->new("\311\n"); - print $fh $u; - print $fh $u; - print $fh $u; - my $l = UTF8Toggle->new("\351\n", 1); - print $fh $l; - print $fh $l; - print $fh $l; + my $pad = $operator =~ /\boff\b/ ? "\243" : ""; + my $trail = $operator =~ /\blen\b/ ? "!" : ""; + my $u = UTF8Toggle->new("$pad\311\n$trail"); + my $l = UTF8Toggle->new("$pad\351\n$trail", 1); + if ($operator eq 'print') { + print $fh $u; + print $fh $u; + print $fh $u; + print $fh $l; + print $fh $l; + print $fh $l; + } elsif ($operator eq 'syswrite') { + syswrite $fh, $u; + syswrite $fh, $u; + syswrite $fh, $u; + syswrite $fh, $l; + syswrite $fh, $l; + syswrite $fh, $l; + } elsif ($operator eq 'syswrite len') { + syswrite $fh, $u, 2; + syswrite $fh, $u, 2; + syswrite $fh, $u, 2; + syswrite $fh, $l, 2; + syswrite $fh, $l, 2; + syswrite $fh, $l, 2; + } elsif ($operator eq 'syswrite off' + || $operator eq 'syswrite len off') { + syswrite $fh, $u, 2, 1; + syswrite $fh, $u, 2, 1; + syswrite $fh, $u, 2, 1; + syswrite $fh, $l, 2, 1; + syswrite $fh, $l, 2, 1; + syswrite $fh, $l, 2, 1; + } else { + die $operator; + } seek $fh, 0, 0 or die $!; my $line;