From: Nick Ing-Simmons Date: Sat, 9 Dec 2000 19:47:30 +0000 (+0000) Subject: Make print, syswrite, send, readline, getc honour utf8-ness of PerlIO. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=7d59b7e40bca518078f3e97c802950b76d52efa2;p=p5sagit%2Fp5-mst-13.2.git Make print, syswrite, send, readline, getc honour utf8-ness of PerlIO. (sysread, recv and write i.e. formats still to do...) Allow :utf8 or :bytes in PerlIO_apply_layers() so that open($fh,">:utf8","name") etc. work. - "applying" those just sets/clears the UTF8 bit of the top layer, so no extra overhead is involved. Tweak t/comp/require.t to add a 'use bytes' to permit its dubious writing of BOM to a non-utf8 stream. Add initial io/utf8.t Fix SvPVutf8() - sv_2pv() was not expecting to be called with something that was already SvPOK() - (we just fossiked with SvUTF8 bit). Fix that and also just use the SvPV macro in sv_2pvutf8() to avoid the issue/overhead. p4raw-id: //depot/perlio@8054 --- diff --git a/MANIFEST b/MANIFEST index 2f33427..eaa7425 100644 --- a/MANIFEST +++ b/MANIFEST @@ -1340,6 +1340,7 @@ t/io/pipe.t See if secure pipes work t/io/print.t See if print commands work t/io/read.t See if read works t/io/tell.t See if file seeking works +t/io/utf8.t See if file seeking works t/lib/abbrev.t See if Text::Abbrev works t/lib/ansicolor.t See if Term::ANSIColor works t/lib/anydbm.t See if AnyDBM_File works diff --git a/doio.c b/doio.c index 901ca71..d8168e1 100644 --- a/doio.c +++ b/doio.c @@ -1148,12 +1148,14 @@ Perl_do_print(pTHX_ register SV *sv, PerlIO *fp) } /* FALL THROUGH */ default: -#if 0 - /* XXX Fix this when the I/O disciplines arrive. XXX */ - if (DO_UTF8(sv)) - sv_utf8_downgrade(sv, FALSE); -#endif - tmps = SvPV(sv, len); + if (PerlIO_isutf8(fp)) { + tmps = SvPVutf8(sv, len); + } + else { + if (DO_UTF8(sv)) + sv_utf8_downgrade(sv, FALSE); + tmps = SvPV(sv, len); + } break; } /* To detect whether the process is about to overstep its diff --git a/perlio.c b/perlio.c index 874dece..278dde1 100644 --- a/perlio.c +++ b/perlio.c @@ -572,6 +572,14 @@ PerlIO_apply_layers(pTHX_ PerlIO *f, const char *mode, const char *names) } } } + else if ((e - s) == 4 && strncmp(s,"utf8",4) == 0) + { + PerlIOBase(f)->flags |= PERLIO_F_UTF8; + } + else if ((e - s) == 5 && strncmp(s,"bytes",5) == 0) + { + PerlIOBase(f)->flags &= ~PERLIO_F_UTF8; + } else { SV *layer = PerlIO_find_layer(s,e-s); @@ -606,7 +614,7 @@ PerlIO_binmode(pTHX_ PerlIO *f, int iotype, int mode, const char *names) { PerlIO_debug("PerlIO_binmode f=%p %s %c %x %s\n", f,PerlIOBase(f)->tab->name,iotype,mode, (names) ? names : "(Null)"); - if (!names || (O_TEXT != O_BINARY && mode & O_BINARY)) + if (!names || (O_TEXT != O_BINARY && (mode & O_BINARY))) { PerlIO *top = f; PerlIOl *l; diff --git a/pp_sys.c b/pp_sys.c index 314b885..e4640be 100644 --- a/pp_sys.c +++ b/pp_sys.c @@ -1129,6 +1129,16 @@ PP(pp_getc) TAINT; sv_setpv(TARG, " "); *SvPVX(TARG) = PerlIO_getc(IoIFP(GvIOp(gv))); /* should never be EOF */ + if (PerlIO_isutf8(IoIFP(GvIOp(gv)))) { + /* Find out how many bytes the char needs */ + Size_t len = UTF8SKIP(SvPVX(TARG)); + if (len > 1) { + SvGROW(TARG,len+1); + len = PerlIO_read(IoIFP(GvIOp(gv)),SvPVX(TARG)+1,len-1); + SvCUR_set(TARG,1+len); + } + SvUTF8_on(TARG); + } PUSHTARG; RETURN; } @@ -1490,10 +1500,7 @@ PP(pp_sysread) bufsv = *++MARK; if (! SvOK(bufsv)) sv_setpvn(bufsv, "", 0); - buffer = SvPV_force(bufsv, blen); length = SvIVx(*++MARK); - if (length < 0) - DIE(aTHX_ "Negative length"); SETERRNO(0,0); if (MARK < SP) offset = SvIVx(*++MARK); @@ -1502,6 +1509,15 @@ PP(pp_sysread) io = GvIO(gv); if (!io || !IoIFP(io)) goto say_undef; + if (PerlIO_isutf8(IoIFP(io))) { + buffer = SvPVutf8_force(bufsv, blen); + } + else { + buffer = SvPV_force(bufsv, blen); + } + if (length < 0) + DIE(aTHX_ "Negative length"); + #ifdef HAS_SOCKET if (PL_op->op_type == OP_RECV) { char namebuf[MAXPATHLEN]; @@ -1514,10 +1530,6 @@ PP(pp_sysread) if (bufsize >= 256) bufsize = 255; #endif -#ifdef OS2 /* At least Warp3+IAK: only the first byte of bufsize set */ - if (bufsize >= 256) - bufsize = 255; -#endif buffer = SvGROW(bufsv, length+1); /* 'offset' means 'flags' here */ length = PerlSock_recvfrom(PerlIO_fileno(IoIFP(io)), buffer, length, offset, @@ -1540,6 +1552,10 @@ PP(pp_sysread) if (PL_op->op_type == OP_RECV) DIE(aTHX_ PL_no_sock_func, "recv"); #endif + if (SvUTF8(bufsv) && offset) { + /* FIXME ! */ + Perl_croak(aTHX_ "Non zero offset not supported yet for utf8"); + } if (offset < 0) { if (-offset > blen) DIE(aTHX_ "Offset outside string"); @@ -1642,7 +1658,6 @@ PP(pp_send) char *buffer; Size_t length; SSize_t retval; - IV offset; STRLEN blen; MAGIC *mg; @@ -1664,7 +1679,6 @@ PP(pp_send) if (!gv) goto say_undef; bufsv = *++MARK; - buffer = SvPV(bufsv, blen); #if Size_t_size > IVSIZE length = (Size_t)SvNVx(*++MARK); #else @@ -1678,8 +1692,24 @@ PP(pp_send) retval = -1; if (ckWARN(WARN_CLOSED)) report_evil_fh(gv, io, PL_op->op_type); + goto say_undef; + } + + if (PerlIO_isutf8(IoIFP(io))) { + buffer = SvPVutf8(bufsv, blen); } - else if (PL_op->op_type == OP_SYSWRITE) { + else { + if (DO_UTF8(bufsv)) + sv_utf8_downgrade(bufsv, FALSE); + buffer = SvPV(bufsv, blen); + } + + if (PL_op->op_type == OP_SYSWRITE) { + IV offset; + if (DO_UTF8(bufsv)) { + /* length and offset are in chars */ + blen = sv_len_utf8(bufsv); + } if (MARK < SP) { offset = SvIVx(*++MARK); if (offset < 0) { @@ -1692,17 +1722,24 @@ PP(pp_send) offset = 0; if (length > blen - offset) length = blen - offset; + if (DO_UTF8(bufsv)) { + buffer = utf8_hop((U8 *)buffer, offset); + length = utf8_hop((U8 *)buffer, length) - (U8 *)buffer; + } + else { + buffer = buffer+offset; + } #ifdef PERL_SOCK_SYSWRITE_IS_SEND if (IoTYPE(io) == IoTYPE_SOCKET) { retval = PerlSock_send(PerlIO_fileno(IoIFP(io)), - buffer+offset, length, 0); + buffer, length, 0); } else #endif { /* See the note at doio.c:do_print about filesize limits. --jhi */ retval = PerlLIO_write(PerlIO_fileno(IoIFP(io)), - buffer+offset, length); + buffer, length); } } #ifdef HAS_SOCKET @@ -1710,12 +1747,13 @@ PP(pp_send) char *sockbuf; STRLEN mlen; sockbuf = SvPVx(*++MARK, mlen); + /* length is really flags */ retval = PerlSock_sendto(PerlIO_fileno(IoIFP(io)), buffer, blen, length, (struct sockaddr *)sockbuf, mlen); } else + /* length is really flags */ retval = PerlSock_send(PerlIO_fileno(IoIFP(io)), buffer, blen, length); - #else else DIE(aTHX_ PL_no_sock_func, "send"); diff --git a/sv.c b/sv.c index 87da8f7..3d25a2e 100644 --- a/sv.c +++ b/sv.c @@ -2192,7 +2192,11 @@ Perl_sv_2pv(pTHX_ register SV *sv, STRLEN *lp) return ""; } } - if (SvNOKp(sv)) { /* See note in sv_2uv() */ + if (SvPOK(sv)) { + *lp = SvCUR(sv); + return SvPVX(sv); + } + else if (SvNOKp(sv)) { /* See note in sv_2uv() */ /* XXXX 64-bit? IV may have better precision... */ /* I tried changing this to be 64-bit-aware and * the t/op/numconvert.t became very, very, angry. @@ -2328,7 +2332,7 @@ char * Perl_sv_2pvutf8(pTHX_ register SV *sv, STRLEN *lp) { sv_utf8_upgrade(sv); - return sv_2pv(sv,lp); + return SvPV(sv,*lp); } /* This function is only called on magical items */ @@ -4330,14 +4334,31 @@ Perl_sv_gets(pTHX_ register SV *sv, register PerlIO *fp, I32 append) #endif SvCUR_set(sv, bytesread); buffer[bytesread] = '\0'; + if (PerlIO_isutf8(fp)) + SvUTF8_on(sv); + else + SvUTF8_off(sv); return(SvCUR(sv) ? SvPVX(sv) : Nullch); } else if (RsPARA(PL_rs)) { rsptr = "\n\n"; rslen = 2; } - else - rsptr = SvPV(PL_rs, rslen); + else { + /* Get $/ i.e. PL_rs into same encoding as stream wants */ + if (PerlIO_isutf8(fp)) { + rsptr = SvPVutf8(PL_rs, rslen); + } + else { + if (SvUTF8(PL_rs)) { + if (!sv_utf8_downgrade(PL_rs, TRUE)) { + Perl_croak(aTHX_ "Wide character in $/"); + } + } + rsptr = SvPV(PL_rs, rslen); + } + } + rslast = rslen ? rsptr[rslen - 1] : '\0'; if (RsPARA(PL_rs)) { /* have to do this both before and after */ @@ -4556,6 +4577,11 @@ screamer2: } } + if (PerlIO_isutf8(fp)) + SvUTF8_on(sv); + else + SvUTF8_off(sv); + return (SvCUR(sv) - append) ? SvPVX(sv) : Nullch; } diff --git a/t/comp/require.t b/t/comp/require.t index eaea3ad..e634532 100755 --- a/t/comp/require.t +++ b/t/comp/require.t @@ -21,6 +21,7 @@ sub write_file { my $f = shift; open(REQ,">$f") or die "Can't write '$f': $!"; binmode REQ; + use bytes; print REQ @_; close REQ; } @@ -132,7 +133,7 @@ $i++; do_require(qq(${utf8}print "ok $i\n"; 1;\n)); sub bytes_to_utf16 { my $utf16 = pack("$_[0]*", unpack("C*", $_[1])); - return @_ == 3 && $_[2] ? pack("$_[0]", 0xFEFF) . $utf16 : $utf16; + return @_ == 3 && $_[2] ? pack("$_[0]", 0xFEFF) . $utf16 : $utf16; } $i++; do_require(bytes_to_utf16('n', qq(print "ok $i\\n"; 1;\n), 1)); # BE diff --git a/t/io/utf8.t b/t/io/utf8.t new file mode 100755 index 0000000..1e47c33 --- /dev/null +++ b/t/io/utf8.t @@ -0,0 +1,51 @@ +#!./perl + +BEGIN { + chdir 't' if -d 't'; + @INC = '../lib'; + require Config; import Config; + unless ($Config{'useperlio'}) { + print "1..0 # Skip: not perlio\n"; + exit 0; + } +} + +$| = 1; +print "1..11\n"; + +open(F,"+>:utf8",'a'); +print F chr(0x100).'£'; +print '#'.tell(F)."\n"; +print "not " unless tell(F) == 4; +print "ok 1\n"; +print F "\n"; +print '#'.tell(F)."\n"; +print "not " unless tell(F) >= 5; +print "ok 2\n"; +seek(F,0,0); +print "not " unless getc(F) eq chr(0x100); +print "ok 3\n"; +print "not " unless getc(F) eq "£"; +print "ok 4\n"; +print "not " unless getc(F) eq "\n"; +print "ok 5\n"; +seek(F,0,0); +binmode(F,":bytes"); +print "not " unless getc(F) eq chr(0xc4); +print "ok 6\n"; +print "not " unless getc(F) eq chr(0x80); +print "ok 7\n"; +print "not " unless getc(F) eq chr(0xc2); +print "ok 8\n"; +print "not " unless getc(F) eq chr(0xa3); +print "ok 9\n"; +print "not " unless getc(F) eq "\n"; +print "ok 10\n"; +seek(F,0,0); +binmode(F,":utf8"); +print "not " unless scalar() eq "\x{100}£\n"; +print "ok 11\n"; +close(F); + +# unlink('a'); +