From: Nick Ing-Simmons <nik@tiuk.ti.com>
Date: Sun, 3 Feb 2002 17:32:03 +0000 (+0000)
Subject: Stable intermediate point in Encode cleanup.
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=0b3236bb1fb664bc9c9ccd069cac189e80c3ef35;p=p5sagit%2Fp5-mst-13.2.git

Stable intermediate point in Encode cleanup.
:encode(euc-jp) works on Dan's table.euc
Much buffer copying and other silliness remains.

p4raw-id: //depot/perlio@14536
---

diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm
index 7af36ad..e804583 100644
--- a/ext/Encode/Encode.pm
+++ b/ext/Encode/Encode.pm
@@ -1,6 +1,6 @@
 package Encode;
 use strict;
-our $VERSION = do {my @r=(q$Revision: 0.30 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r};
+our $VERSION = '0.30';
 
 require DynaLoader;
 require Exporter;
@@ -125,7 +125,7 @@ define_alias( qr/^iso8859(\d+)$/i => '"iso-8859-$1"' );
 # More HP stuff.
 define_alias( qr/^(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
 
-# The Official name of ASCII. 
+# The Official name of ASCII.
 define_alias( qr/^ANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
 
 # This is a font issue, not an encoding issue.
@@ -444,7 +444,7 @@ As of Perl 5.8.0, at least the following encodings are recognized
 
 The Unicode:
 
-  UTF-8   
+  UTF-8
   UTF-16
   UCS-2
 
@@ -461,7 +461,7 @@ The ISO 8859 and KOI:
 
   Latin1  => 8859-1  Latin6  => 8859-10
   Latin2  => 8859-2  Latin7  => 8859-13
-  Latin3  => 8859-3  Latin8  => 8859-14 
+  Latin3  => 8859-3  Latin8  => 8859-14
   Latin4  => 8859-4  Latin9  => 8859-15
   Latin5  => 8859-9  Latin10 => 8859-16
 
@@ -470,14 +470,14 @@ The ISO 8859 and KOI:
   Greek    => 8859-7
   Hebrew   => 8859-8
   Thai     => 8859-11
-  TIS620   => 8859-11 
+  TIS620   => 8859-11
 
 The CJKV: Chinese, Japanese, Korean, Vietnamese:
 
-  ISO 2022     ISO 2022 JP-1  JIS 0201  GB 1988   Big5       EUC-CN       
-  ISO 2022 CN  ISO 2022 JP-2  JIS 0208  GB 2312   HZ         EUC-JP     
+  ISO 2022     ISO 2022 JP-1  JIS 0201  GB 1988   Big5       EUC-CN
+  ISO 2022 CN  ISO 2022 JP-2  JIS 0208  GB 2312   HZ         EUC-JP
   ISO 2022 JP  ISO 2022 KR    JIS 0210  GB 12345  CNS 11643  EUC-JP-0212
-  Shift-JIS                                                  EUC-KR     
+  Shift-JIS                                                  EUC-KR
   VISCII
 
 The PC codepages:
@@ -502,13 +502,13 @@ The PC codepages:
 
 The Mac codepages:
 
-  MacCentralEuropean   MacJapanese        
-  MacCroatian          MacRoman           
-  MacCyrillic          MacRumanian        
-  MacDingbats          MacSami            
-  MacGreek             MacThai            
-  MacIcelandic         MacTurkish         
-                       MacUkraine         
+  MacCentralEuropean   MacJapanese
+  MacCroatian          MacRoman
+  MacCyrillic          MacRumanian
+  MacDingbats          MacSami
+  MacGreek             MacThai
+  MacIcelandic         MacTurkish
+                       MacUkraine
 
 Miscellaneous:
 
diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs
index 825f9cd..c48a5a0 100644
--- a/ext/Encode/Encode.xs
+++ b/ext/Encode/Encode.xs
@@ -42,8 +42,9 @@ UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
 #include "perliol.h"
 typedef struct {
     PerlIOBuf base;		/* PerlIOBuf stuff */
-    SV *bufsv;
-    SV *enc;
+    SV *bufsv;			/* buffer seen by layers above */
+    SV *dataSV;			/* data we have read from layer below */
+    SV *enc;			/* the encoding object */
 } PerlIOEncode;
 
 SV *
@@ -115,6 +116,10 @@ PerlIOEncode_popped(pTHX_ PerlIO * f)
 	SvREFCNT_dec(e->bufsv);
 	e->bufsv = Nullsv;
     }
+    if (e->dataSV) {
+	SvREFCNT_dec(e->dataSV);
+	e->bufsv = Nullsv;
+    }
     return 0;
 }
 
@@ -160,45 +165,129 @@ PerlIOEncode_fill(pTHX_ PerlIO * f)
 {
     PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
     dSP;
-    IV code;
-    code = PerlIOBuf_fill(aTHX_ f);
-    if (code == 0) {
+    IV code = 0;
+    PerlIO *n;
+    SSize_t avail;
+    if (PerlIO_flush(f) != 0)
+	return -1;
+    n  = PerlIONext(f);
+    if (!PerlIO_fast_gets(n)) {
+	/* Things get too messy if we don't have a buffer layer
+	   push a :perlio to do the job */
+	char mode[8];
+	n  = PerlIO_push(aTHX_ n, &PerlIO_perlio, PerlIO_modestr(f,mode), Nullsv);
+	if (!n) {
+	    Perl_die(aTHX_ "panic: cannot push :perlio for %p",f);
+	}
+    }
+    ENTER;
+    SAVETMPS;
+  retry:
+    avail = PerlIO_get_cnt(n);
+    if (avail <= 0) {
+	avail = PerlIO_fill(n);
+	if (avail == 0) {
+	    avail = PerlIO_get_cnt(n);
+	}
+	else {
+	    if (!PerlIO_error(n) && PerlIO_eof(n))
+		avail = 0;
+	}
+    }
+    if (avail > 0) {
+	STDCHAR *ptr = PerlIO_get_ptr(n);
+	SSize_t use  = avail;
 	SV *uni;
-	STRLEN len;
 	char *s;
-	/* Set SV that is the buffer to be buf..ptr */
-	SvCUR_set(e->bufsv, e->base.end - e->base.buf);
-	SvUTF8_off(e->bufsv);
-	ENTER;
-	SAVETMPS;
+	STRLEN len = 0;
+	e->base.ptr = e->base.end = (STDCHAR *) Nullch;
+	(void) PerlIOEncode_get_base(aTHX_ f);
+	if (!e->dataSV)
+	    e->dataSV = newSV(0);
+	if (SvTYPE(e->dataSV) < SVt_PV) {
+	    sv_upgrade(e->dataSV,SVt_PV);
+	}
+	if (SvCUR(e->dataSV)) {
+	    /* something left over from last time - create a normal
+	       SV with new data appended
+	     */
+	    if (use + SvCUR(e->dataSV) > e->base.bufsiz) {
+	       use = e->base.bufsiz - SvCUR(e->dataSV);
+	    }
+	    sv_catpvn(e->dataSV,ptr,use);
+	}
+	else {
+	    /* Create a "dummy" SV to represent the available data from layer below */
+	    if (SvLEN(e->dataSV) && SvPVX(e->dataSV)) {
+		Safefree(SvPVX(e->dataSV));
+	    }
+	    if (use > e->base.bufsiz) {
+	       use = e->base.bufsiz;
+	    }
+	    SvPVX(e->dataSV) = (char *) ptr;
+	    SvLEN(e->dataSV) = 0;  /* Hands off sv.c - it isn't yours */
+	    SvCUR_set(e->dataSV,use);
+	    SvPOK_on(e->dataSV);
+	}
+	SvUTF8_off(e->dataSV);
 	PUSHMARK(sp);
 	XPUSHs(e->enc);
-	XPUSHs(e->bufsv);
+	XPUSHs(e->dataSV);
 	XPUSHs(&PL_sv_yes);
 	PUTBACK;
-	if (perl_call_method("decode", G_SCALAR) != 1)
-	    code = -1;
+	if (perl_call_method("decode", G_SCALAR) != 1) {
+	    Perl_die(aTHX_ "panic: decode did not return a value");
+	}
 	SPAGAIN;
 	uni = POPs;
 	PUTBACK;
-	/* Now get translated string (forced to UTF-8) and copy back to buffer
-	   don't use sv_setsv as that may "steal" PV from returned temp
-	   and so free() our known-large-enough buffer.
-	   sv_setpvn() should do but let us do it long hand.
-	 */
-	s = SvPVutf8(uni, len);
-	if (s != SvPVX(e->bufsv)) {
-	    e->base.buf = (STDCHAR *) SvGROW(e->bufsv, len);
-	    Move(s, e->base.buf, len, char);
-	    SvCUR_set(e->bufsv, len);
+	/* Now get translated string (forced to UTF-8) and use as buffer */
+	if (SvPOK(uni)) {
+	    s = SvPVutf8(uni, len);
+	    if (len && !is_utf8_string(s,len)) {
+		Perl_warn(aTHX_ "panic: decode did not return UTF-8 '%.*s'",(int) len,s);
+	    }
+	}
+	if (len > 0) {
+	    /* Got _something */
+	    /* if decode gave us back dataSV then data may vanish when
+	       we do ptrcnt adjust - so take our copy now.
+	       (The copy is a pain - need a put-it-here option for decode.)
+	     */
+	    sv_setpvn(e->bufsv,s,len);
+	    e->base.ptr = e->base.buf = SvPVX(e->bufsv);
+	    e->base.end = e->base.ptr + SvCUR(e->bufsv);
+	    PerlIOBase(f)->flags |= PERLIO_F_RDBUF;
+	    SvUTF8_on(e->bufsv);
+
+	    /* Adjust ptr/cnt not taking anything which
+	       did not translate - not clear this is a win */
+	    /* compute amount we took */
+	    use -= SvCUR(e->dataSV);
+	    PerlIO_set_ptrcnt(n, ptr+use, (avail-use));
+	    /* and as we did not take it it isn't pending */
+	    SvCUR_set(e->dataSV,0);
+	} else {
+	    /* Got nothing - assume partial character so we need some more */
+	    /* Make sure e->dataSV is a normal SV before re-filling as
+	       buffer alias will change under us
+	     */
+	    s = SvPV(e->dataSV,len);
+	    sv_setpvn(e->dataSV,s,len);
+	    PerlIO_set_ptrcnt(n, ptr+use, (avail-use));
+	    goto retry;
 	}
-	SvUTF8_on(e->bufsv);
-	e->base.end = e->base.buf + len;
-	e->base.ptr = e->base.buf;
 	FREETMPS;
 	LEAVE;
+	return code;
+    }
+    else {
+	if (avail == 0)
+	    PerlIOBase(f)->flags |= PERLIO_F_EOF;
+	else
+	    PerlIOBase(f)->flags |= PERLIO_F_ERROR;
+	return -1;
     }
-    return code;
 }
 
 IV
@@ -206,50 +295,84 @@ PerlIOEncode_flush(pTHX_ PerlIO * f)
 {
     PerlIOEncode *e = PerlIOSelf(f, PerlIOEncode);
     IV code = 0;
-    if (e->bufsv
-	&& (PerlIOBase(f)->flags & (PERLIO_F_RDBUF | PERLIO_F_WRBUF))
-	&& (e->base.ptr > e->base.buf)
-	) {
+    if (e->bufsv && (e->base.ptr > e->base.buf)) {
 	dSP;
 	SV *str;
 	char *s;
 	STRLEN len;
-	SSize_t left = 0;
-	if (PerlIOBase(f)->flags & PERLIO_F_RDBUF) {
-	    /* This is really just a flag to see if we took all the data, if
-	       we did PerlIOBase_flush avoids a seek to lower layer.
-	       Need to revisit if we start getting clever with unreads or seeks-in-buffer
-	     */
-	    left = e->base.end - e->base.ptr;
+	SSize_t count = 0;
+	if (PerlIOBase(f)->flags & PERLIO_F_WRBUF) {
+	    /* Write case encode the buffer and write() to layer below */
+	    ENTER;
+	    SAVETMPS;
+	    PUSHMARK(sp);
+	    XPUSHs(e->enc);
+	    SvCUR_set(e->bufsv, e->base.ptr - e->base.buf);
+	    SvUTF8_on(e->bufsv);
+	    Perl_warn(aTHX_ "flush %_",e->bufsv);
+	    XPUSHs(e->bufsv);
+	    XPUSHs(&PL_sv_yes);
+	    PUTBACK;
+	    if (perl_call_method("encode", G_SCALAR) != 1)
+		code = -1;
+	    SPAGAIN;
+	    str = POPs;
+	    PUTBACK;
+	    s = SvPV(str, len);
+	    count = PerlIO_write(PerlIONext(f),s,len);
+	    if (count != len) {
+		code = -1;
+	    }
+	    FREETMPS;
+	    LEAVE;
+	    if (PerlIO_flush(PerlIONext(f)) != 0) {
+		code = -1;
+	    }
 	}
-	ENTER;
-	SAVETMPS;
-	PUSHMARK(sp);
-	XPUSHs(e->enc);
-	SvCUR_set(e->bufsv, e->base.ptr - e->base.buf);
-	SvUTF8_on(e->bufsv);
-	XPUSHs(e->bufsv);
-	XPUSHs(&PL_sv_yes);
-	PUTBACK;
-	if (perl_call_method("encode", G_SCALAR) != 1)
-	    code = -1;
-	SPAGAIN;
-	str = POPs;
-	PUTBACK;
-	s = SvPV(str, len);
-	if (s != SvPVX(e->bufsv)) {
-	    e->base.buf = (STDCHAR *) SvGROW(e->bufsv, len);
-	    Move(s, e->base.buf, len, char);
-	    SvCUR_set(e->bufsv, len);
+	else if (PerlIOBase(f)->flags & PERLIO_F_RDBUF) {
+	    /* read case */
+	    /* if we have any untranslated stuff then unread that first */
+	    if (e->dataSV && SvCUR(e->dataSV)) {
+		s = SvPV(e->dataSV, len);
+		count = PerlIO_unread(PerlIONext(f),s,len);
+		if (count != len) {
+		    code = -1;
+		}
+	    }
+	    /* See if there is anything left in the buffer */
+	    if (e->base.ptr < e->base.end) {
+		/* Bother - have unread data.
+		   re-encode and unread() to layer below
+		 */
+		ENTER;
+		SAVETMPS;
+		str = sv_newmortal();
+		sv_upgrade(str, SVt_PV);
+		SvPVX(str) = e->base.ptr;
+		SvLEN(str) = 0;
+		SvCUR_set(str, e->base.end - e->base.ptr);
+		SvUTF8_on(str);
+		PUSHMARK(sp);
+		XPUSHs(e->enc);
+		XPUSHs(str);
+		XPUSHs(&PL_sv_yes);
+		PUTBACK;
+		if (perl_call_method("encode", G_SCALAR) != 1)
+		    code = -1;
+		SPAGAIN;
+		str = POPs;
+		PUTBACK;
+		s = SvPV(str, len);
+		count = PerlIO_unread(PerlIONext(f),s,len);
+		if (count != len) {
+		    code = -1;
+		}
+		FREETMPS;
+		LEAVE;
+	    }
 	}
-	SvUTF8_off(e->bufsv);
-	e->base.ptr = e->base.buf + len;
-	/* restore end != ptr as inequality is used by PerlIOBuf_flush in read case */
-	e->base.end = e->base.ptr + left;
-	FREETMPS;
-	LEAVE;
-	if (PerlIOBuf_flush(aTHX_ f) != 0)
-	    code = -1;
+	e->base.ptr = e->base.end = e->base.buf;
+	PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF | PERLIO_F_WRBUF);
     }
     return code;
 }
@@ -274,30 +397,11 @@ Off_t
 PerlIOEncode_tell(pTHX_ PerlIO * f)
 {
     PerlIOBuf *b = PerlIOSelf(f, PerlIOBuf);
-    /* Unfortunately the only way to get a postion is to back-translate,
-       the UTF8-bytes we have buf..ptr and adjust accordingly.
-       But we will try and save any unread data in case stream
-       is un-seekable.
+    /* Unfortunately the only way to get a postion is to (re-)translate,
+       the UTF8 we have in bufefr and then ask layer below
      */
-    if ((PerlIOBase(f)->flags & PERLIO_F_RDBUF) && b->ptr < b->end) {
-	Size_t count = b->end - b->ptr;
-	PerlIO_push(aTHX_ f, &PerlIO_pending, "r", Nullsv);
-	/* Save what we have left to read */
-	PerlIOSelf(f, PerlIOBuf)->bufsiz = count;
-	PerlIO_unread(f, b->ptr, count);
-	/* There isn't any unread data - we just saved it - so avoid the lower seek */
-	b->end = b->ptr;
-	/* Flush ourselves - now one layer down,
-	   this does the back translate and adjusts position
-	 */
-	PerlIO_flush(PerlIONext(f));
-	/* Set position of the saved data */
-	PerlIOSelf(f, PerlIOBuf)->posn = b->posn;
-    }
-    else {
-	PerlIO_flush(f);
-    }
-    return b->posn;
+    PerlIO_flush(f);
+    return PerlIO_tell(PerlIONext(f));
 }
 
 PerlIO *
@@ -373,29 +477,42 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src,
 {
     STRLEN slen;
     U8 *s = (U8 *) SvPV(src, slen);
-    SV *dst = sv_2mortal(newSV(2 * slen + 1));
+    STRLEN tlen = slen;
+    SV *dst = sv_2mortal(newSV(slen+1));
     if (slen) {
-	U8 *d = (U8 *) SvGROW(dst, 2 * slen + 1);
-	STRLEN dlen = SvLEN(dst);
+	U8 *d = (U8 *) SvPVX(dst);
+	STRLEN dlen = SvLEN(dst)-1;
 	int code;
 	while ((code = do_encode(dir, s, &slen, d, dlen, &dlen, !check))) {
 	    SvCUR_set(dst, dlen);
 	    SvPOK_on(dst);
 
-	    if (code == ENCODE_FALLBACK)
+#if 0
+	    Perl_warn(aTHX_ "code=%d @ s=%d/%d d=%d",code,slen,tlen,dlen);
+#endif
+	
+	    if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL)
 		break;
 
 	    switch (code) {
 	    case ENCODE_NOSPACE:
 		{
-		    STRLEN need = dlen + UTF8_MAXLEN * 128;	/* 128 is too big or small? */
+		    STRLEN done = tlen-slen;
+		    STRLEN need ;
+		    if (done) {
+			need = (tlen*dlen)/done+1;
+		    }
+		    else {
+			need = dlen + UTF8_MAXLEN;
+		    }
+		
 		    d = (U8 *) SvGROW(dst, need);
 		    if (dlen >= SvLEN(dst)) {
 			Perl_croak(aTHX_
 				   "Destination couldn't be grown (the need may be miscalculated).");
 		    }
 		    dlen = SvLEN(dst);
-		    slen = SvCUR(src);
+		    slen = tlen;
 		    break;
 		}
 
@@ -427,14 +544,6 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src,
 		}
 		break;
 
-	    case ENCODE_PARTIAL:
-		if (!check && ckWARN_d(WARN_UTF8)) {
-		    Perl_warner(aTHX_ WARN_UTF8, "Partial %s character",
-				(dir ==
-				 enc->f_utf8) ? "UTF-8" : enc->name[0]);
-		}
-		return &PL_sv_undef;
-
 	    default:
 		Perl_croak(aTHX_ "Unexpected code %d converting %s %s",
 			   code, (dir == enc->f_utf8) ? "to" : "from",
@@ -449,12 +558,14 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src,
 		Move(s + slen, s, SvCUR(src) - slen, U8);
 	    }
 	    SvCUR_set(src, SvCUR(src) - slen);
+	    *SvEND(src) = '\0';
 	}
     }
     else {
-	SvCUR_set(dst, slen);
+	SvCUR_set(dst, 0);
 	SvPOK_on(dst);
     }
+    *SvEND(dst) = '\0';
     return dst;
 }
 
diff --git a/ext/Encode/lib/Encode/Encoding.pm b/ext/Encode/lib/Encode/Encoding.pm
index 11bc01d..1499955 100644
--- a/ext/Encode/lib/Encode/Encoding.pm
+++ b/ext/Encode/lib/Encode/Encoding.pm
@@ -1,8 +1,7 @@
 package Encode::Encoding;
 # Base class for classes which implement encodings
 use strict;
-our $VERSION = 
-    do {my @r=(q$Revision: 0.30 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r};
+our $VERSION = '0.02';
 
 sub Define
 {