From: Nick Ing-Simmons <nik@tiuk.ti.com>
Date: Mon, 1 Jan 2001 21:58:50 +0000 (+0000)
Subject: More rigor in UTF-8-ness of Encode's toUnicode
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=a12c0f5690a38b5b84d767b0f8b7cc133a68affc;p=p5sagit%2Fp5-mst-13.2.git

More rigor in UTF-8-ness of Encode's toUnicode
XS export some of the utf8 internal functions.
Test some of the functions.
Fix pp_concat() bug shown by said tests.

p4raw-id: //depot/perlio@8293
---

diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm
index 42c9e84..1f4ffb1 100644
--- a/ext/Encode/Encode.pm
+++ b/ext/Encode/Encode.pm
@@ -22,6 +22,10 @@ require Exporter;
        off_utf8
        utf_to_utf
        encodings
+       utf8_decode
+       utf8_encode
+       utf8_upgrade
+       utf8_downgrade
       );
 
 bootstrap Encode ();
@@ -409,13 +413,19 @@ sub getEncoding
 package Encode::Unicode;
 
 # Dummy package that provides the encode interface but leaves data
-# as UTF-8 encoded. It is here so that from_to()
+# as UTF-8 encoded. It is here so that from_to() works.
 
 sub name { 'Unicode' }
 
-sub toUnicode   { $_[1] }
+sub toUnicode
+{
+ my ($obj,$str,$chk) = @_;
+ Encode::utf8_upgrade($str);
+ $_[1] = '' if $chk;
+ return $str;
+}
 
-sub fromUnicode { $_[1] }
+*fromUnicode = \&toUnicode;
 
 package Encode::Table;
 
@@ -550,6 +560,7 @@ sub toUnicode
    $uni .= chr($code);
   }
  $_[1] = $str if $chk;
+ Encode::utf8_upgrade($uni);
  return $uni;
 }
 
diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs
index b61d89b..7ea22d4 100644
--- a/ext/Encode/Encode.xs
+++ b/ext/Encode/Encode.xs
@@ -415,6 +415,38 @@ encode_method(pTHX_ encode_t *enc, encpage_t *dir, SV *src, int check)
  return dst;
 }
 
+MODULE = Encode		PACKAGE = Encode	PREFIX = sv_
+
+void
+valid_utf8(sv)
+SV *	sv
+CODE:
+ {
+  STRLEN len;
+  char *s = SvPV(sv,len);
+  if (!SvUTF8(sv) || is_utf8_string(s,len))
+   XSRETURN_YES;
+  else
+   XSRETURN_NO;
+ }
+
+void
+sv_utf8_encode(sv)
+SV *	sv
+
+bool
+sv_utf8_decode(sv)
+SV *	sv
+
+void
+sv_utf8_upgrade(sv)
+SV *	sv
+
+bool
+sv_utf8_downgrade(sv,failok=0)
+SV *	sv
+bool	failok
+
 MODULE = Encode		PACKAGE = Encode::XS	PREFIX = Encode_
 
 PROTOTYPES: ENABLE
diff --git a/pp_hot.c b/pp_hot.c
index f9c5960..795ba32 100644
--- a/pp_hot.c
+++ b/pp_hot.c
@@ -155,6 +155,10 @@ PP(pp_concat)
     left_utf8  = DO_UTF8(left);
     right_utf8 = DO_UTF8(right);
 
+    if (!left_utf8 && !right_utf8 && SvUTF8(TARG)) {
+	SvUTF8_off(TARG);
+    }
+
     if (left_utf8 != right_utf8 && !IN_BYTE) {
         if (TARG == right && !right_utf8) {
             sv_utf8_upgrade(TARG); /* Now straight binary copy */
@@ -289,7 +293,7 @@ PP(pp_eq)
 	if (SvIOK(TOPm1s)) {
 	    bool auvok = SvUOK(TOPm1s);
 	    bool buvok = SvUOK(TOPs);
-	    
+	
 	    if (!auvok && !buvok) { /* ## IV == IV ## */
 		IV aiv = SvIVX(TOPm1s);
 		IV biv = SvIVX(TOPs);
@@ -416,7 +420,7 @@ PP(pp_add)
 	if (SvIOK(TOPm1s)) {
 	    bool auvok = SvUOK(TOPm1s);
 	    bool buvok = SvUOK(TOPs);
-	    
+	
 	    if (!auvok && !buvok) { /* ## IV + IV ## */
 		IV aiv = SvIVX(TOPm1s);
 		IV biv = SvIVX(TOPs);
@@ -459,7 +463,7 @@ PP(pp_add)
 		    aiv = SvIVX(TOPs);
 		    buv = SvUVX(TOPm1s);
 		}
-	    
+	
 		if (aiv >= 0) {
 		    UV result = (UV)aiv + buv;
 		    if (result >= buv) {
@@ -1627,7 +1631,7 @@ PP(pp_helem)
 		    STRLEN keylen;
 		    char *key = SvPV(keysv, keylen);
 		    save_delete(hv, key, keylen);
-		} else 
+		} else
 		    save_helem(hv, keysv, svp);
             }
 	}
@@ -1857,7 +1861,7 @@ PP(pp_subst)
     if (PL_tainted)
 	rxtainted |= 2;
     TAINT_NOT;
-    
+
   force_it:
     if (!pm || !s)
 	DIE(aTHX_ "panic: pp_subst");
diff --git a/t/lib/encode.t b/t/lib/encode.t
index 34527d7..890be55 100644
--- a/t/lib/encode.t
+++ b/t/lib/encode.t
@@ -16,7 +16,7 @@ my @character_set = ('0'..'9', 'A'..'Z', 'a'..'z');
 my @source = qw(ascii iso8859-1 cp1250);
 my @destiny = qw(cp1047 cp37 posix-bc);
 my @ebcdic_sets = qw(cp1047 cp37 posix-bc);
-plan test => 13+$n*@encodings + 2*@source*@destiny*@character_set + 2*@ebcdic_sets*256;
+plan test => 21+$n*@encodings + 2*@source*@destiny*@character_set + 2*@ebcdic_sets*256;
 my $str = join('',map(chr($_),0x20..0x7E));
 my $cpy = $str;
 ok(length($str),from_to($cpy,'iso8859-1','Unicode'),"Length Wrong");
@@ -47,7 +47,7 @@ foreach my $enc (qw(symbol dingbats ascii),@encodings)
 
 # On ASCII based machines see if we can map several codepoints from
 # three distinct ASCII sets to three distinct EBCDIC coded character sets.
-# On EBCDIC machines see if we can map from three EBCDIC sets to three 
+# On EBCDIC machines see if we can map from three EBCDIC sets to three
 # distinct ASCII sets.
 
 my @expectation = (240..249, 193..201,209..217,226..233, 129..137,145..153,162..169);
@@ -91,3 +91,12 @@ foreach my $enc_eb (@ebcdic_sets)
    }
  }
 
+for $i (256,128,129,256)
+ {
+  my $c = chr($i);
+  my $s = "$c\n".sprintf("%02X",$i);
+  ok(Encode::valid_utf8($s),1,"concat to utf8 botched");
+  Encode::utf8_upgrade($s);
+  ok(Encode::valid_utf8($s),1,"concat to utf8 botched");
+ }
+