From: Nick Ing-Simmons <nik@tiuk.ti.com>
Date: Sun, 18 Mar 2001 14:18:12 +0000 (+0000)
Subject: UTF-X encoding invariance for Encode:
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=1b026014ba0f5424fabe070eda050db5e7df518a;p=p5sagit%2Fp5-mst-13.2.git

UTF-X encoding invariance for Encode:
 - move Encode::utf8_encode to utf8::encode (likewise decode,upgrade,downgrade,valid)
 - move the XS code for those to universal.c (so in miniperl)
 - add utf8::unicode_to_native and its inverse to allow EBCDIC to work in true unicode.
 - change ext/Encode/compile to use above.
 - Fix t/lib/encode.t for above
 - Teach t/lib/b.t to expect -uutf8
 - In utf8.c look for SWASHNEW rather than just utf8:: package to see if
   utf8.pm is needed.

p4raw-id: //depot/perlio@9198
---

diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm
index b5ba929..fd85520 100644
--- a/ext/Encode/Encode.pm
+++ b/ext/Encode/Encode.pm
@@ -188,14 +188,14 @@ sub from_to
 sub encode_utf8
 {
  my ($str) = @_;
- utf8_encode($str);
+ utf8::encode($str);
  return $str;
 }
 
 sub decode_utf8
 {
  my ($str) = @_;
- return undef unless utf8_decode($str);
+ return undef unless utf8::decode($str);
  return $str;
 }
 
@@ -226,14 +226,14 @@ package Encode::Unicode;
 use base 'Encode::Encoding';
 
 # Dummy package that provides the encode interface but leaves data
-# as UTF-8 encoded. It is here so that from_to() works.
+# as UTF-X encoded. It is here so that from_to() works.
 
 __PACKAGE__->Define('Unicode');
 
 sub decode
 {
  my ($obj,$str,$chk) = @_;
- Encode::utf8_upgrade($str);
+ utf8::upgrade($str);
  $_[1] = '' if $chk;
  return $str;
 }
@@ -717,17 +717,6 @@ As such they are efficient, but may change.
 
 =over 4
 
-=item *
-
-        $num_octets = utf8_upgrade($string);
-
-Converts internal representation of string to the UTF-8 form.
-Returns the number of octets necessary to represent the string as UTF-8.
-
-=item * utf8_downgrade($string[, CHECK])
-
-Converts internal representation of string to be un-encoded bytes.
-
 =item * is_utf8(STRING [, CHECK])
 
 [INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs
index cdb1965..4d62501 100644
--- a/ext/Encode/Encode.xs
+++ b/ext/Encode/Encode.xs
@@ -433,38 +433,6 @@ encode_method(pTHX_ encode_t *enc, encpage_t *dir, SV *src, int check)
  return dst;
 }
 
-MODULE = Encode		PACKAGE = Encode	PREFIX = sv_
-
-void
-valid_utf8(sv)
-SV *	sv
-CODE:
- {
-  STRLEN len;
-  char *s = SvPV(sv,len);
-  if (!SvUTF8(sv) || is_utf8_string((U8*)s,len))
-   XSRETURN_YES;
-  else
-   XSRETURN_NO;
- }
-
-void
-sv_utf8_encode(sv)
-SV *	sv
-
-bool
-sv_utf8_decode(sv)
-SV *	sv
-
-STRLEN
-sv_utf8_upgrade(sv)
-SV *	sv
-
-bool
-sv_utf8_downgrade(sv,failok=0)
-SV *	sv
-bool	failok
-
 MODULE = Encode		PACKAGE = Encode::XS	PREFIX = Method_
 
 PROTOTYPES: ENABLE
diff --git a/ext/Encode/compile b/ext/Encode/compile
index 8201043..d0611f7 100755
--- a/ext/Encode/compile
+++ b/ext/Encode/compile
@@ -8,23 +8,16 @@ use Getopt::Std;
 my @orig_ARGV = @ARGV;
 my $perforce  = '$Id$';
 
-
 sub encode_U
 {
  # UTF-8 encode long hand - only covers part of perl's range
  my $uv = shift;
- if ($uv < 0x80)
-  {
-   return chr($uv)
-  }
- if ($uv < 0x800)
-  {
-   return chr(($uv >> 6)        | 0xC0).
-          chr(($uv & 0x3F)      | 0x80);
-  }
- return chr(($uv >> 12)         | 0xE0).
-        chr((($uv >> 6) & 0x3F) | 0x80).
-        chr(($uv & 0x3F)        | 0x80);
+ # chr() works in native space so convert value from table
+ # into that space before using chr().
+ my $ch = chr(utf8::unicode_to_native($uv));
+ # Now get core perl to encode that the way it likes.
+ utf8::encode($ch);
+ return $ch;
 }
 
 sub encode_S
diff --git a/lib/utf8.pm b/lib/utf8.pm
index f9055b5..7c9a7df 100644
--- a/lib/utf8.pm
+++ b/lib/utf8.pm
@@ -52,7 +52,7 @@ source text.  Until UTF-8 becomes the default format for source
 text, this pragma should be used to recognize UTF-8 in the source.
 When UTF-8 becomes the standard source format, this pragma will
 effectively become a no-op.  This pragma already is a no-op on
-EBCDIC platforms (where it is alright to code perl in EBCDIC 
+EBCDIC platforms (where it is alright to code perl in EBCDIC
 rather than UTF-8).
 
 Enabling the C<utf8> pragma has the following effects:
@@ -81,6 +81,32 @@ of byte semantics.
 
 =back
 
+=head2 Utility functions
+
+The following functions are defined in the C<utf8::> package by the perl core.
+
+=over 4
+
+=item * $num_octets = utf8::upgrade($string);
+
+Converts internal representation of string to the perls internal UTF-X form.
+Returns the number of octets necessary to represent the string as UTF-X.
+
+=item * utf8::downgrade($string[, CHECK])
+
+Converts internal representation of string to be un-encoded bytes.
+
+=item * utf8::encode($string)
+
+Converts (in-place) I<$string> from logical characters to octet sequence
+representing it in perl's UTF-X encoding.
+
+=item * $flag = utf8::decode($string)
+
+Attempts to converts I<$string> in-place from perl's UTF-X encoding into logical characters.
+
+=back
+
 =head1 SEE ALSO
 
 L<perlunicode>, L<bytes>
diff --git a/t/lib/b.t b/t/lib/b.t
index 397fdba..019a1e8 100755
--- a/t/lib/b.t
+++ b/t/lib/b.t
@@ -2,11 +2,11 @@
 
 BEGIN {
     chdir 't' if -d 't';
-    if ($^O eq 'MacOS') { 
-	@INC = qw(: ::lib ::macos:lib); 
-    } else { 
-	@INC = '.'; 
-	push @INC, '../lib'; 
+    if ($^O eq 'MacOS') {
+	@INC = qw(: ::lib ::macos:lib);
+    } else {
+	@INC = '.';
+	push @INC, '../lib';
     }
 }
 
@@ -141,7 +141,7 @@ $a =~ s/-u(Cwd|File|File::Copy|OS2),//g if $^O eq 'os2';
 $a =~ s/-uCwd,// if $^O eq 'cygwin';
 if ($Config{static_ext} eq ' ') {
   $b = '-uCarp,-uCarp::Heavy,-uDB,-uExporter,-uExporter::Heavy,-uattributes,'
-     . '-umain,-ustrict,-uwarnings';
+     . '-umain,-ustrict,-uutf8,-uwarnings';
   if (ord('A') == 193) { # EBCDIC sort order is qw(a A) not qw(A a)
       $b = join ',', sort split /,/, $b;
   }
@@ -157,7 +157,7 @@ if ($is_thread) {
     $a = `$^X $path "-MO=Showlex" -e "my %one" $redir`;
     if (ord('A') != 193) { # ASCIIish
         print "# [$a]\nnot " unless $a =~ /sv_undef.*PVNV.*%one.*sv_undef.*HV/s;
-    } 
+    }
     else { # EBCDICish C<1: PVNV (0x1a7ede34) "%\226\225\205">
         print "# [$a]\nnot " unless $a =~ /sv_undef.*PVNV.*%\\[0-9].*sv_undef.*HV/s;
     }
@@ -171,6 +171,6 @@ my $foo = $deparse->coderef2text(sub { { 234; }});
 print "not " unless $foo =~ /{.*{.*234;.*}.*}/sm;
 ok;
 $foo = $deparse->coderef2text(sub { { 234; } continue { 123; } });
-print "not " unless $foo =~ /{.*{.*234;.*}.*continue.*{.*123.*}/sm; 
+print "not " unless $foo =~ /{.*{.*234;.*}.*continue.*{.*123.*}/sm;
 ok;
 }
diff --git a/t/lib/encode.t b/t/lib/encode.t
index af1f34b..d4a13ee 100644
--- a/t/lib/encode.t
+++ b/t/lib/encode.t
@@ -104,9 +104,9 @@ for my $i (256,128,129,256)
  {
   my $c = chr($i);
   my $s = "$c\n".sprintf("%02X",$i);
-  ok(Encode::valid_utf8($s),1,"concat of $i botched");
-  Encode::utf8_upgrade($s);
-  ok(Encode::valid_utf8($s),1,"concat of $i botched");
+  ok(utf8::valid($s),1,"concat of $i botched");
+  utf8::upgrade($s);
+  ok(utf8::valid($s),1,"concat of $i botched");
  }
 
 # Spot check a few points in/out of utf8
diff --git a/universal.c b/universal.c
index 12d31e5..3e14a68 100644
--- a/universal.c
+++ b/universal.c
@@ -130,9 +130,18 @@ Perl_sv_derived_from(pTHX_ SV *sv, const char *name)
         : FALSE ;
 }
 
+#include "XSUB.h"
+
 void XS_UNIVERSAL_isa(pTHXo_ CV *cv);
 void XS_UNIVERSAL_can(pTHXo_ CV *cv);
 void XS_UNIVERSAL_VERSION(pTHXo_ CV *cv);
+XS(XS_utf8_valid);
+XS(XS_utf8_encode);
+XS(XS_utf8_decode);
+XS(XS_utf8_upgrade);
+XS(XS_utf8_downgrade);
+XS(XS_utf8_unicode_to_native);
+XS(XS_utf8_native_to_unicode);
 
 void
 Perl_boot_core_UNIVERSAL(pTHX)
@@ -142,9 +151,15 @@ Perl_boot_core_UNIVERSAL(pTHX)
     newXS("UNIVERSAL::isa",             XS_UNIVERSAL_isa,         file);
     newXS("UNIVERSAL::can",             XS_UNIVERSAL_can,         file);
     newXS("UNIVERSAL::VERSION", 	XS_UNIVERSAL_VERSION, 	  file);
+    newXS("utf8::valid", XS_utf8_valid, file);
+    newXS("utf8::encode", XS_utf8_encode, file);
+    newXS("utf8::decode", XS_utf8_decode, file);
+    newXS("utf8::upgrade", XS_utf8_upgrade, file);
+    newXS("utf8::downgrade", XS_utf8_downgrade, file);
+    newXS("utf8::native_to_unicode", XS_utf8_native_to_unicode, file);
+    newXS("utf8::unicode_to_native", XS_utf8_unicode_to_native, file);
 }
 
-#include "XSUB.h"
 
 XS(XS_UNIVERSAL_isa)
 {
@@ -299,3 +314,107 @@ finish:
     XSRETURN(1);
 }
 
+XS(XS_utf8_valid)
+{
+    dXSARGS;
+    if (items != 1)
+	Perl_croak(aTHX_ "Usage: utf8::valid(sv)");
+    {
+	SV *	sv = ST(0);
+ {
+  STRLEN len;
+  char *s = SvPV(sv,len);
+  if (!SvUTF8(sv) || is_utf8_string((U8*)s,len))
+   XSRETURN_YES;
+  else
+   XSRETURN_NO;
+ }
+    }
+    XSRETURN_EMPTY;
+}
+
+XS(XS_utf8_encode)
+{
+    dXSARGS;
+    if (items != 1)
+	Perl_croak(aTHX_ "Usage: utf8::encode(sv)");
+    {
+	SV *	sv = ST(0);
+
+	sv_utf8_encode(sv);
+    }
+    XSRETURN_EMPTY;
+}
+
+XS(XS_utf8_decode)
+{
+    dXSARGS;
+    if (items != 1)
+	Perl_croak(aTHX_ "Usage: utf8::decode(sv)");
+    {
+	SV *	sv = ST(0);
+	bool	RETVAL;
+
+	RETVAL = sv_utf8_decode(sv);
+	ST(0) = boolSV(RETVAL);
+	sv_2mortal(ST(0));
+    }
+    XSRETURN(1);
+}
+
+XS(XS_utf8_upgrade)
+{
+    dXSARGS;
+    if (items != 1)
+	Perl_croak(aTHX_ "Usage: utf8::upgrade(sv)");
+    {
+	SV *	sv = ST(0);
+	STRLEN	RETVAL;
+	dXSTARG;
+
+	RETVAL = sv_utf8_upgrade(sv);
+	XSprePUSH; PUSHi((IV)RETVAL);
+    }
+    XSRETURN(1);
+}
+
+XS(XS_utf8_downgrade)
+{
+    dXSARGS;
+    if (items < 1 || items > 2)
+	Perl_croak(aTHX_ "Usage: utf8::downgrade(sv, failok=0)");
+    {
+	SV *	sv = ST(0);
+	bool	failok;
+	bool	RETVAL;
+
+	if (items < 2)
+	    failok = 0;
+	else {
+	    failok = (int)SvIV(ST(1));
+	}
+
+	RETVAL = sv_utf8_downgrade(sv, failok);
+	ST(0) = boolSV(RETVAL);
+	sv_2mortal(ST(0));
+    }
+    XSRETURN(1);
+}
+
+XS(XS_utf8_native_to_unicode)
+{
+ dXSARGS;
+ UV uv = SvUV(ST(0));
+ ST(0) = sv_2mortal(newSViv(NATIVE_TO_UNI(uv)));
+ XSRETURN(1);
+}
+
+XS(XS_utf8_unicode_to_native)
+{
+ dXSARGS;
+ UV uv = SvUV(ST(0));
+ ST(0) = sv_2mortal(newSViv(UNI_TO_NATIVE(uv)));
+ XSRETURN(1);
+}
+
+
diff --git a/utf8.c b/utf8.c
index 7ca3cc7..81fb44d 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1282,8 +1282,9 @@ Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
     SV* retval;
     SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
     dSP;
+    HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
 
-    if (!gv_stashpv(pkg, 0)) {	/* demand load utf8 */
+    if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {	/* demand load utf8 */
 	ENTER;
 	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
 	LEAVE;