From: Nick Ing-Simmons <nik@tiuk.ti.com>
Date: Sun, 14 Apr 2002 14:47:18 +0000 (+0000)
Subject: XS versions of encode/decode for Encode::Unicode
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=d5c9ee33b57c6c516247cc68878006fd91166a49;p=p5sagit%2Fp5-mst-13.2.git

XS versions of encode/decode for Encode::Unicode

p4raw-id: //depot/perlio@15912
---

diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs
index 470f14e..229359e 100644
--- a/ext/Encode/Encode.xs
+++ b/ext/Encode/Encode.xs
@@ -6,6 +6,79 @@
 #include "encode.h"
 #include "def_t.h"
 
+#define FBCHAR			0xFFFd
+#define BOM_BE			0xFeFF
+#define BOM16LE			0xFFFe
+#define BOM32LE			0xFFFe0000
+
+#define valid_ucs2(x)		((0 <= (x) && (x) < 0xD800) || (0xDFFF < (x) && (x) <= 0xFFFF))
+
+#define issurrogate(x)		(0xD800 <= (x)  && (x) <= 0xDFFF )
+#define isHiSurrogate(x)	(0xD800 <= (x)  && (x) <  0xDC00 )
+#define isLoSurrogate(x)	(0xDC00 <= (x)  && (x) <= 0xDFFF )
+
+static UV
+enc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian)
+{
+    U8 *s = *sp;
+    UV v = 0;
+    if (s+size > e) {
+	croak("Partial character %c",(char) endian);
+    }
+    switch(endian) {
+	case 'N':
+	    v = *s++;
+	    v = (v << 8) | *s++;
+	case 'n':
+	    v = (v << 8) | *s++;
+	    v = (v << 8) | *s++;
+	    break;
+	case 'V':
+	case 'v':
+	    v |= *s++;
+	    v |= (*s++ << 8);
+	    if (endian == 'v')
+		break;
+	    v |= (*s++ << 16);
+	    v |= (*s++ << 24);
+	    break;
+	default:
+	    croak("Unknown endian %c",(char) endian);
+	    break;
+    }
+    *sp = s;
+    return v;
+}
+
+void
+enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value)
+{
+    U8 *d = SvGROW(result,SvCUR(result)+size);
+    switch(endian) {
+	case 'v':
+	case 'V':
+	    d += SvCUR(result);
+	    SvCUR_set(result,SvCUR(result)+size);
+	    while (size--) {
+		*d++ = value & 0xFF;
+		value >>= 8;
+	    }
+	    break;
+	case 'n':
+	case 'N':
+	    SvCUR_set(result,SvCUR(result)+size);
+	    d += SvCUR(result);
+	    while (size--) {
+		*--d = value & 0xFF;
+		value >>= 8;
+	    }
+	    break;
+	default:
+	    croak("Unknown endian %c",(char) endian);
+	    break;
+    }
+}
+
 #define ENCODE_XS_PROFILE 0 /* set 1 or more to profile.
 			       t/encoding.t dumps core because of
 			       Perl_warner and PerlIO don't work well */
@@ -674,6 +747,164 @@ CODE:
   XSRETURN(1);
  }
 
+MODULE = Encode		PACKAGE = Encode::Unicode
+
+void
+decode_xs(obj, str, chk = &PL_sv_undef)
+SV *	obj
+SV *	str
+SV *	chk
+CODE:
+{
+    int size    = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0));
+    U8 endian   = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0)));
+    int ucs2    = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0));
+    SV *result = newSVpvn("",0);
+    STRLEN ulen;
+    U8 *s = SvPVbyte(str,ulen);
+    U8 *e = SvEND(str);
+    ST(0) = sv_2mortal(result);
+    SvUTF8_on(result);
+
+    if (!endian && s+size <= e) {
+	UV bom;
+	endian = (size == 4) ? 'N' : 'n';
+	bom = enc_unpack(aTHX_ &s,e,size,endian);
+        if (bom != BOM_BE) {
+	    if (bom == BOM16LE) {
+		endian = 'v';
+	    }
+	    else if (bom == BOM32LE) {
+		endian = 'V';
+	    }
+	    else {
+		croak("%s:Unregognised BOM %"UVxf,
+                      SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),bom);
+	    }
+	}
+#if 0
+	/* Update endian for this sequence */
+	hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
+#endif
+    }
+    while (s < e && s+size <= e) {
+	UV ord = enc_unpack(aTHX_ &s,e,size,endian);
+	U8 *d;
+	if (size != 4 && !valid_ucs2(ord)) {
+	    if (ucs2) {
+		if (SvTRUE(chk)) {
+		    croak("%s:no surrogates allowed %"UVxf,
+			  SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord);
+		}
+		if (s+size <= e) {
+		     enc_unpack(aTHX_ &s,e,size,endian); /* skip the next one as well */
+		}
+		ord = FBCHAR;
+	    }
+	    else {
+		UV lo;
+		if (!isHiSurrogate(ord)) {
+		    croak("%s:Malformed HI surrogate %"UVxf,
+			  SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord);
+		}
+		if (s+size > e) {
+		    /* Partial character */
+		    s -= size;   /* back up to 1st half */
+		    break;       /* And exit loop */
+		}
+		lo = enc_unpack(aTHX_ &s,e,size,endian);
+		if (!isLoSurrogate(lo)){
+		    croak("%s:Malformed LO surrogate %"UVxf,
+			  SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord);
+		}
+		ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
+	    }
+	}
+	d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1);
+	d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0);
+	SvCUR_set(result,d - (U8 *)SvPVX(result));
+    }
+    if (SvTRUE(chk)) {
+	if (s < e) {
+	     Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character",
+			  SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)));
+	     Move(s,SvPVX(str),e-s,U8);
+	     SvCUR_set(str,(e-s));
+	}
+	else {
+	    SvCUR_set(str,0);
+	}
+	*SvEND(str) = '\0';
+    }
+    XSRETURN(1);
+}
+
+void
+encode_xs(obj, utf8, chk = &PL_sv_undef)
+SV *	obj
+SV *	utf8
+SV *	chk
+CODE:
+{
+    int size   = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0));
+    U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0)));
+    int ucs2   = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0));
+    SV *result = newSVpvn("",0);
+    STRLEN ulen;
+    U8 *s = SvPVutf8(utf8,ulen);
+    U8 *e = SvEND(utf8);
+    ST(0) = sv_2mortal(result);
+    if (!endian) {
+	endian = (size == 4) ? 'N' : 'n';
+	enc_pack(aTHX_ result,size,endian,BOM_BE);
+#if 0
+	/* Update endian for this sequence */
+	hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
+#endif
+    }
+    while (s < e && s+UTF8SKIP(s) <= e) {
+	STRLEN len;
+	UV ord = utf8n_to_uvuni(s, e-s, &len, 0);
+        s += len;
+	if (size != 4 && !valid_ucs2(ord)) {
+	    if (!issurrogate(ord)){
+		if (ucs2) {
+		    if (SvTRUE(chk)) {
+			croak("%s:code point \"\\x{"UVxf"}\" too high",
+			  SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord);
+		    }
+		    enc_pack(aTHX_ result,size,endian,FBCHAR);
+		}else{
+		    UV hi = ((ord - 0x10000) >> 10)   + 0xD800;
+		    UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
+		    enc_pack(aTHX_ result,size,endian,hi);
+		    enc_pack(aTHX_ result,size,endian,lo);
+		}
+	    }
+	    else {
+		/* not supposed to happen */
+		enc_pack(aTHX_ result,size,endian,FBCHAR);
+	    }
+	}
+	else {
+	    enc_pack(aTHX_ result,size,endian,ord);
+	}
+    }
+    if (SvTRUE(chk)) {
+	if (s < e) {
+	     Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character",
+			  SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)));
+	     Move(s,SvPVX(utf8),e-s,U8);
+	     SvCUR_set(utf8,(e-s));
+	}
+	else {
+	    SvCUR_set(utf8,0);
+	}
+	*SvEND(utf8) = '\0';
+    }
+    XSRETURN(1);
+}
+
 MODULE = Encode         PACKAGE = Encode
 
 PROTOTYPES: ENABLE
diff --git a/ext/Encode/lib/Encode/Unicode.pm b/ext/Encode/lib/Encode/Unicode.pm
index 2a05ef0..1168e35 100644
--- a/ext/Encode/lib/Encode/Unicode.pm
+++ b/ext/Encode/lib/Encode/Unicode.pm
@@ -15,8 +15,8 @@ sub BOM16LE(){ 0xFFFe }
 sub BOM32LE(){ 0xFFFe0000 }
 
 sub valid_ucs2($){
-    return 
-	(0 <= $_[0] && $_[0] < 0xD800) 
+    return
+	(0 <= $_[0] && $_[0] < 0xD800)
 	    || 	( 0xDFFF < $_[0] && $_[0] <= 0xFFFF);
 }
 
@@ -69,12 +69,22 @@ for my $name (qw(UTF-16 UTF-16BE UTF-16LE
 }
 
 sub name { shift->{'Name'} }
-sub new_sequence { $_[0] };
+
+sub new_sequence
+{
+ my $self = shift;
+ # Return the original if endian known
+ return $self if ($self->{endian});
+ # Return a clone
+ return bless {%$self},ref($self);
+}
 
 #
-# two implementation of (en|de)code exist.  *_modern use
+# Three implementation of (en|de)code exist.  *_modern use
 # an array and *_classic stick with substr.  *_classic is much
-# slower but more memory conservative.  *_modern is default.
+# slower but more memory conservative.
+# *_xs is C code in Encode.xs
+# *_xs is the default.
 
 sub set_transcoder{
     no warnings qw(redefine);
@@ -85,13 +95,16 @@ sub set_transcoder{
     }elsif($type eq "classic"){
 	*decode = \&decode_classic;
 	*encode = \&encode_classic;
+    }elsif($type eq "xs"){
+	*decode = \&decode_xs;
+	*encode = \&encode_xs;
     }else{
-	require Carp; 
-	Carp::croak __PACKAGE__, "::set_transcoder(modern|classic)";
+	require Carp;
+	Carp::croak __PACKAGE__, "::set_transcoder(modern|classic|xs)";
     }
 }
 
-set_transcoder("modern");
+set_transcoder("xs");
 
 #
 # *_modern are much faster but guzzle more memory
@@ -113,7 +126,7 @@ sub decode_modern
 	my $ord = shift @ord;
 	unless ($size == 4 or valid_ucs2($ord &= $mask)){
 	    if ($ucs2){
-		$chk and 
+		$chk and
 		    poisoned2death($obj, "no surrogates allowed", $ord);
 		shift @ord; # skip the next one as well
 		$ord = FBCHAR;
@@ -149,12 +162,12 @@ sub encode_modern
 	unless ($size == 4 or valid_ucs2($ord)) {
 	    unless(issurrogate($ord)){
 		if ($ucs2){
-		    $chk and 
+		    $chk and
 			poisoned2death($obj, "code point too high", $ord);
 
 		    push @str, FBCHAR;
 		}else{
-		 
+		
 		    push @str, ensurrogate($ord);
 		}
 	    }else{  # not supposed to happen
@@ -186,7 +199,7 @@ sub decode_classic
 	 my $ord = unpack($endian, substr($str, 0, $size, ''));
 	unless ($size == 4 or valid_ucs2($ord &= $mask)){
 	    if ($ucs2){
-		$chk and 
+		$chk and
 		    poisoned2death($obj, "no surrogates allowed", $ord);
 		substr($str,0,$size,''); # skip the next one as well
 		$ord = FBCHAR;
@@ -222,7 +235,7 @@ sub encode_classic
 	unless ($size == 4 or valid_ucs2($ord)) {
 	    unless(issurrogate($ord)){
 		if ($ucs2){
-		    $chk and 
+		    $chk and
 			poisoned2death($obj, "code point too high", $ord);
 		    $str .= pack($endian, FBCHAR);
 		}else{
@@ -242,7 +255,7 @@ sub BOMB {
     my ($size, $bom) = @_;
     my $N = $size == 2 ? 'n' : 'N';
     my $ord = unpack($N, $bom);
-    return ($ord eq BOM_BE) ? $N : 
+    return ($ord eq BOM_BE) ? $N :
 	($ord eq BOM16LE) ? 'v' : ($ord eq BOM32LE) ? 'V' : undef;
 }
 
@@ -265,7 +278,7 @@ Encode::Unicode -- Various Unicode Transform Format
 
 =head1 SYNOPSIS
 
-    use Encode qw/encode decode/; 
+    use Encode qw/encode decode/;
     $ucs2 = encode("UCS-2BE", $utf8);
     $utf8 = decode("UCS-2BE", $ucs2);
 
@@ -347,7 +360,7 @@ called Byte Order Mark (BOM) is prepended to the head of string.
   -------------------------
 
 =back
- 
+
 This modules handles BOM as follows.
 
 =over 4
@@ -361,7 +374,7 @@ simply treated as one of characters (ZERO WIDTH NO-BREAK SPACE).
 
 When BE or LE is omitted during decode(), it checks if BOM is in the
 beginning of the string and if found endianness is set to what BOM
-says.  If not found, dies. 
+says.  If not found, dies.
 
 =item *
 
@@ -414,7 +427,7 @@ And to desurrogate;
  $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);
 
 Note this move has made \x{D800}-\x{DFFF} into a forbidden zone but
-perl does not prohibit the use of characters within this range.  To perl, 
+perl does not prohibit the use of characters within this range.  To perl,
 every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I<a character>.
 
   (*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit
@@ -431,7 +444,7 @@ RFC 2781 L<http://rfc.net/rfc2781.html>,
 L<http://www.unicode.org/unicode/faq/utf_bom.html>
 
 Ch. 15, pp. 403 of C<Programming Perl (3rd Edition)>
-by Larry Wall, Tom Christiansen, Jon Orwant; 
+by Larry Wall, Tom Christiansen, Jon Orwant;
 O'Reilly & Associates; ISBN 0-596-00027-8
 
 =cut