From: Nick Ing-Simmons Date: Sun, 14 Apr 2002 14:47:18 +0000 (+0000) Subject: XS versions of encode/decode for Encode::Unicode X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=d5c9ee33b57c6c516247cc68878006fd91166a49;p=p5sagit%2Fp5-mst-13.2.git XS versions of encode/decode for Encode::Unicode p4raw-id: //depot/perlio@15912 --- diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs index 470f14e..229359e 100644 --- a/ext/Encode/Encode.xs +++ b/ext/Encode/Encode.xs @@ -6,6 +6,79 @@ #include "encode.h" #include "def_t.h" +#define FBCHAR 0xFFFd +#define BOM_BE 0xFeFF +#define BOM16LE 0xFFFe +#define BOM32LE 0xFFFe0000 + +#define valid_ucs2(x) ((0 <= (x) && (x) < 0xD800) || (0xDFFF < (x) && (x) <= 0xFFFF)) + +#define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF ) +#define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 ) +#define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF ) + +static UV +enc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian) +{ + U8 *s = *sp; + UV v = 0; + if (s+size > e) { + croak("Partial character %c",(char) endian); + } + switch(endian) { + case 'N': + v = *s++; + v = (v << 8) | *s++; + case 'n': + v = (v << 8) | *s++; + v = (v << 8) | *s++; + break; + case 'V': + case 'v': + v |= *s++; + v |= (*s++ << 8); + if (endian == 'v') + break; + v |= (*s++ << 16); + v |= (*s++ << 24); + break; + default: + croak("Unknown endian %c",(char) endian); + break; + } + *sp = s; + return v; +} + +void +enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value) +{ + U8 *d = SvGROW(result,SvCUR(result)+size); + switch(endian) { + case 'v': + case 'V': + d += SvCUR(result); + SvCUR_set(result,SvCUR(result)+size); + while (size--) { + *d++ = value & 0xFF; + value >>= 8; + } + break; + case 'n': + case 'N': + SvCUR_set(result,SvCUR(result)+size); + d += SvCUR(result); + while (size--) { + *--d = value & 0xFF; + value >>= 8; + } + break; + default: + croak("Unknown endian %c",(char) endian); + break; + } +} + #define ENCODE_XS_PROFILE 0 /* set 1 or more to profile. t/encoding.t dumps core because of Perl_warner and PerlIO don't work well */ @@ -674,6 +747,164 @@ CODE: XSRETURN(1); } +MODULE = Encode PACKAGE = Encode::Unicode + +void +decode_xs(obj, str, chk = &PL_sv_undef) +SV * obj +SV * str +SV * chk +CODE: +{ + int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0)); + U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0))); + int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0)); + SV *result = newSVpvn("",0); + STRLEN ulen; + U8 *s = SvPVbyte(str,ulen); + U8 *e = SvEND(str); + ST(0) = sv_2mortal(result); + SvUTF8_on(result); + + if (!endian && s+size <= e) { + UV bom; + endian = (size == 4) ? 'N' : 'n'; + bom = enc_unpack(aTHX_ &s,e,size,endian); + if (bom != BOM_BE) { + if (bom == BOM16LE) { + endian = 'v'; + } + else if (bom == BOM32LE) { + endian = 'V'; + } + else { + croak("%s:Unregognised BOM %"UVxf, + SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),bom); + } + } +#if 0 + /* Update endian for this sequence */ + hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); +#endif + } + while (s < e && s+size <= e) { + UV ord = enc_unpack(aTHX_ &s,e,size,endian); + U8 *d; + if (size != 4 && !valid_ucs2(ord)) { + if (ucs2) { + if (SvTRUE(chk)) { + croak("%s:no surrogates allowed %"UVxf, + SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord); + } + if (s+size <= e) { + enc_unpack(aTHX_ &s,e,size,endian); /* skip the next one as well */ + } + ord = FBCHAR; + } + else { + UV lo; + if (!isHiSurrogate(ord)) { + croak("%s:Malformed HI surrogate %"UVxf, + SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord); + } + if (s+size > e) { + /* Partial character */ + s -= size; /* back up to 1st half */ + break; /* And exit loop */ + } + lo = enc_unpack(aTHX_ &s,e,size,endian); + if (!isLoSurrogate(lo)){ + croak("%s:Malformed LO surrogate %"UVxf, + SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord); + } + ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00); + } + } + d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1); + d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0); + SvCUR_set(result,d - (U8 *)SvPVX(result)); + } + if (SvTRUE(chk)) { + if (s < e) { + Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character", + SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0))); + Move(s,SvPVX(str),e-s,U8); + SvCUR_set(str,(e-s)); + } + else { + SvCUR_set(str,0); + } + *SvEND(str) = '\0'; + } + XSRETURN(1); +} + +void +encode_xs(obj, utf8, chk = &PL_sv_undef) +SV * obj +SV * utf8 +SV * chk +CODE: +{ + int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0)); + U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0))); + int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0)); + SV *result = newSVpvn("",0); + STRLEN ulen; + U8 *s = SvPVutf8(utf8,ulen); + U8 *e = SvEND(utf8); + ST(0) = sv_2mortal(result); + if (!endian) { + endian = (size == 4) ? 'N' : 'n'; + enc_pack(aTHX_ result,size,endian,BOM_BE); +#if 0 + /* Update endian for this sequence */ + hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); +#endif + } + while (s < e && s+UTF8SKIP(s) <= e) { + STRLEN len; + UV ord = utf8n_to_uvuni(s, e-s, &len, 0); + s += len; + if (size != 4 && !valid_ucs2(ord)) { + if (!issurrogate(ord)){ + if (ucs2) { + if (SvTRUE(chk)) { + croak("%s:code point \"\\x{"UVxf"}\" too high", + SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord); + } + enc_pack(aTHX_ result,size,endian,FBCHAR); + }else{ + UV hi = ((ord - 0x10000) >> 10) + 0xD800; + UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00; + enc_pack(aTHX_ result,size,endian,hi); + enc_pack(aTHX_ result,size,endian,lo); + } + } + else { + /* not supposed to happen */ + enc_pack(aTHX_ result,size,endian,FBCHAR); + } + } + else { + enc_pack(aTHX_ result,size,endian,ord); + } + } + if (SvTRUE(chk)) { + if (s < e) { + Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character", + SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0))); + Move(s,SvPVX(utf8),e-s,U8); + SvCUR_set(utf8,(e-s)); + } + else { + SvCUR_set(utf8,0); + } + *SvEND(utf8) = '\0'; + } + XSRETURN(1); +} + MODULE = Encode PACKAGE = Encode PROTOTYPES: ENABLE diff --git a/ext/Encode/lib/Encode/Unicode.pm b/ext/Encode/lib/Encode/Unicode.pm index 2a05ef0..1168e35 100644 --- a/ext/Encode/lib/Encode/Unicode.pm +++ b/ext/Encode/lib/Encode/Unicode.pm @@ -15,8 +15,8 @@ sub BOM16LE(){ 0xFFFe } sub BOM32LE(){ 0xFFFe0000 } sub valid_ucs2($){ - return - (0 <= $_[0] && $_[0] < 0xD800) + return + (0 <= $_[0] && $_[0] < 0xD800) || ( 0xDFFF < $_[0] && $_[0] <= 0xFFFF); } @@ -69,12 +69,22 @@ for my $name (qw(UTF-16 UTF-16BE UTF-16LE } sub name { shift->{'Name'} } -sub new_sequence { $_[0] }; + +sub new_sequence +{ + my $self = shift; + # Return the original if endian known + return $self if ($self->{endian}); + # Return a clone + return bless {%$self},ref($self); +} # -# two implementation of (en|de)code exist. *_modern use +# Three implementation of (en|de)code exist. *_modern use # an array and *_classic stick with substr. *_classic is much -# slower but more memory conservative. *_modern is default. +# slower but more memory conservative. +# *_xs is C code in Encode.xs +# *_xs is the default. sub set_transcoder{ no warnings qw(redefine); @@ -85,13 +95,16 @@ sub set_transcoder{ }elsif($type eq "classic"){ *decode = \&decode_classic; *encode = \&encode_classic; + }elsif($type eq "xs"){ + *decode = \&decode_xs; + *encode = \&encode_xs; }else{ - require Carp; - Carp::croak __PACKAGE__, "::set_transcoder(modern|classic)"; + require Carp; + Carp::croak __PACKAGE__, "::set_transcoder(modern|classic|xs)"; } } -set_transcoder("modern"); +set_transcoder("xs"); # # *_modern are much faster but guzzle more memory @@ -113,7 +126,7 @@ sub decode_modern my $ord = shift @ord; unless ($size == 4 or valid_ucs2($ord &= $mask)){ if ($ucs2){ - $chk and + $chk and poisoned2death($obj, "no surrogates allowed", $ord); shift @ord; # skip the next one as well $ord = FBCHAR; @@ -149,12 +162,12 @@ sub encode_modern unless ($size == 4 or valid_ucs2($ord)) { unless(issurrogate($ord)){ if ($ucs2){ - $chk and + $chk and poisoned2death($obj, "code point too high", $ord); push @str, FBCHAR; }else{ - + push @str, ensurrogate($ord); } }else{ # not supposed to happen @@ -186,7 +199,7 @@ sub decode_classic my $ord = unpack($endian, substr($str, 0, $size, '')); unless ($size == 4 or valid_ucs2($ord &= $mask)){ if ($ucs2){ - $chk and + $chk and poisoned2death($obj, "no surrogates allowed", $ord); substr($str,0,$size,''); # skip the next one as well $ord = FBCHAR; @@ -222,7 +235,7 @@ sub encode_classic unless ($size == 4 or valid_ucs2($ord)) { unless(issurrogate($ord)){ if ($ucs2){ - $chk and + $chk and poisoned2death($obj, "code point too high", $ord); $str .= pack($endian, FBCHAR); }else{ @@ -242,7 +255,7 @@ sub BOMB { my ($size, $bom) = @_; my $N = $size == 2 ? 'n' : 'N'; my $ord = unpack($N, $bom); - return ($ord eq BOM_BE) ? $N : + return ($ord eq BOM_BE) ? $N : ($ord eq BOM16LE) ? 'v' : ($ord eq BOM32LE) ? 'V' : undef; } @@ -265,7 +278,7 @@ Encode::Unicode -- Various Unicode Transform Format =head1 SYNOPSIS - use Encode qw/encode decode/; + use Encode qw/encode decode/; $ucs2 = encode("UCS-2BE", $utf8); $utf8 = decode("UCS-2BE", $ucs2); @@ -347,7 +360,7 @@ called Byte Order Mark (BOM) is prepended to the head of string. ------------------------- =back - + This modules handles BOM as follows. =over 4 @@ -361,7 +374,7 @@ simply treated as one of characters (ZERO WIDTH NO-BREAK SPACE). When BE or LE is omitted during decode(), it checks if BOM is in the beginning of the string and if found endianness is set to what BOM -says. If not found, dies. +says. If not found, dies. =item * @@ -414,7 +427,7 @@ And to desurrogate; $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00); Note this move has made \x{D800}-\x{DFFF} into a forbidden zone but -perl does not prohibit the use of characters within this range. To perl, +perl does not prohibit the use of characters within this range. To perl, every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I. (*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit @@ -431,7 +444,7 @@ RFC 2781 L, L Ch. 15, pp. 403 of C -by Larry Wall, Tom Christiansen, Jon Orwant; +by Larry Wall, Tom Christiansen, Jon Orwant; O'Reilly & Associates; ISBN 0-596-00027-8 =cut