use strict;
use warnings;
+no warnings 'redefine';
our $VERSION = do { my @r = (q$Revision: 1.39 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
require Encode;
+our %BOM_Unknown = map {$_ => 1} qw(UTF-16 UTF-32);
+
for my $name (qw(UTF-16 UTF-16BE UTF-16LE
UTF-32 UTF-32BE UTF-32LE
UCS-2BE UCS-2LE))
endian => $endian,
ucs2 => $ucs2,
} => __PACKAGE__;
-
}
use base qw(Encode::Encoding);
-#
-# three implementations of (en|de)code exist. The XS version is the
-# fastest. *_modern uses an array and *_classic sticks with substr.
-# *_classic is much slower but more memory conservative.
-# *_xs is the default.
-
-sub set_transcoder{
- no warnings qw(redefine);
- my $type = shift;
- if ($type eq "xs"){
- *decode = \&decode_xs;
- *encode = \&encode_xs;
- }elsif($type eq "modern"){
- *decode = \&decode_modern;
- *encode = \&encode_modern;
- }elsif($type eq "classic"){
- *decode = \&decode_classic;
- *encode = \&encode_classic;
- }else{
- require Carp;
- Carp::croak __PACKAGE__, "::set_transcoder(modern|classic|xs)";
- }
-}
-
-set_transcoder("xs");
-
-#
-# Aux. subs & constants
-#
-
-sub FBCHAR(){ 0xFFFd }
-sub BOM_BE(){ 0xFeFF }
-sub BOM16LE(){ 0xFFFe }
-sub BOM32LE(){ 0xFFFe0000 }
-
-sub valid_ucs2($){
- return
- (0 <= $_[0] && $_[0] < 0xD800)
- || ( 0xDFFF < $_[0] && $_[0] <= 0xFFFF);
-}
-
-sub issurrogate($){ 0xD800 <= $_[0] && $_[0] <= 0xDFFF }
-sub isHiSurrogate($){ 0xD800 <= $_[0] && $_[0] < 0xDC00 }
-sub isLoSurrogate($){ 0xDC00 <= $_[0] && $_[0] <= 0xDFFF }
-
-sub ensurrogate($){
- use integer; # we have divisions
- my $uni = shift;
- my $hi = ($uni - 0x10000) / 0x400 + 0xD800;
- my $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
- return ($hi, $lo);
-}
-
-sub desurrogate($$){
- my ($hi, $lo) = @_;
- return 0x10000 + ($hi - 0xD800)*0x400 + ($lo - 0xDC00);
-}
-
-sub Mask { {2 => 0xffff, 4 => 0xffffffff} }
-
-#
-# *_modern are much faster but guzzle more memory
-#
-
-sub decode_modern($$;$)
-{
- my ($obj, $str, $chk ) = @_;
- my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
-
- # warn "$size, $endian, $ucs2";
- $endian ||= BOMB($size, substr($str, 0, $size, ''))
- or poisoned2death($obj, "Where's the BOM?");
- my $mask = Mask->{$size};
- my $utf8 = '';
- my @ord = unpack("$endian*", $str);
- undef $str; # to conserve memory
- while (@ord){
- my $ord = shift @ord;
- unless ($size == 4 or valid_ucs2($ord &= $mask)){
- if ($ucs2){
- $chk and
- poisoned2death($obj, "no surrogates allowed", $ord);
- shift @ord; # skip the next one as well
- $ord = FBCHAR;
- }else{
- unless (isHiSurrogate($ord)){
- poisoned2death($obj, "Malformed HI surrogate", $ord);
- }
- my $lo = shift @ord;
- unless (isLoSurrogate($lo &= $mask)){
- poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
- }
- $ord = desurrogate($ord, $lo);
- }
- }
- $utf8 .= chr($ord);
- }
- utf8::upgrade($utf8);
- return $utf8;
-}
-
-sub encode_modern($$;$)
-{
- my ($obj, $utf8, $chk) = @_;
- my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
- my @str = ();
- unless ($endian){
- $endian = ($size == 4) ? 'N' : 'n';
- push @str, BOM_BE;
- }
- my @ord = unpack("U*", $utf8);
- undef $utf8; # to conserve memory
- for my $ord (@ord){
- unless ($size == 4 or valid_ucs2($ord)) {
- unless(issurrogate($ord)){
- if ($ucs2){
- $chk and
- poisoned2death($obj, "code point too high", $ord);
-
- push @str, FBCHAR;
- }else{
-
- push @str, ensurrogate($ord);
- }
- }else{ # not supposed to happen
- push @str, FBCHAR;
- }
- }else{
- push @str, $ord;
- }
- }
- return pack("$endian*", @str);
-}
-
-#
-# *_classic are slower but more memory conservative
-#
-
-sub decode_classic($$;$)
-{
- my ($obj, $str, $chk ) = @_;
- my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
-
- # warn "$size, $endian, $ucs2";
- $endian ||= BOMB($size, substr($str, 0, $size, ''))
- or poisoned2death($obj, "Where's the BOM?");
- my $mask = Mask->{$size};
- my $utf8 = '';
- my @ord = unpack("$endian*", $str);
- while (length($str)){
- my $ord = unpack($endian, substr($str, 0, $size, ''));
- unless ($size == 4 or valid_ucs2($ord &= $mask)){
- if ($ucs2){
- $chk and
- poisoned2death($obj, "no surrogates allowed", $ord);
- substr($str,0,$size,''); # skip the next one as well
- $ord = FBCHAR;
- }else{
- unless (isHiSurrogate($ord)){
- poisoned2death($obj, "Malformed HI surrogate", $ord);
- }
- my $lo = unpack($endian ,substr($str,0,$size,''));
- unless (isLoSurrogate($lo &= $mask)){
- poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
- }
- $ord = desurrogate($ord, $lo);
- }
- }
- $utf8 .= chr($ord);
- }
- utf8::upgrade($utf8);
- return $utf8;
+sub renew {
+ my $self = shift;
+ $BOM_Unknown{$self->name} or return $self;
+ my $clone = bless { %$self } => ref($self);
+ $clone->{clone} = 1; # so the caller knows it is renewed.
+ return $clone;
}
-sub encode_classic($$;$)
-{
- my ($obj, $utf8, $chk) = @_;
- my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
- # warn join ", ", $size, $ucs2, $endian, $mask;
- my $str = '';
- unless ($endian){
- $endian = ($size == 4) ? 'N' : 'n';
- $str .= pack($endian, BOM_BE);
- }
- while (length($utf8)){
- my $ord = ord(substr($utf8,0,1,''));
- unless ($size == 4 or valid_ucs2($ord)) {
- unless(issurrogate($ord)){
- if ($ucs2){
- $chk and
- poisoned2death($obj, "code point too high", $ord);
- $str .= pack($endian, FBCHAR);
- }else{
- $str .= pack($endian.2, ensurrogate($ord));
- }
- }else{ # not supposed to happen
- $str .= pack($endian, FBCHAR);
- }
- }else{
- $str .= pack($endian, $ord);
- }
- }
- return $str;
-}
+# There used to be a perl implemntation of (en|de)code but with
+# XS version is ripe, perl version is zapped for optimal speed
-sub BOMB {
- my ($size, $bom) = @_;
- my $N = $size == 2 ? 'n' : 'N';
- my $ord = unpack($N, $bom);
- return ($ord eq BOM_BE) ? $N :
- ($ord eq BOM16LE) ? 'v' : ($ord eq BOM32LE) ? 'V' : undef;
-}
-
-sub poisoned2death{
- my $obj = shift;
- my $msg = shift;
- my $pair = join(", ", map {sprintf "\\x%x", $_} @_);
- require Carp;
- Carp::croak $obj->name, ":", $msg, "<$pair>.", caller;
-}
+*decode = \&decode_xs;
+*encode = \&encode_xs;
1;
__END__
/*
- $Id: Unicode.xs,v 1.7 2003/02/20 14:42:34 dankogai Exp $
+ $Id: Unicode.xs,v 1.7 2003/02/20 14:42:34 dankogai Exp dankogai $
*/
#define PERL_NO_GET_CONTEXT
PROTOTYPES: DISABLE
+#define attr(k, l) (hv_exists((HV *)SvRV(obj),k,l) ? \
+ *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef)
+
void
decode_xs(obj, str, check = 0)
SV * obj
IV check
CODE:
{
- int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0));
- U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0)));
- int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0));
- SV *result = newSVpvn("",0);
+ U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
+ int size = SvIV(attr("size", 4));
+ int ucs2 = SvTRUE(attr("ucs2", 4));
+ int clone = SvTRUE(attr("clone", 5));
+ SV *result = newSVpvn("",0);
STRLEN ulen;
U8 *s = (U8 *)SvPVbyte(str,ulen);
U8 *e = (U8 *)SvEND(str);
bom);
}
}
-#if 0
- /* Update endian for this sequence */
- hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
+#if 1
+ /* Update endian for next sequence */
+ if (clone) {
+ hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
+ }
#endif
}
while (s < e && s+size <= e) {
IV check
CODE:
{
- int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0));
- U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0)));
- int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0));
- SV *result = newSVpvn("",0);
+ U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
+ int size = SvIV(attr("size", 4));
+ int ucs2 = SvTRUE(attr("ucs2", 4));
+ int clone = SvTRUE(attr("clone", 5));
+ SV *result = newSVpvn("",0);
STRLEN ulen;
U8 *s = (U8 *)SvPVutf8(utf8,ulen);
U8 *e = (U8 *)SvEND(utf8);
if (!endian) {
endian = (size == 4) ? 'N' : 'n';
enc_pack(aTHX_ result,size,endian,BOM_BE);
-#if 0
- /* Update endian for this sequence */
- hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
+#if 1
+ /* Update endian for next sequence */
+ if (clone){
+ hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
+ }
#endif
}
while (s < e && s+UTF8SKIP(s) <= e) {