From: Dan Kogai Date: Fri, 23 May 2003 20:17:16 +0000 (+0900) Subject: Stateful PerlIO implemented [Was: [perl #22261] Was: Unrecognised BOM...] X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=a0d8a30e4d6cdcb83a19818856ccc52190cdd95f;p=p5sagit%2Fp5-mst-13.2.git Stateful PerlIO implemented [Was: [perl #22261] Was: Unrecognised BOM...] Message-Id: <1C123D88-8D10-11D7-B277-000393AE4244@dan.co.jp> p4raw-id: //depot/perl@19593 --- diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index 37b350f..4959b5f 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -15,7 +15,7 @@ use base qw/Exporter/; our @EXPORT = qw( decode decode_utf8 encode encode_utf8 - encodings find_encoding + encodings find_encoding clone_encoding ); our @FB_FLAGS = qw(DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC @@ -95,7 +95,7 @@ sub getEncoding { my ($class, $name, $skip_external) = @_; - ref($name) && $name->can('new_sequence') and return $name; + ref($name) && $name->can('renew') and return $name; exists $Encoding{$name} and return $Encoding{$name}; my $lc = lc $name; exists $Encoding{$lc} and return $Encoding{$lc}; @@ -116,18 +116,26 @@ sub getEncoding return; } -sub find_encoding +sub find_encoding($;$) { my ($name, $skip_external) = @_; return __PACKAGE__->getEncoding($name,$skip_external); } -sub resolve_alias { +sub resolve_alias($){ my $obj = find_encoding(shift); defined $obj and return $obj->name; return; } +sub clone_encoding($){ + my $obj = find_encoding(shift); + ref $obj or return; + eval { require Storable }; + $@ and return; + return Storable::dclone($obj); +} + sub encode($$;$) { my ($name, $string, $check) = @_; diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs index 36d5f3d..7970058 100644 --- a/ext/Encode/Encode.xs +++ b/ext/Encode/Encode.xs @@ -1,5 +1,5 @@ /* - $Id: Encode.xs,v 1.55 2003/02/28 01:40:27 dankogai Exp $ + $Id: Encode.xs,v 1.55 2003/02/28 01:40:27 dankogai Exp dankogai $ */ #define PERL_NO_GET_CONTEXT @@ -258,6 +258,16 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, MODULE = Encode PACKAGE = Encode::utf8 PREFIX = Method_ +PROTOTYPES: DISABLE + +void +Method_renew(obj) +SV * obj +CODE: +{ + XSRETURN(1); +} + void Method_decode_xs(obj,src,check = 0) SV * obj @@ -389,6 +399,14 @@ MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_ PROTOTYPES: ENABLE void +Method_renew(obj) +SV * obj +CODE: +{ + XSRETURN(1); +} + +void Method_name(obj) SV * obj CODE: diff --git a/ext/Encode/Unicode/Unicode.pm b/ext/Encode/Unicode/Unicode.pm index 1829218..9648fd3 100644 --- a/ext/Encode/Unicode/Unicode.pm +++ b/ext/Encode/Unicode/Unicode.pm @@ -2,6 +2,7 @@ package Encode::Unicode; use strict; use warnings; +no warnings 'redefine'; our $VERSION = do { my @r = (q$Revision: 1.39 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; @@ -14,6 +15,8 @@ XSLoader::load(__PACKAGE__,$VERSION); require Encode; +our %BOM_Unknown = map {$_ => 1} qw(UTF-16 UTF-32); + for my $name (qw(UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE UCS-2BE UCS-2LE)) @@ -35,231 +38,23 @@ for my $name (qw(UTF-16 UTF-16BE UTF-16LE endian => $endian, ucs2 => $ucs2, } => __PACKAGE__; - } use base qw(Encode::Encoding); -# -# three implementations of (en|de)code exist. The XS version is the -# fastest. *_modern uses an array and *_classic sticks with substr. -# *_classic is much slower but more memory conservative. -# *_xs is the default. - -sub set_transcoder{ - no warnings qw(redefine); - my $type = shift; - if ($type eq "xs"){ - *decode = \&decode_xs; - *encode = \&encode_xs; - }elsif($type eq "modern"){ - *decode = \&decode_modern; - *encode = \&encode_modern; - }elsif($type eq "classic"){ - *decode = \&decode_classic; - *encode = \&encode_classic; - }else{ - require Carp; - Carp::croak __PACKAGE__, "::set_transcoder(modern|classic|xs)"; - } -} - -set_transcoder("xs"); - -# -# Aux. subs & constants -# - -sub FBCHAR(){ 0xFFFd } -sub BOM_BE(){ 0xFeFF } -sub BOM16LE(){ 0xFFFe } -sub BOM32LE(){ 0xFFFe0000 } - -sub valid_ucs2($){ - return - (0 <= $_[0] && $_[0] < 0xD800) - || ( 0xDFFF < $_[0] && $_[0] <= 0xFFFF); -} - -sub issurrogate($){ 0xD800 <= $_[0] && $_[0] <= 0xDFFF } -sub isHiSurrogate($){ 0xD800 <= $_[0] && $_[0] < 0xDC00 } -sub isLoSurrogate($){ 0xDC00 <= $_[0] && $_[0] <= 0xDFFF } - -sub ensurrogate($){ - use integer; # we have divisions - my $uni = shift; - my $hi = ($uni - 0x10000) / 0x400 + 0xD800; - my $lo = ($uni - 0x10000) % 0x400 + 0xDC00; - return ($hi, $lo); -} - -sub desurrogate($$){ - my ($hi, $lo) = @_; - return 0x10000 + ($hi - 0xD800)*0x400 + ($lo - 0xDC00); -} - -sub Mask { {2 => 0xffff, 4 => 0xffffffff} } - -# -# *_modern are much faster but guzzle more memory -# - -sub decode_modern($$;$) -{ - my ($obj, $str, $chk ) = @_; - my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)}; - - # warn "$size, $endian, $ucs2"; - $endian ||= BOMB($size, substr($str, 0, $size, '')) - or poisoned2death($obj, "Where's the BOM?"); - my $mask = Mask->{$size}; - my $utf8 = ''; - my @ord = unpack("$endian*", $str); - undef $str; # to conserve memory - while (@ord){ - my $ord = shift @ord; - unless ($size == 4 or valid_ucs2($ord &= $mask)){ - if ($ucs2){ - $chk and - poisoned2death($obj, "no surrogates allowed", $ord); - shift @ord; # skip the next one as well - $ord = FBCHAR; - }else{ - unless (isHiSurrogate($ord)){ - poisoned2death($obj, "Malformed HI surrogate", $ord); - } - my $lo = shift @ord; - unless (isLoSurrogate($lo &= $mask)){ - poisoned2death($obj, "Malformed LO surrogate", $ord, $lo); - } - $ord = desurrogate($ord, $lo); - } - } - $utf8 .= chr($ord); - } - utf8::upgrade($utf8); - return $utf8; -} - -sub encode_modern($$;$) -{ - my ($obj, $utf8, $chk) = @_; - my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)}; - my @str = (); - unless ($endian){ - $endian = ($size == 4) ? 'N' : 'n'; - push @str, BOM_BE; - } - my @ord = unpack("U*", $utf8); - undef $utf8; # to conserve memory - for my $ord (@ord){ - unless ($size == 4 or valid_ucs2($ord)) { - unless(issurrogate($ord)){ - if ($ucs2){ - $chk and - poisoned2death($obj, "code point too high", $ord); - - push @str, FBCHAR; - }else{ - - push @str, ensurrogate($ord); - } - }else{ # not supposed to happen - push @str, FBCHAR; - } - }else{ - push @str, $ord; - } - } - return pack("$endian*", @str); -} - -# -# *_classic are slower but more memory conservative -# - -sub decode_classic($$;$) -{ - my ($obj, $str, $chk ) = @_; - my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)}; - - # warn "$size, $endian, $ucs2"; - $endian ||= BOMB($size, substr($str, 0, $size, '')) - or poisoned2death($obj, "Where's the BOM?"); - my $mask = Mask->{$size}; - my $utf8 = ''; - my @ord = unpack("$endian*", $str); - while (length($str)){ - my $ord = unpack($endian, substr($str, 0, $size, '')); - unless ($size == 4 or valid_ucs2($ord &= $mask)){ - if ($ucs2){ - $chk and - poisoned2death($obj, "no surrogates allowed", $ord); - substr($str,0,$size,''); # skip the next one as well - $ord = FBCHAR; - }else{ - unless (isHiSurrogate($ord)){ - poisoned2death($obj, "Malformed HI surrogate", $ord); - } - my $lo = unpack($endian ,substr($str,0,$size,'')); - unless (isLoSurrogate($lo &= $mask)){ - poisoned2death($obj, "Malformed LO surrogate", $ord, $lo); - } - $ord = desurrogate($ord, $lo); - } - } - $utf8 .= chr($ord); - } - utf8::upgrade($utf8); - return $utf8; +sub renew { + my $self = shift; + $BOM_Unknown{$self->name} or return $self; + my $clone = bless { %$self } => ref($self); + $clone->{clone} = 1; # so the caller knows it is renewed. + return $clone; } -sub encode_classic($$;$) -{ - my ($obj, $utf8, $chk) = @_; - my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)}; - # warn join ", ", $size, $ucs2, $endian, $mask; - my $str = ''; - unless ($endian){ - $endian = ($size == 4) ? 'N' : 'n'; - $str .= pack($endian, BOM_BE); - } - while (length($utf8)){ - my $ord = ord(substr($utf8,0,1,'')); - unless ($size == 4 or valid_ucs2($ord)) { - unless(issurrogate($ord)){ - if ($ucs2){ - $chk and - poisoned2death($obj, "code point too high", $ord); - $str .= pack($endian, FBCHAR); - }else{ - $str .= pack($endian.2, ensurrogate($ord)); - } - }else{ # not supposed to happen - $str .= pack($endian, FBCHAR); - } - }else{ - $str .= pack($endian, $ord); - } - } - return $str; -} +# There used to be a perl implemntation of (en|de)code but with +# XS version is ripe, perl version is zapped for optimal speed -sub BOMB { - my ($size, $bom) = @_; - my $N = $size == 2 ? 'n' : 'N'; - my $ord = unpack($N, $bom); - return ($ord eq BOM_BE) ? $N : - ($ord eq BOM16LE) ? 'v' : ($ord eq BOM32LE) ? 'V' : undef; -} - -sub poisoned2death{ - my $obj = shift; - my $msg = shift; - my $pair = join(", ", map {sprintf "\\x%x", $_} @_); - require Carp; - Carp::croak $obj->name, ":", $msg, "<$pair>.", caller; -} +*decode = \&decode_xs; +*encode = \&encode_xs; 1; __END__ diff --git a/ext/Encode/Unicode/Unicode.xs b/ext/Encode/Unicode/Unicode.xs index 2163fb5..8b02402 100644 --- a/ext/Encode/Unicode/Unicode.xs +++ b/ext/Encode/Unicode/Unicode.xs @@ -1,5 +1,5 @@ /* - $Id: Unicode.xs,v 1.7 2003/02/20 14:42:34 dankogai Exp $ + $Id: Unicode.xs,v 1.7 2003/02/20 14:42:34 dankogai Exp dankogai $ */ #define PERL_NO_GET_CONTEXT @@ -84,6 +84,9 @@ MODULE = Encode::Unicode PACKAGE = Encode::Unicode PROTOTYPES: DISABLE +#define attr(k, l) (hv_exists((HV *)SvRV(obj),k,l) ? \ + *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef) + void decode_xs(obj, str, check = 0) SV * obj @@ -91,10 +94,11 @@ SV * str IV check CODE: { - int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0)); - U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0))); - int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0)); - SV *result = newSVpvn("",0); + U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6))); + int size = SvIV(attr("size", 4)); + int ucs2 = SvTRUE(attr("ucs2", 4)); + int clone = SvTRUE(attr("clone", 5)); + SV *result = newSVpvn("",0); STRLEN ulen; U8 *s = (U8 *)SvPVbyte(str,ulen); U8 *e = (U8 *)SvEND(str); @@ -118,9 +122,11 @@ CODE: bom); } } -#if 0 - /* Update endian for this sequence */ - hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); +#if 1 + /* Update endian for next sequence */ + if (clone) { + hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); + } #endif } while (s < e && s+size <= e) { @@ -188,10 +194,11 @@ SV * utf8 IV check CODE: { - int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0)); - U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0))); - int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0)); - SV *result = newSVpvn("",0); + U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6))); + int size = SvIV(attr("size", 4)); + int ucs2 = SvTRUE(attr("ucs2", 4)); + int clone = SvTRUE(attr("clone", 5)); + SV *result = newSVpvn("",0); STRLEN ulen; U8 *s = (U8 *)SvPVutf8(utf8,ulen); U8 *e = (U8 *)SvEND(utf8); @@ -199,9 +206,11 @@ CODE: if (!endian) { endian = (size == 4) ? 'N' : 'n'; enc_pack(aTHX_ result,size,endian,BOM_BE); -#if 0 - /* Update endian for this sequence */ - hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); +#if 1 + /* Update endian for next sequence */ + if (clone){ + hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); + } #endif } while (s < e && s+UTF8SKIP(s) <= e) { diff --git a/ext/Encode/lib/Encode/Encoding.pm b/ext/Encode/lib/Encode/Encoding.pm index 0bb4350..3978e9d 100644 --- a/ext/Encode/lib/Encode/Encoding.pm +++ b/ext/Encode/lib/Encode/Encoding.pm @@ -14,8 +14,10 @@ sub Define Encode::define_encoding($obj, $canonical, @_); } -sub name { return shift->{'Name'} } -sub new_sequence { return $_[0] } +sub name { return shift->{'Name'} } + +sub renew { return $_[0] } +*new_sequence = \&renew; sub needs_lines { 0 }; @@ -24,7 +26,8 @@ sub perlio_ok { return $@ ? 0 : 1; } -# Temporary legacy methods +# (Temporary|legacy) methods + sub toUnicode { shift->decode(@_) } sub fromUnicode { shift->encode(@_) } @@ -160,15 +163,28 @@ Predefined As: MUST return the string representing the canonical name of the encoding. -=item -Enew_sequence +=item -Erenew Predefined As: - sub new_sequence { return $_[0] } + sub renew { return $_[0] } + +This method reconstructs the encoding object if necessary. If you need +to store the state during encoding, this is where you clone your object. +Here is an example: + + sub renew { + my $self = shift; + my $clone = bless { %$self } => ref($self); + $clone->{clone} = 1; # so the caller can see it + return $clone; + } + +Since most encodings are stateless the default behavior is just return +itself as shown above. -This is a placeholder for encodings with state. It should return an -object which implements this interface. All current implementations -return the original object. +PerlIO ALWAYS calls this method to make sure it has its own private +encoding object. =item -Eperlio_ok() diff --git a/ext/PerlIO/encoding/encoding.xs b/ext/PerlIO/encoding/encoding.xs index 5f7b0df..cd69262 100644 --- a/ext/PerlIO/encoding/encoding.xs +++ b/ext/PerlIO/encoding/encoding.xs @@ -113,12 +113,13 @@ PerlIOEncode_pushed(pTHX_ PerlIO * f, const char *mode, SV * arg, PerlIO_funcs * code = -1; } else { -#ifdef USE_NEW_SEQUENCE + + /* $enc->renew */ PUSHMARK(sp); XPUSHs(result); PUTBACK; - if (call_method("new_sequence",G_SCALAR|G_EVAL) != 1 || SvTRUE(ERRSV)) { - Perl_warner(aTHX_ packWARN(WARN_IO), "\"%" SVf "\" does not support new_sequence", + if (call_method("renew",G_SCALAR|G_EVAL) != 1 || SvTRUE(ERRSV)) { + Perl_warner(aTHX_ packWARN(WARN_IO), "\"%" SVf "\" does not support renew method", arg); } else { @@ -126,7 +127,6 @@ PerlIOEncode_pushed(pTHX_ PerlIO * f, const char *mode, SV * arg, PerlIO_funcs * result = POPs; PUTBACK; } -#endif e->enc = newSVsv(result); PUSHMARK(sp); XPUSHs(e->enc);