From: Nick Ing-Simmons Date: Sun, 20 Oct 2002 12:36:16 +0000 (+0000) Subject: Encode::utf8 encode/decode methods as XS that obeys X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=19d607df75c65b2c7fb5706080ac2f302c4a2328;p=p5sagit%2Fp5-mst-13.2.git Encode::utf8 encode/decode methods as XS that obeys same protocol as Encode::XS - allows :encoding(UTF-8) to cope with partial chars at end of buffer. p4raw-id: //depot/perlio@18032 --- diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index 2500e14..81f2cbc 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -243,21 +243,7 @@ sub predefine_encodings{ # was in Encode::utf8 package Encode::utf8; push @Encode::utf8::ISA, 'Encode::Encoding'; - *decode = sub{ - my ($obj,$octets,$chk) = @_; - my $str = Encode::decode_utf8($octets); - if (defined $str) { - $_[1] = '' if $chk; - return $str; - } - return undef; - }; - *encode = sub { - my ($obj,$string,$chk) = @_; - my $octets = Encode::encode_utf8($string); - $_[1] = '' if $chk; - return $octets; - }; + # encode and decode methods now in Encode.xs $Encode::Encoding{utf8} = bless {Name => "utf8"} => "Encode::utf8"; } diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs index e631106..d40c869 100644 --- a/ext/Encode/Encode.xs +++ b/ext/Encode/Encode.xs @@ -238,6 +238,123 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, return dst; } +MODULE = Encode PACKAGE = Encode::utf8 PREFIX = Method_ + +void +Method_decode(obj,src,check = 0) +SV * obj +SV * src +int check +CODE: +{ + STRLEN slen; + U8 *s = (U8 *) SvPV(src, slen); + U8 *e = (U8 *) SvEND(src); + SV *dst = newSV(slen); + SvPOK_only(dst); + SvCUR_set(dst,0); + while (s < e) { + if (UTF8_IS_INVARIANT(*s) || UTF8_IS_START(*s)) { + U8 skip = UTF8SKIP(s); + if ((s + skip) > e) { + /* Partial character - done */ + break; + } + else if (is_utf8_char(s)) { + /* Whole char is good */ + sv_catpvn(dst,(char *)s,skip); + s += skip; + continue; + } + else { + /* starts ok but isn't "good" */ + } + } + else { + /* Invalid start byte */ + } + /* If we get here there is something wrong with alleged UTF-8 */ + if (check & ENCODE_DIE_ON_ERR){ + Perl_croak(aTHX_ ERR_DECODE_NOMAP, "utf8", (UV)*s); + XSRETURN(0); + } + if (check & ENCODE_WARN_ON_ERR){ + Perl_warner(aTHX_ packWARN(WARN_UTF8), + ERR_DECODE_NOMAP, "utf8", (UV)*s); + } + if (check & ENCODE_RETURN_ON_ERR) { + break; + } + if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ + SV* perlqq = newSVpvf("\\x%02" UVXf, (UV)*s); + sv_catsv(dst, perlqq); + SvREFCNT_dec(perlqq); + } else { + sv_catpv(dst, FBCHAR_UTF8); + } + s++; + } + *SvEND(dst) = '\0'; + + /* Clear out translated part of source unless asked not to */ + if (check && !(check & ENCODE_LEAVE_SRC)){ + slen = e-s; + if (slen) { + sv_setpvn(src, (char*)s, slen); + } + SvCUR_set(src, slen); + } + SvUTF8_on(dst); + ST(0) = sv_2mortal(dst); + XSRETURN(1); +} + +void +Method_encode(obj,src,check = 0) +SV * obj +SV * src +int check +CODE: +{ + STRLEN slen; + U8 *s = (U8 *) SvPV(src, slen); + U8 *e = (U8 *) SvEND(src); + SV *dst = newSV(slen); + if (SvUTF8(src)) { + /* Already encoded - trust it and just copy the octets */ + sv_setpvn(dst,(char *)s,(e-s)); + s = e; + } + else { + /* Native bytes - can always encode */ + U8 *d = (U8 *) SvGROW(dst,2*slen); + while (s < e) { + UV uv = NATIVE_TO_UNI((UV) *s++); + if (UNI_IS_INVARIANT(uv)) + *d++ = (U8)UTF_TO_NATIVE(uv); + else { + *d++ = (U8)UTF8_EIGHT_BIT_HI(uv); + *d++ = (U8)UTF8_EIGHT_BIT_LO(uv); + } + } + SvCUR_set(dst, d- (U8 *)SvPVX(dst)); + *SvEND(dst) = '\0'; + } + + /* Clear out translated part of source unless asked not to */ + if (check && !(check & ENCODE_LEAVE_SRC)){ + slen = e-s; + if (slen) { + sv_setpvn(src, (char*)s, slen); + } + SvCUR_set(src, slen); + } + SvPOK_only(dst); + SvUTF8_off(dst); + ST(0) = sv_2mortal(dst); + XSRETURN(1); +} + MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_ PROTOTYPES: ENABLE diff --git a/ext/PerlIO/t/encoding.t b/ext/PerlIO/t/encoding.t index ce07fea..cf80af7 100644 --- a/ext/PerlIO/t/encoding.t +++ b/ext/PerlIO/t/encoding.t @@ -12,13 +12,14 @@ BEGIN { } } -print "1..13\n"; +print "1..14\n"; my $grk = "grk$$"; my $utf = "utf$$"; my $fail1 = "fa$$"; my $fail2 = "fb$$"; my $russki = "koi8r$$"; +my $threebyte = "3byte$$"; if (open(GRK, ">$grk")) { binmode(GRK, ":bytes"); @@ -131,6 +132,21 @@ if (!defined $warn) { print "$warn"; } +# Create a string of chars that are 3 bytes in UTF-8 +my $str = "\x{1f80}" x 2048; + +# Write them to a file +open(F,'>:utf8',$threebyte) || die "Cannot open $threebyte:$!"; +print F $str; +close(F); + +# Read file back as UTF-8 +open(F,'<:encoding(utf-8)',$threebyte) || die "Cannot open $threebyte:$!"; +my $dstr = ; +close(F); +print "not " unless ($dstr eq $str); +print "ok 14\n"; + END { - unlink($grk, $utf, $fail1, $fail2, $russki); + unlink($grk, $utf, $fail1, $fail2, $russki, $threebyte); }