From: Nick Ing-Simmons Date: Mon, 1 Jan 2001 21:58:50 +0000 (+0000) Subject: More rigor in UTF-8-ness of Encode's toUnicode X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=a12c0f5690a38b5b84d767b0f8b7cc133a68affc;p=p5sagit%2Fp5-mst-13.2.git More rigor in UTF-8-ness of Encode's toUnicode XS export some of the utf8 internal functions. Test some of the functions. Fix pp_concat() bug shown by said tests. p4raw-id: //depot/perlio@8293 --- diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index 42c9e84..1f4ffb1 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -22,6 +22,10 @@ require Exporter; off_utf8 utf_to_utf encodings + utf8_decode + utf8_encode + utf8_upgrade + utf8_downgrade ); bootstrap Encode (); @@ -409,13 +413,19 @@ sub getEncoding package Encode::Unicode; # Dummy package that provides the encode interface but leaves data -# as UTF-8 encoded. It is here so that from_to() +# as UTF-8 encoded. It is here so that from_to() works. sub name { 'Unicode' } -sub toUnicode { $_[1] } +sub toUnicode +{ + my ($obj,$str,$chk) = @_; + Encode::utf8_upgrade($str); + $_[1] = '' if $chk; + return $str; +} -sub fromUnicode { $_[1] } +*fromUnicode = \&toUnicode; package Encode::Table; @@ -550,6 +560,7 @@ sub toUnicode $uni .= chr($code); } $_[1] = $str if $chk; + Encode::utf8_upgrade($uni); return $uni; } diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs index b61d89b..7ea22d4 100644 --- a/ext/Encode/Encode.xs +++ b/ext/Encode/Encode.xs @@ -415,6 +415,38 @@ encode_method(pTHX_ encode_t *enc, encpage_t *dir, SV *src, int check) return dst; } +MODULE = Encode PACKAGE = Encode PREFIX = sv_ + +void +valid_utf8(sv) +SV * sv +CODE: + { + STRLEN len; + char *s = SvPV(sv,len); + if (!SvUTF8(sv) || is_utf8_string(s,len)) + XSRETURN_YES; + else + XSRETURN_NO; + } + +void +sv_utf8_encode(sv) +SV * sv + +bool +sv_utf8_decode(sv) +SV * sv + +void +sv_utf8_upgrade(sv) +SV * sv + +bool +sv_utf8_downgrade(sv,failok=0) +SV * sv +bool failok + MODULE = Encode PACKAGE = Encode::XS PREFIX = Encode_ PROTOTYPES: ENABLE diff --git a/pp_hot.c b/pp_hot.c index f9c5960..795ba32 100644 --- a/pp_hot.c +++ b/pp_hot.c @@ -155,6 +155,10 @@ PP(pp_concat) left_utf8 = DO_UTF8(left); right_utf8 = DO_UTF8(right); + if (!left_utf8 && !right_utf8 && SvUTF8(TARG)) { + SvUTF8_off(TARG); + } + if (left_utf8 != right_utf8 && !IN_BYTE) { if (TARG == right && !right_utf8) { sv_utf8_upgrade(TARG); /* Now straight binary copy */ @@ -289,7 +293,7 @@ PP(pp_eq) if (SvIOK(TOPm1s)) { bool auvok = SvUOK(TOPm1s); bool buvok = SvUOK(TOPs); - + if (!auvok && !buvok) { /* ## IV == IV ## */ IV aiv = SvIVX(TOPm1s); IV biv = SvIVX(TOPs); @@ -416,7 +420,7 @@ PP(pp_add) if (SvIOK(TOPm1s)) { bool auvok = SvUOK(TOPm1s); bool buvok = SvUOK(TOPs); - + if (!auvok && !buvok) { /* ## IV + IV ## */ IV aiv = SvIVX(TOPm1s); IV biv = SvIVX(TOPs); @@ -459,7 +463,7 @@ PP(pp_add) aiv = SvIVX(TOPs); buv = SvUVX(TOPm1s); } - + if (aiv >= 0) { UV result = (UV)aiv + buv; if (result >= buv) { @@ -1627,7 +1631,7 @@ PP(pp_helem) STRLEN keylen; char *key = SvPV(keysv, keylen); save_delete(hv, key, keylen); - } else + } else save_helem(hv, keysv, svp); } } @@ -1857,7 +1861,7 @@ PP(pp_subst) if (PL_tainted) rxtainted |= 2; TAINT_NOT; - + force_it: if (!pm || !s) DIE(aTHX_ "panic: pp_subst"); diff --git a/t/lib/encode.t b/t/lib/encode.t index 34527d7..890be55 100644 --- a/t/lib/encode.t +++ b/t/lib/encode.t @@ -16,7 +16,7 @@ my @character_set = ('0'..'9', 'A'..'Z', 'a'..'z'); my @source = qw(ascii iso8859-1 cp1250); my @destiny = qw(cp1047 cp37 posix-bc); my @ebcdic_sets = qw(cp1047 cp37 posix-bc); -plan test => 13+$n*@encodings + 2*@source*@destiny*@character_set + 2*@ebcdic_sets*256; +plan test => 21+$n*@encodings + 2*@source*@destiny*@character_set + 2*@ebcdic_sets*256; my $str = join('',map(chr($_),0x20..0x7E)); my $cpy = $str; ok(length($str),from_to($cpy,'iso8859-1','Unicode'),"Length Wrong"); @@ -47,7 +47,7 @@ foreach my $enc (qw(symbol dingbats ascii),@encodings) # On ASCII based machines see if we can map several codepoints from # three distinct ASCII sets to three distinct EBCDIC coded character sets. -# On EBCDIC machines see if we can map from three EBCDIC sets to three +# On EBCDIC machines see if we can map from three EBCDIC sets to three # distinct ASCII sets. my @expectation = (240..249, 193..201,209..217,226..233, 129..137,145..153,162..169); @@ -91,3 +91,12 @@ foreach my $enc_eb (@ebcdic_sets) } } +for $i (256,128,129,256) + { + my $c = chr($i); + my $s = "$c\n".sprintf("%02X",$i); + ok(Encode::valid_utf8($s),1,"concat to utf8 botched"); + Encode::utf8_upgrade($s); + ok(Encode::valid_utf8($s),1,"concat to utf8 botched"); + } +