[ 8293]
More rigor in UTF-8-ness of Encode's toUnicode
XS export some of the utf8 internal functions.
Test some of the functions.
Fix pp_concat() bug shown by said tests.
p4raw-link: @8293 on //depot/perlio:
a12c0f5690a38b5b84d767b0f8b7cc133a68affc
p4raw-id: //depot/perl@8295
p4raw-integrated: from //depot/perlio@8292 'copy in'
ext/Encode/Encode.xs (@8285..) ext/Encode/Encode.pm (@8290..)
pp_hot.c t/lib/encode.t (@8293..)
off_utf8
utf_to_utf
encodings
+ utf8_decode
+ utf8_encode
+ utf8_upgrade
+ utf8_downgrade
);
bootstrap Encode ();
package Encode::Unicode;
# Dummy package that provides the encode interface but leaves data
-# as UTF-8 encoded. It is here so that from_to()
+# as UTF-8 encoded. It is here so that from_to() works.
sub name { 'Unicode' }
-sub toUnicode { $_[1] }
+sub toUnicode
+{
+ my ($obj,$str,$chk) = @_;
+ Encode::utf8_upgrade($str);
+ $_[1] = '' if $chk;
+ return $str;
+}
-sub fromUnicode { $_[1] }
+*fromUnicode = \&toUnicode;
package Encode::Table;
$uni .= chr($code);
}
$_[1] = $str if $chk;
+ Encode::utf8_upgrade($uni);
return $uni;
}
return dst;
}
+MODULE = Encode PACKAGE = Encode PREFIX = sv_
+
+void
+valid_utf8(sv)
+SV * sv
+CODE:
+ {
+ STRLEN len;
+ char *s = SvPV(sv,len);
+ if (!SvUTF8(sv) || is_utf8_string(s,len))
+ XSRETURN_YES;
+ else
+ XSRETURN_NO;
+ }
+
+void
+sv_utf8_encode(sv)
+SV * sv
+
+bool
+sv_utf8_decode(sv)
+SV * sv
+
+void
+sv_utf8_upgrade(sv)
+SV * sv
+
+bool
+sv_utf8_downgrade(sv,failok=0)
+SV * sv
+bool failok
+
MODULE = Encode PACKAGE = Encode::XS PREFIX = Encode_
PROTOTYPES: ENABLE
left_utf8 = DO_UTF8(left);
right_utf8 = DO_UTF8(right);
+ if (!left_utf8 && !right_utf8 && SvUTF8(TARG)) {
+ SvUTF8_off(TARG);
+ }
+
if (left_utf8 != right_utf8 && !IN_BYTE) {
if (TARG == right && !right_utf8) {
sv_utf8_upgrade(TARG); /* Now straight binary copy */
if (SvIOK(TOPm1s)) {
bool auvok = SvUOK(TOPm1s);
bool buvok = SvUOK(TOPs);
-
+
if (!auvok && !buvok) { /* ## IV == IV ## */
IV aiv = SvIVX(TOPm1s);
IV biv = SvIVX(TOPs);
if (SvIOK(TOPm1s)) {
bool auvok = SvUOK(TOPm1s);
bool buvok = SvUOK(TOPs);
-
+
if (!auvok && !buvok) { /* ## IV + IV ## */
IV aiv = SvIVX(TOPm1s);
IV biv = SvIVX(TOPs);
aiv = SvIVX(TOPs);
buv = SvUVX(TOPm1s);
}
-
+
if (aiv >= 0) {
UV result = (UV)aiv + buv;
if (result >= buv) {
STRLEN keylen;
char *key = SvPV(keysv, keylen);
save_delete(hv, key, keylen);
- } else
+ } else
save_helem(hv, keysv, svp);
}
}
if (PL_tainted)
rxtainted |= 2;
TAINT_NOT;
-
+
force_it:
if (!pm || !s)
DIE(aTHX_ "panic: pp_subst");
my @source = qw(ascii iso8859-1 cp1250);
my @destiny = qw(cp1047 cp37 posix-bc);
my @ebcdic_sets = qw(cp1047 cp37 posix-bc);
-plan test => 13+$n*@encodings + 2*@source*@destiny*@character_set + 2*@ebcdic_sets*256;
+plan test => 21+$n*@encodings + 2*@source*@destiny*@character_set + 2*@ebcdic_sets*256;
my $str = join('',map(chr($_),0x20..0x7E));
my $cpy = $str;
ok(length($str),from_to($cpy,'iso8859-1','Unicode'),"Length Wrong");
# On ASCII based machines see if we can map several codepoints from
# three distinct ASCII sets to three distinct EBCDIC coded character sets.
-# On EBCDIC machines see if we can map from three EBCDIC sets to three
+# On EBCDIC machines see if we can map from three EBCDIC sets to three
# distinct ASCII sets.
my @expectation = (240..249, 193..201,209..217,226..233, 129..137,145..153,162..169);
}
}
+for $i (256,128,129,256)
+ {
+ my $c = chr($i);
+ my $s = "$c\n".sprintf("%02X",$i);
+ ok(Encode::valid_utf8($s),1,"concat of $i botched");
+ Encode::utf8_upgrade($s);
+ ok(Encode::valid_utf8($s),1,"concat of $i botched");
+ }
+