The C<chr()> and C<ord()> functions work on characters. This is like
C<pack("U")> and C<unpack("U")>, not like C<pack("C")> and
C<unpack("C")>. In fact, the latter are how you now emulate
-byte-oriented C<chr()> and C<ord()> under utf8.
+byte-oriented C<chr()> and C<ord()> for Unicode strings.
+(Note that this reveals the internal UTF-8 encoding of strings and
+you are not supposed to do that unless you know what you are doing.)
=item *
int natint; /* native integer */
int unatint; /* unsigned native integer */
#endif
+ bool do_utf8 = DO_UTF8(right);
if (gimme != G_ARRAY) { /* arrange to do first one only */
/*SUPPRESS 530*/
DIE(aTHX_ "'!' allowed only after types %s", natstr);
}
star = 0;
- if (pat >= patend)
+ if (pat > patend)
len = 1;
else if (*pat == '*') {
len = strend - strbeg; /* long enough */
}
break;
case 'C':
+ unpack_C: /* unpack U will jump here if not UTF-8 */
+ if (len == 0) {
+ do_utf8 = FALSE;
+ break;
+ }
if (len > strend - s)
len = strend - s;
if (checksum) {
}
break;
case 'U':
+ if (len == 0) {
+ do_utf8 = TRUE;
+ break;
+ }
+ if (!do_utf8)
+ goto unpack_C;
if (len > strend - s)
len = strend - s;
if (checksum) {
}
-print "1..161\n";
+print "1..169\n";
# Note: All test numbers in comments are off by 1 after the comment below..
sprintf "%vd", pack("C0U*",1,20,300,4000);
print "ok $test\n"; $test++;
-# 160
+# 161
print "not " unless join(" ", unpack("C*", chr(0x1e2)))
eq ((ord(A) == 193) ? "156 67" : "199 162");
print "ok $test\n"; $test++;
+
+# 162: does pack U create Unicode?
+print "not " unless ord(pack('U', 300)) == 300;
+print "ok $test\n"; $test++;
+
+# 163: does unpack U deref Unicode?
+print "not " unless (unpack('U', chr(300)))[0] == 300;
+print "ok $test\n"; $test++;
+
+# 164: is unpack U the reverse of pack U for Unicode string?
+print "not "
+ unless "@{[unpack('U*', pack('U*', 100, 200, 300))]}" eq "100 200 300";
+print "ok $test\n"; $test++;
+
+# 165: is unpack U the reverse of pack U for byte string?
+print "not "
+ unless "@{[unpack('U*', pack('U*', 100, 200))]}" eq "100 200";
+print "ok $test\n"; $test++;
+
+# 166: does unpack C unravel pack U?
+print "not " unless "@{[unpack('C*', pack('U*', 100, 200))]}" eq "100 195 136";
+print "ok $test\n"; $test++;
+
+# 167: does pack U0C create Unicode?
+print "not " unless "@{[pack('U0C*', 100, 195, 136)]}" eq v100.v200;
+print "ok $test\n"; $test++;
+
+# 168: does pack C0U create characters?
+print "not " unless "@{[pack('C0U*', 100, 200)]}" eq pack("C*", 100, 195, 136);
+print "ok $test\n"; $test++;
+
+# 169: does unpack U0U on byte data warn?
+{
+ local $SIG{__WARN__} = sub { $@ = "@_" };
+ my @null = unpack('U0U', chr(255));
+ print "not " unless $@ =~ /^Malformed UTF-8 character /;
+ print "ok $test\n"; $test++;
+}
+
# 104..181
{
- my $WARNCNT;
my $id;
- local $SIG{__WARN__} =
- sub {
- print "# $id: @_";
- $WARNCNT++;
- $WARNMSG = "@_";
- };
+ local $SIG{__WARN__} = sub {
+ print "# $id: @_";
+ $@ = "@_";
+ };
sub moan {
print "$id: @_";
}
- sub test_unpack_U {
- $WARNCNT = 0;
- $WARNMSG = "";
- unpack('U*', $_[0]);
+ sub warn_unpack_U {
+ $@ = '';
+ my @null = unpack('U0U*', $_[0]);
+ return $@;
}
for (@MK) {
# print "# $_\n";
} elsif (/^(\d+\.\d+\.\d+[bu]?)\s+([yn])\s+"(.+)"\s+([0-9a-f]{1,8}|-)\s+(\d+)\s+([0-9a-f]{2}(?::[0-9a-f]{2})*)(?:\s+((?:\d+|-)(?:\s+(.+))?))?$/) {
$id = $1;
- my ($okay, $bytes, $Unicode, $byteslen, $hex, $charslen, $error) =
+ my ($okay, $bytes, $Unicode, $byteslen, $hex, $charslen, $experr) =
($2, $3, $4, $5, $6, $7, $8);
my @hex = split(/:/, $hex);
unless (@hex == $byteslen) {
moan "bytes length() ($bytesbyteslen) not equal to $byteslen\n";
}
}
+ my $warn = warn_unpack_U($bytes);
if ($okay eq 'y') {
- test_unpack_U($bytes);
- if ($WARNCNT) {
- moan "unpack('U*') false negative\n";
+ if ($warn) {
+ moan "unpack('U0U*') false negative\n";
print "not ";
}
} elsif ($okay eq 'n') {
- test_unpack_U($bytes);
- if ($WARNCNT == 0 || ($error ne '' && $WARNMSG !~ /$error/)) {
- moan "unpack('U*') false positive\n";
+ if (not $warn || ($experr ne '' && $warn !~ /$experr/)) {
+ moan "unpack('U0U*') false positive\n";
print "not ";
}
}
- print "ok $test\n";
+ print "ok $test # $id $okay\n";
$test++;
} else {
moan "unknown format\n";