print "ok 8\n";
# the first octet of UTF-8 encoded 0x3af
-print "not " unless unpack("C", chr(0xdf)) == 0xce;
+print "not " unless unpack("U0 C", chr(0xdf)) == 0xce;
print "ok 9\n";
print "not " unless unpack("U", pack("U", 0xdf)) == 0xdf;
}
# Better than no plan, because I was getting out of memory errors, at which
# point Test::More tidily prints up 1..79 as if I meant to finish there.
-use Test::More tests=>148;
+use Test::More tests=>144;
use bytes ();
my %utf8hash;
foreach my $i (@ords){
my $u = chr($i); utf8::upgrade($u);
# warn sprintf "%d,%d", bytes::length($u), is_utf8($u);
- my $b = pack("C*", unpack("C*", $u));
+ my $b = chr($i); utf8::encode($b);
# warn sprintf "%d,%d" ,bytes::length($b), is_utf8($b);
- isnt($u, $b,
- "equivalence - with utf8flag");
- is (pack("C*", unpack("C*", $u)), pack("C*", unpack("C*", $b)),
- "equivalence - without utf8flag");
+ isnt($u, $b, "equivalence - with utf8flag");
$utf8hash{$u} = $utf8hash{$b} = $i;
}
no warnings qw(utf8);
# U+3042 is 3-byte length (in UTF-8/UTF-EBCDIC)
-our $a = pack 'U0C', unpack 'C', "\x{3042}";
+our $a = pack 'U0C', unpack 'U0C', "\x{3042}";
print NFD($a) eq "\0"
? "ok" : "not ok", " 2\n";
shift() if @_ > 1 and ( ref($_[0]) || (defined $_[1] && $_[0] eq $CGI::DefaultClass));
my $toencode = shift;
return undef unless defined($toencode);
- # force bytes while preserving backward compatibility -- dankogai
- $toencode = pack("C*", unpack("C*", $toencode));
+ # we enforce UTF-8 encoding for URLs for no good reason except UTF-8 being the future
+ utf8::encode $toencode;
if ($EBCDIC) {
$toencode=~s/([^a-zA-Z0-9_.~-])/uc sprintf("%%%02x",$E2A[ord($1)])/eg;
} else {
H A hex string (high nybble first).
c A signed char (8-bit) value.
- C An unsigned C char (octet) even under Unicode. Should normally not
- be used. See U and W instead.
+ C An unsigned char (octet) value.
W An unsigned char value (can be greater than 255).
s A signed short (16-bit) value.
P A pointer to a structure (fixed-length string).
u A uuencoded string.
- U A Unicode character number. Encodes to UTF-8 internally
- (or UTF-EBCDIC in EBCDIC platforms).
+ U A Unicode character number. Encodes to a character in character mode
+ and UTF-8 (or UTF-EBCDIC in EBCDIC platforms) in byte mode.
w A BER compressed integer (not an ASN.1 BER, see perlpacktut for
details). Its bytes represent an unsigned integer in base 128,
* other pp*.c files for the rest of the pp_ functions.
*/
-
#include "EXTERN.h"
#define PERL_IN_PP_PACK_C
#include "perl.h"
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0,
- /* C */ sizeof(unsigned char) | PACK_SIZE_UNPREDICTABLE,
+ /* C */ sizeof(unsigned char),
#if defined(HAS_LONG_DOUBLE) && defined(USE_LONG_DOUBLE)
/* D */ LONG_DOUBLESIZE,
#else
/* w */ sizeof(char) | PACK_SIZE_UNPREDICTABLE | PACK_SIZE_CANNOT_CSUM,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* C */ sizeof(unsigned char) | PACK_SIZE_UNPREDICTABLE,
+ /* C */ sizeof(unsigned char),
#if defined(HAS_LONG_DOUBLE) && defined(USE_LONG_DOUBLE)
/* D */ LONG_DOUBLESIZE,
#else
XPUSHs(sv);
break;
}
+ case 'C':
+ if (len == 0) {
+ if (explicit_length)
+ /* Switch to "character" mode */
+ utf8 = (symptr->flags & FLAG_DO_UTF8) ? 1 : 0;
+ break;
+ }
+ /* FALL THROUGH */
case 'c':
- while (len-- > 0) {
- int aint = SHIFT_BYTE(utf8, s, strend, datumtype);
- if (aint >= 128) /* fake up signed chars */
+ while (len-- > 0 && s < strend) {
+ int aint;
+ if (utf8)
+ {
+ STRLEN retlen;
+ aint = utf8n_to_uvchr((U8 *) s, strend-s, &retlen,
+ ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+ if (retlen == (STRLEN) -1 || retlen == 0)
+ Perl_croak(aTHX_ "Malformed UTF-8 string in unpack");
+ s += retlen;
+ }
+ else
+ aint = *(U8 *)(s)++;
+ if (aint >= 128 && datumtype != 'C') /* fake up signed chars */
aint -= 256;
if (!checksum)
PUSHs(sv_2mortal(newSViv((IV)aint)));
cuv += aint;
}
break;
- case 'C':
case 'W':
W_checksum:
- if (len == 0) {
- if (explicit_length && datumtype == 'C')
- /* Switch to "character" mode */
- utf8 = (symptr->flags & FLAG_DO_UTF8) ? 1 : 0;
- break;
- }
- if (datumtype == 'C' ?
- (symptr->flags & FLAG_DO_UTF8) &&
- !(symptr->flags & FLAG_WAS_UTF8) : utf8) {
+ if (utf8) {
while (len-- > 0 && s < strend) {
STRLEN retlen;
const UV val = utf8n_to_uvchr((U8 *) s, strend-s, &retlen,
utf8 = (symptr->flags & FLAG_DO_UTF8) ? 1 : 0;
break;
}
- GROWING(0, cat, start, cur, len);
while (len-- > 0) {
IV aiv;
fromstr = NEXTFROM;
ckWARN(WARN_PACK))
Perl_warner(aTHX_ packWARN(WARN_PACK),
"Character in 'C' format wrapped in pack");
- *cur++ = (char)(aiv & 0xff);
+ PUSH_BYTE(utf8, cur, (U8)(aiv & 0xff));
}
break;
case 'W': {
sub hexes {
no warnings 'utf8'; # avoid surrogate and beyond Unicode warnings
- join(" ",map{sprintf"%02x",$_}unpack("C*",chr($_[0])));
+ join(" ",unpack "U0 (H2)*", chr $_[0]);
}
# The following code points are some interesting steps in UTF-8.
my $no_signedness = $] > 5.009 ? '' :
"Signed/unsigned pack modifiers not available on this perl";
-plan tests => 14697;
+plan tests => 14696;
use strict;
use warnings qw(FATAL all);
isnt(v1.20.300.4000, sprintf "%vd", pack("C0U*",1,20,300,4000));
my $rslt = $Is_EBCDIC ? "156 67" : "199 162";
-is(join(" ", unpack("C*", chr(0x1e2))), $rslt);
+is(join(" ", unpack("U0 C*", chr(0x1e2))), $rslt);
# does pack U create Unicode?
is(ord(pack('U', 300)), 300);
SKIP: {
skip "Not for EBCDIC", 4 if $Is_EBCDIC;
- # does unpack C unravel pack U?
- is("@{[unpack('C*', pack('U*', 100, 200))]}", "100 195 136");
-
# does pack U0C create Unicode?
is("@{[pack('U0C*', 100, 195, 136)]}", v100.v200);
}
{
- # C is *not* neutral
+ # C *is* neutral
my $down = "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff\x05\x06";
my $up = $down;
utf8::upgrade($up);
is(pack("C*", @down), $down, "byte join");
my @up = unpack("C*", $up);
- my @expect_up = (0xc3, 0xb8, 0xc3, 0xb9, 0xc3, 0xba, 0xc3, 0xbb, 0xc3, 0xbc, 0xc3, 0xbd, 0xc3, 0xbe, 0xc3, 0xbf, 0x05, 0x06);
+ my @expect_up = (0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, 0x05, 0x06);
is("@up", "@expect_up", "UTF-8 expand");
is(pack("U0C0C*", @up), $up, "UTF-8 join");
}
my $arg = $ENV{PATH}; # a tainted value
use constant UTF8 => "\x{1234}";
-sub is_utf8 {
- my $s = shift;
- return 0xB6 != unpack('C', chr(0xB6).$s);
-}
+*is_utf8 = \&utf8::is_utf8;
for my $ary ([ascii => 'perl'], [latin1 => "\xB6"], [utf8 => "\x{100}"]) {
my $encode = $ary->[0];