From: Ton Hospel Date: Sun, 6 Mar 2005 18:29:38 +0000 (+0000) Subject: Encoding neutral unpack X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=f337b084e4f053c4222a0b9a773a9e12c0232e6d;p=p5sagit%2Fp5-mst-13.2.git Encoding neutral unpack Message-Id: p4raw-id: //depot/perl@24010 --- diff --git a/embed.fnc b/embed.fnc index 795f3fe..6fd4a1d 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1064,7 +1064,6 @@ s |I32 |measure_struct |tempsym_t* symptr s |char * |group_end |char *pat|char *patend|char ender s |char * |get_num |char *ppat|I32 * s |bool |next_symbol |tempsym_t* symptr -s |void |doencodes |SV* sv|char* s|I32 len s |SV* |is_an_int |char *s|STRLEN l s |int |div128 |SV *pnum|bool *done #endif diff --git a/embed.h b/embed.h index d5c5e40..00ae7a7 100644 --- a/embed.h +++ b/embed.h @@ -1469,9 +1469,6 @@ #define next_symbol S_next_symbol #endif #ifdef PERL_CORE -#define doencodes S_doencodes -#endif -#ifdef PERL_CORE #define is_an_int S_is_an_int #endif #ifdef PERL_CORE @@ -4069,9 +4066,6 @@ #define next_symbol(a) S_next_symbol(aTHX_ a) #endif #ifdef PERL_CORE -#define doencodes(a,b,c) S_doencodes(aTHX_ a,b,c) -#endif -#ifdef PERL_CORE #define is_an_int(a,b) S_is_an_int(aTHX_ a,b) #endif #ifdef PERL_CORE diff --git a/genpacksizetables.pl b/genpacksizetables.pl index e63a3aa..0fffe9b 100755 --- a/genpacksizetables.pl +++ b/genpacksizetables.pl @@ -4,13 +4,16 @@ use strict; use Encode; -my @lines = grep {!/^#/} ; +my @lines = grep { + s/#.*//; + /\S/; +} ; sub addline { - my ($arrays, $chrmap, $letter, $arrayname, $spare, $nocsum, $size, + my ($arrays, $chrmap, $letter, $arrayname, $unpredictable, $nocsum, $size, $condition) = @_; my $line = "/* $letter */ $size"; - $line .= " | PACK_SIZE_SPARE" if $spare; + $line .= " | PACK_SIZE_UNPREDICTABLE" if $unpredictable; $line .= " | PACK_SIZE_CANNOT_CSUM" if $nocsum; $line .= ","; # And then the hack @@ -24,7 +27,7 @@ sub output_tables { my $chrmap = shift; foreach (@_) { - my ($letter, $shriek, $spare, $nocsum, $size, $condition) + my ($letter, $shriek, $unpredictable, $nocsum, $size, $condition) = /^([A-Za-z])(!?)\t(\S*)\t(\S*)\t([^\t\n]+)(?:\t+(.*))?$/; die "Can't parse '$_'" unless $size; @@ -36,7 +39,7 @@ sub output_tables { } addline (\%arrays, $chrmap, $letter, $shriek ? 'shrieking' : 'normal', - $spare, $nocsum, $size, $condition); + $unpredictable, $nocsum, $size, $condition); } my %earliest; @@ -100,11 +103,12 @@ output_tables (\%ebcdicmap, @lines); print "#endif\n"; __DATA__ -#Symbol spare nocsum size +#Symbol unpredictable +# nocsum size c char -C unsigned char -W unsigned char -U char +C * unsigned char +W * unsigned char +U * char s! short s =SIZE16 S! unsigned short @@ -128,7 +132,7 @@ V! =SIZE32 PERL_PACK_CAN_SHRIEKSIGN N! =SIZE32 PERL_PACK_CAN_SHRIEKSIGN L =SIZE32 p * char * -w * char +w * * char q Quad_t HAS_QUAD Q Uquad_t HAS_QUAD f float diff --git a/lib/charnames.t b/lib/charnames.t index 49917c5..c53f54a 100644 --- a/lib/charnames.t +++ b/lib/charnames.t @@ -61,7 +61,7 @@ else { # EBCDIC where UTF-EBCDIC may be used (this may be 1047 specific since } sub to_bytes { - pack"a*", shift; + unpack"U0a*", shift; } { diff --git a/perl.h b/perl.h index b8a9642..ee51583 100644 --- a/perl.h +++ b/perl.h @@ -3700,7 +3700,7 @@ typedef enum { e_star /* asterisk */ } howlen_t; -typedef struct { +typedef struct tempsym { char* patptr; /* current template char */ char* patend; /* one after last char */ char* grpbeg; /* 1st char of ()-group */ @@ -3711,6 +3711,8 @@ typedef struct { int level; /* () nesting level */ U32 flags; /* /=4, comma=2, pack=1 */ /* and group modifiers */ + STRLEN strbeg; /* offset of group start */ + struct tempsym *previous; /* previous group */ } tempsym_t; #include "thread.h" diff --git a/pod/perldiag.pod b/pod/perldiag.pod index b2ea972..3687d41 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -1148,7 +1148,7 @@ references can be weakened. with an assignment operator, which implies modifying the value itself. Perhaps you need to copy the value to a temporary, and repeat that. -=item Character in "C" format wrapped in pack +=item Character in 'C' format wrapped in pack (W pack) You said @@ -1163,7 +1163,19 @@ and so on) and not for Unicode characters, so Perl behaved as if you meant If you actually want to pack Unicode codepoints, use the C<"U"> format instead. -=item Character in "c" format wrapped in pack +=item Character in 'W' format wrapped in pack + +(W pack) You said + + pack("U0W", $x) + +where $x is either less than 0 or more than 255. However, C-mode expects +all values to fall in the interval [0, 255], so Perl behaved as if you +meant: + + pack("U0W", $x & 255) + +=item Character in 'c' format wrapped in pack (W pack) You said @@ -1178,6 +1190,42 @@ and so on) and not for Unicode characters, so Perl behaved as if you meant If you actually want to pack Unicode codepoints, use the C<"U"> format instead. +=item Character in '%c' format wrapped in unpack + +(W unpack) You tried something like + + unpack("H", "\x{2a1}") + +where the format expects to process a byte (a character with a value +below 256), but a higher value was provided instead. Perl uses the value +modulus 256 instead, as if you had provided: + + unpack("H", "\x{a1}") + +=item Character(s) in '%c' format wrapped in pack + +(W pack) You tried something like + + pack("u", "\x{1f3}b") + +where the format expects to process a sequence of bytes (character with a +value below 256), but some of the characters had a higher value. Perl +uses the character values modulus 256 instead, as if you had provided: + + pack("u", "\x{f3}b") + +=item Character(s) in '%c' format wrapped in unpack + +(W unpack) You tried something like + + unpack("s", "\x{1f3}b") + +where the format expects to process a sequence of bytes (character with a +value below 256), but some of the characters had a higher value. Perl +uses the character values modulus 256 instead, as if you had provided: + + unpack("s", "\x{f3}b") + =item close() on unopened filehandle %s (W unopened) You tried to close a filehandle that was never opened. @@ -1560,6 +1608,13 @@ you which section of the Perl source code is distressed. (F) Your machine apparently doesn't implement fcntl(). What is this, a PDP-11 or something? +=item Field too wide in 'u' format in pack + +(W pack) Each line in an uuencoded string start with a length indicator +which can't encode values above 63. So there is no point in asking for +a line length bigger than that. Perl will behave as if you specified +C as format. + =item Filehandle %s opened only for input (W io) You tried to write on a read-only filehandle. If you intended @@ -2113,6 +2168,21 @@ possibility is careless use of utf8::upgrade(). Perl thought it was reading UTF-16 encoded character data but while doing it Perl met a malformed Unicode surrogate. +=item Malformed UTF-8 string in pack + +(F) You tried to pack something that didn't comply with UTF-8 encoding +rules and perl was unable to guess how to make more progress. + +=item Malformed UTF-8 string in unpack + +(F) You tried to unpack something that didn't comply with UTF-8 encoding +rules and perl was unable to guess how to make more progress. + +=item Malformed UTF-8 string in '%c' format in unpack + +(F) You tried to unpack something that didn't comply with UTF-8 encoding +rules and perl was unable to guess how to make more progress. + =item %s matches null string many times in regex; marked by <-- HERE in m/%s/ (W regexp) The pattern you've specified would be an infinite loop if the @@ -2716,6 +2786,12 @@ otherwise. (F) You had a template that specified an absolute position outside the string being unpacked. See L. +=item '@' outside of string with malformed UTF-8 in unpack + +(F) You had a template that specified an absolute position outside +the string being unpacked. The string being unpacked was also invalid +UTF-8. See L. + =item %s package attribute may clash with future reserved word: %s (W reserved) A lowercase attribute name was used that had a diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod index dc23b21..c15185e 100644 --- a/pod/perlfunc.pod +++ b/pod/perlfunc.pod @@ -2054,7 +2054,7 @@ addresses returned by the corresponding system library call. In the Internet domain, each address is four bytes long and you can unpack it by saying something like: - ($a,$b,$c,$d) = unpack('C4',$addr[0]); + ($a,$b,$c,$d) = unpack('W4',$addr[0]); The Socket library makes this slightly easier: @@ -3296,7 +3296,8 @@ Takes a LIST of values and converts it into a string using the rules given by the TEMPLATE. The resulting string is the concatenation of the converted values. Typically, each converted value looks like its machine-level representation. For example, on 32-bit machines -a converted integer may be represented by a sequence of 4 bytes. +an integer may be represented by a sequence of 4 bytes which will be +converted to a sequence of 4 characters. The TEMPLATE is a sequence of characters that give the order and type of values, as follows: @@ -3311,7 +3312,9 @@ of values, as follows: H A hex string (high nybble first). c A signed char (8-bit) value. - C An unsigned char value. Only does bytes. See U for Unicode. + C An unsigned C char (octet) even under Unicode. Should normally not + be used. See U and W instead. + W An unsigned char value (can be greater than 255). s A signed short (16-bit) value. S An unsigned short value. @@ -3414,71 +3417,72 @@ byte (so the packed result will be one longer than the byte C of the item). The repeat count for C is interpreted as the maximal number of bytes -to encode per line of output, with 0 and 1 replaced by 45. +to encode per line of output, with 0, 1 and 2 replaced by 45. The repeat +count should not be more than 65. =item * The C, C, and C types gobble just one value, but pack it as a string of length count, padding with nulls or spaces as necessary. When unpacking, C strips trailing spaces and nulls, C strips everything -after the first null, and C returns data verbatim. When packing, -C, and C are equivalent. +after the first null, and C returns data verbatim. If the value-to-pack is too long, it is truncated. If too long and an explicit count is provided, C packs only C<$count-1> bytes, followed -by a null byte. Thus C always packs a trailing null byte under -all circumstances. +by a null byte. Thus C always packs a trailing null (except when the +count is 0). =item * Likewise, the C and C fields pack a string that many bits long. -Each byte of the input field of pack() generates 1 bit of the result. +Each character of the input field of pack() generates 1 bit of the result. Each result bit is based on the least-significant bit of the corresponding -input byte, i.e., on C. In particular, bytes C<"0"> and -C<"1"> generate bits 0 and 1, as do bytes C<"\0"> and C<"\1">. +input character, i.e., on C. In particular, characters C<"0"> +and C<"1"> generate bits 0 and 1, as do characters C<"\0"> and C<"\1">. Starting from the beginning of the input string of pack(), each 8-tuple -of bytes is converted to 1 byte of output. With format C -the first byte of the 8-tuple determines the least-significant bit of a -byte, and with format C it determines the most-significant bit of -a byte. +of characters is converted to 1 character of output. With format C +the first character of the 8-tuple determines the least-significant bit of a +character, and with format C it determines the most-significant bit of +a character. If the length of the input string is not exactly divisible by 8, the -remainder is packed as if the input string were padded by null bytes +remainder is packed as if the input string were padded by null characters at the end. Similarly, during unpack()ing the "extra" bits are ignored. -If the input string of pack() is longer than needed, extra bytes are ignored. -A C<*> for the repeat count of pack() means to use all the bytes of -the input field. On unpack()ing the bits are converted to a string -of C<"0">s and C<"1">s. +If the input string of pack() is longer than needed, extra characters are +ignored. A C<*> for the repeat count of pack() means to use all the +characters of the input field. On unpack()ing the bits are converted to a +string of C<"0">s and C<"1">s. =item * The C and C fields pack a string that many nybbles (4-bit groups, representable as hexadecimal digits, 0-9a-f) long. -Each byte of the input field of pack() generates 4 bits of the result. -For non-alphabetical bytes the result is based on the 4 least-significant -bits of the input byte, i.e., on C. In particular, -bytes C<"0"> and C<"1"> generate nybbles 0 and 1, as do bytes -C<"\0"> and C<"\1">. For bytes C<"a".."f"> and C<"A".."F"> the result +Each character of the input field of pack() generates 4 bits of the result. +For non-alphabetical characters the result is based on the 4 least-significant +bits of the input character, i.e., on C. In particular, +characters C<"0"> and C<"1"> generate nybbles 0 and 1, as do bytes +C<"\0"> and C<"\1">. For characters C<"a".."f"> and C<"A".."F"> the result is compatible with the usual hexadecimal digits, so that C<"a"> and -C<"A"> both generate the nybble C<0xa==10>. The result for bytes +C<"A"> both generate the nybble C<0xa==10>. The result for characters C<"g".."z"> and C<"G".."Z"> is not well-defined. Starting from the beginning of the input string of pack(), each pair -of bytes is converted to 1 byte of output. With format C the -first byte of the pair determines the least-significant nybble of the -output byte, and with format C it determines the most-significant +of characters is converted to 1 character of output. With format C the +first character of the pair determines the least-significant nybble of the +output character, and with format C it determines the most-significant nybble. If the length of the input string is not even, it behaves as if padded -by a null byte at the end. Similarly, during unpack()ing the "extra" +by a null character at the end. Similarly, during unpack()ing the "extra" nybbles are ignored. -If the input string of pack() is longer than needed, extra bytes are ignored. -A C<*> for the repeat count of pack() means to use all the bytes of -the input field. On unpack()ing the bits are converted to a string +If the input string of pack() is longer than needed, extra characters are +ignored. +A C<*> for the repeat count of pack() means to use all the characters of +the input field. On unpack()ing the nybbles are converted to a string of hexadecimal digits. =item * @@ -3512,7 +3516,7 @@ I, but if you put in the '*' it will be ignored. For all other codes, C applies the length value to the next item, which must not have a repeat count. - unpack 'C/a', "\04Gurusamy"; gives 'Guru' + unpack 'W/a', "\04Gurusamy"; gives 'Guru' unpack 'a3/A* A*', '007 Bond J '; gives (' Bond','J') pack 'n/a* w/a*','hello,','world'; gives "\000\006hello,\005world" @@ -3581,7 +3585,7 @@ Some systems may have even weirder byte orders such as You can see your system's preference with print join(" ", map { sprintf "%#02x", $_ } - unpack("C*",pack("L",0x12345678))), "\n"; + unpack("W*",pack("L",0x12345678))), "\n"; The byteorder on the platform where Perl was built is also available via L: @@ -3649,21 +3653,21 @@ will not in general equal $foo). =item * -If the pattern begins with a C, the resulting string will be -treated as UTF-8-encoded Unicode. You can force UTF-8 encoding on in a -string with an initial C, and the bytes that follow will be -interpreted as Unicode characters. If you don't want this to happen, -you can begin your pattern with C (or anything else) to force Perl -not to UTF-8 encode your string, and then follow this with a C -somewhere in your pattern. +Pack and unpack can operate in two modes, character mode (C mode) where +the packed string is processed per character and UTF-8 mode (C mode) +where the packed string is processed in its UTF-8-encoded Unicode form on +a byte by byte basis. Character mode is the default unless the format string +starts with an C. You can switch mode at any moment with an explicit +C or C in the format. A mode is in effect until the next mode switch +or until the end of the ()-group in which it was entered. =item * You must yourself do any alignment or padding by inserting for example enough C<'x'>es while packing. There is no way to pack() and unpack() -could know where the bytes are going to or coming from. Therefore +could know where the characters are going to or coming from. Therefore C (and C) handle their output and input as flat -sequences of bytes. +sequences of characters. =item * @@ -3681,9 +3685,9 @@ is the string "\0a\0\0bc". C and C accept C modifier. In this case they act as alignment commands: they jump forward/back to the closest position -aligned at a multiple of C bytes. For example, to pack() or +aligned at a multiple of C characters. For example, to pack() or unpack() C's C one may need to -use the template C; this assumes that doubles must be +use the template C; this assumes that doubles must be aligned on the double's size. For alignment commands C of 0 is equivalent to C of 1; @@ -3713,20 +3717,27 @@ to pack() than actually given, extra arguments are ignored. Examples: - $foo = pack("CCCC",65,66,67,68); + $foo = pack("WWWW",65,66,67,68); # foo eq "ABCD" - $foo = pack("C4",65,66,67,68); + $foo = pack("W4",65,66,67,68); # same thing + $foo = pack("W4",0x24b6,0x24b7,0x24b8,0x24b9); + # same thing with Unicode circled letters. $foo = pack("U4",0x24b6,0x24b7,0x24b8,0x24b9); - # same thing with Unicode circled letters + # same thing with Unicode circled letters. You don't get the UTF-8 + # bytes because the U at the start of the format caused a switch to + # U0-mode, so the UTF-8 bytes get joined into characters + $foo = pack("C0U4",0x24b6,0x24b7,0x24b8,0x24b9); + # foo eq "\xe2\x92\xb6\xe2\x92\xb7\xe2\x92\xb8\xe2\x92\xb9" + # This is the UTF-8 encoding of the string in the previous example $foo = pack("ccxxcc",65,66,67,68); # foo eq "AB\0\0CD" - # note: the above examples featuring "C" and "c" are true + # note: the above examples featuring "W" and "c" are true # only on ASCII and ASCII-derived systems such as ISO Latin 1 # and UTF-8. In EBCDIC the first example would be - # $foo = pack("CCCC",193,194,195,196); + # $foo = pack("WWWW",193,194,195,196); $foo = pack("s2",1,2); # "\1\0\2\0" on little-endian @@ -6242,7 +6253,7 @@ If EXPR is omitted, unpacks the C<$_> string. The string is broken into chunks described by the TEMPLATE. Each chunk is converted separately to a value. Typically, either the string is a result -of C, or the bytes of the string represent a C structure of some +of C, or the characters of the string represent a C structure of some kind. The TEMPLATE has the same format as in the C function. @@ -6255,7 +6266,7 @@ Here's a subroutine that does substring: and then there's - sub ordinal { unpack("c",$_[0]); } # same as ord() + sub ordinal { unpack("W",$_[0]); } # same as ord() In addition to fields allowed in pack(), you may prefix a field with a % to indicate that @@ -6269,7 +6280,7 @@ computes the same number as the System V sum program: $checksum = do { local $/; # slurp! - unpack("%32C*",<>) % 65535; + unpack("%32W*",<>) % 65535; }; The following efficiently counts the number of set bits in a bit vector: diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index 23bee6e..f1308be 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -571,25 +571,24 @@ that make the distinction. Most operators that deal with positions or lengths in a string will automatically switch to using character positions, including C, C, C, C, C, C, -C, C, and C. Operators that -specifically do not switch include C, C, and -C. Operators that really don't care include -operators that treats strings as a bucket of bits such as C, -and operators dealing with filenames. +C, C, and C. An operator that +specifically does not switch is C. Operators that really don't +care include operators that treat strings as a bucket of bits such as +C, and operators dealing with filenames. =item * -The C/C letters C and C do I change, -since they are often used for byte-oriented formats. Again, think -C in the C language. +The C/C letter C does I change, since it is often +used for byte-oriented formats. Again, think C in the C language. There is a new C specifier that converts between Unicode characters -and code points. +and code points. There is also a C specifier that is the equivalent of +C/C and properly handles character values even if they are above 255. =item * The C and C functions work on characters, similar to -C and C, I C and +C and C, I C and C. C and C are methods for emulating byte-oriented C and C on Unicode strings. While these methods reveal the internal encoding of Unicode strings, diff --git a/pod/perluniintro.pod b/pod/perluniintro.pod index 81efd6b..b0d5859 100644 --- a/pod/perluniintro.pod +++ b/pod/perluniintro.pod @@ -247,12 +247,12 @@ constants: you cannot use variables in them. if you want similar run-time functionality, use C and C. If you want to force the result to Unicode characters, use the special -C<"U0"> prefix. It consumes no arguments but forces the result to be -in Unicode characters, instead of bytes. +C<"U0"> prefix. It consumes no arguments but causes the following bytes +to be interpreted as the UTF-8 encoding of Unicode characters: - my $chars = pack("U0C*", 0x80, 0x42); + my $chars = pack("U0W*", 0x80, 0x42); -Likewise, you can force the result to be bytes by using the special +Likewise, you can stop such UTF-8 interpretation by using the special C<"C0"> prefix. =head2 Handling Unicode @@ -452,7 +452,7 @@ displayed as C<\x..>, and the rest of the characters as themselves: chr($_) =~ /[[:cntrl:]]/ ? # else if control character ... sprintf("\\x%02X", $_) : # \x.. quotemeta(chr($_)) # else quoted or as themselves - } unpack("U*", $_[0])); # unpack Unicode characters + } unpack("W*", $_[0])); # unpack Unicode characters } For example, @@ -492,11 +492,12 @@ explicitly-defined I/O layers). But if you must, there are two ways of looking behind the scenes. One way of peeking inside the internal encoding of Unicode characters -is to use C to get the bytes or C -to display the bytes: +is to use C to get the bytes of whatever the string +encoding happens to be, or C to get the bytes of the +UTF-8 encoding: # this prints c4 80 for the UTF-8 bytes 0xc4 0x80 - print join(" ", unpack("H*", pack("U", 0x100))), "\n"; + print join(" ", unpack("U0(H2)*", pack("U", 0x100))), "\n"; Yet another way would be to use the Devel::Peek module: @@ -675,15 +676,17 @@ For example, # invalid } -For UTF-8 only, you can use: +Or use C to try decoding it: use warnings; - @chars = unpack("U0U*", $string_of_bytes_that_I_think_is_utf8); + @chars = unpack("C0U*", $string_of_bytes_that_I_think_is_utf8); If invalid, a C -warning is produced. The "U0" means "expect strictly UTF-8 encoded -Unicode". Without that the C would accept also -data like C), similarly to the C as we saw earlier. +warning is produced. The "C0" means +"process the string character per character". Without that the +C would work in C mode (the default if the format +string starts with C) and it would return the bytes making up the UTF-8 +encoding of the target string, something that will always work. =item * @@ -725,8 +728,8 @@ Back to converting data. If you have (or want) data in your system's native 8-bit encoding (e.g. Latin-1, EBCDIC, etc.), you can use pack/unpack to convert to/from Unicode. - $native_string = pack("C*", unpack("U*", $Unicode_string)); - $Unicode_string = pack("U*", unpack("C*", $native_string)); + $native_string = pack("W*", unpack("U*", $Unicode_string)); + $Unicode_string = pack("U*", unpack("W*", $native_string)); If you have a sequence of bytes you B is valid UTF-8, but Perl doesn't know it yet, you can make Perl a believer, too: @@ -734,6 +737,10 @@ but Perl doesn't know it yet, you can make Perl a believer, too: use Encode 'decode_utf8'; $Unicode = decode_utf8($bytes); +or: + + $Unicode = pack("U0a*", $bytes); + You can convert well-formed UTF-8 to a sequence of bytes, but if you just want to convert random binary data into UTF-8, you can't. B. You can diff --git a/pp_pack.c b/pp_pack.c index d2ae072..c6e2d4e 100644 --- a/pp_pack.c +++ b/pp_pack.c @@ -32,8 +32,12 @@ #include "perl.h" #if PERL_VERSION >= 9 -#define PERL_PACK_CAN_BYTEORDER -#define PERL_PACK_CAN_SHRIEKSIGN +# define PERL_PACK_CAN_BYTEORDER +# define PERL_PACK_CAN_SHRIEKSIGN +#endif + +#ifndef CHAR_BIT +# define CHAR_BIT 8 #endif /* @@ -77,32 +81,52 @@ # define OFF32(p) ((char *) (p)) #endif -#define COPY16(s,p) Copy(s, OFF16(p), SIZE16, char) -#define COPY32(s,p) Copy(s, OFF32(p), SIZE32, char) -#define CAT16(sv,p) sv_catpvn(sv, OFF16(p), SIZE16) -#define CAT32(sv,p) sv_catpvn(sv, OFF32(p), SIZE32) +/* Only to be used inside a loop (see the break) */ +#define SHIFT16(utf8, s, strend, p, datumtype) STMT_START { \ + if (utf8) { \ + if (!uni_to_bytes(aTHX_ &(s), strend, OFF16(p), SIZE16, datumtype)) break; \ + } else { \ + Copy(s, OFF16(p), SIZE16, char); \ + (s) += SIZE16; \ + } \ +} STMT_END + +/* Only to be used inside a loop (see the break) */ +#define SHIFT32(utf8, s, strend, p, datumtype) STMT_START { \ + if (utf8) { \ + if (!uni_to_bytes(aTHX_ &(s), strend, OFF32(p), SIZE32, datumtype)) break; \ + } else { \ + Copy(s, OFF32(p), SIZE32, char); \ + (s) += SIZE32; \ + } \ +} STMT_END + +#define PUSH16(utf8, cur, p) PUSH_BYTES(utf8, cur, OFF16(p), SIZE16) +#define PUSH32(utf8, cur, p) PUSH_BYTES(utf8, cur, OFF32(p), SIZE32) /* Only to be used inside a loop (see the break) */ -#define COPYVAR(s,strend,utf8,var,format) \ +#define SHIFT_VAR(utf8, s, strend, var, datumtype) \ STMT_START { \ if (utf8) { \ - if (!next_uni_bytes(aTHX_ &s, strend, \ - (char *) &var, sizeof(var))) break; \ + if (!uni_to_bytes(aTHX_ &s, strend, \ + (char *) &var, sizeof(var), datumtype)) break;\ } else { \ Copy(s, (char *) &var, sizeof(var), char); \ s += sizeof(var); \ } \ - DO_BO_UNPACK(var, format); \ } STMT_END +#define PUSH_VAR(utf8, aptr, var) \ + PUSH_BYTES(utf8, aptr, (char *) &(var), sizeof(var)) + /* Avoid stack overflow due to pathological templates. 100 should be plenty. */ #define MAX_SUB_TEMPLATE_LEVEL 100 /* flags (note that type modifiers can also be used as flags!) */ -#define FLAG_UNPACK_WAS_UTF8 0x40 /* original had FLAG_UNPACK_DO_UTF8 */ -#define FLAG_UNPACK_PARSE_UTF8 0x20 /* Parse as utf8 */ +#define FLAG_WAS_UTF8 0x40 +#define FLAG_PARSE_UTF8 0x20 /* Parse as utf8 */ #define FLAG_UNPACK_ONLY_ONE 0x10 -#define FLAG_UNPACK_DO_UTF8 0x08 /* The underlying string is utf8 */ +#define FLAG_DO_UTF8 0x08 /* The underlying string is utf8 */ #define FLAG_SLASH 0x04 #define FLAG_COMMA 0x02 #define FLAG_PACK 0x01 @@ -151,6 +175,7 @@ S_mul128(pTHX_ SV *sv, U8 m) #define TYPE_IS_SHRIEKING 0x100 #define TYPE_IS_BIG_ENDIAN 0x200 #define TYPE_IS_LITTLE_ENDIAN 0x400 +#define TYPE_IS_PACK 0x800 #define TYPE_ENDIANNESS_MASK (TYPE_IS_BIG_ENDIAN|TYPE_IS_LITTLE_ENDIAN) #define TYPE_MODIFIERS(t) ((t) & ~0xFF) #define TYPE_NO_MODIFIERS(t) ((t) & 0xFF) @@ -177,7 +202,7 @@ S_mul128(pTHX_ SV *sv, U8 m) # define DO_BO_UNPACK_P(var) # define DO_BO_PACK_P(var) -#else +#else /* PERL_PACK_CAN_BYTEORDER */ # define TYPE_ENDIANNESS(t) ((t) & TYPE_ENDIANNESS_MASK) # define TYPE_NO_ENDIANNESS(t) ((t) & ~TYPE_ENDIANNESS_MASK) @@ -281,10 +306,10 @@ S_mul128(pTHX_ SV *sv, U8 m) # define DO_BO_PACK_N(var, type) BO_CANT_DOIT(pack, type) # endif -#endif +#endif /* PERL_PACK_CAN_BYTEORDER */ #define PACK_SIZE_CANNOT_CSUM 0x80 -#define PACK_SIZE_SPARE 0x40 +#define PACK_SIZE_UNPREDICTABLE 0x40 /* Not a fixed size element */ #define PACK_SIZE_MASK 0x3F @@ -326,9 +351,9 @@ unsigned char size_normal[53] = { 0, /* S */ SIZE16, 0, - /* U */ sizeof(char), + /* U */ sizeof(char) | PACK_SIZE_UNPREDICTABLE, /* V */ SIZE32, - /* W */ sizeof(unsigned char), + /* W */ sizeof(unsigned char) | PACK_SIZE_UNPREDICTABLE, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* c */ sizeof(char), /* d */ sizeof(double), @@ -352,7 +377,7 @@ unsigned char size_normal[53] = { /* s */ SIZE16, 0, 0, /* v */ SIZE16, - /* w */ sizeof(char) | PACK_SIZE_CANNOT_CSUM, + /* w */ sizeof(char) | PACK_SIZE_UNPREDICTABLE | PACK_SIZE_CANNOT_CSUM, }; unsigned char size_shrieking[46] = { /* I */ sizeof(unsigned int), @@ -421,7 +446,7 @@ unsigned char size_normal[100] = { /* s */ SIZE16, 0, 0, /* v */ SIZE16, - /* w */ sizeof(char) | PACK_SIZE_CANNOT_CSUM, + /* w */ sizeof(char) | PACK_SIZE_UNPREDICTABLE | PACK_SIZE_CANNOT_CSUM, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* C */ sizeof(unsigned char), @@ -449,9 +474,9 @@ unsigned char size_normal[100] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, /* S */ SIZE16, 0, - /* U */ sizeof(char), + /* U */ sizeof(char) | PACK_SIZE_UNPREDICTABLE, /* V */ SIZE32, - /* W */ sizeof(unsigned char), + /* W */ sizeof(unsigned char) | PACK_SIZE_UNPREDICTABLE, }; unsigned char size_shrieking[93] = { /* i */ sizeof(int), @@ -498,34 +523,35 @@ struct packsize_t packsize[2] = { #endif STATIC U8 -next_uni_byte(pTHX_ char **s, const char *end, I32 datumtype) +uni_to_byte(pTHX_ char **s, const char *end, I32 datumtype) { UV val; STRLEN retlen; - val = - UNI_TO_NATIVE(utf8n_to_uvuni((U8*)*s, end-*s, &retlen, - ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY)); + val = utf8n_to_uvchr((U8 *) *s, end-*s, &retlen, + ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); /* We try to process malformed UTF-8 as much as possible (preferrably with warnings), but these two mean we make no progress in the string and might enter an infinite loop */ if (retlen == (STRLEN) -1 || retlen == 0) - Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); + Perl_croak(aTHX_ "Malformed UTF-8 string in '%c' format in unpack", + (int) TYPE_NO_MODIFIERS(datumtype)); if (val >= 0x100) { + if (ckWARN(WARN_UNPACK)) Perl_warner(aTHX_ packWARN(WARN_UNPACK), "Character in '%c' format wrapped in unpack", - (int) datumtype); + (int) TYPE_NO_MODIFIERS(datumtype)); val &= 0xff; } *s += retlen; return val; } -#define NEXT_BYTE(utf8, s, strend, datumtype) ((utf8) ? \ - next_uni_byte(aTHX_ &(s), (strend), (datumtype)) : \ +#define SHIFT_BYTE(utf8, s, strend, datumtype) ((utf8) ? \ + uni_to_byte(aTHX_ &(s), (strend), (datumtype)) : \ *(U8 *)(s)++) STATIC bool -next_uni_bytes(pTHX_ char **s, char *end, char *buf, int buf_len) +uni_to_bytes(pTHX_ char **s, char *end, char *buf, int buf_len, I32 datumtype) { UV val; STRLEN retlen; @@ -535,7 +561,7 @@ next_uni_bytes(pTHX_ char **s, char *end, char *buf, int buf_len) UTF8_CHECK_ONLY : (UTF8_CHECK_ONLY | UTF8_ALLOW_ANY); for (;buf_len > 0; buf_len--) { if (from >= end) return FALSE; - val = UNI_TO_NATIVE(utf8n_to_uvuni((U8*)from, end-from, &retlen, flags)); + val = utf8n_to_uvchr((U8 *) from, end-from, &retlen, flags); if (retlen == (STRLEN) -1 || retlen == 0) { from += UTF8SKIP(from); bad |= 1; @@ -554,13 +580,16 @@ next_uni_bytes(pTHX_ char **s, char *end, char *buf, int buf_len) flags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY; for (ptr = *s; ptr < from; ptr += UTF8SKIP(ptr)) { if (ptr >= end) break; - utf8n_to_uvuni((U8*)ptr, end-ptr, &retlen, flags); + utf8n_to_uvuni((U8 *) ptr, end-ptr, &retlen, flags); } if (from > end) from = end; } if ((bad & 2) && ckWARN(WARN_UNPACK)) - Perl_warner(aTHX_ packWARN(WARN_UNPACK), - "Character(s) wrapped in unpack"); + Perl_warner(aTHX_ packWARN(datumtype & TYPE_IS_PACK ? + WARN_PACK : WARN_UNPACK), + "Character(s) in '%c' format wrapped in %s", + (int) TYPE_NO_MODIFIERS(datumtype), + datumtype & TYPE_IS_PACK ? "pack" : "unpack"); } *s = from; return TRUE; @@ -571,43 +600,118 @@ next_uni_uu(pTHX_ char **s, const char *end, I32 *out) { UV val; STRLEN retlen; - char *from = *s; - val = UNI_TO_NATIVE(utf8n_to_uvuni((U8*)*s, end-*s, &retlen, UTF8_CHECK_ONLY)); + val = utf8n_to_uvchr((U8 *) *s, end-*s, &retlen, UTF8_CHECK_ONLY); if (val >= 0x100 || !ISUUCHAR(val) || retlen == (STRLEN) -1 || retlen == 0) { *out = 0; return FALSE; } *out = PL_uudmap[val] & 077; - *s = from; + *s += retlen; return TRUE; } +STATIC void +bytes_to_uni(pTHX_ U8 *start, STRLEN len, char **dest) { + U8 buffer[UTF8_MAXLEN]; + U8 *end = start + len; + char *d = *dest; + while (start < end) { + int length = + uvuni_to_utf8_flags(buffer, NATIVE_TO_UNI(*start), 0) - buffer; + switch(length) { + case 1: + *d++ = buffer[0]; + break; + case 2: + *d++ = buffer[0]; + *d++ = buffer[1]; + break; + default: + Perl_croak(aTHX_ "Perl bug: value %d UTF-8 expands to %d bytes", + *start, length); + } + start++; + } + *dest = d; +} + +#define PUSH_BYTES(utf8, cur, buf, len) \ +STMT_START { \ + if (utf8) bytes_to_uni(aTHX_ buf, len, &(cur)); \ + else { \ + Copy(buf, cur, len, char); \ + (cur) += (len); \ + } \ +} STMT_END + +#define GROWING(utf8, cat, start, cur, in_len) \ +STMT_START { \ + STRLEN glen = (in_len); \ + if (utf8) glen *= 2; \ + if ((cur) + glen >= (start) + SvLEN(cat)) { \ + (start) = sv_exp_grow(aTHX_ cat, glen); \ + (cur) = (start) + SvCUR(cat); \ + } \ +} STMT_END + +#define PUSH_GROWING_BYTES(utf8, cat, start, cur, buf, in_len) \ +STMT_START { \ + STRLEN glen = (in_len); \ + STRLEN gl = glen; \ + if (utf8) gl *= 2; \ + if ((cur) + gl >= (start) + SvLEN(cat)) { \ + *cur = '\0'; \ + SvCUR(cat) = (cur) - (start); \ + (start) = sv_exp_grow(aTHX_ cat, gl); \ + (cur) = (start) + SvCUR(cat); \ + } \ + PUSH_BYTES(utf8, cur, buf, glen); \ +} STMT_END + +#define PUSH_BYTE(utf8, s, byte) \ +STMT_START { \ + if (utf8) { \ + U8 au8 = (byte); \ + bytes_to_uni(aTHX_ &au8, 1, &(s)); \ + } else *(U8 *)(s)++ = (byte); \ +} STMT_END + +/* Only to be used inside a loop (see the break) */ +#define NEXT_UNI_VAL(val, cur, str, end, utf8_flags) \ +STMT_START { \ + STRLEN retlen; \ + if (str >= end) break; \ + val = utf8n_to_uvchr((U8 *) str, end-str, &retlen, utf8_flags); \ + if (retlen == (STRLEN) -1 || retlen == 0) { \ + *cur = '\0'; \ + Perl_croak(aTHX_ "Malformed UTF-8 string in pack"); \ + } \ + str += retlen; \ +} STMT_END + /* Returns the sizeof() struct described by pat */ STATIC I32 -S_measure_struct(pTHX_ register tempsym_t* symptr) +S_measure_struct(pTHX_ tempsym_t* symptr) { - register I32 len = 0; - register I32 total = 0; - int star; - - register int size; + I32 total = 0; while (next_symbol(symptr)) { - int which = (symptr->code & TYPE_IS_SHRIEKING) - ? PACK_SIZE_SHRIEKING : PACK_SIZE_NORMAL; - int offset - = TYPE_NO_MODIFIERS(symptr->code) - packsize[which].first; - - switch( symptr->howlen ){ - case e_no_len: - case e_number: - len = symptr->length; - break; + I32 len; + int star, size; + int which = (symptr->code & TYPE_IS_SHRIEKING) ? + PACK_SIZE_SHRIEKING : PACK_SIZE_NORMAL; + int offset = TYPE_NO_MODIFIERS(symptr->code) - packsize[which].first; + + switch (symptr->howlen) { case e_star: Perl_croak(aTHX_ "Within []-length '*' not allowed in %s", symptr->flags & FLAG_PACK ? "pack" : "unpack" ); break; + default: + /* e_no_len and e_number */ + len = symptr->length; + break; } if ((offset >= 0) && (offset < packsize[which].size)) @@ -746,10 +850,11 @@ S_get_num(pTHX_ register char *patptr, I32 *lenptr ) * locates next template code and count */ STATIC bool -S_next_symbol(pTHX_ register tempsym_t* symptr ) +S_next_symbol(pTHX_ tempsym_t* symptr ) { - register char* patptr = symptr->patptr; - register char* patend = symptr->patend; + char* patptr = symptr->patptr; + char* patend = symptr->patend; + const char *allowed = ""; symptr->flags &= ~FLAG_SLASH; @@ -797,7 +902,6 @@ S_next_symbol(pTHX_ register tempsym_t* symptr ) /* look for modifiers */ while (patptr < patend) { - const char *allowed; I32 modifier = 0; switch (*patptr) { case '!': @@ -813,7 +917,7 @@ S_next_symbol(pTHX_ register tempsym_t* symptr ) modifier = TYPE_IS_LITTLE_ENDIAN; allowed = ENDIANNESS_ALLOWED_TYPES; break; -#endif +#endif /* PERL_PACK_CAN_BYTEORDER */ default: break; } @@ -965,23 +1069,23 @@ and ocnt are not used. This call should not be used, use unpackstring instead. =cut */ I32 -Perl_unpack_str(pTHX_ char *pat, register char *patend, register char *s, char *strbeg, char *strend, char **new_s, I32 ocnt, U32 flags) +Perl_unpack_str(pTHX_ char *pat, char *patend, char *s, char *strbeg, char *strend, char **new_s, I32 ocnt, U32 flags) { tempsym_t sym = { 0 }; - if (flags & FLAG_UNPACK_DO_UTF8) flags |= FLAG_UNPACK_WAS_UTF8; + if (flags & FLAG_DO_UTF8) flags |= FLAG_WAS_UTF8; else if (need_utf8(pat, patend)) { /* We probably should try to avoid this in case a scalar context call wouldn't get to the "U0" */ STRLEN len = strend - s; - s = (char*)bytes_to_utf8((U8*)s, &len); + s = (char *) bytes_to_utf8(s, &len); SAVEFREEPV(s); strend = s + len; - flags |= FLAG_UNPACK_DO_UTF8; + flags |= FLAG_DO_UTF8; } - if (first_symbol(pat, patend) != 'U' && (flags & FLAG_UNPACK_DO_UTF8)) - flags |= FLAG_UNPACK_PARSE_UTF8; + if (first_symbol(pat, patend) != 'U' && (flags & FLAG_DO_UTF8)) + flags |= FLAG_PARSE_UTF8; sym.patptr = pat; sym.patend = patend; @@ -1000,23 +1104,23 @@ Issue C before and C after the call to this function. =cut */ I32 -Perl_unpackstring(pTHX_ char *pat, register char *patend, register char *s, char *strend, U32 flags) +Perl_unpackstring(pTHX_ char *pat, char *patend, char *s, char *strend, U32 flags) { tempsym_t sym = { 0 }; - if (flags & FLAG_UNPACK_DO_UTF8) flags |= FLAG_UNPACK_WAS_UTF8; + if (flags & FLAG_DO_UTF8) flags |= FLAG_WAS_UTF8; else if (need_utf8(pat, patend)) { /* We probably should try to avoid this in case a scalar context call wouldn't get to the "U0" */ STRLEN len = strend - s; - s = (char*)bytes_to_utf8((U8*)s, &len); + s = (char *) bytes_to_utf8(s, &len); SAVEFREEPV(s); strend = s + len; - flags |= FLAG_UNPACK_DO_UTF8; + flags |= FLAG_DO_UTF8; } - if (first_symbol(pat, patend) != 'U' && (flags & FLAG_UNPACK_DO_UTF8)) - flags |= FLAG_UNPACK_PARSE_UTF8; + if (first_symbol(pat, patend) != 'U' && (flags & FLAG_DO_UTF8)) + flags |= FLAG_PARSE_UTF8; sym.patptr = pat; sym.patend = patend; @@ -1030,8 +1134,6 @@ I32 S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char **new_s ) { dSP; - I32 datumtype, ai32; - I32 len = 0; SV *sv; I32 start_sp_offset = SP - PL_stack_base; howlen_t howlen; @@ -1039,15 +1141,16 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char I32 checksum = 0; UV cuv = 0; NV cdouble = 0.0; - const int bits_in_uv = 8 * sizeof(cuv); + const int bits_in_uv = CHAR_BIT * sizeof(cuv); char* strrelbeg = s; bool beyond = FALSE; bool explicit_length; bool unpack_only_one = (symptr->flags & FLAG_UNPACK_ONLY_ONE) != 0; - bool utf8 = (symptr->flags & FLAG_UNPACK_PARSE_UTF8) ? 1 : 0; + bool utf8 = (symptr->flags & FLAG_PARSE_UTF8) ? 1 : 0; while (next_symbol(symptr)) { - datumtype = symptr->code; + I32 len, ai32; + I32 datumtype = symptr->code; /* do first one only unless in list context / is implemented by unpacking the count, then popping it from the stack, so must check that we're not in the middle of a / */ @@ -1056,28 +1159,29 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char && (datumtype != '/') ) /* XXX can this be omitted */ break; - switch( howlen = symptr->howlen ){ - case e_no_len: - case e_number: - len = symptr->length; - break; + switch (howlen = symptr->howlen) { case e_star: len = strend - strbeg; /* long enough */ break; + default: + /* e_no_len and e_number */ + len = symptr->length; + break; } explicit_length = TRUE; redo_switch: beyond = s >= strend; { - int which = (symptr->code & TYPE_IS_SHRIEKING) - ? PACK_SIZE_SHRIEKING : PACK_SIZE_NORMAL; + struct packsize_t *pack_props = + &packsize[(symptr->code & TYPE_IS_SHRIEKING) ? + PACK_SIZE_SHRIEKING : PACK_SIZE_NORMAL]; const int rawtype = TYPE_NO_MODIFIERS(datumtype); - int offset = rawtype - packsize[which].first; + int offset = rawtype - pack_props->first; - if (offset >= 0 && offset < packsize[which].size) { + if (offset >= 0 && offset < pack_props->size) { /* Data about this template letter */ - unsigned char data = packsize[which].array[offset]; + unsigned char data = pack_props->array[offset]; if (data) { /* data nonzero means we can process this letter. */ @@ -1116,8 +1220,8 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char PUTBACK; while (len--) { symptr->patptr = savsym.grpbeg; - if (utf8) symptr->flags |= FLAG_UNPACK_PARSE_UTF8; - else symptr->flags &= ~FLAG_UNPACK_PARSE_UTF8; + if (utf8) symptr->flags |= FLAG_PARSE_UTF8; + else symptr->flags &= ~FLAG_PARSE_UTF8; unpack_rec(symptr, s, strbeg, strend, &s); if (s == strend && savsym.howlen == e_star) break; /* No way to continue */ @@ -1150,22 +1254,28 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char len = 1; if (utf8) { char *hop, *last; - I32 l; - for (l=len, hop = strbeg; hop < s; l++, hop += UTF8SKIP(hop)) - if (l == len) { + I32 l = len; + hop = last = strbeg; + while (hop < s) { + hop += UTF8SKIP(hop); + if (--l == 0) { last = hop; - l = 0; + l = len; + } } + if (last > s) + Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); s = last; break; - } else len = (s - strbeg) % len; + } + len = (s - strbeg) % len; /* FALL THROUGH */ case 'X': if (utf8) { while (len > 0) { if (s <= strbeg) Perl_croak(aTHX_ "'X' outside of string in unpack"); - while (UTF8_IS_CONTINUATION(*--s)) { + while (--s, UTF8_IS_CONTINUATION(*s)) { if (s <= strbeg) Perl_croak(aTHX_ "'X' outside of string in unpack"); } @@ -1180,14 +1290,8 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char case 'x' | TYPE_IS_SHRIEKING: if (!len) /* Avoid division by 0 */ len = 1; - if (utf8) { - char *hop = strbeg; - I32 l = 0; - for (hop = strbeg; hop < s; hop += UTF8SKIP(hop)) l++; - if (s != hop) - Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); - ai32 = l % len; - } else ai32 = (s - strbeg) % len; + if (utf8) ai32 = utf8_length(strbeg, s) % len; + else ai32 = (s - strbeg) % len; if (ai32 == 0) break; len -= ai32; /* FALL THROUGH */ @@ -1203,7 +1307,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char if (len > strend - s) Perl_croak(aTHX_ "'x' outside of string in unpack"); s += len; - }; + } break; case '/': Perl_croak(aTHX_ "'/' must follow a numeric type in unpack"); @@ -1234,8 +1338,9 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char if (datumtype == 'Z') { /* 'Z' strips stuff after first null */ - char *ptr; - for (ptr = s; ptr < strend; ptr++) if (*ptr == 0) break; + char *ptr, *end; + end = s + len; + for (ptr = s; ptr < end; ptr++) if (*ptr == 0) break; sv = newSVpvn(s, ptr-s); if (howlen == e_star) /* exact for 'Z*' */ len = ptr-s + (ptr != strend ? 1 : 0); @@ -1251,7 +1356,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char if (utf8) { SvUTF8_on(sv); /* Undo any upgrade done due to need_utf8() */ - if (!(symptr->flags & FLAG_UNPACK_WAS_UTF8)) + if (!(symptr->flags & FLAG_WAS_UTF8)) sv_utf8_downgrade(sv, 0); } XPUSHs(sv_2mortal(sv)); @@ -1277,32 +1382,30 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char if (bits & 128) PL_bitcount[bits]++; } } - if (utf8) { + if (utf8) while (len >= 8 && s < strend) { - cuv += PL_bitcount[next_uni_byte(aTHX_ &s, strend, datumtype)]; + cuv += PL_bitcount[uni_to_byte(aTHX_ &s, strend, datumtype)]; len -= 8; } - } else { + else while (len >= 8) { cuv += PL_bitcount[*(U8 *)s++]; len -= 8; } - } if (len && s < strend) { U8 bits; - bits = NEXT_BYTE(utf8, s, strend, datumtype); - if (datumtype == 'b') { + bits = SHIFT_BYTE(utf8, s, strend, datumtype); + if (datumtype == 'b') while (len-- > 0) { if (bits & 1) cuv++; bits >>= 1; } - } else { + else while (len-- > 0) { if (bits & 0x80) cuv++; bits <<= 1; } } - } break; } @@ -1310,24 +1413,24 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char SvPOK_on(sv); str = SvPVX(sv); if (datumtype == 'b') { - U8 bits; + U8 bits = 0; ai32 = len; for (len = 0; len < ai32; len++) { if (len & 7) bits >>= 1; else if (utf8) { if (s >= strend) break; - bits = next_uni_byte(aTHX_ &s, strend, datumtype); + bits = uni_to_byte(aTHX_ &s, strend, datumtype); } else bits = *(U8 *) s++; *str++ = bits & 1 ? '1' : '0'; } } else { - U8 bits; + U8 bits = 0; ai32 = len; for (len = 0; len < ai32; len++) { if (len & 7) bits <<= 1; else if (utf8) { if (s >= strend) break; - bits = next_uni_byte(aTHX_ &s, strend, datumtype); + bits = uni_to_byte(aTHX_ &s, strend, datumtype); } else bits = *(U8 *) s++; *str++ = bits & 0x80 ? '1' : '0'; } @@ -1347,24 +1450,24 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char SvPOK_on(sv); str = SvPVX(sv); if (datumtype == 'h') { - U8 bits; + U8 bits = 0; ai32 = len; for (len = 0; len < ai32; len++) { if (len & 1) bits >>= 4; else if (utf8) { if (s >= strend) break; - bits = next_uni_byte(aTHX_ &s, strend, datumtype); + bits = uni_to_byte(aTHX_ &s, strend, datumtype); } else bits = * (U8 *) s++; *str++ = PL_hexdigit[bits & 15]; } } else { - U8 bits; + U8 bits = 0; ai32 = len; for (len = 0; len < ai32; len++) { if (len & 1) bits <<= 4; else if (utf8) { if (s >= strend) break; - bits = next_uni_byte(aTHX_ &s, strend, datumtype); + bits = uni_to_byte(aTHX_ &s, strend, datumtype); } else bits = *(U8 *) s++; *str++ = PL_hexdigit[(bits >> 4) & 15]; } @@ -1376,7 +1479,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char } case 'c': while (len-- > 0) { - int aint = NEXT_BYTE(utf8, s, strend, datumtype); + int aint = SHIFT_BYTE(utf8, s, strend, datumtype); if (aint >= 128) /* fake up signed chars */ aint -= 256; if (!checksum) @@ -1393,18 +1496,17 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char if (len == 0) { if (explicit_length && datumtype == 'C') /* Switch to "character" mode */ - utf8 = (symptr->flags & FLAG_UNPACK_DO_UTF8) ? 1 : 0; + utf8 = (symptr->flags & FLAG_DO_UTF8) ? 1 : 0; break; } if (datumtype == 'C' ? - (symptr->flags & FLAG_UNPACK_DO_UTF8) && - !(symptr->flags & FLAG_UNPACK_WAS_UTF8) : utf8) { + (symptr->flags & FLAG_DO_UTF8) && + !(symptr->flags & FLAG_WAS_UTF8) : utf8) { while (len-- > 0 && s < strend) { UV val; STRLEN retlen; - val = - UNI_TO_NATIVE(utf8n_to_uvuni((U8*)s, strend-s, &retlen, - ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY)); + val = utf8n_to_uvchr((U8 *) s, strend-s, &retlen, + ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); if (retlen == (STRLEN) -1 || retlen == 0) Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); s += retlen; @@ -1429,7 +1531,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char if (len == 0) { if (explicit_length) { /* Switch to "bytes in UTF-8" mode */ - if (symptr->flags & FLAG_UNPACK_DO_UTF8) utf8 = 0; + if (symptr->flags & FLAG_DO_UTF8) utf8 = 0; else /* Should be impossible due to the need_utf8() test */ Perl_croak(aTHX_ "U0 mode on a byte string"); @@ -1452,11 +1554,11 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char ptr = s; /* Bug: warns about bad utf8 even if we are short on bytes and will break out of the loop */ - if (!next_uni_bytes(aTHX_ &ptr, strend, (char*)result, 1)) + if (!uni_to_bytes(aTHX_ &ptr, strend, result, 1, 'U')) break; len = UTF8SKIP(result); - if (!next_uni_bytes(aTHX_ &ptr, strend, (char*)&result[1], len-1)) - break; + if (!uni_to_bytes(aTHX_ &ptr, strend, + &result[1], len-1, 'U')) break; auv = utf8n_to_uvuni(result, len, &retlen, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANYUV); s = ptr; } else { @@ -1477,7 +1579,8 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char #if SHORTSIZE != SIZE16 while (len-- > 0) { short ashort; - COPYVAR(s, strend, utf8, ashort, s); + SHIFT_VAR(utf8, s, strend, ashort, datumtype); + DO_BO_UNPACK(ashort, s); if (!checksum) PUSHs(sv_2mortal(newSViv((IV)ashort))); else if (checksum > bits_in_uv) @@ -1496,13 +1599,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char #if U16SIZE > SIZE16 ai16 = 0; #endif - if (utf8) { - if (!next_uni_bytes(aTHX_ &s, strend, - OFF16(&ai16), SIZE16)) break; - } else { - COPY16(s, &ai16); - s += SIZE16; - } + SHIFT16(utf8, s, strend, &ai16, datumtype); DO_BO_UNPACK(ai16, 16); #if U16SIZE > SIZE16 if (ai16 > 32767) @@ -1520,7 +1617,8 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char #if SHORTSIZE != SIZE16 while (len-- > 0) { unsigned short aushort; - COPYVAR(s, strend, utf8, aushort, s); + SHIFT_VAR(utf8, s, strend, aushort, datumtype); + DO_BO_UNPACK(aushort, s); if (!checksum) PUSHs(sv_2mortal(newSVuv((UV) aushort))); else if (checksum > bits_in_uv) @@ -1540,13 +1638,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char #if U16SIZE > SIZE16 au16 = 0; #endif - if (utf8) { - if (!next_uni_bytes(aTHX_ &s, strend, - OFF16(&au16), SIZE16)) break; - } else { - COPY16(s, &au16); - s += SIZE16; - } + SHIFT16(utf8, s, strend, &au16, datumtype); DO_BO_UNPACK(au16, 16); #ifdef HAS_NTOHS if (datumtype == 'n') @@ -1559,7 +1651,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char if (!checksum) PUSHs(sv_2mortal(newSVuv((UV)au16))); else if (checksum > bits_in_uv) - cdouble += (NV)au16; + cdouble += (NV) au16; else cuv += au16; } @@ -1572,13 +1664,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char # if U16SIZE > SIZE16 ai16 = 0; # endif - if (utf8) { - if (!next_uni_bytes(aTHX_ &s, strend, - (char *) &ai16, sizeof(ai16))) break; - } else { - COPY16(s, &ai16); - s += SIZE16; - } + SHIFT16(utf8, s, strend, &ai16, datumtype); # ifdef HAS_NTOHS if (datumtype == ('n' | TYPE_IS_SHRIEKING)) ai16 = (I16) PerlSock_ntohs((U16) ai16); @@ -1600,7 +1686,8 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char case 'i' | TYPE_IS_SHRIEKING: while (len-- > 0) { int aint; - COPYVAR(s, strend, utf8, aint, i); + SHIFT_VAR(utf8, s, strend, aint, datumtype); + DO_BO_UNPACK(aint, i); if (!checksum) PUSHs(sv_2mortal(newSViv((IV)aint))); else if (checksum > bits_in_uv) @@ -1613,7 +1700,8 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char case 'I' | TYPE_IS_SHRIEKING: while (len-- > 0) { unsigned int auint; - COPYVAR(s, strend, utf8, auint, i); + SHIFT_VAR(utf8, s, strend, auint, datumtype); + DO_BO_UNPACK(auint, i); if (!checksum) PUSHs(sv_2mortal(newSVuv((UV)auint))); else if (checksum > bits_in_uv) @@ -1625,12 +1713,13 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char case 'j': while (len-- > 0) { IV aiv; + SHIFT_VAR(utf8, s, strend, aiv, datumtype); #if IVSIZE == INTSIZE - COPYVAR(s, strend, utf8, aiv, i); + DO_BO_UNPACK(aiv, i); #elif IVSIZE == LONGSIZE - COPYVAR(s, strend, utf8, aiv, l); + DO_BO_UNPACK(aiv, l); #elif defined(HAS_QUAD) && IVSIZE == U64SIZE - COPYVAR(s, strend, utf8, aiv, 64); + DO_BO_UNPACK(aiv, 64); #else Perl_croak(aTHX_ "'j' not supported on this platform"); #endif @@ -1645,12 +1734,13 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char case 'J': while (len-- > 0) { UV auv; + SHIFT_VAR(utf8, s, strend, auv, datumtype); #if IVSIZE == INTSIZE - COPYVAR(s, strend, utf8, auv, i); + DO_BO_UNPACK(auv, i); #elif IVSIZE == LONGSIZE - COPYVAR(s, strend, utf8, auv, l); + DO_BO_UNPACK(auv, l); #elif defined(HAS_QUAD) && IVSIZE == U64SIZE - COPYVAR(s, strend, utf8, auv, 64); + DO_BO_UNPACK(auv, 64); #else Perl_croak(aTHX_ "'J' not supported on this platform"); #endif @@ -1666,7 +1756,8 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char #if LONGSIZE != SIZE32 while (len-- > 0) { long along; - COPYVAR(s, strend, utf8, along, l); + SHIFT_VAR(utf8, s, strend, along, datumtype); + DO_BO_UNPACK(along, l); if (!checksum) PUSHs(sv_2mortal(newSViv((IV)along))); else if (checksum > bits_in_uv) @@ -1684,13 +1775,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char #if U32SIZE > SIZE32 ai32 = 0; #endif - if (utf8) { - if (!next_uni_bytes(aTHX_ &s, strend, - OFF32(&ai32), SIZE32)) break; - } else { - COPY32(s, &ai32); - s += SIZE32; - } + SHIFT32(utf8, s, strend, &ai32, datumtype); DO_BO_UNPACK(ai32, 32); #if U32SIZE > SIZE32 if (ai32 > 2147483647) ai32 -= 4294967296; @@ -1707,7 +1792,8 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char #if LONGSIZE != SIZE32 while (len-- > 0) { unsigned long aulong; - COPYVAR(s, strend, utf8, aulong, l); + SHIFT_VAR(utf8, s, strend, aulong, datumtype); + DO_BO_UNPACK(aulong, l); if (!checksum) PUSHs(sv_2mortal(newSVuv((UV)aulong))); else if (checksum > bits_in_uv) @@ -1727,13 +1813,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char #if U32SIZE > SIZE32 au32 = 0; #endif - if (utf8) { - if (!next_uni_bytes(aTHX_ &s, strend, - OFF32(&au32), SIZE32)) break; - } else { - COPY32(s, &au32); - s += SIZE32; - } + SHIFT32(utf8, s, strend, &au32, datumtype); DO_BO_UNPACK(au32, 32); #ifdef HAS_NTOHL if (datumtype == 'N') @@ -1759,13 +1839,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char # if U32SIZE > SIZE32 ai32 = 0; # endif - if (utf8) { - if (!next_uni_bytes(aTHX_ &s, strend, - OFF32(&ai32), SIZE32)) break; - } else { - COPY32(s, &ai32); - s += SIZE32; - } + SHIFT32(utf8, s, strend, &ai32, datumtype); # ifdef HAS_NTOHL if (datumtype == ('N' | TYPE_IS_SHRIEKING)) ai32 = (I32)PerlSock_ntohl((U32)ai32); @@ -1786,13 +1860,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char case 'p': while (len-- > 0) { char *aptr; - if (utf8) { - if (!next_uni_bytes(aTHX_ &s, strend, - (char *) &aptr, sizeof(aptr))) break; - } else { - Copy(s, &aptr, 1, char*); - s += sizeof(aptr); - } + SHIFT_VAR(utf8, s, strend, aptr, datumtype); DO_BO_UNPACK_P(aptr); /* newSVpv generates undef if aptr is NULL */ PUSHs(sv_2mortal(newSVpv(aptr, 0))); @@ -1805,7 +1873,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char while (len > 0 && s < strend) { U8 ch; - ch = NEXT_BYTE(utf8, s, strend, 'w'); + ch = SHIFT_BYTE(utf8, s, strend, datumtype); auv = (auv << 7) | (ch & 0x7f); /* UTF8_IS_XXXXX not right here - using constant 0x80 */ if (ch < 0x80) { @@ -1821,7 +1889,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char sv = Perl_newSVpvf(aTHX_ "%.*"UVf, (int)TYPE_DIGITS(UV), auv); while (s < strend) { - ch = NEXT_BYTE(utf8, s, strend, 'w'); + ch = SHIFT_BYTE(utf8, s, strend, datumtype); sv = mul128(sv, (U8)(ch & 0x7f)); if (!(ch & 0x80)) { bytes = 0; @@ -1847,13 +1915,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char EXTEND(SP, 1); if (sizeof(char*) <= strend - s) { char *aptr; - if (utf8) { - if (!next_uni_bytes(aTHX_ &s, strend, (char *) &aptr, - sizeof(aptr))) break; - } else { - Copy(s, &aptr, 1, char*); - s += sizeof(aptr); - } + SHIFT_VAR(utf8, s, strend, aptr, datumtype); DO_BO_UNPACK_P(aptr); /* newSVpvn generates undef if aptr is NULL */ PUSHs(sv_2mortal(newSVpvn(aptr, len))); @@ -1863,7 +1925,8 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char case 'q': while (len-- > 0) { Quad_t aquad; - COPYVAR(s, strend, utf8, aquad, 64); + SHIFT_VAR(utf8, s, strend, aquad, datumtype); + DO_BO_UNPACK(aquad, 64); if (!checksum) PUSHs(sv_2mortal(aquad >= IV_MIN && aquad <= IV_MAX ? newSViv((IV)aquad) : newSVnv((NV)aquad))); @@ -1876,7 +1939,8 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char case 'Q': while (len-- > 0) { Uquad_t auquad; - COPYVAR(s, strend, utf8, auquad, 64); + SHIFT_VAR(utf8, s, strend, auquad, datumtype); + DO_BO_UNPACK(auquad, 64); if (!checksum) PUSHs(sv_2mortal(auquad <= UV_MAX ? newSVuv((UV)auquad):newSVnv((NV)auquad))); @@ -1891,13 +1955,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char case 'f': while (len-- > 0) { float afloat; - if (utf8) { - if (!next_uni_bytes(aTHX_ &s, strend, (char *) &afloat, - sizeof(afloat))) break; - } else { - Copy(s, &afloat, 1, float); - s += sizeof(float); - } + SHIFT_VAR(utf8, s, strend, afloat, datumtype); DO_BO_UNPACK_N(afloat, float); if (!checksum) PUSHs(sv_2mortal(newSVnv((NV)afloat))); @@ -1908,13 +1966,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char case 'd': while (len-- > 0) { double adouble; - if (utf8) { - if (!next_uni_bytes(aTHX_ &s, strend, (char *) &adouble, - sizeof(adouble))) break; - } else { - Copy(s, &adouble, 1, double); - s += sizeof(double); - } + SHIFT_VAR(utf8, s, strend, adouble, datumtype); DO_BO_UNPACK_N(adouble, double); if (!checksum) PUSHs(sv_2mortal(newSVnv((NV)adouble))); @@ -1925,13 +1977,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char case 'F': while (len-- > 0) { NV anv; - if (utf8) { - if (!next_uni_bytes(aTHX_ &s, strend, - (char *) &anv, sizeof(anv))) break; - } else { - Copy(s, &anv, 1, NV); - s += NVSIZE; - } + SHIFT_VAR(utf8, s, strend, anv, datumtype); DO_BO_UNPACK_N(anv, NV); if (!checksum) PUSHs(sv_2mortal(newSVnv(anv))); @@ -1943,13 +1989,7 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, char *s, char *strbeg, char *strend, char case 'D': while (len-- > 0) { long double aldouble; - if (utf8) { - if (!next_uni_bytes(aTHX_ &s, strend, (char *) &aldouble, - sizeof(aldouble))) break; - } else { - Copy(s, &aldouble, 1, long double); - s += LONG_DOUBLESIZE; - } + SHIFT_VAR(utf8, s, strend, aldouble, datumtype); DO_BO_UNPACK_N(aldouble, long double); if (!checksum) PUSHs(sv_2mortal(newSVnv((NV)aldouble))); @@ -2120,7 +2160,7 @@ PP(pp_unpack) PUTBACK; cnt = unpackstring(pat, patend, s, strend, ((gimme == G_SCALAR) ? FLAG_UNPACK_ONLY_ONE : 0) - | (DO_UTF8(right) ? FLAG_UNPACK_DO_UTF8 : 0)); + | (DO_UTF8(right) ? FLAG_DO_UTF8 : 0)); SPAGAIN; if ( !cnt && gimme == G_SCALAR ) @@ -2128,32 +2168,27 @@ PP(pp_unpack) RETURN; } -STATIC void -S_doencodes(pTHX_ register SV *sv, register char *s, register I32 len) +STATIC U8 * +doencodes(U8 *h, char *s, I32 len) { - char hunk[5]; - - *hunk = PL_uuemap[len]; - sv_catpvn(sv, hunk, 1); - hunk[4] = '\0'; + *h++ = PL_uuemap[len]; while (len > 2) { - hunk[0] = PL_uuemap[(077 & (*s >> 2))]; - hunk[1] = PL_uuemap[(077 & (((*s << 4) & 060) | ((s[1] >> 4) & 017)))]; - hunk[2] = PL_uuemap[(077 & (((s[1] << 2) & 074) | ((s[2] >> 6) & 03)))]; - hunk[3] = PL_uuemap[(077 & (s[2] & 077))]; - sv_catpvn(sv, hunk, 4); + *h++ = PL_uuemap[(077 & (s[0] >> 2))]; + *h++ = PL_uuemap[(077 & (((s[0] << 4) & 060) | ((s[1] >> 4) & 017)))]; + *h++ = PL_uuemap[(077 & (((s[1] << 2) & 074) | ((s[2] >> 6) & 03)))]; + *h++ = PL_uuemap[(077 & (s[2] & 077))]; s += 3; len -= 3; } if (len > 0) { char r = (len > 1 ? s[1] : '\0'); - hunk[0] = PL_uuemap[(077 & (*s >> 2))]; - hunk[1] = PL_uuemap[(077 & (((*s << 4) & 060) | ((r >> 4) & 017)))]; - hunk[2] = PL_uuemap[(077 & ((r << 2) & 074))]; - hunk[3] = PL_uuemap[0]; - sv_catpvn(sv, hunk, 4); + *h++ = PL_uuemap[(077 & (s[0] >> 2))]; + *h++ = PL_uuemap[(077 & (((s[0] << 4) & 060) | ((r >> 4) & 017)))]; + *h++ = PL_uuemap[(077 & ((r << 2) & 074))]; + *h++ = PL_uuemap[0]; } - sv_catpvn(sv, "\n", 1); + *h++ = '\n'; + return h; } STATIC SV * @@ -2266,449 +2301,757 @@ The engine implementing pack() Perl function. void Perl_packlist(pTHX_ SV *cat, char *pat, register char *patend, register SV **beglist, SV **endlist ) { + STRLEN no_len; tempsym_t sym = { 0 }; + sym.patptr = pat; sym.patend = patend; sym.flags = FLAG_PACK; + /* We're going to do changes through SvPVX(cat). Make sure it's valid. + Also make sure any UTF8 flag is loaded */ + SvPV_force(cat, no_len); + if (DO_UTF8(cat)) sym.flags |= FLAG_PARSE_UTF8 | FLAG_DO_UTF8; + (void)pack_rec( cat, &sym, beglist, endlist ); } +/* like sv_utf8_upgrade, but also repoint the group start markers */ +STATIC void +marked_upgrade(pTHX_ SV *sv, tempsym_t *sym_ptr) { + STRLEN len; + tempsym_t *group; + char *from_ptr, *to_start, *to_ptr, **marks, **m, *from_start, *from_end; + + if (SvUTF8(sv)) return; + + from_start = SvPVX(sv); + from_end = from_start + SvCUR(sv); + for (from_ptr = from_start; from_ptr < from_end; from_ptr++) + if (!NATIVE_IS_INVARIANT(*from_ptr)) break; + if (from_ptr == from_end) { + /* Simple case: no character needs to be changed */ + SvUTF8_on(sv); + return; + } + + /* We assume a char translates to at most 2 UTF-8 bytes */ + len = (from_end-from_ptr)*2+(from_ptr-from_start)+1; + New('U', to_start, len, char); + Copy(from_start, to_start, from_ptr-from_start, char); + to_ptr = to_start + (from_ptr-from_start); + + New('U', marks, sym_ptr->level+2, char *); + for (group=sym_ptr; group; group = group->previous) + marks[group->level] = from_start + group->strbeg; + marks[sym_ptr->level+1] = from_end+1; + for (m = marks; *m < from_ptr; m++) + *m = to_start + (*m-from_start); + + for (;from_ptr < from_end; from_ptr++) { + while (*m == from_ptr) *m++ = to_ptr; + to_ptr = uvchr_to_utf8(to_ptr, *(U8 *) from_ptr); + } + *to_ptr = 0; + + while (*m == from_ptr) *m++ = to_ptr; + if (m != marks + sym_ptr->level+1) { + Safefree(marks); + Safefree(to_start); + Perl_croak(aTHX_ "Assertion: marks beyond string end"); + } + for (group=sym_ptr; group; group = group->previous) + group->strbeg = marks[group->level] - to_start; + Safefree(marks); + + if (SvOOK(sv)) { + if (SvIVX(sv)) { + SvLEN(sv) += SvIVX(sv); + from_start -= SvIVX(sv); + SvIV_set(sv, 0); + } + SvFLAGS(sv) &= ~SVf_OOK; + } + if (SvLEN(sv) != 0) + Safefree(from_start); + SvPVX(sv) = to_start; + SvCUR(sv) = to_ptr - to_start; + SvLEN(sv) = len; + SvUTF8_on(sv); +} + +/* Exponential string grower. Makes string extension effectively O(n) + needed says how many extra bytes we need (not counting the final '\0') + Only grows the string if there is an actual lack of space +*/ +STATIC char * +sv_exp_grow(pTHX_ SV *sv, STRLEN needed) { + STRLEN cur = SvCUR(sv); + STRLEN len = SvLEN(sv); + STRLEN extend; + if (len - cur > needed) return SvPVX(sv); + extend = needed > len ? needed : len; + return SvGROW(sv, len+extend+1); +} STATIC SV ** -S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV **endlist ) +S_pack_rec(pTHX_ SV *cat, tempsym_t* symptr, SV **beglist, SV **endlist ) { - register I32 items; - STRLEN fromlen; - register I32 len = 0; - SV *fromstr; - /*SUPPRESS 442*/ - static char null10[] = {0,0,0,0,0,0,0,0,0,0}; - static char *space10 = " "; - bool found; - - /* These must not be in registers: */ - char achar; - I16 ai16; - U16 au16; - I32 ai32; - U32 au32; -#ifdef HAS_QUAD - Quad_t aquad; - Uquad_t auquad; -#endif -#if SHORTSIZE != SIZE16 - short ashort; - unsigned short aushort; -#endif - int aint; - unsigned int auint; -#if LONGSIZE != SIZE32 - long along; - unsigned long aulong; -#endif - char *aptr; - float afloat; - double adouble; -#if defined(HAS_LONG_DOUBLE) && defined(USE_LONG_DOUBLE) - long double aldouble; -#endif - IV aiv; - UV auv; - NV anv; - - int strrelbeg = SvCUR(cat); tempsym_t lookahead; - - items = endlist - beglist; - found = next_symbol( symptr ); - -#ifndef PACKED_IS_OCTETS - if (symptr->level == 0 && found && symptr->code == 'U' ){ - SvUTF8_on(cat); + I32 items = endlist - beglist; + bool found = next_symbol(symptr); + bool utf8 = (symptr->flags & FLAG_PARSE_UTF8) ? 1 : 0; + + if (symptr->level == 0 && found && symptr->code == 'U') { + marked_upgrade(aTHX_ cat, symptr); + symptr->flags |= FLAG_DO_UTF8; + utf8 = 0; } -#endif + symptr->strbeg = SvCUR(cat); while (found) { + SV *fromstr; + STRLEN fromlen; + I32 len; SV *lengthcode = Nullsv; -#define NEXTFROM ( lengthcode ? lengthcode : items-- > 0 ? *beglist++ : &PL_sv_no) - I32 datumtype = symptr->code; - howlen_t howlen; + howlen_t howlen = symptr->howlen; + char *start = SvPVX(cat); + char *cur = start + SvCUR(cat); - switch( howlen = symptr->howlen ){ - case e_no_len: - case e_number: - len = symptr->length; - break; +#define NEXTFROM (lengthcode ? lengthcode : items-- > 0 ? *beglist++ : &PL_sv_no) + + switch (howlen) { case e_star: - len = strchr("@Xxu", TYPE_NO_MODIFIERS(datumtype)) ? 0 : items; + len = strchr("@Xxu", TYPE_NO_MODIFIERS(datumtype)) ? + 0 : items; + break; + default: + /* e_no_len and e_number */ + len = symptr->length; break; } + if (len) { + struct packsize_t *pack_props = + &packsize[(symptr->code & TYPE_IS_SHRIEKING) ? + PACK_SIZE_SHRIEKING : PACK_SIZE_NORMAL]; + const int rawtype = TYPE_NO_MODIFIERS(datumtype); + int offset = rawtype - pack_props->first; + + if (offset >= 0 && offset < pack_props->size) { + /* Data about this template letter */ + unsigned char data = pack_props->array[offset]; + + if (data && !(data & PACK_SIZE_UNPREDICTABLE)) { + /* We can process this letter. */ + STRLEN size = data & PACK_SIZE_MASK; + GROWING(utf8, cat, start, cur, (STRLEN) len * size); + } + } + + } + /* Look ahead for next symbol. Do we have code/code? */ lookahead = *symptr; found = next_symbol(&lookahead); if ( symptr->flags & FLAG_SLASH ) { - if (found){ + if (!found) Perl_croak(aTHX_ "Code missing after '/' in pack"); if ( 0 == strchr( "aAZ", lookahead.code ) || e_star != lookahead.howlen ) Perl_croak(aTHX_ "'/' must be followed by 'a*', 'A*' or 'Z*' in pack"); - lengthcode = sv_2mortal(newSViv(sv_len(items > 0 - ? *beglist : &PL_sv_no) - + (lookahead.code == 'Z' ? 1 : 0))); - } else { - Perl_croak(aTHX_ "Code missing after '/' in pack"); - } + lengthcode = + sv_2mortal(newSViv((items > 0 ? DO_UTF8(*beglist) ? sv_len_utf8(*beglist) : sv_len(*beglist) : 0) + (lookahead.code == 'Z' ? 1 : 0))); } + /* Code inside the switch must take care to properly update + cat (CUR length and '\0' termination) if it updated *cur and + doesn't simply leave using break */ switch(TYPE_NO_ENDIANNESS(datumtype)) { default: - Perl_croak(aTHX_ "Invalid type '%c' in pack", (int)TYPE_NO_MODIFIERS(datumtype)); + Perl_croak(aTHX_ "Invalid type '%c' in pack", + (int) TYPE_NO_MODIFIERS(datumtype)); case '%': Perl_croak(aTHX_ "'%%' may not be used in pack"); case '@': - len += strrelbeg - SvCUR(cat); - if (len > 0) - goto grow; + if (utf8) { + char *s = start + symptr->strbeg; + while (len > 0 && s < cur) { + s += UTF8SKIP(s); + len--; + } + if (s > cur) + Perl_croak(aTHX_ "Malformed UTF-8 string in pack"); + if (len > 0) { + grow: + GROWING(0, cat, start, cur, len); + Zero(cur, len, char); + cur += len; + } else if (s < cur) cur = s; + else goto no_change; + } else { + len -= cur - (start+symptr->strbeg); + if (len > 0) goto grow; len = -len; - if (len > 0) - goto shrink; + if (len > 0) goto shrink; + else goto no_change; + } break; - case '(': - { + case '(': { tempsym_t savsym = *symptr; U32 group_modifiers = TYPE_MODIFIERS(datumtype & ~symptr->flags); symptr->flags |= group_modifiers; symptr->patend = savsym.grpend; symptr->level++; + symptr->previous = &lookahead; while (len--) { + U32 was_utf8; + if (utf8) symptr->flags |= FLAG_PARSE_UTF8; + else symptr->flags &= ~FLAG_PARSE_UTF8; + was_utf8 = SvUTF8(cat); symptr->patptr = savsym.grpbeg; - beglist = pack_rec(cat, symptr, beglist, endlist ); + beglist = pack_rec(cat, symptr, beglist, endlist); + if (SvUTF8(cat) != was_utf8) + /* This had better be an upgrade while in utf8==0 mode */ + utf8 = 1; + if (savsym.howlen == e_star && beglist == endlist) break; /* No way to continue */ } - symptr->flags &= ~group_modifiers; - lookahead.flags = symptr->flags; - *symptr = savsym; - break; + lookahead.flags = symptr->flags & ~group_modifiers; + goto no_change; } case 'X' | TYPE_IS_SHRIEKING: if (!len) /* Avoid division by 0 */ len = 1; - len = (SvCUR(cat)) % len; + if (utf8) { + char *hop, *last; + I32 l = len; + hop = last = start; + while (hop < cur) { + hop += UTF8SKIP(hop); + if (--l == 0) { + last = hop; + l = len; + } + } + if (last > cur) + Perl_croak(aTHX_ "Malformed UTF-8 string in pack"); + cur = last; + break; + } + len = (cur-start) % len; /* FALL THROUGH */ case 'X': + if (utf8) { + if (len < 1) goto no_change; + while (len > 0) { + if (cur <= start) + Perl_croak(aTHX_ "'X' outside of string in pack"); + while (--cur, UTF8_IS_CONTINUATION(*cur)) { + if (cur <= start) + Perl_croak(aTHX_ "'X' outside of string in pack"); + } + len--; + } + } else { shrink: - if ((I32)SvCUR(cat) < len) + if (cur - start < len) Perl_croak(aTHX_ "'X' outside of string in pack"); - SvCUR(cat) -= len; - *SvEND(cat) = '\0'; + cur -= len; + } + if (cur < start+symptr->strbeg) { + /* Make sure group starts don't point into the void */ + tempsym_t *group; + STRLEN length = cur-start; + for (group = symptr; + group && length < group->strbeg; + group = group->previous) group->strbeg = length; + lookahead.strbeg = length; + } break; - case 'x' | TYPE_IS_SHRIEKING: + case 'x' | TYPE_IS_SHRIEKING: { + I32 ai32; if (!len) /* Avoid division by 0 */ len = 1; - aint = (SvCUR(cat)) % len; - if (aint) /* Other portable ways? */ - len = len - aint; - else - len = 0; + if (utf8) ai32 = utf8_length(start, cur) % len; + else ai32 = (cur - start) % len; + if (ai32 == 0) goto no_change; + len -= ai32; + } /* FALL THROUGH */ - case 'x': - grow: - while (len >= 10) { - sv_catpvn(cat, null10, 10); - len -= 10; - } - sv_catpvn(cat, null10, len); - break; + goto grow; case 'A': case 'Z': - case 'a': + case 'a': { + char *aptr; + fromstr = NEXTFROM; aptr = SvPV(fromstr, fromlen); + if (DO_UTF8(fromstr)) { + char *end, *s; + + if (!utf8 && !SvUTF8(cat)) { + marked_upgrade(aTHX_ cat, symptr); + lookahead.flags |= FLAG_DO_UTF8; + lookahead.strbeg = symptr->strbeg; + utf8 = 1; + start = SvPVX(cat); + cur = start + SvCUR(cat); + } if (howlen == e_star) { + if (utf8) goto string_copy; + len = fromlen+1; + } + s = aptr; + end = aptr + fromlen; + fromlen = datumtype == 'Z' ? len-1 : len; + while ((I32) fromlen > 0 && s < end) { + s += UTF8SKIP(s); + fromlen--; + } + if (s > end) + Perl_croak(aTHX_ "Malformed UTF-8 string in pack"); + if (utf8) { len = fromlen; - if (datumtype == 'Z') - ++len; - } - if ((I32)fromlen >= len) { - sv_catpvn(cat, aptr, len); - if (datumtype == 'Z' && len > 0) - *(SvEND(cat)-1) = '\0'; + if (datumtype == 'Z') len++; + fromlen = s-aptr; + len += fromlen; + + goto string_copy; + } + fromlen = len - fromlen; + if (datumtype == 'Z') fromlen--; + if (howlen == e_star) { + len = fromlen; + if (datumtype == 'Z') len++; } - else { - sv_catpvn(cat, aptr, fromlen); + GROWING(0, cat, start, cur, len); + if (!uni_to_bytes(aTHX_ &aptr, end, cur, fromlen, + datumtype | TYPE_IS_PACK)) + Perl_croak(aTHX_ "Perl bug: predicted utf8 length not available"); + cur += fromlen; len -= fromlen; - if (datumtype == 'A') { - while (len >= 10) { - sv_catpvn(cat, space10, 10); - len -= 10; - } - sv_catpvn(cat, space10, len); + } else if (utf8) { + if (howlen == e_star) { + len = fromlen; + if (datumtype == 'Z') len++; } - else { - while (len >= 10) { - sv_catpvn(cat, null10, 10); - len -= 10; + if (len <= (I32) fromlen) { + fromlen = len; + if (datumtype == 'Z' && fromlen > 0) fromlen--; + } + /* assumes a byte expands to at most 2 bytes on upgrade: + expected_length <= from_len*2 + (len-from_len) */ + GROWING(0, cat, start, cur, fromlen+len); + len -= fromlen; + while (fromlen > 0) { + cur = uvchr_to_utf8(cur, * (U8 *) aptr); + aptr++; + fromlen--; } - sv_catpvn(cat, null10, len); + } else { + string_copy: + if (howlen == e_star) { + len = fromlen; + if (datumtype == 'Z') len++; + } + if (len <= (I32) fromlen) { + fromlen = len; + if (datumtype == 'Z' && fromlen > 0) fromlen--; } + GROWING(0, cat, start, cur, len); + Copy(aptr, cur, fromlen, char); + cur += fromlen; + len -= fromlen; } + memset(cur, datumtype == 'A' ? ' ' : '\0', len); + cur += len; break; + } case 'B': - case 'b': - { - register char *str; - I32 saveitems; + case 'b': { + char *str, *end; + I32 l, field_len; + U8 bits; + bool utf8_source; + U32 utf8_flags; fromstr = NEXTFROM; - saveitems = items; str = SvPV(fromstr, fromlen); - if (howlen == e_star) - len = fromlen; - aint = SvCUR(cat); - SvCUR(cat) += (len+7)/8; - SvGROW(cat, SvCUR(cat) + 1); - aptr = SvPVX(cat) + aint; - if (len > (I32)fromlen) - len = fromlen; - aint = len; - items = 0; - if (datumtype == 'B') { - for (len = 0; len++ < aint;) { - items |= *str++ & 1; - if (len & 7) - items <<= 1; + end = str + fromlen; + if (DO_UTF8(fromstr)) { + utf8_source = TRUE; + utf8_flags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY; + } else { + utf8_source = FALSE; + utf8_flags = 0; /* Unused, but keep compilers happy */ + } + if (howlen == e_star) len = fromlen; + field_len = (len+7)/8; + GROWING(utf8, cat, start, cur, field_len); + if (len > (I32)fromlen) len = fromlen; + bits = 0; + l = 0; + if (datumtype == 'B') + while (l++ < len) { + if (utf8_source) { + UV val; + NEXT_UNI_VAL(val, cur, str, end, utf8_flags); + bits |= val & 1; + } else bits |= *str++ & 1; + if (l & 7) bits <<= 1; else { - *aptr++ = items & 0xff; - items = 0; - } + PUSH_BYTE(utf8, cur, bits); + bits = 0; } } + else + /* datumtype == 'b' */ + while (l++ < len) { + if (utf8_source) { + UV val; + NEXT_UNI_VAL(val, cur, str, end, utf8_flags); + if (val & 1) bits |= 0x80; + } else if (*str++ & 1) + bits |= 0x80; + if (l & 7) bits >>= 1; else { - for (len = 0; len++ < aint;) { - if (*str++ & 1) - items |= 128; - if (len & 7) - items >>= 1; - else { - *aptr++ = items & 0xff; - items = 0; - } + PUSH_BYTE(utf8, cur, bits); + bits = 0; } } - if (aint & 7) { + l--; + if (l & 7) { if (datumtype == 'B') - items <<= 7 - (aint & 7); + bits <<= 7 - (l & 7); else - items >>= 7 - (aint & 7); - *aptr++ = items & 0xff; - } - str = SvPVX(cat) + SvCUR(cat); - while (aptr <= str) - *aptr++ = '\0'; - - items = saveitems; + bits >>= 7 - (l & 7); + PUSH_BYTE(utf8, cur, bits); + l += 7; } + /* Determine how many chars are left in the requested field */ + l /= 8; + if (howlen == e_star) field_len = 0; + else field_len -= l; + Zero(cur, field_len, char); + cur += field_len; break; + } case 'H': - case 'h': - { - register char *str; - I32 saveitems; + case 'h': { + char *str, *end; + I32 l, field_len; + U8 bits; + bool utf8_source; + U32 utf8_flags; fromstr = NEXTFROM; - saveitems = items; str = SvPV(fromstr, fromlen); - if (howlen == e_star) - len = fromlen; - aint = SvCUR(cat); - SvCUR(cat) += (len+1)/2; - SvGROW(cat, SvCUR(cat) + 1); - aptr = SvPVX(cat) + aint; - if (len > (I32)fromlen) - len = fromlen; - aint = len; - items = 0; - if (datumtype == 'H') { - for (len = 0; len++ < aint;) { - if (isALPHA(*str)) - items |= ((*str++ & 15) + 9) & 15; + end = str + fromlen; + if (DO_UTF8(fromstr)) { + utf8_source = TRUE; + utf8_flags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY; + } else { + utf8_source = FALSE; + utf8_flags = 0; /* Unused, but keep compilers happy */ + } + if (howlen == e_star) len = fromlen; + field_len = (len+1)/2; + GROWING(utf8, cat, start, cur, field_len); + if (!utf8 && len > (I32)fromlen) len = fromlen; + bits = 0; + l = 0; + if (datumtype == 'H') + while (l++ < len) { + if (utf8_source) { + UV val; + NEXT_UNI_VAL(val, cur, str, end, utf8_flags); + if (val < 256 && isALPHA(val)) + bits |= (val + 9) & 0xf; else - items |= *str++ & 15; - if (len & 1) - items <<= 4; + bits |= val & 0xf; + } else if (isALPHA(*str)) + bits |= (*str++ + 9) & 0xf; + else + bits |= *str++ & 0xf; + if (l & 1) bits <<= 4; else { - *aptr++ = items & 0xff; - items = 0; - } + PUSH_BYTE(utf8, cur, bits); + bits = 0; } } - else { - for (len = 0; len++ < aint;) { - if (isALPHA(*str)) - items |= (((*str++ & 15) + 9) & 15) << 4; + else + while (l++ < len) { + if (utf8_source) { + UV val; + NEXT_UNI_VAL(val, cur, str, end, utf8_flags); + if (val < 256 && isALPHA(val)) + bits |= ((val + 9) & 0xf) << 4; else - items |= (*str++ & 15) << 4; - if (len & 1) - items >>= 4; - else { - *aptr++ = items & 0xff; - items = 0; + bits |= (val & 0xf) << 4; + } else if (isALPHA(*str)) + bits |= ((*str++ + 9) & 0xf) << 4; + else + bits |= (*str++ & 0xf) << 4; + if (l & 1) bits >>= 4; + else { + PUSH_BYTE(utf8, cur, bits); + bits = 0; } } + l--; + if (l & 1) { + PUSH_BYTE(utf8, cur, bits); + l++; + } + /* Determine how many chars are left in the requested field */ + l /= 2; + if (howlen == e_star) field_len = 0; + else field_len -= l; + Zero(cur, field_len, char); + cur += field_len; + break; } - if (aint & 1) - *aptr++ = items & 0xff; - str = SvPVX(cat) + SvCUR(cat); - while (aptr <= str) - *aptr++ = '\0'; - - items = saveitems; + case 'c': + while (len-- > 0) { + IV aiv; + fromstr = NEXTFROM; + aiv = SvIV(fromstr); + if ((-128 > aiv || aiv > 127) && + ckWARN(WARN_PACK)) + Perl_warner(aTHX_ packWARN(WARN_PACK), + "Character in 'c' format wrapped in pack"); + PUSH_BYTE(utf8, cur, aiv & 0xff); } break; case 'C': - case 'c': + if (len == 0) { + utf8 = (symptr->flags & FLAG_DO_UTF8) ? 1 : 0; + break; + } + GROWING(0, cat, start, cur, len); while (len-- > 0) { + IV aiv; fromstr = NEXTFROM; - switch (TYPE_NO_MODIFIERS(datumtype)) { - case 'C': - aint = SvIV(fromstr); - if ((aint < 0 || aint > 255) && + aiv = SvIV(fromstr); + if ((0 > aiv || aiv > 0xff) && ckWARN(WARN_PACK)) Perl_warner(aTHX_ packWARN(WARN_PACK), "Character in 'C' format wrapped in pack"); - achar = aint & 255; - sv_catpvn(cat, &achar, sizeof(char)); + *cur++ = aiv & 0xff; + } break; - case 'c': - aint = SvIV(fromstr); - if ((aint < -128 || aint > 127) && - ckWARN(WARN_PACK)) + case 'W': { + char *end; + U8 in_bytes = IN_BYTES; + + end = start+SvLEN(cat)-1; + if (utf8) end -= UTF8_MAXLEN-1; + while (len-- > 0) { + UV auv; + fromstr = NEXTFROM; + auv = SvUV(fromstr); + if (in_bytes) auv = auv % 0x100; + if (utf8) { + W_utf8: + if (cur > end) { + *cur = '\0'; + SvCUR(cat) = cur - start; + + GROWING(0, cat, start, cur, len+UTF8_MAXLEN); + end = start+SvLEN(cat)-UTF8_MAXLEN; + } + cur = uvuni_to_utf8_flags(cur, NATIVE_TO_UNI(auv), + ckWARN(WARN_UTF8) ? + 0 : UNICODE_ALLOW_ANY); + } else { + if (auv >= 0x100) { + if (!SvUTF8(cat)) { + *cur = '\0'; + SvCUR(cat) = cur - start; + marked_upgrade(aTHX_ cat, symptr); + lookahead.flags |= FLAG_DO_UTF8; + lookahead.strbeg = symptr->strbeg; + utf8 = 1; + start = SvPVX(cat); + cur = start + SvCUR(cat); + end = start+SvLEN(cat)-UTF8_MAXLEN; + goto W_utf8; + } + if (ckWARN(WARN_PACK)) Perl_warner(aTHX_ packWARN(WARN_PACK), - "Character in 'c' format wrapped in pack" ); - achar = aint & 255; - sv_catpvn(cat, &achar, sizeof(char)); - break; + "Character in 'W' format wrapped in pack"); + auv &= 0xff; + } + if (cur >= end) { + *cur = '\0'; + SvCUR(cat) = cur - start; + GROWING(0, cat, start, cur, len+1); + end = start+SvLEN(cat)-1; + } + *(U8 *) cur++ = auv; } } break; - case 'U': + } + case 'U': { + char *end; + + if (len == 0) { + if (!(symptr->flags & FLAG_DO_UTF8)) { + marked_upgrade(aTHX_ cat, symptr); + lookahead.flags |= FLAG_DO_UTF8; + lookahead.strbeg = symptr->strbeg; + } + utf8 = 0; + goto no_change; + } + + end = start+SvLEN(cat); + if (!utf8) end -= UTF8_MAXLEN; while (len-- > 0) { + UV auv; fromstr = NEXTFROM; - auint = UNI_TO_NATIVE(SvUV(fromstr)); - SvGROW(cat, SvCUR(cat) + UTF8_MAXBYTES + 1); - SvCUR_set(cat, - (char*)uvchr_to_utf8_flags((U8*)SvEND(cat), - auint, + auv = SvUV(fromstr); + if (utf8) { + char buffer[UTF8_MAXLEN], *end; + end = uvuni_to_utf8_flags(buffer, auv, + ckWARN(WARN_UTF8) ? + 0 : UNICODE_ALLOW_ANY); + if (cur >= end-(end-buffer)*2) { + *cur = '\0'; + SvCUR(cat) = cur - start; + GROWING(0, cat, start, cur, len+(end-buffer)*2); + end = start+SvLEN(cat)-UTF8_MAXLEN; + } + bytes_to_uni(aTHX_ buffer, end-buffer, &cur); + } else { + if (cur >= end) { + *cur = '\0'; + SvCUR(cat) = cur - start; + GROWING(0, cat, start, cur, len+UTF8_MAXLEN); + end = start+SvLEN(cat)-UTF8_MAXLEN; + } + cur = uvuni_to_utf8_flags(cur, auv, ckWARN(WARN_UTF8) ? - 0 : UNICODE_ALLOW_ANY) - - SvPVX(cat)); + 0 : UNICODE_ALLOW_ANY); + } } - *SvEND(cat) = '\0'; break; + } /* Float and double added by gnb@melba.bby.oz.au 22/11/89 */ case 'f': while (len-- > 0) { + float afloat; + NV anv; fromstr = NEXTFROM; + anv = SvNV(fromstr); #ifdef __VOS__ -/* VOS does not automatically map a floating-point overflow + /* VOS does not automatically map a floating-point overflow during conversion from double to float into infinity, so we do it by hand. This code should either be generalized for any OS that needs it, or removed if and when VOS implements posix-976 (suggestion to support mapping to infinity). Paul.Green@stratus.com 02-04-02. */ - if (SvNV(fromstr) > FLT_MAX) + if (anv > FLT_MAX) afloat = _float_constants[0]; /* single prec. inf. */ - else if (SvNV(fromstr) < -FLT_MAX) + else if (anv < -FLT_MAX) afloat = _float_constants[0]; /* single prec. inf. */ - else afloat = (float)SvNV(fromstr); -#else + else afloat = (float) anv; +#else /* __VOS__ */ # if defined(VMS) && !defined(__IEEE_FP) -/* IEEE fp overflow shenanigans are unavailable on VAX and optional + /* IEEE fp overflow shenanigans are unavailable on VAX and optional * on Alpha; fake it if we don't have them. */ - if (SvNV(fromstr) > FLT_MAX) + if (anv > FLT_MAX) afloat = FLT_MAX; - else if (SvNV(fromstr) < -FLT_MAX) + else if (anv < -FLT_MAX) afloat = -FLT_MAX; - else afloat = (float)SvNV(fromstr); + else afloat = (float)anv; # else - afloat = (float)SvNV(fromstr); + afloat = (float)anv; # endif -#endif +#endif /* __VOS__ */ DO_BO_PACK_N(afloat, float); - sv_catpvn(cat, (char *)&afloat, sizeof (float)); + PUSH_VAR(utf8, cur, afloat); } break; case 'd': while (len-- > 0) { + double adouble; + NV anv; fromstr = NEXTFROM; + anv = SvNV(fromstr); #ifdef __VOS__ -/* VOS does not automatically map a floating-point overflow + /* VOS does not automatically map a floating-point overflow during conversion from long double to double into infinity, so we do it by hand. This code should either be generalized for any OS that needs it, or removed if and when VOS implements posix-976 (suggestion to support mapping to infinity). Paul.Green@stratus.com 02-04-02. */ - if (SvNV(fromstr) > DBL_MAX) + if (anv > DBL_MAX) adouble = _double_constants[0]; /* double prec. inf. */ - else if (SvNV(fromstr) < -DBL_MAX) + else if (anv < -DBL_MAX) adouble = _double_constants[0]; /* double prec. inf. */ - else adouble = (double)SvNV(fromstr); -#else + else adouble = (double) anv; +#else /* __VOS__ */ # if defined(VMS) && !defined(__IEEE_FP) -/* IEEE fp overflow shenanigans are unavailable on VAX and optional + /* IEEE fp overflow shenanigans are unavailable on VAX and optional * on Alpha; fake it if we don't have them. */ - if (SvNV(fromstr) > DBL_MAX) + if (anv > DBL_MAX) adouble = DBL_MAX; - else if (SvNV(fromstr) < -DBL_MAX) + else if (anv < -DBL_MAX) adouble = -DBL_MAX; - else adouble = (double)SvNV(fromstr); + else adouble = (double)anv; # else - adouble = (double)SvNV(fromstr); + adouble = (double)anv; # endif -#endif +#endif /* __VOS__ */ DO_BO_PACK_N(adouble, double); - sv_catpvn(cat, (char *)&adouble, sizeof (double)); + PUSH_VAR(utf8, cur, adouble); } break; - case 'F': + case 'F': { + NV anv; Zero(&anv, 1, NV); /* can be long double with unused bits */ while (len-- > 0) { fromstr = NEXTFROM; anv = SvNV(fromstr); DO_BO_PACK_N(anv, NV); - sv_catpvn(cat, (char *)&anv, NVSIZE); + PUSH_VAR(utf8, cur, anv); } break; + } #if defined(HAS_LONG_DOUBLE) && defined(USE_LONG_DOUBLE) - case 'D': + case 'D': { + long double aldouble; /* long doubles can have unused bits, which may be nonzero */ Zero(&aldouble, 1, long double); while (len-- > 0) { fromstr = NEXTFROM; aldouble = (long double)SvNV(fromstr); DO_BO_PACK_N(aldouble, long double); - sv_catpvn(cat, (char *)&aldouble, LONG_DOUBLESIZE); + PUSH_VAR(utf8, cur, aldouble); } break; + } #endif #ifdef PERL_PACK_CAN_SHRIEKSIGN case 'n' | TYPE_IS_SHRIEKING: #endif case 'n': while (len-- > 0) { + I16 ai16; fromstr = NEXTFROM; ai16 = (I16)SvIV(fromstr); #ifdef HAS_HTONS ai16 = PerlSock_htons(ai16); #endif - CAT16(cat, &ai16); + PUSH16(utf8, cur, &ai16); } break; #ifdef PERL_PACK_CAN_SHRIEKSIGN @@ -2716,48 +3059,45 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV #endif case 'v': while (len-- > 0) { + I16 ai16; fromstr = NEXTFROM; ai16 = (I16)SvIV(fromstr); #ifdef HAS_HTOVS ai16 = htovs(ai16); #endif - CAT16(cat, &ai16); + PUSH16(utf8, cur, &ai16); } break; case 'S' | TYPE_IS_SHRIEKING: #if SHORTSIZE != SIZE16 - { while (len-- > 0) { + unsigned short aushort; fromstr = NEXTFROM; aushort = SvUV(fromstr); DO_BO_PACK(aushort, s); - sv_catpvn(cat, (char *)&aushort, sizeof(unsigned short)); - } + PUSH_VAR(utf8, cur, aushort); } break; #else /* Fall through! */ #endif case 'S': - { while (len-- > 0) { + U16 au16; fromstr = NEXTFROM; au16 = (U16)SvUV(fromstr); DO_BO_PACK(au16, 16); - CAT16(cat, &au16); - } - + PUSH16(utf8, cur, &au16); } break; case 's' | TYPE_IS_SHRIEKING: #if SHORTSIZE != SIZE16 - { while (len-- > 0) { + short ashort; fromstr = NEXTFROM; ashort = SvIV(fromstr); DO_BO_PACK(ashort, s); - sv_catpvn(cat, (char *)&ashort, sizeof(short)); - } + PUSH_VAR(utf8, cur, ashort); } break; #else @@ -2765,23 +3105,26 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV #endif case 's': while (len-- > 0) { + I16 ai16; fromstr = NEXTFROM; ai16 = (I16)SvIV(fromstr); DO_BO_PACK(ai16, 16); - CAT16(cat, &ai16); + PUSH16(utf8, cur, &ai16); } break; case 'I': case 'I' | TYPE_IS_SHRIEKING: while (len-- > 0) { + unsigned int auint; fromstr = NEXTFROM; auint = SvUV(fromstr); DO_BO_PACK(auint, i); - sv_catpvn(cat, (char*)&auint, sizeof(unsigned int)); + PUSH_VAR(utf8, cur, auint); } break; case 'j': while (len-- > 0) { + IV aiv; fromstr = NEXTFROM; aiv = SvIV(fromstr); #if IVSIZE == INTSIZE @@ -2790,12 +3133,15 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV DO_BO_PACK(aiv, l); #elif defined(HAS_QUAD) && IVSIZE == U64SIZE DO_BO_PACK(aiv, 64); +#else + Perl_croak(aTHX_ "'j' not supported on this platform"); #endif - sv_catpvn(cat, (char*)&aiv, IVSIZE); + PUSH_VAR(utf8, cur, aiv); } break; case 'J': while (len-- > 0) { + UV auv; fromstr = NEXTFROM; auv = SvUV(fromstr); #if UVSIZE == INTSIZE @@ -2804,26 +3150,31 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV DO_BO_PACK(auv, l); #elif defined(HAS_QUAD) && UVSIZE == U64SIZE DO_BO_PACK(auv, 64); +#else + Perl_croak(aTHX_ "'J' not supported on this platform"); #endif - sv_catpvn(cat, (char*)&auv, UVSIZE); + PUSH_VAR(utf8, cur, auv); } break; case 'w': while (len-- > 0) { + NV anv; fromstr = NEXTFROM; anv = SvNV(fromstr); - if (anv < 0) + if (anv < 0) { + *cur = '\0'; + SvCUR(cat) = cur - start; Perl_croak(aTHX_ "Cannot compress negative numbers in pack"); + } /* 0xFFFFFFFFFFFFFFFF may cast to 18446744073709551616.0, which is == UV_MAX_P1. IOK is fine (instead of UV_only), as any negative IVs will have already been got by the croak() above. IOK is untrue for fractions, so we test them against UV_MAX_P1. */ - if (SvIOK(fromstr) || anv < UV_MAX_P1) - { - char buf[(sizeof(UV)*8)/7+1]; + if (SvIOK(fromstr) || anv < UV_MAX_P1) { + char buf[(sizeof(UV)*CHAR_BIT)/7+1]; char *in = buf + sizeof(buf); UV auv = SvUV(fromstr); @@ -2832,29 +3183,10 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV auv >>= 7; } while (auv); buf[sizeof(buf) - 1] &= 0x7f; /* clear continue bit */ - sv_catpvn(cat, in, (buf + sizeof(buf)) - in); - } - else if (SvPOKp(fromstr)) { /* decimal string arithmetics */ - char *from, *result, *in; - SV *norm; - STRLEN len; - bool done; - - /* Copy string and check for compliance */ - from = SvPV(fromstr, len); - if ((norm = is_an_int(from, len)) == NULL) - Perl_croak(aTHX_ "Can only compress unsigned integers in pack"); - - New('w', result, len, char); - in = result + len; - done = FALSE; - while (!done) - *--in = div128(norm, &done) | 0x80; - result[len - 1] &= 0x7F; /* clear continue bit */ - sv_catpvn(cat, in, (result + len) - in); - Safefree(result); - SvREFCNT_dec(norm); /* free norm */ - } + PUSH_GROWING_BYTES(utf8, cat, start, cur, + in, (buf + sizeof(buf)) - in); + } else if (SvPOKp(fromstr)) + goto w_string; else if (SvNOKp(fromstr)) { /* 10**NV_MAX_10_EXP is the largest power of 10 so 10**(NV_MAX_10_EXP+1) is definately unrepresentable @@ -2867,10 +3199,10 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV floating-point value. */ #ifdef NV_MAX_10_EXP -/* char buf[1 + (int)((NV_MAX_10_EXP + 1) * 0.47456)]; -- invalid C */ + /* char buf[1 + (int)((NV_MAX_10_EXP + 1) * 0.47456)]; -- invalid C */ char buf[1 + (int)((NV_MAX_10_EXP + 1) / 2)]; /* valid C */ #else -/* char buf[1 + (int)((308 + 1) * 0.47456)]; -- invalid C */ + /* char buf[1 + (int)((308 + 1) * 0.47456)]; -- invalid C */ char buf[1 + (int)((308 + 1) / 2)]; /* valid C */ #endif char *in = buf + sizeof(buf); @@ -2884,14 +3216,15 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV anv = next; } while (anv > 0); buf[sizeof(buf) - 1] &= 0x7f; /* clear continue bit */ - sv_catpvn(cat, in, (buf + sizeof(buf)) - in); - } - else { + PUSH_GROWING_BYTES(utf8, cat, start, cur, + in, (buf + sizeof(buf)) - in); + } else { char *from, *result, *in; SV *norm; STRLEN len; bool done; + w_string: /* Copy string and check for compliance */ from = SvPV(fromstr, len); if ((norm = is_an_int(from, len)) == NULL) @@ -2900,10 +3233,10 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV New('w', result, len, char); in = result + len; done = FALSE; - while (!done) - *--in = div128(norm, &done) | 0x80; + while (!done) *--in = div128(norm, &done) | 0x80; result[len - 1] &= 0x7F; /* clear continue bit */ - sv_catpvn(cat, in, (result + len) - in); + PUSH_GROWING_BYTES(utf8, cat, start, cur, + in, (result + len) - in); Safefree(result); SvREFCNT_dec(norm); /* free norm */ } @@ -2912,10 +3245,11 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV case 'i': case 'i' | TYPE_IS_SHRIEKING: while (len-- > 0) { + int aint; fromstr = NEXTFROM; aint = SvIV(fromstr); DO_BO_PACK(aint, i); - sv_catpvn(cat, (char*)&aint, sizeof(int)); + PUSH_VAR(utf8, cur, aint); } break; #ifdef PERL_PACK_CAN_SHRIEKSIGN @@ -2923,12 +3257,13 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV #endif case 'N': while (len-- > 0) { + U32 au32; fromstr = NEXTFROM; au32 = SvUV(fromstr); #ifdef HAS_HTONL au32 = PerlSock_htonl(au32); #endif - CAT32(cat, &au32); + PUSH32(utf8, cur, &au32); } break; #ifdef PERL_PACK_CAN_SHRIEKSIGN @@ -2936,47 +3271,45 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV #endif case 'V': while (len-- > 0) { + U32 au32; fromstr = NEXTFROM; au32 = SvUV(fromstr); #ifdef HAS_HTOVL au32 = htovl(au32); #endif - CAT32(cat, &au32); + PUSH32(utf8, cur, &au32); } break; case 'L' | TYPE_IS_SHRIEKING: #if LONGSIZE != SIZE32 - { while (len-- > 0) { + unsigned long aulong; fromstr = NEXTFROM; aulong = SvUV(fromstr); DO_BO_PACK(aulong, l); - sv_catpvn(cat, (char *)&aulong, sizeof(unsigned long)); - } + PUSH_VAR(utf8, cur, aulong); } break; #else /* Fall though! */ #endif case 'L': - { while (len-- > 0) { + U32 au32; fromstr = NEXTFROM; au32 = SvUV(fromstr); DO_BO_PACK(au32, 32); - CAT32(cat, &au32); - } + PUSH32(utf8, cur, &au32); } break; case 'l' | TYPE_IS_SHRIEKING: #if LONGSIZE != SIZE32 - { while (len-- > 0) { + long along; fromstr = NEXTFROM; along = SvIV(fromstr); DO_BO_PACK(along, l); - sv_catpvn(cat, (char *)&along, sizeof(long)); - } + PUSH_VAR(utf8, cur, along); } break; #else @@ -2984,35 +3317,41 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV #endif case 'l': while (len-- > 0) { + I32 ai32; fromstr = NEXTFROM; ai32 = SvIV(fromstr); DO_BO_PACK(ai32, 32); - CAT32(cat, &ai32); + PUSH32(utf8, cur, &ai32); } break; #ifdef HAS_QUAD case 'Q': while (len-- > 0) { + Uquad_t auquad; fromstr = NEXTFROM; - auquad = (Uquad_t)SvUV(fromstr); + auquad = (Uquad_t) SvUV(fromstr); DO_BO_PACK(auquad, 64); - sv_catpvn(cat, (char*)&auquad, sizeof(Uquad_t)); + PUSH_VAR(utf8, cur, auquad); } break; case 'q': while (len-- > 0) { + Quad_t aquad; fromstr = NEXTFROM; aquad = (Quad_t)SvIV(fromstr); DO_BO_PACK(aquad, 64); - sv_catpvn(cat, (char*)&aquad, sizeof(Quad_t)); + PUSH_VAR(utf8, cur, aquad); } break; -#endif +#endif /* HAS_QUAD */ case 'P': len = 1; /* assume SV is correct length */ + GROWING(utf8, cat, start, cur, sizeof(char *)); /* Fall through! */ case 'p': while (len-- > 0) { + char *aptr; + fromstr = NEXTFROM; SvGETMAGIC(fromstr); if (!SvOK(fromstr)) aptr = NULL; @@ -3023,10 +3362,9 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV * of pack() (and all copies of the result) are * gone. */ - if (ckWARN(WARN_PACK) && (SvTEMP(fromstr) - || (SvPADTMP(fromstr) - && !SvREADONLY(fromstr)))) - { + if (ckWARN(WARN_PACK) && + (SvTEMP(fromstr) || (SvPADTMP(fromstr) && + !SvREADONLY(fromstr)))) { Perl_warner(aTHX_ packWARN(WARN_PACK), "Attempt to pack pointer to temporary value"); } @@ -3036,30 +3374,59 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV aptr = SvPV_force_flags(fromstr, n_a, 0); } DO_BO_PACK_P(aptr); - sv_catpvn(cat, (char*)&aptr, sizeof(char*)); + PUSH_VAR(utf8, cur, aptr); } break; - case 'u': + case 'u': { + char *aptr, *aend; + bool from_utf8; + fromstr = NEXTFROM; + if (len <= 2) len = 45; + else len = len / 3 * 3; + if (len >= 64) { + Perl_warner(aTHX_ packWARN(WARN_PACK), + "Field too wide in 'u' format in pack"); + len = 63; + } aptr = SvPV(fromstr, fromlen); - SvGROW(cat, fromlen * 4 / 3); - if (len <= 2) - len = 45; - else - len = len / 3 * 3; + from_utf8 = DO_UTF8(fromstr); + if (from_utf8) { + aend = aptr + fromlen; + fromlen = sv_len_utf8(fromstr); + } else aend = NULL; /* Unused, but keep compilers happy */ + GROWING(utf8, cat, start, cur, (fromlen+2) / 3 * 4 + (fromlen+len-1)/len * 2); while (fromlen > 0) { + U8 *end; I32 todo; + U8 hunk[1+63/3*4+1]; if ((I32)fromlen > len) todo = len; else todo = fromlen; - doencodes(cat, aptr, todo); - fromlen -= todo; + if (from_utf8) { + char buffer[64]; + if (!uni_to_bytes(aTHX_ &aptr, aend, buffer, todo, + 'u' | TYPE_IS_PACK)) { + *cur = '\0'; + SvCUR(cat) = cur - start; + Perl_croak(aTHX_ "Assertion: string is shorter than advertised"); + } + end = doencodes(hunk, buffer, todo); + } else { + end = doencodes(hunk, aptr, todo); aptr += todo; } + PUSH_BYTES(utf8, cur, hunk, end-hunk); + fromlen -= todo; + } break; } + } + *cur = '\0'; + SvCUR(cat) = cur - start; + no_change: *symptr = lookahead; } return beglist; @@ -3077,6 +3444,7 @@ PP(pp_pack) MARK++; sv_setpvn(cat, "", 0); + SvUTF8_off(cat); packlist(cat, pat, patend, MARK, SP + 1); diff --git a/proto.h b/proto.h index f99ab1c..2a21592 100644 --- a/proto.h +++ b/proto.h @@ -1021,7 +1021,6 @@ STATIC I32 S_measure_struct(pTHX_ tempsym_t* symptr); STATIC char * S_group_end(pTHX_ char *pat, char *patend, char ender); STATIC char * S_get_num(pTHX_ char *ppat, I32 *); STATIC bool S_next_symbol(pTHX_ tempsym_t* symptr); -STATIC void S_doencodes(pTHX_ SV* sv, char* s, I32 len); STATIC SV* S_is_an_int(pTHX_ char *s, STRLEN l); STATIC int S_div128(pTHX_ SV *pnum, bool *done); #endif diff --git a/t/op/pack.t b/t/op/pack.t index 28aece7..7f6bbed 100755 --- a/t/op/pack.t +++ b/t/op/pack.t @@ -12,7 +12,7 @@ my $no_endianness = $] > 5.009 ? '' : my $no_signedness = $] > 5.009 ? '' : "Signed/unsigned pack modifiers not available on this perl"; -plan tests => 13864; +plan tests => 14604; use strict; use warnings; @@ -422,11 +422,11 @@ while (my ($base, $expect) = splice @lengths, 0, 2) { print "# test unpack-pack lengths\n"; -my @templates = qw(c C i I s S l L n N v V f d q Q); +my @templates = qw(c C W i I s S l L n N v V f d q Q); foreach my $base (@templates) { my @tmpl = ($base); - $base =~ /^[cnv]/i or push @tmpl, "$base>", "$base<"; + $base =~ /^[cwnv]/i or push @tmpl, "$base>", "$base<"; foreach my $t (@tmpl) { SKIP: { my @t = eval { unpack("$t*", pack("$t*", 12, 34)) }; @@ -640,6 +640,7 @@ sub numbers_with_total { numbers ('c', -128, -1, 0, 1, 127); numbers ('C', 0, 1, 127, 128, 255); +numbers ('W', 0, 1, 127, 128, 255, 256, 0x7ff, 0x800, 0xfffd); numbers ('s', -32768, -1, 0, 1, 32767); numbers ('S', 0, 1, 32767, 32768, 65535); numbers ('i', -2147483648, -1, 0, 1, 2147483647); @@ -1303,7 +1304,7 @@ SKIP: { } { # Repeat count [SUBEXPR] - my @codes = qw( x A Z a c C B b H h s v n S i I l V N L p P f F d + my @codes = qw( x A Z a c C W B b H h s v n S i I l V N L p P f F d s! S! i! I! l! L! j J); my $G; if (eval { pack 'q', 1 } ) { @@ -1323,6 +1324,7 @@ SKIP: { @val{@codes} = map { / [Xx] (?{ undef }) | [AZa] (?{ 'something' }) | C (?{ 214 }) + | W (?{ 8188 }) | c (?{ 114 }) | [Bb] (?{ '101' }) | [Hh] (?{ 'b8' }) @@ -1509,6 +1511,8 @@ is(unpack('c'), 65, "one-arg unpack (change #18751)"); # defaulting to $_ my (@x) = unpack("a(U0)U", "b\341\277\274"); is($x[0], 'b', 'before scope'); is($x[1], 8188, 'after scope'); + + is(pack("a(U0)U", "b", 8188), "b\341\277\274"); } { @@ -1525,3 +1529,244 @@ is(unpack('c'), 65, "one-arg unpack (change #18751)"); # defaulting to $_ my (@x) = unpack("C*", pack("CZ0", 1, "b")); is(join(',', @x), '1', 'pack Z0 doesn\'t destroy the character before'); } + +{ + # Encoding neutrality + # String we will pull apart and rebuild in several ways: + my $down = "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff\x05\x06"; + my $up = $down; + utf8::upgrade($up); + + my %expect = + # [expected result, + # how many chars it should progress, + # (optional) expected result of pack] + (a5 => ["\xf8\xf9\xfa\xfb\xfc", 5], + A5 => ["\xf8\xf9\xfa\xfb\xfc", 5], + Z5 => ["\xf8\xf9\xfa\xfb\xfc", 5, "\xf8\xf9\xfa\xfb\x00\xfd"], + b21 => ["000111111001111101011", 3, "\xf8\xf9\x1a\xfb"], + B21 => ["111110001111100111111", 3, "\xf8\xf9\xf8\xfb"], + H5 => ["f8f9f", 3, "\xf8\xf9\xf0\xfb"], + h5 => ["8f9fa", 3, "\xf8\xf9\x0a\xfb"], + "s<" => [-1544, 2], + "s>" => [-1799, 2], + "S<" => [0xf9f8, 2], + "S>" => [0xf8f9, 2], + "l<" => [-67438088, 4], + "l>" => [-117835013, 4], + "L>" => [0xf8f9fafb, 4], + "L<" => [0xfbfaf9f8, 4], + n => [0xf8f9, 2], + N => [0xf8f9fafb, 4], + v => [63992, 2], + V => [0xfbfaf9f8, 4], + c => [-8, 1], + U0U => [0xf8, 1], + w => ["8715569050387726213", 9], + q => ["-283686952306184", 8], + Q => ["18446460386757245432", 8], + ); + + for my $string ($down, $up) { + for my $format (sort {lc($a) cmp lc($b) || $a cmp $b } keys %expect) { + SKIP: { + my $expect = $expect{$format}; + # unpack upgraded and downgraded string + my @result = eval { unpack("$format C0 W", $string) }; + skip "cannot pack/unpack '$format C0 W' on this perl", 5 if + $@ && is_valid_error($@); + is(@result, 2, "Two results from unpack $format C0 W"); + + # pack to downgraded + my $new = pack("$format C0 W", @result); + is(length($new), $expect->[1]+1, + "pack $format C0 W should give $expect->[1]+1 chars"); + is($new, $expect->[2] || substr($string, 0, length $new), + "pack $format C0 W returns expected value"); + + # pack to upgraded + $new = pack("a0 $format C0 W", chr(256), @result); + is(length($new), $expect->[1]+1, + "pack a0 $format C0 W should give $expect->[1]+1 chars"); + is($new, $expect->[2] || substr($string, 0, length $new), + "pack a0 $format C0 W returns expected value"); + } + } + } +} + +{ + # Encoding neutrality, numbers + my $val = -2.68; + for my $format (qw(s S i I l L j J f d F D q Q + s! S! i! I! l! L! n! N! v! V!)) { + SKIP: { + my $down = eval { pack($format, $val) }; + skip "cannot pack/unpack $format on this perl", 9 if + $@ && is_valid_error($@); + ok(!utf8::is_utf8($down), "Simple $format pack doesn't get upgraded"); + my $up = pack("a0 $format", chr(256), $val); + ok(utf8::is_utf8($up), "a0 $format with high char leads to upgrade"); + is($down, $up, "$format generated strings are equal though"); + my @down_expanded = unpack("$format W", $down . chr(0xce)); + is(@down_expanded, 2, "Expand to two values"); + is($down_expanded[1], 0xce, + "unpack $format left us at the expected position"); + my @up_expanded = unpack("$format W", $up . chr(0xce)); + is(@up_expanded, 2, "Expand to two values"); + is($up_expanded[1], 0xce, + "unpack $format left us at the expected position"); + is($down_expanded[0], $up_expanded[0], "$format unpack was neutral"); + is(pack($format, $down_expanded[0]), $down, "Pack $format undoes unpack $format"); + } + } +} + +{ + # C is *not* neutral + my $down = "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff\x05\x06"; + my $up = $down; + utf8::upgrade($up); + my @down = unpack("C*", $down); + my @expect_down = (0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, 0x05, 0x06); + is("@down", "@expect_down", "byte expand"); + is(pack("C*", @down), $down, "byte join"); + + my @up = unpack("C*", $up); + my @expect_up = (0xc3, 0xb8, 0xc3, 0xb9, 0xc3, 0xba, 0xc3, 0xbb, 0xc3, 0xbc, 0xc3, 0xbd, 0xc3, 0xbe, 0xc3, 0xbf, 0x05, 0x06); + is("@up", "@expect_up", "UTF-8 expand"); + is(pack("U0C0C*", @up), $up, "UTF-8 join"); +} + +{ + # Harder cases for the neutrality test + + # u format + my $down = "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff\x05\x06"; + my $up = $down; + utf8::upgrade($up); + is(pack("u", $down), pack("u", $up), "u pack is neutral"); + is(unpack("u", pack("u", $down)), $down, "u unpack to downgraded works"); + is(unpack("U0C0u", pack("u", $down)), $up, "u unpack to upgraded works"); + + # p/P format + # This actually only tests something if the address contains a byte >= 0x80 + my $str = "abc\xa5\x00\xfede"; + $down = pack("p", $str); + is(pack("P", $str), $down); + is(pack("U0C0p", $str), $down); + is(pack("U0C0P", $str), $down); + is(unpack("p", $down), "abc\xa5", "unpack p downgraded"); + $up = $down; + utf8::upgrade($up); + is(unpack("p", $up), "abc\xa5", "unpack p upgraded"); + + is(unpack("P7", $down), "abc\xa5\x00\xfed", "unpack P downgraded"); + is(unpack("P7", $up), "abc\xa5\x00\xfed", "unpack P upgraded"); + + # x, X and @ + $down = "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff\x05\x06"; + $up = $down; + utf8::upgrade($up); + + is(unpack('@4W', $down), 0xfc, "\@positioning on downgraded string"); + is(unpack('@4W', $up), 0xfc, "\@positioning on upgraded string"); + + is(unpack('@4x2W', $down), 0xfe, "x moving on downgraded string"); + is(unpack('@4x2W', $up), 0xfe, "x moving on upgraded string"); + is(unpack('@4x!4W', $down), 0xfc, "x! moving on downgraded string"); + is(unpack('@4x!4W', $up), 0xfc, "x! moving on upgraded string"); + is(unpack('@5x!4W', $down), 0x05, "x! moving on downgraded string"); + is(unpack('@5x!4W', $up), 0x05, "x! moving on upgraded string"); + + is(unpack('@4X2W', $down), 0xfa, "X moving on downgraded string"); + is(unpack('@4X2W', $up), 0xfa, "X moving on upgraded string"); + is(unpack('@4X!4W', $down), 0xfc, "X! moving on downgraded string"); + is(unpack('@4X!4W', $up), 0xfc, "X! moving on upgraded string"); + is(unpack('@5X!4W', $down), 0xfc, "X! moving on downgraded string"); + is(unpack('@5X!4W', $up), 0xfc, "X! moving on upgraded string"); + is(unpack('@5X!8W', $down), 0xf8, "X! moving on downgraded string"); + is(unpack('@5X!8W', $up), 0xf8, "X! moving on upgraded string"); + + is(pack("W2x", 0xfa, 0xe3), "\xfa\xe3\x00", "x on downgraded string"); + is(pack("W2x!4", 0xfa, 0xe3), "\xfa\xe3\x00\x00", + "x! on downgraded string"); + is(pack("W2x!2", 0xfa, 0xe3), "\xfa\xe3", "x! on downgraded string"); + is(pack("U0C0W2x", 0xfa, 0xe3), "\xfa\xe3\x00", "x on upgraded string"); + is(pack("U0C0W2x!4", 0xfa, 0xe3), "\xfa\xe3\x00\x00", + "x! on upgraded string"); + is(pack("U0C0W2x!2", 0xfa, 0xe3), "\xfa\xe3", "x! on upgraded string"); + is(pack("W2X", 0xfa, 0xe3), "\xfa", "X on downgraded string"); + is(pack("U0C0W2X", 0xfa, 0xe3), "\xfa", "X on upgraded string"); + is(pack("W2X!2", 0xfa, 0xe3), "\xfa\xe3", "X! on downgraded string"); + is(pack("U0C0W2X!2", 0xfa, 0xe3), "\xfa\xe3", "X! on upgraded string"); + is(pack("W3X!2", 0xfa, 0xe3, 0xa6), "\xfa\xe3", "X! on downgraded string"); + is(pack("U0C0W3X!2", 0xfa, 0xe3, 0xa6), "\xfa\xe3", + "X! on upgraded string"); + + # backward eating through a ( moves the group starting point backwards + is(pack("a*(Xa)", "abc", "q"), "abq", + "eating before strbeg moves it back"); + is(pack("a*(Xa)", "ab" . chr(512), "q"), "abq", + "eating before strbeg moves it back"); + + # Check marked_upgrade + is(pack('W(W(Wa@3W)@6W)@9W', 0xa1, 0xa2, 0xa3, "a", 0xa4, 0xa5, 0xa6), + "\xa1\xa2\xa3a\x00\xa4\x00\xa5\x00\xa6"); + $up = "a"; + utf8::upgrade($up); + is(pack('W(W(Wa@3W)@6W)@9W', 0xa1, 0xa2, 0xa3, $up, 0xa4, 0xa5, 0xa6), + "\xa1\xa2\xa3a\x00\xa4\x00\xa5\x00\xa6", "marked upgrade caused by a"); + is(pack('W(W(WW@3W)@6W)@9W', 0xa1, 0xa2, 0xa3, 256, 0xa4, 0xa5, 0xa6), + "\xa1\xa2\xa3\x{100}\x00\xa4\x00\xa5\x00\xa6", + "marked upgrade caused by W"); + is(pack('W(W(WU0aC0@3W)@6W)@9W', 0xa1, 0xa2, 0xa3, "a", 0xa4, 0xa5, 0xa6), + "\xa1\xa2\xa3a\x00\xa4\x00\xa5\x00\xa6", "marked upgrade caused by U0"); + + # a, A and Z + $down = "\xa4\xa6\xa7"; + $up = $down; + utf8::upgrade($up); + utf8::upgrade(my $high = "\xfeb"); + + for my $format ("a0", "A0", "Z0", "U0a0C0", "U0A0C0", "U0Z0C0") { + is(pack("a* $format a*", "ab", $down, "cd"), "abcd", + "$format format on plain string"); + is(pack("a* $format a*", "ab", $up, "cd"), "abcd", + "$format format on upgraded string"); + is(pack("a* $format a*", $high, $down, "cd"), "\xfebcd", + "$format format on plain string"); + is(pack("a* $format a*", $high, $up, "cd"), "\xfebcd", + "$format format on upgraded string"); + my @down = unpack("a1 $format a*", "\xfeb"); + is("@down", "\xfe b", "unpack $format"); + my @up = unpack("a1 $format a*", $high); + is("@up", "\xfe b", "unpack $format"); + } + is(pack("a1", $high), "\xfe"); + is(pack("A1", $high), "\xfe"); + is(pack("Z1", $high), "\x00"); + is(pack("a2", $high), "\xfeb"); + is(pack("A2", $high), "\xfeb"); + is(pack("Z2", $high), "\xfe\x00"); + is(pack("a5", $high), "\xfeb\x00\x00\x00"); + is(pack("A5", $high), "\xfeb "); + is(pack("Z5", $high), "\xfeb\x00\x00\x00"); + is(pack("a*", $high), "\xfeb"); + is(pack("A*", $high), "\xfeb"); + is(pack("Z*", $high), "\xfeb\x00"); + + utf8::upgrade($high = "\xc3\xbeb"); + is(pack("U0a2", $high), "\xfe"); + is(pack("U0A2", $high), "\xfe"); + is(pack("U0Z1", $high), "\x00"); + is(pack("U0a3", $high), "\xfeb"); + is(pack("U0A3", $high), "\xfeb"); + is(pack("U0Z3", $high), "\xfe\x00"); + is(pack("U0a6", $high), "\xfeb\x00\x00\x00"); + is(pack("U0A6", $high), "\xfeb "); + is(pack("U0Z6", $high), "\xfeb\x00\x00\x00"); + is(pack("U0a*", $high), "\xfeb"); + is(pack("U0A*", $high), "\xfeb"); + is(pack("U0Z*", $high), "\xfeb\x00"); +} diff --git a/t/op/utftaint.t b/t/op/utftaint.t index cd44503..0edb2f2 100644 --- a/t/op/utftaint.t +++ b/t/op/utftaint.t @@ -31,7 +31,7 @@ use constant UTF8 => "\x{1234}"; sub is_utf8 { my $s = shift; - return 0xB6 != ord pack('a*', chr(0xB6).$s); + return 0xB6 != unpack('C', chr(0xB6).$s); } for my $ary ([ascii => 'perl'], [latin1 => "\xB6"], [utf8 => "\x{100}"]) { @@ -82,7 +82,7 @@ for my $ary ([ascii => 'perl'], [latin1 => "\xB6"], [utf8 => "\x{100}"]) { my $encode = $ary->[0]; my $utf8 = pack('U*') . $ary->[1]; - my $byte = pack('C0a*', $utf8); + my $byte = unpack('U0a*', $utf8); my $taint = $arg; substr($taint, 0) = $utf8; utf8::encode($taint); @@ -120,7 +120,7 @@ for my $ary ([ascii => 'perl'], [latin1 => "\xB6"]) { my $encode = $ary->[0]; my $up = pack('U*') . $ary->[1]; - my $down = pack('C0a*', $ary->[1]); + my $down = pack("a*", $ary->[1]); my $taint = $arg; substr($taint, 0) = $up; utf8::upgrade($taint);