Converts a string C<s> of length C<len> from UTF8 into byte encoding.
Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
-the newly-created string, and updates C<len> to contain the new length.
-Returns the original string if no conversion occurs, C<len> and
-C<is_utf8> are unchanged. Do nothing if C<is_utf8> points to 0. Sets
-C<is_utf8> to 0 if C<s> is converted or malformed .
+the newly-created string, and updates C<len> to contain the new
+length. Returns the original string if no conversion occurs, C<len>
+is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
+0 if C<s> is converted or contains all 7bit characters.
NOTE: this function is experimental and may change or be
removed without notice.
push @INC, '../lib';
}
-print "1..25\n";
+print "1..26\n";
$h{'abc'} = 'ABC';
$h{'def'} = 'DEF';
print "#$u{$_}\n" for keys %u; # Used to core dump before change #8056.
print "ok 24\n";
-%u = (qu"\xe3\x81\x82" => "downglade");
+$d = qu"\xe3\x81\x82";
+%u = ($d => "downgrade");
for (keys %u) {
use bytes;
print "not " if length ne 3 or $_ ne "\xe3\x81\x82";
print "ok 25\n";
}
+{
+ use bytes;
+ print "not " if length($d) ne 6 or $d ne qu"\xe3\x81\x82";
+ print "ok 26\n";
+}
Converts a string C<s> of length C<len> from UTF8 into byte encoding.
Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
-the newly-created string, and updates C<len> to contain the new length.
-Returns the original string if no conversion occurs, C<len> and
-C<is_utf8> are unchanged. Do nothing if C<is_utf8> points to 0. Sets
-C<is_utf8> to 0 if C<s> is converted or malformed .
+the newly-created string, and updates C<len> to contain the new
+length. Returns the original string if no conversion occurs, C<len>
+is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
+0 if C<s> is converted or contains all 7bit characters.
=cut */
if (!*is_utf8)
return start;
- /* ensure valid UTF8 and chars < 256 before updating string */
+ /* ensure valid UTF8 and chars < 256 before converting string */
for (send = s + *len; s < send;) {
U8 c = *s++;
if (!UTF8_IS_ASCII(c)) {
if (UTF8_IS_CONTINUATION(c) || s >= send ||
- !UTF8_IS_CONTINUATION(*s)) {
- *is_utf8 = 0;
- return start;
- }
- if ((c & 0xfc) != 0xc0)
+ !UTF8_IS_CONTINUATION(*s) || (c & 0xfc) != 0xc0)
return start;
s++, count++;
}
return start;
Newz(801, d, (*len) - count + 1, U8);
- d = s = start;
+ s = start; start = d;
while (s < send) {
U8 c = *s++;
if (UTF8_IS_ASCII(c))