SvTAINTED_on(sv);
}
-/* XXX SvUTF8 support missing! */
+/* currently converts input to bytes if needed and croaks if a character
+ > 255 is encountered */
UV
Perl_do_vecget(pTHX_ SV *sv, I32 offset, I32 size)
{
return retnum;
if (size < 1 || (size & (size-1))) /* size < 1 or not a power of two */
Perl_croak(aTHX_ "Illegal number of bits in vec");
+
+ if (SvUTF8(sv)) {
+ if (Perl_utf8_to_bytes(aTHX_ (U8*) s, &srclen)) {
+ SvUTF8_off(sv);
+ SvCUR_set(sv, srclen);
+ }
+ else
+ Perl_croak(aTHX_ "Character > 255 in vec()");
+ }
+
offset *= size; /* turn into bit offset */
len = (offset + size + 7) / 8; /* required number of bytes */
if (len > srclen) {
return retnum;
}
-/* XXX SvUTF8 support missing! */
+/* currently converts input to bytes if needed and croaks if a character
+ > 255 is encountered */
void
Perl_do_vecset(pTHX_ SV *sv)
{
if (!targ)
return;
s = (unsigned char*)SvPV_force(targ, targlen);
+ if (SvUTF8(targ)) {
+ if (Perl_utf8_to_bytes(aTHX_ (U8*) s, &targlen)) {
+ /* SvUTF8_off(targ); SvPOK_only below ensures this */
+ SvCUR_set(targ, targlen);
+ }
+ else
+ Perl_croak(aTHX_ "Character > 255 in vec()");
+ }
+
(void)SvPOK_only(targ);
lval = SvUV(sv);
offset = LvTARGOFF(sv);
Ap |U8* |utf16_to_utf8_reversed|U8* p|U8 *d|I32 bytelen|I32 *newlen
Ap |I32 |utf8_distance |U8 *a|U8 *b
Ap |U8* |utf8_hop |U8 *s|I32 off
-ApM |U8* |utf8_to_bytes |U8 *s|STRLEN len
+ApM |U8* |utf8_to_bytes |U8 *s|STRLEN *len
ApM |U8* |bytes_to_utf8 |U8 *s|STRLEN *len
Ap |UV |utf8_to_uv |U8 *s|I32* retlen
Ap |U8* |uv_to_utf8 |U8 *d|UV uv
#undef Perl_utf8_to_bytes
U8*
-Perl_utf8_to_bytes(pTHXo_ U8 *s, STRLEN len)
+Perl_utf8_to_bytes(pTHXo_ U8 *s, STRLEN *len)
{
return ((CPerlObj*)pPerl)->Perl_utf8_to_bytes(s, len);
}
=item utf8_to_bytes
-Converts a string C<s> of length C<len> from UTF8 into ASCII encoding.
-Unlike C<bytes_to_utf8>, this over-writes the original string.
-Returns zero on failure after converting as much as possible.
+Converts a string C<s> of length C<len> from UTF8 into byte encoding.
+Unlike C<bytes_to_utf8>, this over-writes the original string, and
+updates len to contain the new length.
+Returns zero on failure leaving the string and len unchanged
- U8 * utf8_to_bytes(U8 *s, STRLEN len)
+ U8 * utf8_to_bytes(U8 *s, STRLEN *len)
=for hackers
Found in file utf8.c
with an assignment operator, which implies modifying the value itself.
Perhaps you need to copy the value to a temporary, and repeat that.
+=item Character > 255 in vec()
+
+(F) You applied the vec() function to a UTF8 string which contained
+a character > 255. vec() currently only operates on characters < 256.
+
=item chmod() mode argument is missing initial 0
(W chmod) A novice will sometimes say
extend the string with sufficiently many zero bytes. It is an error
to try to write off the beginning of the string (i.e. negative OFFSET).
+The string must not contain any character with value > 255 (which
+can only happen if you're using UTF8 encoding).
+
Strings created with C<vec> can also be manipulated with the logical
operators C<|>, C<&>, C<^>, and C<~>. These operators will assume a bit
vector operation is desired when both operands are strings.
PERL_CALLCONV U8* Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8 *d, I32 bytelen, I32 *newlen);
PERL_CALLCONV I32 Perl_utf8_distance(pTHX_ U8 *a, U8 *b);
PERL_CALLCONV U8* Perl_utf8_hop(pTHX_ U8 *s, I32 off);
-PERL_CALLCONV U8* Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN len);
+PERL_CALLCONV U8* Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len);
PERL_CALLCONV U8* Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len);
PERL_CALLCONV UV Perl_utf8_to_uv(pTHX_ U8 *s, I32* retlen);
PERL_CALLCONV U8* Perl_uv_to_utf8(pTHX_ U8 *d, UV uv);
#!./perl
-print "1..23\n";
+print "1..30\n";
print vec($foo,0,1) == 0 ? "ok 1\n" : "not ok 1\n";
print length($foo) == 0 ? "ok 2\n" : "not ok 2\n";
print "ok 22\n";
print "not " if vec('abcd', 7, 8);
print "ok 23\n";
+
+# UTF8
+# N.B. currently curiously coded to circumvent bugs elswhere in UTF8 handling
+
+$foo = "\x{100}" . "\xff\xfe";
+$x = substr $foo, 1;
+print "not " if vec($x, 0, 8) != 255;
+print "ok 24\n";
+eval { vec($foo, 1, 8) };
+print "not " unless $@ =~ /^Character > 255 in vec\(\) /;
+print "ok 25\n";
+eval { vec($foo, 1, 8) = 13 };
+print "not " unless $@ =~ /^Character > 255 in vec\(\) /;
+print "ok 26\n";
+print "not " if $foo ne "\x{100}" . "\xff\xfe";
+print "ok 27\n";
+$x = substr $foo, 1;
+vec($x, 2, 4) = 7;
+print "not " if $x ne "\xff\xf7";
+print "ok 28\n";
+
+# mixed magic
+
+$foo = "\x61\x62\x63\x64\x65\x66";
+print "not " if vec(substr($foo, 2, 2), 0, 16) != 25444;
+print "ok 29\n";
+vec(substr($foo, 1,3), 5, 4) = 3;
+print "not " if $foo ne "\x61\x62\x63\x34\x65\x66";
+print "ok 30\n";
return uv;
}
-/* utf8_distance(a,b) is intended to be a - b in pointer arithmetic */
+/* utf8_distance(a,b) returns the number of UTF8 characters between
+ the pointers a and b */
I32
Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
}
/*
-=for apidoc Am|U8 *|utf8_to_bytes|U8 *s|STRLEN len
+=for apidoc Am|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
-Converts a string C<s> of length C<len> from UTF8 into ASCII encoding.
-Unlike C<bytes_to_utf8>, this over-writes the original string.
-Returns zero on failure after converting as much as possible.
+Converts a string C<s> of length C<len> from UTF8 into byte encoding.
+Unlike C<bytes_to_utf8>, this over-writes the original string, and
+updates len to contain the new length.
+Returns zero on failure leaving the string and len unchanged
=cut
*/
U8 *
-Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN len)
+Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
{
dTHR;
U8 *send;
U8 *d;
U8 *save;
- send = s + len;
+ send = s + *len;
d = save = s;
+
+ /* ensure valid UTF8 and chars < 256 before updating string */
+ while (s < send) {
+ U8 c = *s++;
+ if (c >= 0x80 &&
+ ( (s >= send) || ((*s++ & 0xc0) != 0x80) || ((c & 0xfe) != 0xc2)))
+ return 0;
+ }
+ s = save;
while (s < send) {
if (*s < 0x80)
*d++ = *s++;
else {
I32 ulen;
- UV uv = utf8_to_uv(s, &ulen);
- if (uv > 255) {
- *d = '\0';
- return 0;
- }
+ *d++ = (U8)utf8_to_uv(s, &ulen);
s += ulen;
- *d++ = (U8)uv;
}
}
*d = '\0';
+ *len = d - save;
return save;
}