Fix vec() / utf8 (was Re: bitvec ops still broken with utf8 -- or not?)

diff --git a/doop.c b/doop.c

index 3c86690..46ffc1b 100644 (file)
--- a/doop.c
+++ b/doop.c
@@ -537,7 +537,8 @@ Perl_do_sprintf(pTHX_ SV *sv, I32 len, SV **sarg)
        SvTAINTED_on(sv);
 }
 
-/* XXX SvUTF8 support missing! */
+/* currently converts input to bytes if needed and croaks if a character
+   > 255 is encountered                                                        */
 UV
 Perl_do_vecget(pTHX_ SV *sv, I32 offset, I32 size)
 {
@@ -549,6 +550,16 @@ Perl_do_vecget(pTHX_ SV *sv, I32 offset, I32 size)
        return retnum;
     if (size < 1 || (size & (size-1))) /* size < 1 or not a power of two */ 
        Perl_croak(aTHX_ "Illegal number of bits in vec");
+
+    if (SvUTF8(sv)) {
+       if (Perl_utf8_to_bytes(aTHX_ (U8*) s, &srclen)) {
+           SvUTF8_off(sv);
+           SvCUR_set(sv, srclen);
+       }
+       else
+           Perl_croak(aTHX_ "Character > 255 in vec()");
+    }
+
     offset *= size;    /* turn into bit offset */
     len = (offset + size + 7) / 8;     /* required number of bytes */
     if (len > srclen) {
@@ -670,7 +681,8 @@ Perl_do_vecget(pTHX_ SV *sv, I32 offset, I32 size)
     return retnum;
 }
 
-/* XXX SvUTF8 support missing! */
+/* currently converts input to bytes if needed and croaks if a character
+   > 255 is encountered                                                        */
 void
 Perl_do_vecset(pTHX_ SV *sv)
 {
@@ -686,6 +698,15 @@ Perl_do_vecset(pTHX_ SV *sv)
     if (!targ)
        return;
     s = (unsigned char*)SvPV_force(targ, targlen);
+    if (SvUTF8(targ)) {
+       if (Perl_utf8_to_bytes(aTHX_ (U8*) s, &targlen)) {
+       /*  SvUTF8_off(targ);   SvPOK_only below ensures this  */
+           SvCUR_set(targ, targlen);
+       }
+       else
+           Perl_croak(aTHX_ "Character > 255 in vec()");
+    }
+
     (void)SvPOK_only(targ);
     lval = SvUV(sv);
     offset = LvTARGOFF(sv);
diff --git a/embed.pl b/embed.pl

index 9353435..b99a59f 100755 (executable)
--- a/embed.pl
+++ b/embed.pl
@@ -2071,7 +2071,7 @@ Ap        |U8*    |utf16_to_utf8  |U8* p|U8 *d|I32 bytelen|I32 *newlen
 Ap     |U8*    |utf16_to_utf8_reversed|U8* p|U8 *d|I32 bytelen|I32 *newlen
 Ap     |I32    |utf8_distance  |U8 *a|U8 *b
 Ap     |U8*    |utf8_hop       |U8 *s|I32 off
-ApM    |U8*    |utf8_to_bytes  |U8 *s|STRLEN len
+ApM    |U8*    |utf8_to_bytes  |U8 *s|STRLEN *len
 ApM    |U8*    |bytes_to_utf8  |U8 *s|STRLEN *len
 Ap     |UV     |utf8_to_uv     |U8 *s|I32* retlen
 Ap     |U8*    |uv_to_utf8     |U8 *d|UV uv
diff --git a/perlapi.c b/perlapi.c

index 2fca6bc..3257fec 100644 (file)
--- a/perlapi.c
+++ b/perlapi.c
@@ -3359,7 +3359,7 @@ Perl_utf8_hop(pTHXo_ U8 *s, I32 off)
 
 #undef  Perl_utf8_to_bytes
 U8*
-Perl_utf8_to_bytes(pTHXo_ U8 *s, STRLEN len)
+Perl_utf8_to_bytes(pTHXo_ U8 *s, STRLEN *len)
 {
     return ((CPerlObj*)pPerl)->Perl_utf8_to_bytes(s, len);
 }
diff --git a/pod/perlapi.pod b/pod/perlapi.pod

index 3eda765..b6dab89 100644 (file)
--- a/pod/perlapi.pod
+++ b/pod/perlapi.pod
@@ -3115,11 +3115,12 @@ Found in file utf8.c
 
 =item utf8_to_bytes
 
-Converts a string C<s> of length C<len> from UTF8 into ASCII encoding.
-Unlike C<bytes_to_utf8>, this over-writes the original string.
-Returns zero on failure after converting as much as possible.
+Converts a string C<s> of length C<len> from UTF8 into byte encoding.
+Unlike C<bytes_to_utf8>, this over-writes the original string, and
+updates len to contain the new length.
+Returns zero on failure leaving the string and len unchanged
 
-       U8 *    utf8_to_bytes(U8 *s, STRLEN len)
+       U8 *    utf8_to_bytes(U8 *s, STRLEN *len)
 
 =for hackers
 Found in file utf8.c
diff --git a/pod/perldiag.pod b/pod/perldiag.pod

index 27711f5..960170d 100644 (file)
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -1043,6 +1043,11 @@ references can be weakened.
 with an assignment operator, which implies modifying the value itself.
 Perhaps you need to copy the value to a temporary, and repeat that.
 
+=item Character > 255 in vec()
+
+(F) You applied the vec() function to a UTF8 string which contained
+a character > 255.   vec() currently only operates on characters < 256.
+
 =item chmod() mode argument is missing initial 0
 
 (W chmod) A novice will sometimes say
diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod

index a1381f1..c3ba736 100644 (file)
--- a/pod/perlfunc.pod
+++ b/pod/perlfunc.pod
@@ -5510,6 +5510,9 @@ If an element off the end of the string is written to, Perl will first
 extend the string with sufficiently many zero bytes.   It is an error
 to try to write off the beginning of the string (i.e. negative OFFSET).
 
+The string must not contain any character with value > 255 (which
+can only happen if you're using UTF8 encoding).
+
 Strings created with C<vec> can also be manipulated with the logical
 operators C<|>, C<&>, C<^>, and C<~>.  These operators will assume a bit
 vector operation is desired when both operands are strings.
diff --git a/proto.h b/proto.h

index 841e32a..931997c 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -814,7 +814,7 @@ PERL_CALLCONV U8*   Perl_utf16_to_utf8(pTHX_ U8* p, U8 *d, I32 bytelen, I32 *newle
 PERL_CALLCONV U8*      Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8 *d, I32 bytelen, I32 *newlen);
 PERL_CALLCONV I32      Perl_utf8_distance(pTHX_ U8 *a, U8 *b);
 PERL_CALLCONV U8*      Perl_utf8_hop(pTHX_ U8 *s, I32 off);
-PERL_CALLCONV U8*      Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN len);
+PERL_CALLCONV U8*      Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len);
 PERL_CALLCONV U8*      Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len);
 PERL_CALLCONV UV       Perl_utf8_to_uv(pTHX_ U8 *s, I32* retlen);
 PERL_CALLCONV U8*      Perl_uv_to_utf8(pTHX_ U8 *d, UV uv);
diff --git a/t/op/vec.t b/t/op/vec.t

index 52b20cd..b75bebf 100755 (executable)
--- a/t/op/vec.t
+++ b/t/op/vec.t
@@ -1,6 +1,6 @@
 #!./perl
 
-print "1..23\n";
+print "1..30\n";
 
 print vec($foo,0,1) == 0 ? "ok 1\n" : "not ok 1\n";
 print length($foo) == 0 ? "ok 2\n" : "not ok 2\n";
@@ -48,3 +48,32 @@ print "not " if defined $x or $@ !~ /^Assigning to negative offset in vec/;
 print "ok 22\n";
 print "not " if vec('abcd', 7, 8);
 print "ok 23\n";
+
+# UTF8
+# N.B. currently curiously coded to circumvent bugs elswhere in UTF8 handling
+
+$foo = "\x{100}" . "\xff\xfe";
+$x = substr $foo, 1;
+print "not " if vec($x, 0, 8) != 255;
+print "ok 24\n";
+eval { vec($foo, 1, 8) };
+print "not " unless $@ =~ /^Character > 255 in vec\(\) /;
+print "ok 25\n";
+eval { vec($foo, 1, 8) = 13 };
+print "not " unless $@ =~ /^Character > 255 in vec\(\) /;
+print "ok 26\n";
+print "not " if $foo ne "\x{100}" . "\xff\xfe";
+print "ok 27\n";
+$x = substr $foo, 1;
+vec($x, 2, 4) = 7;
+print "not " if $x ne "\xff\xf7";
+print "ok 28\n";
+
+# mixed magic
+
+$foo = "\x61\x62\x63\x64\x65\x66";
+print "not " if vec(substr($foo, 2, 2), 0, 16) != 25444;
+print "ok 29\n";
+vec(substr($foo, 1,3), 5, 4) = 3;
+print "not " if $foo ne "\x61\x62\x63\x34\x65\x66";
+print "ok 30\n";
diff --git a/utf8.c b/utf8.c

index e86c49f..c109f65 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -204,7 +204,8 @@ Perl_utf8_to_uv(pTHX_ U8* s, I32* retlen)
     return uv;
 }
 
-/* utf8_distance(a,b) is intended to be a - b in pointer arithmetic */
+/* utf8_distance(a,b) returns the number of UTF8 characters between
+   the pointers a and b                                                        */
 
 I32
 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
@@ -247,40 +248,46 @@ Perl_utf8_hop(pTHX_ U8 *s, I32 off)
 }
 
 /*
-=for apidoc Am|U8 *|utf8_to_bytes|U8 *s|STRLEN len
+=for apidoc Am|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 
-Converts a string C<s> of length C<len> from UTF8 into ASCII encoding.
-Unlike C<bytes_to_utf8>, this over-writes the original string.
-Returns zero on failure after converting as much as possible.
+Converts a string C<s> of length C<len> from UTF8 into byte encoding.
+Unlike C<bytes_to_utf8>, this over-writes the original string, and
+updates len to contain the new length.
+Returns zero on failure leaving the string and len unchanged
 
 =cut
 */
 
 U8 *
-Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN len)
+Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
 {
     dTHR;
     U8 *send;
     U8 *d;
     U8 *save;
 
-    send = s + len;
+    send = s + *len;
     d = save = s;
+
+    /* ensure valid UTF8 and chars < 256 before updating string */
+    while (s < send) {
+       U8 c = *s++;
+        if (c >= 0x80 &&
+           ( (s >= send) || ((*s++ & 0xc0) != 0x80) || ((c & 0xfe) != 0xc2)))
+           return 0;    
+    }
+    s = save;
     while (s < send) {
         if (*s < 0x80)
             *d++ = *s++;
         else {
             I32 ulen;
-            UV uv = utf8_to_uv(s, &ulen);
-            if (uv > 255) {
-                *d = '\0';
-                return 0;
-            }
+            *d++ = (U8)utf8_to_uv(s, &ulen);
             s += ulen;
-            *d++ = (U8)uv;
         }
     }
     *d = '\0';
+    *len = d - save;
     return save;
 }
doop.c		patch \| blob \| blame \| history
embed.pl		patch \| blob \| blame \| history
perlapi.c		patch \| blob \| blame \| history
pod/perlapi.pod		patch \| blob \| blame \| history
pod/perldiag.pod		patch \| blob \| blame \| history
pod/perlfunc.pod		patch \| blob \| blame \| history
proto.h		patch \| blob \| blame \| history
t/op/vec.t		patch \| blob \| blame \| history
utf8.c		patch \| blob \| blame \| history