Make pack("C", 0x100) to create Unicode, unless under the
Jarkko Hietaniemi [Wed, 21 Feb 2001 00:24:22 +0000 (00:24 +0000)]
evil influence of 'use bytes'.  Similarly, unpack("C", ...)
will understand Unicode, unless you under know what.

p4raw-id: //depot/perl@8865

pp.c
t/op/pack.t
t/op/qu.t

diff --git a/pp.c b/pp.c
index a0361da..b3d769a 100644 (file)
--- a/pp.c
+++ b/pp.c
@@ -4064,6 +4064,7 @@ PP(pp_unpack)
     U16 aushort;
     unsigned int auint;
     U32 aulong;
+    UV auv;
 #ifdef HAS_QUAD
     Uquad_t auquad;
 #endif
@@ -4331,20 +4332,44 @@ PP(pp_unpack)
            if (len > strend - s)
                len = strend - s;
            if (checksum) {
-             uchar_checksum:
-               while (len-- > 0) {
-                   auint = *s++ & 255;
-                   culong += auint;
+               if (DO_UTF8(right)) {
+                   while (len > 0) {
+                       STRLEN l;
+                       auv = utf8_to_uv((U8*)s, len, &l, UTF8_ALLOW_ANYUV);
+                       culong += auv;
+                       s += l;
+                       len -= l;
+                   }
+               }
+               else {
+               uchar_checksum:
+                   while (len-- > 0) {
+                       auint = *s++ & 0xFF;
+                       culong += auint;
+                   }
                }
            }
            else {
                EXTEND(SP, len);
                EXTEND_MORTAL(len);
-               while (len-- > 0) {
-                   auint = *s++ & 255;
-                   sv = NEWSV(37, 0);
-                   sv_setiv(sv, (IV)auint);
-                   PUSHs(sv_2mortal(sv));
+               if (DO_UTF8(right)) {
+                   while (len > 0) {
+                       STRLEN l;
+                       auv = utf8_to_uv((U8*)s, len, &l, UTF8_ALLOW_ANYUV);
+                       sv = NEWSV(37, 0);
+                       sv_setuv(sv, auv);
+                       PUSHs(sv_2mortal(sv));
+                       s += l;
+                       len -= l;
+                   }
+               }
+               else {
+                   while (len-- > 0) {
+                       auint = *s++ & 0xFF;
+                       sv = NEWSV(37, 0);
+                       sv_setuv(sv, auint);
+                       PUSHs(sv_2mortal(sv));
+                   }
                }
            }
            break;
@@ -5145,6 +5170,7 @@ PP(pp_pack)
     unsigned int auint;
     I32 along;
     U32 aulong;
+    UV auv;
 #ifdef HAS_QUAD
     Quad_t aquad;
     Uquad_t auquad;
@@ -5156,6 +5182,7 @@ PP(pp_pack)
 #ifdef PERL_NATINT_PACK
     int natint;                /* native integer */
 #endif
+    bool has_utf8;
 
     items = SP - MARK;
     MARK++;
@@ -5392,7 +5419,6 @@ PP(pp_pack)
                items = saveitems;
            }
            break;
-       case 'C':
        case 'c':
            while (len-- > 0) {
                fromstr = NEXTFROM;
@@ -5401,12 +5427,41 @@ PP(pp_pack)
                sv_catpvn(cat, &achar, sizeof(char));
            }
            break;
+       case 'C':
+           has_utf8 = SvUTF8(cat);
+           while (len-- > 0) {
+               fromstr = NEXTFROM;
+               auv = SvUV(fromstr);
+               if (!has_utf8 && auv > 0xFF && !IN_BYTE) {
+                   has_utf8 = TRUE;
+                   if (SvCUR(cat))
+                       sv_utf8_upgrade(cat);
+                   else
+                       SvUTF8_on(cat); /* There will be UTF8. */
+               }
+               if (has_utf8) {
+                   SvGROW(cat, SvCUR(cat) + UNISKIP(auv) + 1);
+                   SvCUR_set(cat, (char*)uv_to_utf8((U8*)SvEND(cat),auv)
+                                  - SvPVX(cat));
+               }
+               else {
+                   achar = auv;
+                   sv_catpvn(cat, &achar, sizeof(char));
+               }
+           }
+           *SvEND(cat) = '\0';
+           break;
        case 'U':
+           has_utf8 = SvUTF8(cat);
            while (len-- > 0) {
                fromstr = NEXTFROM;
-               auint = SvUV(fromstr);
-               SvGROW(cat, SvCUR(cat) + UTF8_MAXLEN + 1);
-               SvCUR_set(cat, (char*)uv_to_utf8((U8*)SvEND(cat),auint)
+               auv = SvUV(fromstr);
+               if (!has_utf8 && auv > 0x80) {
+                   has_utf8 = TRUE;
+                   sv_utf8_upgrade(cat);
+               }
+               SvGROW(cat, SvCUR(cat) + UNISKIP(auv) + 1);
+               SvCUR_set(cat, (char*)uv_to_utf8((U8*)SvEND(cat),auv)
                               - SvPVX(cat));
            }
            *SvEND(cat) = '\0';
index 67bd547..06e47f2 100755 (executable)
@@ -6,7 +6,7 @@ BEGIN {
     require Config; import Config;
 }
 
-print "1..159\n";
+print "1..163\n";
 
 $format = "c2 x5 C C x s d i l a6";
 # Need the expression in here to force ary[5] to be numeric.  This avoids
@@ -22,7 +22,13 @@ $out2=join(':',@ary2);
 # Using long double NVs may introduce greater accuracy than wanted.
 $out1 =~ s/:9\.87654321097999\d*:/:9.87654321098:/;
 $out2 =~ s/:9\.87654321097999\d*:/:9.87654321098:/;
-print ($out1 eq $out2? "ok 2\n" : "not ok 2\n");
+if ($out1 eq $out2) {
+    print "ok 2\n";
+} else {
+    print "# out1: $out1\n";
+    print "# out2: $out2\n";
+    print "not ok 2\n";
+}
 
 print ($foo =~ /def/ ? "ok 3\n" : "not ok 3\n");
 
@@ -407,6 +413,8 @@ $z = pack <<EOP,'string','etc';
 EOP
 print 'not ' unless $z eq "\000\006string\003etc"; print "ok $test\n"; $test++;
 
+# 157..169: ??? 
+
 print 'not ' unless "1.20.300.4000" eq sprintf "%vd", pack("U*",1,20,300,4000); 
 print "ok $test\n"; $test++;
 print 'not ' unless "1.20.300.4000" eq 
@@ -416,3 +424,32 @@ print 'not ' unless v1.20.300.4000 ne
                     sprintf "%vd", pack("C0U*",1,20,300,4000); 
 print "ok $test\n"; $test++;
 
+# 160: unpack("C") and ord() equivalence for Unicode
+
+#print "not " unless unpack("C", chr(0x100)) eq ord(chr(0x100)) &&
+#                    ord(chr(0x100)) == 0x100;
+print "ok $test\n"; $test++;
+
+# 161: use bytes + unpack C == UTF-8 unraveling
+
+{
+    use bytes;
+    my @bytes = unpack("C*", pack("U", 0x100));
+    print "not " unless "@bytes" eq "196 128";
+    print "ok $test\n"; $test++;
+}
+
+# 162: pack C > 255
+
+print "not " unless ord(pack("C", 0x100)) == 0x100;
+print "ok $test\n"; $test++;
+
+# 163: pack C > 255 + use bytes == wraparound
+
+{
+    use bytes;
+
+    print "not " unless ord(pack("C", 0x100 + 0xab)) == 0xab;
+    print "ok $test\n"; $test++;
+}
+
index 2800204..c24b507 100644 (file)
--- a/t/op/qu.t
+++ b/t/op/qu.t
@@ -1,5 +1,11 @@
 print "1..6\n";
 
+
+BEGIN {
+    chdir 't' if -d 't';
+    @INC = '../lib';
+}
+
 my $foo = "foo";
 
 print "not " unless qu(abc$foo) eq "abcfoo";
@@ -16,9 +22,15 @@ print "ok 3\n";
 print "not " unless qu(\x{41}\x{100}\x61\x{200}) eq "A\x{100}a\x{200}";
 print "ok 4\n";
 
+{
+
+use bytes;
+
 print "not " unless join(" ", unpack("C*", qu(\x80))) eq "194 128";
 print "ok 5\n";
 
 print "not " unless join(" ", unpack("C*", qu(\x{100}))) eq "196 128";
 print "ok 6\n";
 
+}
+