From: Simon Cozens <simon@netthink.co.uk>
Date: Sat, 14 Oct 2000 20:52:13 +0000 (+0100)
Subject: Make ~(chr(a).chr(b)) eq chr(~a).chr(~b) on utf8.
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=1d68d6cd5ca16f48de9798b0758052a6de564551;p=p5sagit%2Fp5-mst-13.2.git

Make ~(chr(a).chr(b)) eq chr(~a).chr(~b) on utf8.
Subject: [PATCH] Re: [ID 20000918.005] ~ on wide chars
Message-ID: <20001014205213.A9645@pembro4.pmb.ox.ac.uk>

p4raw-id: //depot/perl@7235
---

diff --git a/pp.c b/pp.c
index 03609e8..72d9dee 100644
--- a/pp.c
+++ b/pp.c
@@ -1476,6 +1476,38 @@ PP(pp_complement)
 	SvSetSV(TARG, sv);
 	tmps = SvPV_force(TARG, len);
 	anum = len;
+	if (SvUTF8(TARG)) {
+	  /* Calculate exact length, let's not estimate */
+	  STRLEN targlen = 0;
+	  U8 *result;
+	  char *send;
+
+	  send = tmps + len;
+	  while (tmps < send) {
+	    I32 l;
+	    UV c = utf8_to_uv(tmps, &l);
+	    c = (UV)~c;
+	    tmps += UTF8SKIP(tmps);
+	    targlen += UTF8LEN(c);
+	  }
+
+	  /* Now rewind strings and write them. */
+	  tmps -= len;
+	  Newz(0, result, targlen + 1, U8);
+	  while (tmps < send) {
+	    I32 l;
+	    UV c = utf8_to_uv(tmps, &l);
+	    tmps += UTF8SKIP(tmps);
+	    result = uv_to_utf8(result,(UV)~c);
+	  }
+	  *result = '\0';
+	  result -= targlen;
+	  sv_setpvn(TARG, result, targlen);
+	  SvUTF8_on(TARG);
+	  Safefree(result);
+	  SETs(TARG);
+	  RETURN;
+	}
 #ifdef LIBERAL
 	for ( ; anum && (unsigned long)tmps % sizeof(long); anum--, tmps++)
 	    *tmps = ~*tmps;
diff --git a/t/op/bop.t b/t/op/bop.t
index 92baa67..4bdc26b 100755
--- a/t/op/bop.t
+++ b/t/op/bop.t
@@ -9,7 +9,7 @@ BEGIN {
     @INC = '../lib';
 }
 
-print "1..35\n";
+print "1..37\n";
 
 # numerics
 print ((0xdead & 0xbeef) == 0x9ead ? "ok 1\n" : "not ok 1\n");
@@ -82,9 +82,9 @@ print "ok 28\n" if sprintf("%vd", v4095.801.4095 & v801.4095) eq '801.801';
 print "ok 29\n" if sprintf("%vd", v4095.801.4095 | v801.4095) eq '4095.4095.4095';
 print "ok 30\n" if sprintf("%vd", v801.4095 ^ v4095.801.4095) eq '3294.3294.4095';
 #
-print "ok 31\n" if sprintf("%vd", v120.v300 & v200.400) eq '72.256';
-print "ok 32\n" if sprintf("%vd", v120.v300 | v200.400) eq '248.444';
-print "ok 33\n" if sprintf("%vd", v120.v300 ^ v200.400) eq '176.188';
+print "ok 31\n" if sprintf("%vd", v120.300 & v200.400) eq '72.256';
+print "ok 32\n" if sprintf("%vd", v120.300 | v200.400) eq '248.444';
+print "ok 33\n" if sprintf("%vd", v120.300 ^ v200.400) eq '176.188';
 #
 my $a = v120.300;
 my $b = v200.400;
@@ -94,3 +94,20 @@ my $a = v120.300;
 my $b = v200.400;
 $a |= $b;
 print "ok 35\n" if sprintf("%vd", $a) eq '248.444';
+#
+# UTF8 ~ behaviour
+for (0x100...0xFFF) {
+  $a = ~(chr $_);
+  print "not" if $a ne chr(~$_) or length($a) != 1 or ~$a ne chr($_);
+}
+print "ok 36\n";
+
+for my $i (0xEEE...0xF00) {
+  for my $j (0x0..0x120) {
+    $a = ~(chr ($i) . chr $j);
+    print "not" if $a ne chr(~$i).chr(~$j) 
+                or length($a) != 2 
+		or ~$a ne chr($i).chr($j);
+  }
+}
+print "ok 37\n";
diff --git a/utf8.h b/utf8.h
index 32173ea..7407335 100644
--- a/utf8.h
+++ b/utf8.h
@@ -35,6 +35,24 @@ END_EXTERN_C
 
 #define UTF8SKIP(s) PL_utf8skip[*(U8*)s]
 
+#ifdef HAS_QUAD
+#define UTF8LEN(uv) ( (uv) < 0x80           ? 1 : \
+		      (uv) < 0x800          ? 2 : \
+		      (uv) < 0x10000        ? 3 : \
+		      (uv) < 0x200000       ? 4 : \
+		      (uv) < 0x4000000      ? 5 : \
+		      (uv) < 0x80000000     ? 6 : \
+                      (uv) < 0x1000000000LL ? 7 : 13 ) 
+#else
+/* No, I'm not even going to *TRY* putting #ifdef inside a #define */
+#define UTF8LEN(uv) ( (uv) < 0x80           ? 1 : \
+		      (uv) < 0x800          ? 2 : \
+		      (uv) < 0x10000        ? 3 : \
+		      (uv) < 0x200000       ? 4 : \
+		      (uv) < 0x4000000      ? 5 : \
+		      (uv) < 0x80000000     ? 6 : 7 )
+#endif
+
 /*
  * Note: we try to be careful never to call the isXXX_utf8() functions
  * unless we're pretty sure we've seen the beginning of a UTF-8 character