From: Simon Cozens Date: Sat, 14 Oct 2000 20:52:13 +0000 (+0100) Subject: Make ~(chr(a).chr(b)) eq chr(~a).chr(~b) on utf8. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=1d68d6cd5ca16f48de9798b0758052a6de564551;p=p5sagit%2Fp5-mst-13.2.git Make ~(chr(a).chr(b)) eq chr(~a).chr(~b) on utf8. Subject: [PATCH] Re: [ID 20000918.005] ~ on wide chars Message-ID: <20001014205213.A9645@pembro4.pmb.ox.ac.uk> p4raw-id: //depot/perl@7235 --- diff --git a/pp.c b/pp.c index 03609e8..72d9dee 100644 --- a/pp.c +++ b/pp.c @@ -1476,6 +1476,38 @@ PP(pp_complement) SvSetSV(TARG, sv); tmps = SvPV_force(TARG, len); anum = len; + if (SvUTF8(TARG)) { + /* Calculate exact length, let's not estimate */ + STRLEN targlen = 0; + U8 *result; + char *send; + + send = tmps + len; + while (tmps < send) { + I32 l; + UV c = utf8_to_uv(tmps, &l); + c = (UV)~c; + tmps += UTF8SKIP(tmps); + targlen += UTF8LEN(c); + } + + /* Now rewind strings and write them. */ + tmps -= len; + Newz(0, result, targlen + 1, U8); + while (tmps < send) { + I32 l; + UV c = utf8_to_uv(tmps, &l); + tmps += UTF8SKIP(tmps); + result = uv_to_utf8(result,(UV)~c); + } + *result = '\0'; + result -= targlen; + sv_setpvn(TARG, result, targlen); + SvUTF8_on(TARG); + Safefree(result); + SETs(TARG); + RETURN; + } #ifdef LIBERAL for ( ; anum && (unsigned long)tmps % sizeof(long); anum--, tmps++) *tmps = ~*tmps; diff --git a/t/op/bop.t b/t/op/bop.t index 92baa67..4bdc26b 100755 --- a/t/op/bop.t +++ b/t/op/bop.t @@ -9,7 +9,7 @@ BEGIN { @INC = '../lib'; } -print "1..35\n"; +print "1..37\n"; # numerics print ((0xdead & 0xbeef) == 0x9ead ? "ok 1\n" : "not ok 1\n"); @@ -82,9 +82,9 @@ print "ok 28\n" if sprintf("%vd", v4095.801.4095 & v801.4095) eq '801.801'; print "ok 29\n" if sprintf("%vd", v4095.801.4095 | v801.4095) eq '4095.4095.4095'; print "ok 30\n" if sprintf("%vd", v801.4095 ^ v4095.801.4095) eq '3294.3294.4095'; # -print "ok 31\n" if sprintf("%vd", v120.v300 & v200.400) eq '72.256'; -print "ok 32\n" if sprintf("%vd", v120.v300 | v200.400) eq '248.444'; -print "ok 33\n" if sprintf("%vd", v120.v300 ^ v200.400) eq '176.188'; +print "ok 31\n" if sprintf("%vd", v120.300 & v200.400) eq '72.256'; +print "ok 32\n" if sprintf("%vd", v120.300 | v200.400) eq '248.444'; +print "ok 33\n" if sprintf("%vd", v120.300 ^ v200.400) eq '176.188'; # my $a = v120.300; my $b = v200.400; @@ -94,3 +94,20 @@ my $a = v120.300; my $b = v200.400; $a |= $b; print "ok 35\n" if sprintf("%vd", $a) eq '248.444'; +# +# UTF8 ~ behaviour +for (0x100...0xFFF) { + $a = ~(chr $_); + print "not" if $a ne chr(~$_) or length($a) != 1 or ~$a ne chr($_); +} +print "ok 36\n"; + +for my $i (0xEEE...0xF00) { + for my $j (0x0..0x120) { + $a = ~(chr ($i) . chr $j); + print "not" if $a ne chr(~$i).chr(~$j) + or length($a) != 2 + or ~$a ne chr($i).chr($j); + } +} +print "ok 37\n"; diff --git a/utf8.h b/utf8.h index 32173ea..7407335 100644 --- a/utf8.h +++ b/utf8.h @@ -35,6 +35,24 @@ END_EXTERN_C #define UTF8SKIP(s) PL_utf8skip[*(U8*)s] +#ifdef HAS_QUAD +#define UTF8LEN(uv) ( (uv) < 0x80 ? 1 : \ + (uv) < 0x800 ? 2 : \ + (uv) < 0x10000 ? 3 : \ + (uv) < 0x200000 ? 4 : \ + (uv) < 0x4000000 ? 5 : \ + (uv) < 0x80000000 ? 6 : \ + (uv) < 0x1000000000LL ? 7 : 13 ) +#else +/* No, I'm not even going to *TRY* putting #ifdef inside a #define */ +#define UTF8LEN(uv) ( (uv) < 0x80 ? 1 : \ + (uv) < 0x800 ? 2 : \ + (uv) < 0x10000 ? 3 : \ + (uv) < 0x200000 ? 4 : \ + (uv) < 0x4000000 ? 5 : \ + (uv) < 0x80000000 ? 6 : 7 ) +#endif + /* * Note: we try to be careful never to call the isXXX_utf8() functions * unless we're pretty sure we've seen the beginning of a UTF-8 character