XS-assisted SWASHGET (esp. for t/uni/class.t speedup)
SADAHIRO Tomoyuki [Wed, 23 Nov 2005 17:57:34 +0000 (02:57 +0900)]
Message-Id: <20051123175603.FFD5.BQW10602@nifty.com>

And :
Message-Id: <20051123202935.4D9D.BQW10602@nifty.com>

with some nits to use U8 instead of char more consistently

p4raw-id: //depot/perl@26199

lib/utf8_heavy.pl
t/op/pat.t
universal.c
utf8.c

index b6fdeb9..229ed97 100644 (file)
@@ -267,146 +267,11 @@ sub SWASHNEW {
 }
 
 # NOTE: utf8.c:swash_init() assumes entries are never modified once generated.
-
 sub SWASHGET {
     # See utf8.c:Perl_swash_fetch for problems with this interface.
-    my ($self, $start, $len) = @_;
-    local $^D = 0 if $^D;
-    my $type = $self->{TYPE};
-    my $bits = $self->{BITS};
-    my $none = $self->{NONE};
-    print STDERR "SWASHGET @_ [$type/$bits/$none]\n" if DEBUG;
-    my $end = $start + $len;
-    my $swatch = "";
-    my $key;
-    vec($swatch, $len - 1, $bits) = 0; # Extend to correct length.
-    if ($none) {
-       for $key (0 .. $len - 1) { vec($swatch, $key, $bits) = $none }
-    }
-
-    for ($self->{LIST}) {
-       pos $_ = 0;
-       if ($bits > 1) {
-         LINE:
-           while (/^([0-9a-fA-F]+)(?:[ \t]([0-9a-fA-F]+)?)?(?:[ \t]([0-9a-fA-F]+))?/mg) {
-               chomp;
-               my ($a, $b, $c) = ($1, $2, $3);
-               croak "$type: illegal mapping '$_'"
-                   if $type =~ /^To/ &&
-                      !(defined $a && defined $c);
-               my $min = hex $a;
-               my $max = defined $b ? hex $b : $min;
-               my $val = defined $c ? hex $c : 0;
-               next if $max < $start;
-               print "$min $max $val\n" if DEBUG;
-               if ($none) {
-                   if ($min < $start) {
-                       $val += $start - $min if $val < $none;
-                       $min = $start;
-                   }
-                   for ($key = $min; $key <= $max; $key++) {
-                       last LINE if $key >= $end;
-                       print STDERR "$key => $val\n" if DEBUG;
-                       vec($swatch, $key - $start, $bits) = $val;
-                       ++$val if $val < $none;
-                   }
-               }
-               else {
-                   if ($min < $start) {
-                       $val += $start - $min;
-                       $min = $start;
-                   }
-                   for ($key = $min; $key <= $max; $key++, $val++) {
-                       last LINE if $key >= $end;
-                       print STDERR "$key => $val\n" if DEBUG;
-                       vec($swatch, $key - $start, $bits) = $val;
-                   }
-               }
-           }
-       }
-       else {
-         LINE:
-           while (/^([0-9a-fA-F]+)(?:[ \t]+([0-9a-fA-F]+))?/mg) {
-               chomp;
-               my $min = hex $1;
-               my $max = defined $2 ? hex $2 : $min;
-               next if $max < $start;
-               if ($min < $start) {
-                   $min = $start;
-               }
-               for ($key = $min; $key <= $max; $key++) {
-                   last LINE if $key >= $end;
-                   print STDERR "$key => 1\n" if DEBUG;
-                   vec($swatch, $key - $start, 1) = 1;
-               }
-           }
-       }
-    }
-    for my $x ($self->{EXTRAS}) {
-       pos $x = 0;
-       while ($x =~ /^([-+!&])(.*)/mg) {
-           my $char = $1;
-           my $name = $2;
-           print STDERR "INDIRECT $1 $2\n" if DEBUG;
-           my $otherbits = $self->{$name}->{BITS};
-           croak("SWASHGET size mismatch") if $bits < $otherbits;
-           my $other = $self->{$name}->SWASHGET($start, $len);
-           if ($char eq '+') {
-               if ($bits == 1 and $otherbits == 1) {
-                   $swatch |= $other;
-               }
-               else {
-                   for ($key = 0; $key < $len; $key++) {
-                       vec($swatch, $key, $bits) = vec($other, $key, $otherbits);
-                   }
-               }
-           }
-           elsif ($char eq '!') {
-               if ($bits == 1 and $otherbits == 1) {
-                   $swatch |= ~$other;
-               }
-               else {
-                   for ($key = 0; $key < $len; $key++) {
-                       if (!vec($other, $key, $otherbits)) {
-                           vec($swatch, $key, $bits) = 1;
-                       }
-                   }
-               }
-           }
-           elsif ($char eq '-') {
-               if ($bits == 1 and $otherbits == 1) {
-                   $swatch &= ~$other;
-               }
-               else {
-                   for ($key = 0; $key < $len; $key++) {
-                       if (vec($other, $key, $otherbits)) {
-                           vec($swatch, $key, $bits) = 0;
-                       }
-                   }
-               }
-           }
-           elsif ($char eq '&') {
-               if ($bits == 1 and $otherbits == 1) {
-                   $swatch &= $other;
-               }
-               else {
-                   for ($key = 0; $key < $len; $key++) {
-                       if (!vec($other, $key, $otherbits)) {
-                           vec($swatch, $key, $bits) = 0;
-                       }
-                   }
-               }
-           }
-       }
-    }
-    if (DEBUG) {
-       print STDERR "CELLS ";
-       for ($key = 0; $key < $len; $key++) {
-           print STDERR vec($swatch, $key, $bits), " ";
-       }
-       print STDERR "\n";
-    }
-    $swatch;
+    # See universal.c for XS utf8::SWASHGET_heavy.
+    # USAGE: $swatch = utf8::SWASHGET_heavy($self, $start, $len, DEBUG);
+    return utf8::SWASHGET_heavy($_[0], $_[1], $_[2], DEBUG);
 }
 
 1;
index 69d7305..4ab37ad 100755 (executable)
@@ -6,7 +6,7 @@
 
 $| = 1;
 
-print "1..1187\n";
+print "1..1191\n";
 
 BEGIN {
     chdir 't' if -d 't';
@@ -3006,6 +3006,15 @@ END
 print "\x{0712}" =~ /\p{Syriac1}/ ? "ok $test\n" : "not ok $test\n"; $test++;
 print "\x{072F}" =~ /\P{Syriac1}/ ? "ok $test\n" : "not ok $test\n"; $test++;
 
+print "# user-defined character properties may lack \\n at the end\n";
+sub InGreekSmall   { return "03B1\t03C9" }
+sub InGreekCapital { return "0391\t03A9\n-03A2" }
+
+ok("\x{03C0}" =~ /\p{InGreekSmall}/,   "Small pi");
+ok("\x{03C2}" =~ /\p{InGreekSmall}/,   "Final sigma");
+ok("\x{03A0}" =~ /\p{InGreekCapital}/, "Capital PI");
+ok("\x{03A2}" =~ /\P{InGreekCapital}/, "Reserved");
+
 {
     print "# Change #18179\n";
     # previously failed with "panic: end_shift
@@ -3402,5 +3411,5 @@ ok(("foba  ba$s" =~ qr/(foo|BaSS|bar)/i)
        "# TODO assigning to original string should not corrupt match vars");
 }
 
-# last test 1187
+# last test 1191
 
index 10dddb5..4d44aa7 100644 (file)
@@ -199,6 +199,7 @@ XS(XS_Regexp_DESTROY);
 XS(XS_Internals_hash_seed);
 XS(XS_Internals_rehash_seed);
 XS(XS_Internals_HvREHASH);
+XS(XS_utf8_SWASHGET_heavy);
 
 void
 Perl_boot_core_UNIVERSAL(pTHX)
@@ -247,6 +248,7 @@ Perl_boot_core_UNIVERSAL(pTHX)
     newXSproto("Internals::hash_seed",XS_Internals_hash_seed, file, "");
     newXSproto("Internals::rehash_seed",XS_Internals_rehash_seed, file, "");
     newXSproto("Internals::HvREHASH", XS_Internals_HvREHASH, file, "\\%");
+    newXS("utf8::SWASHGET_heavy", XS_utf8_SWASHGET_heavy, file);
 }
 
 
@@ -949,6 +951,417 @@ XS(XS_Internals_HvREHASH) /* Subject to change  */
     Perl_croak(aTHX_ "Internals::HvREHASH $hashref");
 }
 
+XS(XS_utf8_SWASHGET_heavy)
+{
+    dXSARGS;
+    if (items != 4) {
+       Perl_croak(aTHX_
+           "Usage: utf8::SWASHGET_heavy($self, $start, $len, DEBUG)");
+    }
+    {
+       SV* self    = ST(0);
+       const I32 i_start = (I32)SvIV(ST(1));
+       const I32 i_len   = (I32)SvIV(ST(2));
+       const I32 debug   = (I32)SvIV(ST(3));
+       U32 start = (U32)i_start;
+       U32 len   = (U32)i_len;
+
+       HV *hv;
+       SV **listsvp, **typesvp, **bitssvp, **nonesvp, **extssvp, *swatch;
+       U8 *l, *lend, *x, *xend, *s, *nextline;
+       STRLEN lcur, xcur, scur;
+       U8* typestr;
+       int typeto;
+       U32 bits, none, end, octets;
+
+       if (SvROK(self) && SvTYPE(SvRV(self))==SVt_PVHV)
+           hv = (HV*)SvRV(self);
+       else
+           Perl_croak(aTHX_ "hv is not a hash reference");
+
+       if (i_start < 0)
+           Perl_croak(aTHX_ "SWASHGET negative start");
+       if (i_len < 0)
+           Perl_croak(aTHX_ "SWASHGET negative len");
+
+       listsvp = hv_fetch(hv, "LIST", 4, FALSE);
+       typesvp = hv_fetch(hv, "TYPE", 4, FALSE);
+       bitssvp = hv_fetch(hv, "BITS", 4, FALSE);
+       nonesvp = hv_fetch(hv, "NONE", 4, FALSE);
+       extssvp = hv_fetch(hv, "EXTRAS", 6, FALSE);
+       typestr = SvPV_nolen(*typesvp);
+       typeto  = typestr[0] == 'T' && typestr[1] == 'o';
+       bits    = (U32)SvUV(*bitssvp);
+       none    = (U32)SvUV(*nonesvp);
+       end     = start + len;
+       octets  = bits >> 3; /* if bits == 1, then octets == 0 */
+
+       if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
+           Perl_croak(aTHX_ "SWASHGET unknown bits %"UVuf, (UV)bits);
+       }
+       if (debug) {
+           char* selfstr = SvPV_nolen(self);
+           PerlIO_printf(Perl_error_log, "SWASHGET ");
+           PerlIO_printf(Perl_error_log, "%s %"UVuf" %"UVuf" ",
+                                         selfstr, (UV)start, (UV)len);
+           PerlIO_printf(Perl_error_log, "[%s/%"UVuf"/%"UVuf"]\n",
+                                         typestr, (UV)bits, (UV)none);
+       }
+
+       /* initialize $swatch */
+       swatch = newSVpvn("",0);
+       scur   = octets ? (len * octets) : (len + 7) / 8;
+       SvGROW(swatch, scur + 1);
+       s = (U8*)SvPVX(swatch);
+       if (octets && none) {
+           const U8* e = s + scur;
+           while (s < e) {
+               if (bits == 8)
+                   *s++ = (U8)(none & 0xff);
+               else if (bits == 16) {
+                   *s++ = (U8)((none >>  8) & 0xff);
+                   *s++ = (U8)( none        & 0xff);
+               }
+               else if (bits == 32) {
+                   *s++ = (U8)((none >> 24) & 0xff);
+                   *s++ = (U8)((none >> 16) & 0xff);
+                   *s++ = (U8)((none >>  8) & 0xff);
+                   *s++ = (U8)( none        & 0xff);
+               }
+           }
+           *s = '\0';
+       }
+       else {
+           (void)memzero((U8*)s, scur + 1);
+       }
+       SvCUR_set(swatch, scur);
+       s = (U8*)SvPVX(swatch);
+
+       /* read $self->{LIST} */
+       l = (U8*)SvPV(*listsvp, lcur);
+       lend = l + lcur;
+       while (l < lend) {
+           U32 min, max, val, key;
+           STRLEN numlen;
+           I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
+
+           nextline = (U8*)memchr(l, '\n', lend - l);
+
+           numlen = lend - l;
+           min = (U32)grok_hex(l, &numlen, &flags, NULL);
+           if (numlen)
+               l += numlen;
+           else if (nextline) {
+               l = nextline + 1; /* 1 is length of "\n" */
+               continue;
+           }
+           else {
+               l = lend; /* to the end of LIST, at which no \n */
+               break;
+           }
+
+           if (isBLANK(*l)) {
+               ++l;
+               flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
+               numlen = lend - l;
+               max = (U32)grok_hex(l, &numlen, &flags, NULL);
+               if (numlen)
+                   l += numlen;
+               else
+                   max = min;
+
+               if (octets) {
+                   if (isBLANK(*l)) {
+                       ++l;
+                       flags = PERL_SCAN_SILENT_ILLDIGIT |
+                               PERL_SCAN_DISALLOW_PREFIX;
+                       numlen = lend - l;
+                       val = (U32)grok_hex(l, &numlen, &flags, NULL);
+                       if (numlen)
+                           l += numlen;
+                       else
+                           val = 0;
+                   }
+                   else {
+                       val = 0;
+                       if (typeto) {
+                           Perl_croak(aTHX_ "%s: illegal mapping '%s'",
+                                            typestr, l);
+                       }
+                   }
+               }
+           }
+           else {
+               max = min;
+               if (octets) {
+                   val = 0;
+                   if (typeto) {
+                       Perl_croak(aTHX_ "%s: illegal mapping '%s'",
+                                        typestr, l);
+                   }
+               }
+           }
+
+           if (nextline)
+               l = nextline + 1;
+           else
+               l = lend;
+
+           if (max < start)
+               continue;
+
+           if (octets) {
+               if (debug) {
+                   PerlIO_printf(Perl_error_log,
+                       "%"UVuf" %"UVuf" %"UVuf"\n",
+                       (UV)min, (UV)max, (UV)val);
+               }
+               if (min < start) {
+                   if (!none || val < none) {
+                       val += start - min;
+                   }
+                   min = start;
+               }
+               for (key = min; key <= max; key++) {
+                   U32 offset;
+                   if (key >= end)
+                       goto go_out_list;
+                   if (debug) {
+                       PerlIO_printf(Perl_error_log,
+                               "%"UVuf" => %"UVuf"\n",
+                               (UV)key, (UV)val);
+                   }
+
+               /* offset must be non-negative (start <= min <= key < end) */
+                   offset = (key - start) * octets;
+                   if (bits == 8)
+                       s[offset] = (U8)(val & 0xff);
+                   else if (bits == 16) {
+                       s[offset    ] = (U8)((val >>  8) & 0xff);
+                       s[offset + 1] = (U8)( val        & 0xff);
+                   }
+                   else if (bits == 32) {
+                       s[offset    ] = (U8)((val >> 24) & 0xff);
+                       s[offset + 1] = (U8)((val >> 16) & 0xff);
+                       s[offset + 2] = (U8)((val >>  8) & 0xff);
+                       s[offset + 3] = (U8)( val        & 0xff);
+                   }
+
+                   if (!none || val < none)
+                       ++val;
+               }
+           }
+           else {
+               if (min < start)
+                   min = start;
+               for (key = min; key <= max; key++) {
+                   U32 offset = key - start;
+                   if (key >= end)
+                       goto go_out_list;
+                   if (debug) {
+                       PerlIO_printf(Perl_error_log,
+                               "%"UVuf" => 1\n", (UV)key);
+                   }
+                   s[offset >> 3] |= 1 << (offset & 7);
+               }
+           }
+       }
+    go_out_list:
+
+       /* read $self->{EXTRAS} */
+       x = (U8*)SvPV(*extssvp, xcur);
+       xend = x + xcur;
+       while (x < xend) {
+           STRLEN namelen;
+           U8 *namestr;
+           SV** othersvp;
+           U32 otherbits;
+
+           U8 opc = *x++;
+           if (opc == '\n')
+               continue;
+
+           nextline = (U8*)memchr(x, '\n', xend - x);
+
+           if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
+               if (nextline) {
+                   x = nextline + 1;
+                   continue;
+               }
+               else {
+                   x = xend;
+                   break;
+               }
+           }
+
+           namestr = x;
+
+           if (nextline) {
+               namelen = nextline - namestr;
+               x = nextline + 1;
+           }
+           else {
+               namelen = xend - namestr;
+               x = xend;
+           }
+
+           if (debug) {
+               U8* tmpstr;
+               Newx(tmpstr, namelen + 1, U8);
+               Move(namestr, tmpstr, namelen, U8);
+               tmpstr[namelen] = '\0';
+               PerlIO_printf(Perl_error_log,
+                       "INDIRECT %c %s\n", opc, tmpstr);
+               Safefree(tmpstr);
+           }
+
+           {
+               HV* otherhv;
+               SV **otherbitssvp;
+
+               othersvp = hv_fetch(hv, namestr, namelen, FALSE);
+               if (*othersvp && SvROK(*othersvp) &&
+                                SvTYPE(SvRV(*othersvp))==SVt_PVHV)
+                   otherhv = (HV*)SvRV(*othersvp);
+               else
+                   Perl_croak(aTHX_ "otherhv is not a hash reference");
+
+               otherbitssvp = hv_fetch(otherhv, "BITS", 4, FALSE);
+               otherbits = (U32)SvUV(*otherbitssvp);
+               if (bits < otherbits)
+                   Perl_croak(aTHX_ "SWASHGET size mismatch");
+           }
+
+           {
+               dSP;
+               ENTER;
+               SAVETMPS;
+               PUSHMARK(SP);
+               EXTEND(SP,3);
+               PUSHs(*othersvp);
+               PUSHs(sv_2mortal(newSViv(start)));
+               PUSHs(sv_2mortal(newSViv(len)));
+               PUTBACK;
+               if (call_method("SWASHGET", G_SCALAR)) {
+                   U8 *s, *o;
+                   STRLEN slen, olen;
+                   SV* tmpsv = *PL_stack_sp--;
+                   o = (U8*)SvPV(tmpsv, olen);
+
+                   if (!olen)
+                       Perl_croak(aTHX_ "SWASHGET didn't return valid swatch");
+                   s = SvPV(swatch, slen);
+                   if (bits == 1 && otherbits == 1) {
+                       if (slen != olen)
+                           Perl_croak(aTHX_ "SWASHGET length mismatch");
+
+                       switch (opc) {
+                       case '+':
+                           while (slen--)
+                               *s++ |= *o++;
+                           break;
+                       case '!':
+                           while (slen--)
+                               *s++ |= ~*o++;
+                           break;
+                       case '-':
+                           while (slen--)
+                               *s++ &= ~*o++;
+                           break;
+                       case '&':
+                           while (slen--)
+                               *s++ &= *o++;
+                           break;
+                       default:
+                           break;
+                       }
+                   }
+                   else {
+                       U32 otheroctets = otherbits / 8;
+                       U32 offset = 0;
+                       U8* send = s + slen;
+
+                       while (s < send) {
+                           U32 val = 0;
+
+                           if (otherbits == 1) {
+                               val = (o[offset >> 3] >> (offset & 7)) & 1;
+                               ++offset;
+                           }
+                           else {
+                               U32 vlen = otheroctets;
+                               val = *o++;
+                               while (--vlen) {
+                                   val <<= 8;
+                                   val |= *o++;
+                               }
+                           }
+
+                           if      (opc == '+' && val)
+                               val = 1;
+                           else if (opc == '!' && !val)
+                               val = 1;
+                           else if (opc == '-' && val)
+                               val = 0;
+                           else if (opc == '&' && !val)
+                               val = 0;
+                           else {
+                               s += octets;
+                               continue;
+                           }
+
+                           if (bits == 8)
+                               *s++ = (U8)( val & 0xff);
+                           else if (bits == 16) {
+                               *s++ = (U8)((val >>  8) & 0xff);
+                               *s++ = (U8)( val        & 0xff);
+                           }
+                           else if (bits == 32) {
+                               *s++ = (U8)((val >> 24) & 0xff);
+                               *s++ = (U8)((val >> 16) & 0xff);
+                               *s++ = (U8)((val >>  8) & 0xff);
+                               *s++ = (U8)( val        & 0xff);
+                           }
+                       }
+                   }
+               }
+               FREETMPS;
+               LEAVE;
+           }
+       }
+
+       if (debug) {
+           U8* s = (U8*)SvPVX(swatch);
+           PerlIO_printf(Perl_error_log, "CELLS ");
+           if (bits == 1) {
+               U32 key;
+               for (key = 0; key < len; key++) {
+                   int val = (s[key >> 3] >> (key & 7)) & 1;
+                   PerlIO_printf(Perl_error_log, val ? "1 " : "0 ");
+               }
+           }
+           else {
+               U8* send = s + len * octets;
+               while (s < send) {
+                   U32 vlen = octets;
+                   U32 val = *s++;
+                   while (--vlen) {
+                       val <<= 8;
+                       val |= *s++;
+                   }
+                   PerlIO_printf(Perl_error_log, "%"UVuf" ", (UV)val);
+               }
+           }
+           PerlIO_printf(Perl_error_log, "\n");
+       }
+
+       ST(0) = swatch;
+       sv_2mortal(ST(0));
+    }
+    XSRETURN(1);
+}
+
+
 /*
  * Local variables:
  * c-indentation-style: bsd
diff --git a/utf8.c b/utf8.c
index 88855bb..586fc74 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -1696,8 +1696,8 @@ Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
            SV *errsv_save;
            ENTER;
            SAVETMPS;
-           save_re_context();
-           PUSHSTACKi(PERLSI_MAGIC);
+       /*  save_re_context();  */
+       /*  PUSHSTACKi(PERLSI_MAGIC);  */
            PUSHMARK(SP);
            EXTEND(SP,3);
            PUSHs((SV*)sv);
@@ -1714,7 +1714,7 @@ Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
            if (!SvTRUE(ERRSV))
                sv_setsv(ERRSV, errsv_save);
            SvREFCNT_dec(errsv_save);
-           POPSTACK;
+       /*  POPSTACK; */
            FREETMPS;
            LEAVE;
            if (IN_PERL_COMPILETIME)