Re: XS-assisted SWASHGET (esp. for t/uni/class.t speedup)
SADAHIRO Tomoyuki [Sun, 27 Nov 2005 17:02:02 +0000 (02:02 +0900)]
Message-Id: <20051127170016.A786.BQW10602@nifty.com>

p4raw-id: //depot/perl@26229

embed.fnc
embed.h
lib/utf8_heavy.pl
pod/perlapi.pod
pod/perlintern.pod
proto.h
universal.c
utf8.c
utf8.h
utfebcdic.h

index f3385fc..ba2994d 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -1385,7 +1385,8 @@ sn        |NV|mulexp10    |NV value|I32 exponent
 
 #if defined(PERL_IN_UTF8_C) || defined(PERL_DECL_PROT)
 s      |STRLEN |is_utf8_char_slow|NN const U8 *s|const STRLEN len
-spR    |bool   |is_utf8_common |NN const U8 *const p|NN SV **swash|NN const char * const swashname
+sR     |bool   |is_utf8_common |NN const U8 *const p|NN SV **swash|NN const char * const swashname
+sR     |SV*    |swash_get      |NN SV* swash|UV start|UV span
 #endif
 
 START_EXTERN_C
diff --git a/embed.h b/embed.h
index d534f17..3812e2d 100644 (file)
--- a/embed.h
+++ b/embed.h
 #ifdef PERL_CORE
 #define is_utf8_char_slow      S_is_utf8_char_slow
 #define is_utf8_common         S_is_utf8_common
+#define swash_get              S_swash_get
 #endif
 #endif
 #define sv_setsv_flags         Perl_sv_setsv_flags
 #ifdef PERL_CORE
 #define is_utf8_char_slow(a,b) S_is_utf8_char_slow(aTHX_ a,b)
 #define is_utf8_common(a,b,c)  S_is_utf8_common(aTHX_ a,b,c)
+#define swash_get(a,b,c)       S_swash_get(aTHX_ a,b,c)
 #endif
 #endif
 #define sv_setsv_flags(a,b,c)  Perl_sv_setsv_flags(aTHX_ a,b,c)
index 229ed97..e5fd6e3 100644 (file)
@@ -266,12 +266,6 @@ sub SWASHNEW {
     return $SWASH;
 }
 
-# NOTE: utf8.c:swash_init() assumes entries are never modified once generated.
-sub SWASHGET {
-    # See utf8.c:Perl_swash_fetch for problems with this interface.
-    # See universal.c for XS utf8::SWASHGET_heavy.
-    # USAGE: $swatch = utf8::SWASHGET_heavy($self, $start, $len, DEBUG);
-    return utf8::SWASHGET_heavy($_[0], $_[1], $_[2], DEBUG);
-}
+# Now SWASHGET is recasted into a C function S_swash_get (see utf8.c).
 
 1;
index 497ad9f..be9249b 100644 (file)
@@ -6008,7 +6008,7 @@ of the result.
 The "swashp" is a pointer to the swash to use.
 
 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
-and loaded by SWASHGET, using lib/utf8_heavy.pl.  The special (usually,
+and loaded by SWASHNEW, using lib/utf8_heavy.pl.  The special (usually,
 but not always, a multicharacter mapping), is tried first.
 
 The "special" is a string like "utf8::ToSpecLower", which means the
index ee9de89..4e6119d 100644 (file)
@@ -897,38 +897,6 @@ Found in file pp.h
 
 =over 8
 
-=item find_uninit_var
-X<find_uninit_var>
-
-Find the name of the undefined variable (if any) that caused the operator o
-to issue a "Use of uninitialized value" warning.
-If match is true, only return a name if it's value matches uninit_sv.
-So roughly speaking, if a unary operator (such as OP_COS) generates a
-warning, then following the direct child of the op may yield an
-OP_PADSV or OP_GV that gives the name of the undefined variable. On the
-other hand, with OP_ADD there are two branches to follow, so we only print
-the variable name if we get an exact match.
-
-The name is returned as a mortal SV.
-
-Assumes that PL_op is the op that originally triggered the error, and that
-PL_comppad/PL_curpad points to the currently executing pad.
-
-       SV*     find_uninit_var(OP* obase, SV* uninit_sv, bool top)
-
-=for hackers
-Found in file sv.c
-
-=item report_uninit
-X<report_uninit>
-
-Print appropriate "Use of uninitialized variable" warning
-
-       void    report_uninit(SV* uninit_sv)
-
-=for hackers
-Found in file sv.c
-
 =item sv_add_arena
 X<sv_add_arena>
 
@@ -976,6 +944,45 @@ Found in file sv.c
 
 =back
 
+=head1 Unicode Support
+
+=over 8
+
+=item find_uninit_var
+X<find_uninit_var>
+
+Find the name of the undefined variable (if any) that caused the operator o
+to issue a "Use of uninitialized value" warning.
+If match is true, only return a name if it's value matches uninit_sv.
+So roughly speaking, if a unary operator (such as OP_COS) generates a
+warning, then following the direct child of the op may yield an
+OP_PADSV or OP_GV that gives the name of the undefined variable. On the
+other hand, with OP_ADD there are two branches to follow, so we only print
+the variable name if we get an exact match.
+
+The name is returned as a mortal SV.
+
+Assumes that PL_op is the op that originally triggered the error, and that
+PL_comppad/PL_curpad points to the currently executing pad.
+
+       SV*     find_uninit_var(OP* obase, SV* uninit_sv, bool top)
+
+=for hackers
+Found in file sv.c
+
+=item report_uninit
+X<report_uninit>
+
+Print appropriate "Use of uninitialized variable" warning
+
+       void    report_uninit(SV* uninit_sv)
+
+=for hackers
+Found in file sv.c
+
+
+=back
+
 =head1 AUTHORS
 
 The autodocumentation system was originally added to the Perl core by
diff --git a/proto.h b/proto.h
index 4ad3b66..71be05a 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -3806,6 +3806,10 @@ STATIC bool      S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, const char * c
                        __attribute__nonnull__(pTHX_2)
                        __attribute__nonnull__(pTHX_3);
 
+STATIC SV*     S_swash_get(pTHX_ SV* swash, UV start, UV span)
+                       __attribute__warn_unused_result__
+                       __attribute__nonnull__(pTHX_1);
+
 #endif
 
 START_EXTERN_C
index b3a742b..10dddb5 100644 (file)
@@ -199,7 +199,6 @@ XS(XS_Regexp_DESTROY);
 XS(XS_Internals_hash_seed);
 XS(XS_Internals_rehash_seed);
 XS(XS_Internals_HvREHASH);
-XS(XS_utf8_SWASHGET_heavy);
 
 void
 Perl_boot_core_UNIVERSAL(pTHX)
@@ -248,7 +247,6 @@ Perl_boot_core_UNIVERSAL(pTHX)
     newXSproto("Internals::hash_seed",XS_Internals_hash_seed, file, "");
     newXSproto("Internals::rehash_seed",XS_Internals_rehash_seed, file, "");
     newXSproto("Internals::HvREHASH", XS_Internals_HvREHASH, file, "\\%");
-    newXS("utf8::SWASHGET_heavy", XS_utf8_SWASHGET_heavy, file);
 }
 
 
@@ -951,417 +949,6 @@ XS(XS_Internals_HvREHASH) /* Subject to change  */
     Perl_croak(aTHX_ "Internals::HvREHASH $hashref");
 }
 
-XS(XS_utf8_SWASHGET_heavy)
-{
-    dXSARGS;
-    if (items != 4) {
-       Perl_croak(aTHX_
-           "Usage: utf8::SWASHGET_heavy($self, $start, $len, DEBUG)");
-    }
-    {
-       SV* self    = ST(0);
-       const I32 i_start = (I32)SvIV(ST(1));
-       const I32 i_len   = (I32)SvIV(ST(2));
-       const I32 debug   = (I32)SvIV(ST(3));
-       U32 start = (U32)i_start;
-       U32 len   = (U32)i_len;
-
-       HV *hv;
-       SV **listsvp, **typesvp, **bitssvp, **nonesvp, **extssvp, *swatch;
-       U8 *l, *lend, *x, *xend, *s, *nextline;
-       STRLEN lcur, xcur, scur;
-       U8* typestr;
-       int typeto;
-       U32 bits, none, end, octets;
-
-       if (SvROK(self) && SvTYPE(SvRV(self))==SVt_PVHV)
-           hv = (HV*)SvRV(self);
-       else
-           Perl_croak(aTHX_ "hv is not a hash reference");
-
-       if (i_start < 0)
-           Perl_croak(aTHX_ "SWASHGET negative start");
-       if (i_len < 0)
-           Perl_croak(aTHX_ "SWASHGET negative len");
-
-       listsvp = hv_fetch(hv, "LIST", 4, FALSE);
-       typesvp = hv_fetch(hv, "TYPE", 4, FALSE);
-       bitssvp = hv_fetch(hv, "BITS", 4, FALSE);
-       nonesvp = hv_fetch(hv, "NONE", 4, FALSE);
-       extssvp = hv_fetch(hv, "EXTRAS", 6, FALSE);
-       typestr = (U8*)SvPV_nolen(*typesvp);
-       typeto  = typestr[0] == 'T' && typestr[1] == 'o';
-       bits    = (U32)SvUV(*bitssvp);
-       none    = (U32)SvUV(*nonesvp);
-       end     = start + len;
-       octets  = bits >> 3; /* if bits == 1, then octets == 0 */
-
-       if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
-           Perl_croak(aTHX_ "SWASHGET unknown bits %"UVuf, (UV)bits);
-       }
-       if (debug) {
-           char* selfstr = SvPV_nolen(self);
-           PerlIO_printf(Perl_error_log, "SWASHGET ");
-           PerlIO_printf(Perl_error_log, "%s %"UVuf" %"UVuf" ",
-                                         selfstr, (UV)start, (UV)len);
-           PerlIO_printf(Perl_error_log, "[%s/%"UVuf"/%"UVuf"]\n",
-                                         typestr, (UV)bits, (UV)none);
-       }
-
-       /* initialize $swatch */
-       swatch = newSVpvn("",0);
-       scur   = octets ? (len * octets) : (len + 7) / 8;
-       SvGROW(swatch, scur + 1);
-       s = (U8*)SvPVX(swatch);
-       if (octets && none) {
-           const U8* e = s + scur;
-           while (s < e) {
-               if (bits == 8)
-                   *s++ = (U8)(none & 0xff);
-               else if (bits == 16) {
-                   *s++ = (U8)((none >>  8) & 0xff);
-                   *s++ = (U8)( none        & 0xff);
-               }
-               else if (bits == 32) {
-                   *s++ = (U8)((none >> 24) & 0xff);
-                   *s++ = (U8)((none >> 16) & 0xff);
-                   *s++ = (U8)((none >>  8) & 0xff);
-                   *s++ = (U8)( none        & 0xff);
-               }
-           }
-           *s = '\0';
-       }
-       else {
-           (void)memzero((U8*)s, scur + 1);
-       }
-       SvCUR_set(swatch, scur);
-       s = (U8*)SvPVX(swatch);
-
-       /* read $self->{LIST} */
-       l = (U8*)SvPV(*listsvp, lcur);
-       lend = l + lcur;
-       while (l < lend) {
-           U32 min, max, val, key;
-           STRLEN numlen;
-           I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
-
-           nextline = (U8*)memchr(l, '\n', lend - l);
-
-           numlen = lend - l;
-           min = (U32)grok_hex((char *)l, &numlen, &flags, NULL);
-           if (numlen)
-               l += numlen;
-           else if (nextline) {
-               l = nextline + 1; /* 1 is length of "\n" */
-               continue;
-           }
-           else {
-               l = lend; /* to the end of LIST, at which no \n */
-               break;
-           }
-
-           if (isBLANK(*l)) {
-               ++l;
-               flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
-               numlen = lend - l;
-               max = (U32)grok_hex((char *)l, &numlen, &flags, NULL);
-               if (numlen)
-                   l += numlen;
-               else
-                   max = min;
-
-               if (octets) {
-                   if (isBLANK(*l)) {
-                       ++l;
-                       flags = PERL_SCAN_SILENT_ILLDIGIT |
-                               PERL_SCAN_DISALLOW_PREFIX;
-                       numlen = lend - l;
-                       val = (U32)grok_hex((char *)l, &numlen, &flags, NULL);
-                       if (numlen)
-                           l += numlen;
-                       else
-                           val = 0;
-                   }
-                   else {
-                       val = 0;
-                       if (typeto) {
-                           Perl_croak(aTHX_ "%s: illegal mapping '%s'",
-                                            typestr, l);
-                       }
-                   }
-               }
-           }
-           else {
-               max = min;
-               if (octets) {
-                   val = 0;
-                   if (typeto) {
-                       Perl_croak(aTHX_ "%s: illegal mapping '%s'",
-                                        typestr, l);
-                   }
-               }
-           }
-
-           if (nextline)
-               l = nextline + 1;
-           else
-               l = lend;
-
-           if (max < start)
-               continue;
-
-           if (octets) {
-               if (debug) {
-                   PerlIO_printf(Perl_error_log,
-                       "%"UVuf" %"UVuf" %"UVuf"\n",
-                       (UV)min, (UV)max, (UV)val);
-               }
-               if (min < start) {
-                   if (!none || val < none) {
-                       val += start - min;
-                   }
-                   min = start;
-               }
-               for (key = min; key <= max; key++) {
-                   U32 offset;
-                   if (key >= end)
-                       goto go_out_list;
-                   if (debug) {
-                       PerlIO_printf(Perl_error_log,
-                               "%"UVuf" => %"UVuf"\n",
-                               (UV)key, (UV)val);
-                   }
-
-               /* offset must be non-negative (start <= min <= key < end) */
-                   offset = (key - start) * octets;
-                   if (bits == 8)
-                       s[offset] = (U8)(val & 0xff);
-                   else if (bits == 16) {
-                       s[offset    ] = (U8)((val >>  8) & 0xff);
-                       s[offset + 1] = (U8)( val        & 0xff);
-                   }
-                   else if (bits == 32) {
-                       s[offset    ] = (U8)((val >> 24) & 0xff);
-                       s[offset + 1] = (U8)((val >> 16) & 0xff);
-                       s[offset + 2] = (U8)((val >>  8) & 0xff);
-                       s[offset + 3] = (U8)( val        & 0xff);
-                   }
-
-                   if (!none || val < none)
-                       ++val;
-               }
-           }
-           else {
-               if (min < start)
-                   min = start;
-               for (key = min; key <= max; key++) {
-                   U32 offset = key - start;
-                   if (key >= end)
-                       goto go_out_list;
-                   if (debug) {
-                       PerlIO_printf(Perl_error_log,
-                               "%"UVuf" => 1\n", (UV)key);
-                   }
-                   s[offset >> 3] |= 1 << (offset & 7);
-               }
-           }
-       }
-    go_out_list:
-
-       /* read $self->{EXTRAS} */
-       x = (U8*)SvPV(*extssvp, xcur);
-       xend = x + xcur;
-       while (x < xend) {
-           STRLEN namelen;
-           U8 *namestr;
-           SV** othersvp;
-           U32 otherbits;
-
-           U8 opc = *x++;
-           if (opc == '\n')
-               continue;
-
-           nextline = (U8*)memchr(x, '\n', xend - x);
-
-           if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
-               if (nextline) {
-                   x = nextline + 1;
-                   continue;
-               }
-               else {
-                   x = xend;
-                   break;
-               }
-           }
-
-           namestr = x;
-
-           if (nextline) {
-               namelen = nextline - namestr;
-               x = nextline + 1;
-           }
-           else {
-               namelen = xend - namestr;
-               x = xend;
-           }
-
-           if (debug) {
-               U8* tmpstr;
-               Newx(tmpstr, namelen + 1, U8);
-               Move(namestr, tmpstr, namelen, U8);
-               tmpstr[namelen] = '\0';
-               PerlIO_printf(Perl_error_log,
-                       "INDIRECT %c %s\n", opc, tmpstr);
-               Safefree(tmpstr);
-           }
-
-           {
-               HV* otherhv;
-               SV **otherbitssvp;
-
-               othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
-               if (*othersvp && SvROK(*othersvp) &&
-                                SvTYPE(SvRV(*othersvp))==SVt_PVHV)
-                   otherhv = (HV*)SvRV(*othersvp);
-               else
-                   Perl_croak(aTHX_ "otherhv is not a hash reference");
-
-               otherbitssvp = hv_fetch(otherhv, "BITS", 4, FALSE);
-               otherbits = (U32)SvUV(*otherbitssvp);
-               if (bits < otherbits)
-                   Perl_croak(aTHX_ "SWASHGET size mismatch");
-           }
-
-           {
-               dSP;
-               ENTER;
-               SAVETMPS;
-               PUSHMARK(SP);
-               EXTEND(SP,3);
-               PUSHs(*othersvp);
-               PUSHs(sv_2mortal(newSViv(start)));
-               PUSHs(sv_2mortal(newSViv(len)));
-               PUTBACK;
-               if (call_method("SWASHGET", G_SCALAR)) {
-                   U8 *s, *o;
-                   STRLEN slen, olen;
-                   SV* tmpsv = *PL_stack_sp--;
-                   o = (U8*)SvPV(tmpsv, olen);
-
-                   if (!olen)
-                       Perl_croak(aTHX_ "SWASHGET didn't return valid swatch");
-                   s = (U8*)SvPV(swatch, slen);
-                   if (bits == 1 && otherbits == 1) {
-                       if (slen != olen)
-                           Perl_croak(aTHX_ "SWASHGET length mismatch");
-
-                       switch (opc) {
-                       case '+':
-                           while (slen--)
-                               *s++ |= *o++;
-                           break;
-                       case '!':
-                           while (slen--)
-                               *s++ |= ~*o++;
-                           break;
-                       case '-':
-                           while (slen--)
-                               *s++ &= ~*o++;
-                           break;
-                       case '&':
-                           while (slen--)
-                               *s++ &= *o++;
-                           break;
-                       default:
-                           break;
-                       }
-                   }
-                   else {
-                       U32 otheroctets = otherbits / 8;
-                       U32 offset = 0;
-                       U8* send = s + slen;
-
-                       while (s < send) {
-                           U32 val = 0;
-
-                           if (otherbits == 1) {
-                               val = (o[offset >> 3] >> (offset & 7)) & 1;
-                               ++offset;
-                           }
-                           else {
-                               U32 vlen = otheroctets;
-                               val = *o++;
-                               while (--vlen) {
-                                   val <<= 8;
-                                   val |= *o++;
-                               }
-                           }
-
-                           if      (opc == '+' && val)
-                               val = 1;
-                           else if (opc == '!' && !val)
-                               val = 1;
-                           else if (opc == '-' && val)
-                               val = 0;
-                           else if (opc == '&' && !val)
-                               val = 0;
-                           else {
-                               s += octets;
-                               continue;
-                           }
-
-                           if (bits == 8)
-                               *s++ = (U8)( val & 0xff);
-                           else if (bits == 16) {
-                               *s++ = (U8)((val >>  8) & 0xff);
-                               *s++ = (U8)( val        & 0xff);
-                           }
-                           else if (bits == 32) {
-                               *s++ = (U8)((val >> 24) & 0xff);
-                               *s++ = (U8)((val >> 16) & 0xff);
-                               *s++ = (U8)((val >>  8) & 0xff);
-                               *s++ = (U8)( val        & 0xff);
-                           }
-                       }
-                   }
-               }
-               FREETMPS;
-               LEAVE;
-           }
-       }
-
-       if (debug) {
-           U8* s = (U8*)SvPVX(swatch);
-           PerlIO_printf(Perl_error_log, "CELLS ");
-           if (bits == 1) {
-               U32 key;
-               for (key = 0; key < len; key++) {
-                   int val = (s[key >> 3] >> (key & 7)) & 1;
-                   PerlIO_printf(Perl_error_log, val ? "1 " : "0 ");
-               }
-           }
-           else {
-               U8* send = s + len * octets;
-               while (s < send) {
-                   U32 vlen = octets;
-                   U32 val = *s++;
-                   while (--vlen) {
-                       val <<= 8;
-                       val |= *s++;
-                   }
-                   PerlIO_printf(Perl_error_log, "%"UVuf" ", (UV)val);
-               }
-           }
-           PerlIO_printf(Perl_error_log, "\n");
-       }
-
-       ST(0) = swatch;
-       sv_2mortal(ST(0));
-    }
-    XSRETURN(1);
-}
-
-
 /*
  * Local variables:
  * c-indentation-style: bsd
diff --git a/utf8.c b/utf8.c
index 690e4c2..813a64f 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -1621,6 +1621,7 @@ Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits
  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
  * the lower-level routine, and it is similarly broken for returning
  * multiple values.  --jhi */
+/* Now SWASHGET is recasted into S_swash_get in this file. */
 UV
 Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
 {
@@ -1632,14 +1633,14 @@ Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
     STRLEN needents;
     const U8 *tmps = NULL;
     U32 bit;
-    SV *retval;
+    SV *swatch;
     U8 tmputf8[2];
     UV c = NATIVE_TO_ASCII(*ptr);
 
     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
-        tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
-        tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
-        ptr = tmputf8;
+       tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
+       tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
+       ptr = tmputf8;
     }
     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
      * then the "swatch" is a vec() for al the chars which start
@@ -1649,20 +1650,18 @@ Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
     klen = UTF8SKIP(ptr) - 1;
     off  = ptr[klen];
 
-    if (klen == 0)
-     {
+    if (klen == 0) {
       /* If char in invariant then swatch is for all the invariant chars
        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
        */
-      needents = UTF_CONTINUATION_MARK;
-      off      = NATIVE_TO_UTF(ptr[klen]);
-     }
-    else
-     {
+       needents = UTF_CONTINUATION_MARK;
+       off      = NATIVE_TO_UTF(ptr[klen]);
+    }
+    else {
       /* If char is encoded then swatch is for the prefix */
-      needents = (1 << UTF_ACCUMULATION_SHIFT);
-      off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
-     }
+       needents = (1 << UTF_ACCUMULATION_SHIFT);
+       off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
+    }
 
     /*
      * This single-entry cache saves about 1/3 of the utf8 overhead in test
@@ -1684,46 +1683,28 @@ Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
        /* Try our second-level swatch cache, kept in a hash. */
        SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
 
-       /* If not cached, generate it via utf8::SWASHGET */
-       if (!svp || !SvPOK(*svp) || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
-           dSP;
+       /* If not cached, generate it via swash_get */
+       if (!svp || !SvPOK(*svp)
+                || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
            /* We use utf8n_to_uvuni() as we want an index into
               Unicode tables, not a native character number.
             */
            const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
                                           ckWARN(WARN_UTF8) ?
                                           0 : UTF8_ALLOW_ANY);
-           SV *errsv_save;
-           ENTER;
-           SAVETMPS;
-       /*  save_re_context(); */ /* Now SWASHGET doesn't use regex */
-           PUSHSTACKi(PERLSI_MAGIC);
-           PUSHMARK(SP);
-           EXTEND(SP,3);
-           PUSHs((SV*)sv);
-           /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
-           PUSHs(sv_2mortal(newSViv((klen) ?
-                                    (code_point & ~(needents - 1)) : 0)));
-           PUSHs(sv_2mortal(newSViv(needents)));
-           PUTBACK;
-           errsv_save = newSVsv(ERRSV);
-           if (call_method("SWASHGET", G_SCALAR))
-               retval = newSVsv(*PL_stack_sp--);
-           else
-               retval = &PL_sv_undef;
-           if (!SvTRUE(ERRSV))
-               sv_setsv(ERRSV, errsv_save);
-           SvREFCNT_dec(errsv_save);
-           POPSTACK;
-           FREETMPS;
-           LEAVE;
+           swatch = swash_get(sv,
+                   /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
+                               (klen) ? (code_point & ~(needents - 1)) : 0,
+                               needents);
+
            if (IN_PERL_COMPILETIME)
                PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
 
-           svp = hv_store(hv, (const char *)ptr, klen, retval, 0);
+           svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
 
-           if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
-               Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
+           if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
+                    || (slen << 3) < needents)
+               Perl_croak(aTHX_ "The swatch does not have proper length");
        }
 
        PL_last_swash_hv = hv;
@@ -1753,6 +1734,319 @@ Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
     return 0;
 }
 
+/* Note:
+ * Returns a swatch (a bit vector string) for a code point sequence
+ * that starts from the value C<start> and comprises the number C<span>.
+ * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
+ * Should be used via swash_fetch, which will cache the swatch in C<swash>.
+ */
+STATIC SV*
+S_swash_get(pTHX_ SV* swash, UV start, UV span)
+{
+    SV *swatch;
+    U8 *l, *lend, *x, *xend, *s, *nl;
+    STRLEN lcur, xcur, scur;
+
+    HV* const hv = (HV*)SvRV(swash);
+    SV** listsvp = hv_fetch(hv, "LIST", 4, FALSE);
+    SV** typesvp = hv_fetch(hv, "TYPE", 4, FALSE);
+    SV** bitssvp = hv_fetch(hv, "BITS", 4, FALSE);
+    SV** nonesvp = hv_fetch(hv, "NONE", 4, FALSE);
+    SV** extssvp = hv_fetch(hv, "EXTRAS", 6, FALSE);
+    U8*  typestr = (U8*)SvPV_nolen(*typesvp);
+    int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
+    STRLEN bits  = SvUV(*bitssvp);
+    STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
+    UV     none  = SvUV(*nonesvp);
+    UV     end   = start + span;
+
+    if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
+       Perl_croak(aTHX_ "swash_get: unknown bits %"UVuf, (UV) bits);
+    }
+
+    /* create and initialize $swatch */
+    swatch = newSVpvn("",0);
+    scur   = octets ? (span * octets) : (span + 7) / 8;
+    SvGROW(swatch, scur + 1);
+    s = (U8*)SvPVX(swatch);
+    if (octets && none) {
+       const U8* e = s + scur;
+       while (s < e) {
+           if (bits == 8)
+               *s++ = (U8)(none & 0xff);
+           else if (bits == 16) {
+               *s++ = (U8)((none >>  8) & 0xff);
+               *s++ = (U8)( none        & 0xff);
+           }
+           else if (bits == 32) {
+               *s++ = (U8)((none >> 24) & 0xff);
+               *s++ = (U8)((none >> 16) & 0xff);
+               *s++ = (U8)((none >>  8) & 0xff);
+               *s++ = (U8)( none        & 0xff);
+           }
+       }
+       *s = '\0';
+    }
+    else {
+       (void)memzero((U8*)s, scur + 1);
+    }
+    SvCUR_set(swatch, scur);
+    s = (U8*)SvPVX(swatch);
+
+    /* read $swash->{LIST} */
+    l = (U8*)SvPV(*listsvp, lcur);
+    lend = l + lcur;
+    while (l < lend) {
+       UV min, max, val, key;
+       STRLEN numlen;
+       I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
+
+       nl = (U8*)memchr(l, '\n', lend - l);
+
+       numlen = lend - l;
+       min = grok_hex((char *)l, &numlen, &flags, NULL);
+       if (numlen)
+           l += numlen;
+       else if (nl) {
+           l = nl + 1; /* 1 is length of "\n" */
+           continue;
+       }
+       else {
+           l = lend; /* to LIST's end at which \n is not found */
+           break;
+       }
+
+       if (isBLANK(*l)) {
+           ++l;
+           flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
+           numlen = lend - l;
+           max = grok_hex((char *)l, &numlen, &flags, NULL);
+           if (numlen)
+               l += numlen;
+           else
+               max = min;
+
+           if (octets) {
+               if (isBLANK(*l)) {
+                   ++l;
+                   flags = PERL_SCAN_SILENT_ILLDIGIT |
+                           PERL_SCAN_DISALLOW_PREFIX;
+                   numlen = lend - l;
+                   val = grok_hex((char *)l, &numlen, &flags, NULL);
+                   if (numlen)
+                       l += numlen;
+                   else
+                       val = 0;
+               }
+               else {
+                   val = 0;
+                   if (typeto) {
+                       Perl_croak(aTHX_ "%s: illegal mapping '%s'",
+                                        typestr, l);
+                   }
+               }
+           }
+       }
+       else {
+           max = min;
+           if (octets) {
+               val = 0;
+               if (typeto) {
+                   Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
+               }
+           }
+       }
+
+       if (nl)
+           l = nl + 1;
+       else
+           l = lend;
+
+       if (max < start)
+           continue;
+
+       if (octets) {
+           if (min < start) {
+               if (!none || val < none) {
+                   val += start - min;
+               }
+               min = start;
+           }
+           for (key = min; key <= max; key++) {
+               STRLEN offset;
+               if (key >= end)
+                   goto go_out_list;
+               /* offset must be non-negative (start <= min <= key < end) */
+               offset = octets * (key - start);
+               if (bits == 8)
+                   s[offset] = (U8)(val & 0xff);
+               else if (bits == 16) {
+                   s[offset    ] = (U8)((val >>  8) & 0xff);
+                   s[offset + 1] = (U8)( val        & 0xff);
+               }
+               else if (bits == 32) {
+                   s[offset    ] = (U8)((val >> 24) & 0xff);
+                   s[offset + 1] = (U8)((val >> 16) & 0xff);
+                   s[offset + 2] = (U8)((val >>  8) & 0xff);
+                   s[offset + 3] = (U8)( val        & 0xff);
+               }
+
+               if (!none || val < none)
+                   ++val;
+           }
+       }
+       else {
+           if (min < start)
+               min = start;
+           for (key = min; key <= max; key++) {
+               STRLEN offset = (STRLEN)(key - start);
+               if (key >= end)
+                   goto go_out_list;
+               s[offset >> 3] |= 1 << (offset & 7);
+           }
+       }
+    } /* while */
+  go_out_list:
+
+    /* read $swash->{EXTRAS} */
+    x = (U8*)SvPV(*extssvp, xcur);
+    xend = x + xcur;
+    while (x < xend) {
+       STRLEN namelen;
+       U8 *namestr;
+       SV** othersvp;
+       HV* otherhv;
+       STRLEN otherbits;
+       SV **otherbitssvp, *other;
+       U8 *s, *o;
+       STRLEN slen, olen;
+
+       U8 opc = *x++;
+       if (opc == '\n')
+           continue;
+
+       nl = (U8*)memchr(x, '\n', xend - x);
+
+       if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
+           if (nl) {
+               x = nl + 1; /* 1 is length of "\n" */
+               continue;
+           }
+           else {
+               x = xend; /* to EXTRAS' end at which \n is not found */
+               break;
+           }
+       }
+
+       namestr = x;
+       if (nl) {
+           namelen = nl - namestr;
+           x = nl + 1;
+       }
+       else {
+           namelen = xend - namestr;
+           x = xend;
+       }
+
+       othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
+       if (*othersvp && SvROK(*othersvp) &&
+                        SvTYPE(SvRV(*othersvp))==SVt_PVHV)
+           otherhv = (HV*)SvRV(*othersvp);
+       else
+           Perl_croak(aTHX_ "otherhv is not a hash reference");
+
+       otherbitssvp = hv_fetch(otherhv, "BITS", 4, FALSE);
+       otherbits = (STRLEN)SvUV(*otherbitssvp);
+       if (bits < otherbits)
+           Perl_croak(aTHX_ "swash_get: swatch size mismatch");
+
+       /* The "other" swatch must be destroyed after. */
+       other = swash_get(*othersvp, start, span);
+       o = (U8*)SvPV(other, olen);
+
+       if (!olen)
+           Perl_croak(aTHX_ "swash_get didn't return valid swatch for other");
+
+       s = (U8*)SvPV(swatch, slen);
+       if (bits == 1 && otherbits == 1) {
+           if (slen != olen)
+               Perl_croak(aTHX_ "swash_get: swatch length mismatch");
+
+           switch (opc) {
+           case '+':
+               while (slen--)
+                   *s++ |= *o++;
+               break;
+           case '!':
+               while (slen--)
+                   *s++ |= ~*o++;
+               break;
+           case '-':
+               while (slen--)
+                   *s++ &= ~*o++;
+               break;
+           case '&':
+               while (slen--)
+                   *s++ &= *o++;
+               break;
+           default:
+               break;
+           }
+       }
+       else { /* bits >= 8 */
+              /* XXX: but weirdly otherval is treated as boolean */
+           STRLEN otheroctets = otherbits >> 3;
+           STRLEN offset = 0;
+           U8* send = s + slen;
+
+           while (s < send) {
+               UV otherval = 0;
+
+               if (otherbits == 1) {
+                   otherval = (o[offset >> 3] >> (offset & 7)) & 1;
+                   ++offset;
+               }
+               else {
+                   STRLEN vlen = otheroctets;
+                   otherval = *o++;
+                   while (--vlen) {
+                       otherval <<= 8;
+                       otherval |= *o++;
+                   }
+               }
+
+               if      (opc == '+' && otherval)
+                   otherval = 1;
+               else if (opc == '!' && !otherval)
+                   otherval = 1;
+               else if (opc == '-' && otherval)
+                   otherval = 0;
+               else if (opc == '&' && !otherval)
+                   otherval = 0;
+               else {
+                   s += octets; /* not modify orig swatch */
+                   continue;
+               }
+
+               if (bits == 8)
+                   *s++ = (U8)( otherval & 0xff);
+               else if (bits == 16) {
+                   *s++ = (U8)((otherval >>  8) & 0xff);
+                   *s++ = (U8)( otherval        & 0xff);
+               }
+               else if (bits == 32) {
+                   *s++ = (U8)((otherval >> 24) & 0xff);
+                   *s++ = (U8)((otherval >> 16) & 0xff);
+                   *s++ = (U8)((otherval >>  8) & 0xff);
+                   *s++ = (U8)( otherval        & 0xff);
+               }
+           }
+       }
+       sv_free(other); /* through with it! */
+    } /* while */
+    return swatch;
+}
+
 /*
 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
 
diff --git a/utf8.h b/utf8.h
index 837f7fb..d1989ec 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -145,7 +145,7 @@ encoded character.
  * Note: we try to be careful never to call the isXXX_utf8() functions
  * unless we're pretty sure we've seen the beginning of a UTF-8 character
  * (that is, the two high bits are set).  Otherwise we risk loading in the
- * heavy-duty SWASHINIT and SWASHGET routines unnecessarily.
+ * heavy-duty swash_init and swash_fetch routines unnecessarily.
  */
 #define isIDFIRST_lazy_if(p,c) ((IN_BYTES || (!c || (*((const U8*)p) < 0xc0))) \
                                ? isIDFIRST(*(p)) \
index 7d03608..bdc1359 100644 (file)
@@ -357,8 +357,8 @@ END_EXTERN_C
 /*
  * Note: we should try and be careful never to call the isXXX_utf8() functions
  * unless we're pretty sure we've seen the beginning of a UTF-EBCDIC character
- * Otherwise we risk loading in the heavy-duty SWASHINIT and SWASHGET routines
- * unnecessarily.
+ * Otherwise we risk loading in the heavy-duty swash_init and swash_fetch
+ * routines unnecessarily.
  */
 
 #define isIDFIRST_lazy_if(p,c) ((IN_BYTES || (!c || UTF8_IS_INVARIANT(*p))) \