From: SADAHIRO Tomoyuki Date: Wed, 23 Nov 2005 17:57:34 +0000 (+0900) Subject: XS-assisted SWASHGET (esp. for t/uni/class.t speedup) X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=4a818d86735b88cd762faade9872a9c2e89ab057;p=p5sagit%2Fp5-mst-13.2.git XS-assisted SWASHGET (esp. for t/uni/class.t speedup) Message-Id: <20051123175603.FFD5.BQW10602@nifty.com> And : Message-Id: <20051123202935.4D9D.BQW10602@nifty.com> with some nits to use U8 instead of char more consistently p4raw-id: //depot/perl@26199 --- diff --git a/lib/utf8_heavy.pl b/lib/utf8_heavy.pl index b6fdeb9..229ed97 100644 --- a/lib/utf8_heavy.pl +++ b/lib/utf8_heavy.pl @@ -267,146 +267,11 @@ sub SWASHNEW { } # NOTE: utf8.c:swash_init() assumes entries are never modified once generated. - sub SWASHGET { # See utf8.c:Perl_swash_fetch for problems with this interface. - my ($self, $start, $len) = @_; - local $^D = 0 if $^D; - my $type = $self->{TYPE}; - my $bits = $self->{BITS}; - my $none = $self->{NONE}; - print STDERR "SWASHGET @_ [$type/$bits/$none]\n" if DEBUG; - my $end = $start + $len; - my $swatch = ""; - my $key; - vec($swatch, $len - 1, $bits) = 0; # Extend to correct length. - if ($none) { - for $key (0 .. $len - 1) { vec($swatch, $key, $bits) = $none } - } - - for ($self->{LIST}) { - pos $_ = 0; - if ($bits > 1) { - LINE: - while (/^([0-9a-fA-F]+)(?:[ \t]([0-9a-fA-F]+)?)?(?:[ \t]([0-9a-fA-F]+))?/mg) { - chomp; - my ($a, $b, $c) = ($1, $2, $3); - croak "$type: illegal mapping '$_'" - if $type =~ /^To/ && - !(defined $a && defined $c); - my $min = hex $a; - my $max = defined $b ? hex $b : $min; - my $val = defined $c ? hex $c : 0; - next if $max < $start; - print "$min $max $val\n" if DEBUG; - if ($none) { - if ($min < $start) { - $val += $start - $min if $val < $none; - $min = $start; - } - for ($key = $min; $key <= $max; $key++) { - last LINE if $key >= $end; - print STDERR "$key => $val\n" if DEBUG; - vec($swatch, $key - $start, $bits) = $val; - ++$val if $val < $none; - } - } - else { - if ($min < $start) { - $val += $start - $min; - $min = $start; - } - for ($key = $min; $key <= $max; $key++, $val++) { - last LINE if $key >= $end; - print STDERR "$key => $val\n" if DEBUG; - vec($swatch, $key - $start, $bits) = $val; - } - } - } - } - else { - LINE: - while (/^([0-9a-fA-F]+)(?:[ \t]+([0-9a-fA-F]+))?/mg) { - chomp; - my $min = hex $1; - my $max = defined $2 ? hex $2 : $min; - next if $max < $start; - if ($min < $start) { - $min = $start; - } - for ($key = $min; $key <= $max; $key++) { - last LINE if $key >= $end; - print STDERR "$key => 1\n" if DEBUG; - vec($swatch, $key - $start, 1) = 1; - } - } - } - } - for my $x ($self->{EXTRAS}) { - pos $x = 0; - while ($x =~ /^([-+!&])(.*)/mg) { - my $char = $1; - my $name = $2; - print STDERR "INDIRECT $1 $2\n" if DEBUG; - my $otherbits = $self->{$name}->{BITS}; - croak("SWASHGET size mismatch") if $bits < $otherbits; - my $other = $self->{$name}->SWASHGET($start, $len); - if ($char eq '+') { - if ($bits == 1 and $otherbits == 1) { - $swatch |= $other; - } - else { - for ($key = 0; $key < $len; $key++) { - vec($swatch, $key, $bits) = vec($other, $key, $otherbits); - } - } - } - elsif ($char eq '!') { - if ($bits == 1 and $otherbits == 1) { - $swatch |= ~$other; - } - else { - for ($key = 0; $key < $len; $key++) { - if (!vec($other, $key, $otherbits)) { - vec($swatch, $key, $bits) = 1; - } - } - } - } - elsif ($char eq '-') { - if ($bits == 1 and $otherbits == 1) { - $swatch &= ~$other; - } - else { - for ($key = 0; $key < $len; $key++) { - if (vec($other, $key, $otherbits)) { - vec($swatch, $key, $bits) = 0; - } - } - } - } - elsif ($char eq '&') { - if ($bits == 1 and $otherbits == 1) { - $swatch &= $other; - } - else { - for ($key = 0; $key < $len; $key++) { - if (!vec($other, $key, $otherbits)) { - vec($swatch, $key, $bits) = 0; - } - } - } - } - } - } - if (DEBUG) { - print STDERR "CELLS "; - for ($key = 0; $key < $len; $key++) { - print STDERR vec($swatch, $key, $bits), " "; - } - print STDERR "\n"; - } - $swatch; + # See universal.c for XS utf8::SWASHGET_heavy. + # USAGE: $swatch = utf8::SWASHGET_heavy($self, $start, $len, DEBUG); + return utf8::SWASHGET_heavy($_[0], $_[1], $_[2], DEBUG); } 1; diff --git a/t/op/pat.t b/t/op/pat.t index 69d7305..4ab37ad 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -6,7 +6,7 @@ $| = 1; -print "1..1187\n"; +print "1..1191\n"; BEGIN { chdir 't' if -d 't'; @@ -3006,6 +3006,15 @@ END print "\x{0712}" =~ /\p{Syriac1}/ ? "ok $test\n" : "not ok $test\n"; $test++; print "\x{072F}" =~ /\P{Syriac1}/ ? "ok $test\n" : "not ok $test\n"; $test++; +print "# user-defined character properties may lack \\n at the end\n"; +sub InGreekSmall { return "03B1\t03C9" } +sub InGreekCapital { return "0391\t03A9\n-03A2" } + +ok("\x{03C0}" =~ /\p{InGreekSmall}/, "Small pi"); +ok("\x{03C2}" =~ /\p{InGreekSmall}/, "Final sigma"); +ok("\x{03A0}" =~ /\p{InGreekCapital}/, "Capital PI"); +ok("\x{03A2}" =~ /\P{InGreekCapital}/, "Reserved"); + { print "# Change #18179\n"; # previously failed with "panic: end_shift @@ -3402,5 +3411,5 @@ ok(("foba ba$s" =~ qr/(foo|BaSS|bar)/i) "# TODO assigning to original string should not corrupt match vars"); } -# last test 1187 +# last test 1191 diff --git a/universal.c b/universal.c index 10dddb5..4d44aa7 100644 --- a/universal.c +++ b/universal.c @@ -199,6 +199,7 @@ XS(XS_Regexp_DESTROY); XS(XS_Internals_hash_seed); XS(XS_Internals_rehash_seed); XS(XS_Internals_HvREHASH); +XS(XS_utf8_SWASHGET_heavy); void Perl_boot_core_UNIVERSAL(pTHX) @@ -247,6 +248,7 @@ Perl_boot_core_UNIVERSAL(pTHX) newXSproto("Internals::hash_seed",XS_Internals_hash_seed, file, ""); newXSproto("Internals::rehash_seed",XS_Internals_rehash_seed, file, ""); newXSproto("Internals::HvREHASH", XS_Internals_HvREHASH, file, "\\%"); + newXS("utf8::SWASHGET_heavy", XS_utf8_SWASHGET_heavy, file); } @@ -949,6 +951,417 @@ XS(XS_Internals_HvREHASH) /* Subject to change */ Perl_croak(aTHX_ "Internals::HvREHASH $hashref"); } +XS(XS_utf8_SWASHGET_heavy) +{ + dXSARGS; + if (items != 4) { + Perl_croak(aTHX_ + "Usage: utf8::SWASHGET_heavy($self, $start, $len, DEBUG)"); + } + { + SV* self = ST(0); + const I32 i_start = (I32)SvIV(ST(1)); + const I32 i_len = (I32)SvIV(ST(2)); + const I32 debug = (I32)SvIV(ST(3)); + U32 start = (U32)i_start; + U32 len = (U32)i_len; + + HV *hv; + SV **listsvp, **typesvp, **bitssvp, **nonesvp, **extssvp, *swatch; + U8 *l, *lend, *x, *xend, *s, *nextline; + STRLEN lcur, xcur, scur; + U8* typestr; + int typeto; + U32 bits, none, end, octets; + + if (SvROK(self) && SvTYPE(SvRV(self))==SVt_PVHV) + hv = (HV*)SvRV(self); + else + Perl_croak(aTHX_ "hv is not a hash reference"); + + if (i_start < 0) + Perl_croak(aTHX_ "SWASHGET negative start"); + if (i_len < 0) + Perl_croak(aTHX_ "SWASHGET negative len"); + + listsvp = hv_fetch(hv, "LIST", 4, FALSE); + typesvp = hv_fetch(hv, "TYPE", 4, FALSE); + bitssvp = hv_fetch(hv, "BITS", 4, FALSE); + nonesvp = hv_fetch(hv, "NONE", 4, FALSE); + extssvp = hv_fetch(hv, "EXTRAS", 6, FALSE); + typestr = SvPV_nolen(*typesvp); + typeto = typestr[0] == 'T' && typestr[1] == 'o'; + bits = (U32)SvUV(*bitssvp); + none = (U32)SvUV(*nonesvp); + end = start + len; + octets = bits >> 3; /* if bits == 1, then octets == 0 */ + + if (bits != 1 && bits != 8 && bits != 16 && bits != 32) { + Perl_croak(aTHX_ "SWASHGET unknown bits %"UVuf, (UV)bits); + } + if (debug) { + char* selfstr = SvPV_nolen(self); + PerlIO_printf(Perl_error_log, "SWASHGET "); + PerlIO_printf(Perl_error_log, "%s %"UVuf" %"UVuf" ", + selfstr, (UV)start, (UV)len); + PerlIO_printf(Perl_error_log, "[%s/%"UVuf"/%"UVuf"]\n", + typestr, (UV)bits, (UV)none); + } + + /* initialize $swatch */ + swatch = newSVpvn("",0); + scur = octets ? (len * octets) : (len + 7) / 8; + SvGROW(swatch, scur + 1); + s = (U8*)SvPVX(swatch); + if (octets && none) { + const U8* e = s + scur; + while (s < e) { + if (bits == 8) + *s++ = (U8)(none & 0xff); + else if (bits == 16) { + *s++ = (U8)((none >> 8) & 0xff); + *s++ = (U8)( none & 0xff); + } + else if (bits == 32) { + *s++ = (U8)((none >> 24) & 0xff); + *s++ = (U8)((none >> 16) & 0xff); + *s++ = (U8)((none >> 8) & 0xff); + *s++ = (U8)( none & 0xff); + } + } + *s = '\0'; + } + else { + (void)memzero((U8*)s, scur + 1); + } + SvCUR_set(swatch, scur); + s = (U8*)SvPVX(swatch); + + /* read $self->{LIST} */ + l = (U8*)SvPV(*listsvp, lcur); + lend = l + lcur; + while (l < lend) { + U32 min, max, val, key; + STRLEN numlen; + I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX; + + nextline = (U8*)memchr(l, '\n', lend - l); + + numlen = lend - l; + min = (U32)grok_hex(l, &numlen, &flags, NULL); + if (numlen) + l += numlen; + else if (nextline) { + l = nextline + 1; /* 1 is length of "\n" */ + continue; + } + else { + l = lend; /* to the end of LIST, at which no \n */ + break; + } + + if (isBLANK(*l)) { + ++l; + flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX; + numlen = lend - l; + max = (U32)grok_hex(l, &numlen, &flags, NULL); + if (numlen) + l += numlen; + else + max = min; + + if (octets) { + if (isBLANK(*l)) { + ++l; + flags = PERL_SCAN_SILENT_ILLDIGIT | + PERL_SCAN_DISALLOW_PREFIX; + numlen = lend - l; + val = (U32)grok_hex(l, &numlen, &flags, NULL); + if (numlen) + l += numlen; + else + val = 0; + } + else { + val = 0; + if (typeto) { + Perl_croak(aTHX_ "%s: illegal mapping '%s'", + typestr, l); + } + } + } + } + else { + max = min; + if (octets) { + val = 0; + if (typeto) { + Perl_croak(aTHX_ "%s: illegal mapping '%s'", + typestr, l); + } + } + } + + if (nextline) + l = nextline + 1; + else + l = lend; + + if (max < start) + continue; + + if (octets) { + if (debug) { + PerlIO_printf(Perl_error_log, + "%"UVuf" %"UVuf" %"UVuf"\n", + (UV)min, (UV)max, (UV)val); + } + if (min < start) { + if (!none || val < none) { + val += start - min; + } + min = start; + } + for (key = min; key <= max; key++) { + U32 offset; + if (key >= end) + goto go_out_list; + if (debug) { + PerlIO_printf(Perl_error_log, + "%"UVuf" => %"UVuf"\n", + (UV)key, (UV)val); + } + + /* offset must be non-negative (start <= min <= key < end) */ + offset = (key - start) * octets; + if (bits == 8) + s[offset] = (U8)(val & 0xff); + else if (bits == 16) { + s[offset ] = (U8)((val >> 8) & 0xff); + s[offset + 1] = (U8)( val & 0xff); + } + else if (bits == 32) { + s[offset ] = (U8)((val >> 24) & 0xff); + s[offset + 1] = (U8)((val >> 16) & 0xff); + s[offset + 2] = (U8)((val >> 8) & 0xff); + s[offset + 3] = (U8)( val & 0xff); + } + + if (!none || val < none) + ++val; + } + } + else { + if (min < start) + min = start; + for (key = min; key <= max; key++) { + U32 offset = key - start; + if (key >= end) + goto go_out_list; + if (debug) { + PerlIO_printf(Perl_error_log, + "%"UVuf" => 1\n", (UV)key); + } + s[offset >> 3] |= 1 << (offset & 7); + } + } + } + go_out_list: + + /* read $self->{EXTRAS} */ + x = (U8*)SvPV(*extssvp, xcur); + xend = x + xcur; + while (x < xend) { + STRLEN namelen; + U8 *namestr; + SV** othersvp; + U32 otherbits; + + U8 opc = *x++; + if (opc == '\n') + continue; + + nextline = (U8*)memchr(x, '\n', xend - x); + + if (opc != '-' && opc != '+' && opc != '!' && opc != '&') { + if (nextline) { + x = nextline + 1; + continue; + } + else { + x = xend; + break; + } + } + + namestr = x; + + if (nextline) { + namelen = nextline - namestr; + x = nextline + 1; + } + else { + namelen = xend - namestr; + x = xend; + } + + if (debug) { + U8* tmpstr; + Newx(tmpstr, namelen + 1, U8); + Move(namestr, tmpstr, namelen, U8); + tmpstr[namelen] = '\0'; + PerlIO_printf(Perl_error_log, + "INDIRECT %c %s\n", opc, tmpstr); + Safefree(tmpstr); + } + + { + HV* otherhv; + SV **otherbitssvp; + + othersvp = hv_fetch(hv, namestr, namelen, FALSE); + if (*othersvp && SvROK(*othersvp) && + SvTYPE(SvRV(*othersvp))==SVt_PVHV) + otherhv = (HV*)SvRV(*othersvp); + else + Perl_croak(aTHX_ "otherhv is not a hash reference"); + + otherbitssvp = hv_fetch(otherhv, "BITS", 4, FALSE); + otherbits = (U32)SvUV(*otherbitssvp); + if (bits < otherbits) + Perl_croak(aTHX_ "SWASHGET size mismatch"); + } + + { + dSP; + ENTER; + SAVETMPS; + PUSHMARK(SP); + EXTEND(SP,3); + PUSHs(*othersvp); + PUSHs(sv_2mortal(newSViv(start))); + PUSHs(sv_2mortal(newSViv(len))); + PUTBACK; + if (call_method("SWASHGET", G_SCALAR)) { + U8 *s, *o; + STRLEN slen, olen; + SV* tmpsv = *PL_stack_sp--; + o = (U8*)SvPV(tmpsv, olen); + + if (!olen) + Perl_croak(aTHX_ "SWASHGET didn't return valid swatch"); + s = SvPV(swatch, slen); + if (bits == 1 && otherbits == 1) { + if (slen != olen) + Perl_croak(aTHX_ "SWASHGET length mismatch"); + + switch (opc) { + case '+': + while (slen--) + *s++ |= *o++; + break; + case '!': + while (slen--) + *s++ |= ~*o++; + break; + case '-': + while (slen--) + *s++ &= ~*o++; + break; + case '&': + while (slen--) + *s++ &= *o++; + break; + default: + break; + } + } + else { + U32 otheroctets = otherbits / 8; + U32 offset = 0; + U8* send = s + slen; + + while (s < send) { + U32 val = 0; + + if (otherbits == 1) { + val = (o[offset >> 3] >> (offset & 7)) & 1; + ++offset; + } + else { + U32 vlen = otheroctets; + val = *o++; + while (--vlen) { + val <<= 8; + val |= *o++; + } + } + + if (opc == '+' && val) + val = 1; + else if (opc == '!' && !val) + val = 1; + else if (opc == '-' && val) + val = 0; + else if (opc == '&' && !val) + val = 0; + else { + s += octets; + continue; + } + + if (bits == 8) + *s++ = (U8)( val & 0xff); + else if (bits == 16) { + *s++ = (U8)((val >> 8) & 0xff); + *s++ = (U8)( val & 0xff); + } + else if (bits == 32) { + *s++ = (U8)((val >> 24) & 0xff); + *s++ = (U8)((val >> 16) & 0xff); + *s++ = (U8)((val >> 8) & 0xff); + *s++ = (U8)( val & 0xff); + } + } + } + } + FREETMPS; + LEAVE; + } + } + + if (debug) { + U8* s = (U8*)SvPVX(swatch); + PerlIO_printf(Perl_error_log, "CELLS "); + if (bits == 1) { + U32 key; + for (key = 0; key < len; key++) { + int val = (s[key >> 3] >> (key & 7)) & 1; + PerlIO_printf(Perl_error_log, val ? "1 " : "0 "); + } + } + else { + U8* send = s + len * octets; + while (s < send) { + U32 vlen = octets; + U32 val = *s++; + while (--vlen) { + val <<= 8; + val |= *s++; + } + PerlIO_printf(Perl_error_log, "%"UVuf" ", (UV)val); + } + } + PerlIO_printf(Perl_error_log, "\n"); + } + + ST(0) = swatch; + sv_2mortal(ST(0)); + } + XSRETURN(1); +} + + /* * Local variables: * c-indentation-style: bsd diff --git a/utf8.c b/utf8.c index 88855bb..586fc74 100644 --- a/utf8.c +++ b/utf8.c @@ -1696,8 +1696,8 @@ Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8) SV *errsv_save; ENTER; SAVETMPS; - save_re_context(); - PUSHSTACKi(PERLSI_MAGIC); + /* save_re_context(); */ + /* PUSHSTACKi(PERLSI_MAGIC); */ PUSHMARK(SP); EXTEND(SP,3); PUSHs((SV*)sv); @@ -1714,7 +1714,7 @@ Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8) if (!SvTRUE(ERRSV)) sv_setsv(ERRSV, errsv_save); SvREFCNT_dec(errsv_save); - POPSTACK; + /* POPSTACK; */ FREETMPS; LEAVE; if (IN_PERL_COMPILETIME)