From: SADAHIRO Tomoyuki <BQW10602@nifty.com>
Date: Wed, 23 Nov 2005 17:57:34 +0000 (+0900)
Subject: XS-assisted SWASHGET (esp. for t/uni/class.t speedup)
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=4a818d86735b88cd762faade9872a9c2e89ab057;p=p5sagit%2Fp5-mst-13.2.git

XS-assisted SWASHGET (esp. for t/uni/class.t speedup)
Message-Id: <20051123175603.FFD5.BQW10602@nifty.com>

And :
Message-Id: <20051123202935.4D9D.BQW10602@nifty.com>

with some nits to use U8 instead of char more consistently

p4raw-id: //depot/perl@26199
---

diff --git a/lib/utf8_heavy.pl b/lib/utf8_heavy.pl
index b6fdeb9..229ed97 100644
--- a/lib/utf8_heavy.pl
+++ b/lib/utf8_heavy.pl
@@ -267,146 +267,11 @@ sub SWASHNEW {
 }
 
 # NOTE: utf8.c:swash_init() assumes entries are never modified once generated.
-
 sub SWASHGET {
     # See utf8.c:Perl_swash_fetch for problems with this interface.
-    my ($self, $start, $len) = @_;
-    local $^D = 0 if $^D;
-    my $type = $self->{TYPE};
-    my $bits = $self->{BITS};
-    my $none = $self->{NONE};
-    print STDERR "SWASHGET @_ [$type/$bits/$none]\n" if DEBUG;
-    my $end = $start + $len;
-    my $swatch = "";
-    my $key;
-    vec($swatch, $len - 1, $bits) = 0;	# Extend to correct length.
-    if ($none) {
-	for $key (0 .. $len - 1) { vec($swatch, $key, $bits) = $none }
-    }
-
-    for ($self->{LIST}) {
-	pos $_ = 0;
-	if ($bits > 1) {
-	  LINE:
-	    while (/^([0-9a-fA-F]+)(?:[ \t]([0-9a-fA-F]+)?)?(?:[ \t]([0-9a-fA-F]+))?/mg) {
-		chomp;
-		my ($a, $b, $c) = ($1, $2, $3);
-		croak "$type: illegal mapping '$_'"
-		    if $type =~ /^To/ &&
-		       !(defined $a && defined $c);
-		my $min = hex $a;
-		my $max = defined $b ? hex $b : $min;
-		my $val = defined $c ? hex $c : 0;
-		next if $max < $start;
-		print "$min $max $val\n" if DEBUG;
-		if ($none) {
-		    if ($min < $start) {
-			$val += $start - $min if $val < $none;
-			$min = $start;
-		    }
-		    for ($key = $min; $key <= $max; $key++) {
-			last LINE if $key >= $end;
-			print STDERR "$key => $val\n" if DEBUG;
-			vec($swatch, $key - $start, $bits) = $val;
-			++$val if $val < $none;
-		    }
-		}
-		else {
-		    if ($min < $start) {
-			$val += $start - $min;
-			$min = $start;
-		    }
-		    for ($key = $min; $key <= $max; $key++, $val++) {
-			last LINE if $key >= $end;
-			print STDERR "$key => $val\n" if DEBUG;
-			vec($swatch, $key - $start, $bits) = $val;
-		    }
-		}
-	    }
-	}
-	else {
-	  LINE:
-	    while (/^([0-9a-fA-F]+)(?:[ \t]+([0-9a-fA-F]+))?/mg) {
-		chomp;
-		my $min = hex $1;
-		my $max = defined $2 ? hex $2 : $min;
-		next if $max < $start;
-		if ($min < $start) {
-		    $min = $start;
-		}
-		for ($key = $min; $key <= $max; $key++) {
-		    last LINE if $key >= $end;
-		    print STDERR "$key => 1\n" if DEBUG;
-		    vec($swatch, $key - $start, 1) = 1;
-		}
-	    }
-	}
-    }
-    for my $x ($self->{EXTRAS}) {
-	pos $x = 0;
-	while ($x =~ /^([-+!&])(.*)/mg) {
-	    my $char = $1;
-	    my $name = $2;
-	    print STDERR "INDIRECT $1 $2\n" if DEBUG;
-	    my $otherbits = $self->{$name}->{BITS};
-	    croak("SWASHGET size mismatch") if $bits < $otherbits;
-	    my $other = $self->{$name}->SWASHGET($start, $len);
-	    if ($char eq '+') {
-		if ($bits == 1 and $otherbits == 1) {
-		    $swatch |= $other;
-		}
-		else {
-		    for ($key = 0; $key < $len; $key++) {
-			vec($swatch, $key, $bits) = vec($other, $key, $otherbits);
-		    }
-		}
-	    }
-	    elsif ($char eq '!') {
-		if ($bits == 1 and $otherbits == 1) {
-		    $swatch |= ~$other;
-		}
-		else {
-		    for ($key = 0; $key < $len; $key++) {
-			if (!vec($other, $key, $otherbits)) {
-			    vec($swatch, $key, $bits) = 1;
-			}
-		    }
-		}
-	    }
-	    elsif ($char eq '-') {
-		if ($bits == 1 and $otherbits == 1) {
-		    $swatch &= ~$other;
-		}
-		else {
-		    for ($key = 0; $key < $len; $key++) {
-			if (vec($other, $key, $otherbits)) {
-			    vec($swatch, $key, $bits) = 0;
-			}
-		    }
-		}
-	    }
-	    elsif ($char eq '&') {
-		if ($bits == 1 and $otherbits == 1) {
-		    $swatch &= $other;
-		}
-		else {
-		    for ($key = 0; $key < $len; $key++) {
-			if (!vec($other, $key, $otherbits)) {
-			    vec($swatch, $key, $bits) = 0;
-			}
-		    }
-		}
-	    }
-	}
-    }
-    if (DEBUG) {
-	print STDERR "CELLS ";
-	for ($key = 0; $key < $len; $key++) {
-	    print STDERR vec($swatch, $key, $bits), " ";
-	}
-	print STDERR "\n";
-    }
-    $swatch;
+    # See universal.c for XS utf8::SWASHGET_heavy.
+    # USAGE: $swatch = utf8::SWASHGET_heavy($self, $start, $len, DEBUG);
+    return utf8::SWASHGET_heavy($_[0], $_[1], $_[2], DEBUG);
 }
 
 1;
diff --git a/t/op/pat.t b/t/op/pat.t
index 69d7305..4ab37ad 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -6,7 +6,7 @@
 
 $| = 1;
 
-print "1..1187\n";
+print "1..1191\n";
 
 BEGIN {
     chdir 't' if -d 't';
@@ -3006,6 +3006,15 @@ END
 print "\x{0712}" =~ /\p{Syriac1}/ ? "ok $test\n" : "not ok $test\n"; $test++;
 print "\x{072F}" =~ /\P{Syriac1}/ ? "ok $test\n" : "not ok $test\n"; $test++;
 
+print "# user-defined character properties may lack \\n at the end\n";
+sub InGreekSmall   { return "03B1\t03C9" }
+sub InGreekCapital { return "0391\t03A9\n-03A2" }
+
+ok("\x{03C0}" =~ /\p{InGreekSmall}/,   "Small pi");
+ok("\x{03C2}" =~ /\p{InGreekSmall}/,   "Final sigma");
+ok("\x{03A0}" =~ /\p{InGreekCapital}/, "Capital PI");
+ok("\x{03A2}" =~ /\P{InGreekCapital}/, "Reserved");
+
 {
     print "# Change #18179\n";
     # previously failed with "panic: end_shift
@@ -3402,5 +3411,5 @@ ok(("foba  ba$s" =~ qr/(foo|BaSS|bar)/i)
        "# TODO assigning to original string should not corrupt match vars");
 }
 
-# last test 1187
+# last test 1191
 
diff --git a/universal.c b/universal.c
index 10dddb5..4d44aa7 100644
--- a/universal.c
+++ b/universal.c
@@ -199,6 +199,7 @@ XS(XS_Regexp_DESTROY);
 XS(XS_Internals_hash_seed);
 XS(XS_Internals_rehash_seed);
 XS(XS_Internals_HvREHASH);
+XS(XS_utf8_SWASHGET_heavy);
 
 void
 Perl_boot_core_UNIVERSAL(pTHX)
@@ -247,6 +248,7 @@ Perl_boot_core_UNIVERSAL(pTHX)
     newXSproto("Internals::hash_seed",XS_Internals_hash_seed, file, "");
     newXSproto("Internals::rehash_seed",XS_Internals_rehash_seed, file, "");
     newXSproto("Internals::HvREHASH", XS_Internals_HvREHASH, file, "\\%");
+    newXS("utf8::SWASHGET_heavy", XS_utf8_SWASHGET_heavy, file);
 }
 
 
@@ -949,6 +951,417 @@ XS(XS_Internals_HvREHASH)	/* Subject to change  */
     Perl_croak(aTHX_ "Internals::HvREHASH $hashref");
 }
 
+XS(XS_utf8_SWASHGET_heavy)
+{
+    dXSARGS;
+    if (items != 4) {
+	Perl_croak(aTHX_
+	    "Usage: utf8::SWASHGET_heavy($self, $start, $len, DEBUG)");
+    }
+    {
+	SV* self    = ST(0);
+	const I32 i_start = (I32)SvIV(ST(1));
+	const I32 i_len   = (I32)SvIV(ST(2));
+	const I32 debug   = (I32)SvIV(ST(3));
+	U32 start = (U32)i_start;
+	U32 len   = (U32)i_len;
+
+	HV *hv;
+	SV **listsvp, **typesvp, **bitssvp, **nonesvp, **extssvp, *swatch;
+	U8 *l, *lend, *x, *xend, *s, *nextline;
+	STRLEN lcur, xcur, scur;
+	U8* typestr;
+	int typeto;
+	U32 bits, none, end, octets;
+
+	if (SvROK(self) && SvTYPE(SvRV(self))==SVt_PVHV)
+	    hv = (HV*)SvRV(self);
+	else
+	    Perl_croak(aTHX_ "hv is not a hash reference");
+
+	if (i_start < 0)
+	    Perl_croak(aTHX_ "SWASHGET negative start");
+	if (i_len < 0)
+	    Perl_croak(aTHX_ "SWASHGET negative len");
+
+	listsvp = hv_fetch(hv, "LIST", 4, FALSE);
+	typesvp = hv_fetch(hv, "TYPE", 4, FALSE);
+	bitssvp = hv_fetch(hv, "BITS", 4, FALSE);
+	nonesvp = hv_fetch(hv, "NONE", 4, FALSE);
+	extssvp = hv_fetch(hv, "EXTRAS", 6, FALSE);
+	typestr = SvPV_nolen(*typesvp);
+	typeto  = typestr[0] == 'T' && typestr[1] == 'o';
+	bits    = (U32)SvUV(*bitssvp);
+	none    = (U32)SvUV(*nonesvp);
+	end     = start + len;
+	octets  = bits >> 3; /* if bits == 1, then octets == 0 */
+
+	if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
+	    Perl_croak(aTHX_ "SWASHGET unknown bits %"UVuf, (UV)bits);
+	}
+	if (debug) {
+	    char* selfstr = SvPV_nolen(self);
+	    PerlIO_printf(Perl_error_log, "SWASHGET ");
+	    PerlIO_printf(Perl_error_log, "%s %"UVuf" %"UVuf" ",
+					  selfstr, (UV)start, (UV)len);
+	    PerlIO_printf(Perl_error_log, "[%s/%"UVuf"/%"UVuf"]\n",
+					  typestr, (UV)bits, (UV)none);
+	}
+
+	/* initialize $swatch */
+	swatch = newSVpvn("",0);
+	scur   = octets ? (len * octets) : (len + 7) / 8;
+	SvGROW(swatch, scur + 1);
+	s = (U8*)SvPVX(swatch);
+	if (octets && none) {
+	    const U8* e = s + scur;
+	    while (s < e) {
+		if (bits == 8)
+		    *s++ = (U8)(none & 0xff);
+		else if (bits == 16) {
+		    *s++ = (U8)((none >>  8) & 0xff);
+		    *s++ = (U8)( none        & 0xff);
+		}
+		else if (bits == 32) {
+		    *s++ = (U8)((none >> 24) & 0xff);
+		    *s++ = (U8)((none >> 16) & 0xff);
+		    *s++ = (U8)((none >>  8) & 0xff);
+		    *s++ = (U8)( none        & 0xff);
+		}
+	    }
+	    *s = '\0';
+	}
+	else {
+	    (void)memzero((U8*)s, scur + 1);
+	}
+	SvCUR_set(swatch, scur);
+	s = (U8*)SvPVX(swatch);
+
+	/* read $self->{LIST} */
+	l = (U8*)SvPV(*listsvp, lcur);
+	lend = l + lcur;
+	while (l < lend) {
+	    U32 min, max, val, key;
+	    STRLEN numlen;
+	    I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
+
+	    nextline = (U8*)memchr(l, '\n', lend - l);
+
+	    numlen = lend - l;
+	    min = (U32)grok_hex(l, &numlen, &flags, NULL);
+	    if (numlen)
+		l += numlen;
+	    else if (nextline) {
+		l = nextline + 1; /* 1 is length of "\n" */
+		continue;
+	    }
+	    else {
+		l = lend; /* to the end of LIST, at which no \n */
+		break;
+	    }
+
+	    if (isBLANK(*l)) {
+		++l;
+		flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
+		numlen = lend - l;
+		max = (U32)grok_hex(l, &numlen, &flags, NULL);
+		if (numlen)
+		    l += numlen;
+		else
+		    max = min;
+
+		if (octets) {
+		    if (isBLANK(*l)) {
+			++l;
+			flags = PERL_SCAN_SILENT_ILLDIGIT |
+				PERL_SCAN_DISALLOW_PREFIX;
+			numlen = lend - l;
+			val = (U32)grok_hex(l, &numlen, &flags, NULL);
+			if (numlen)
+			    l += numlen;
+			else
+			    val = 0;
+		    }
+		    else {
+			val = 0;
+			if (typeto) {
+			    Perl_croak(aTHX_ "%s: illegal mapping '%s'",
+					     typestr, l);
+			}
+		    }
+		}
+	    }
+	    else {
+		max = min;
+		if (octets) {
+		    val = 0;
+		    if (typeto) {
+			Perl_croak(aTHX_ "%s: illegal mapping '%s'",
+					 typestr, l);
+		    }
+		}
+	    }
+
+	    if (nextline)
+		l = nextline + 1;
+	    else
+		l = lend;
+
+	    if (max < start)
+		continue;
+
+	    if (octets) {
+		if (debug) {
+		    PerlIO_printf(Perl_error_log,
+			"%"UVuf" %"UVuf" %"UVuf"\n",
+			(UV)min, (UV)max, (UV)val);
+		}
+		if (min < start) {
+		    if (!none || val < none) {
+			val += start - min;
+		    }
+		    min = start;
+		}
+		for (key = min; key <= max; key++) {
+		    U32 offset;
+		    if (key >= end)
+			goto go_out_list;
+		    if (debug) {
+			PerlIO_printf(Perl_error_log,
+				"%"UVuf" => %"UVuf"\n",
+				(UV)key, (UV)val);
+		    }
+
+		/* offset must be non-negative (start <= min <= key < end) */
+		    offset = (key - start) * octets;
+		    if (bits == 8)
+			s[offset] = (U8)(val & 0xff);
+		    else if (bits == 16) {
+			s[offset    ] = (U8)((val >>  8) & 0xff);
+			s[offset + 1] = (U8)( val        & 0xff);
+		    }
+		    else if (bits == 32) {
+			s[offset    ] = (U8)((val >> 24) & 0xff);
+			s[offset + 1] = (U8)((val >> 16) & 0xff);
+			s[offset + 2] = (U8)((val >>  8) & 0xff);
+			s[offset + 3] = (U8)( val        & 0xff);
+		    }
+
+		    if (!none || val < none)
+			++val;
+		}
+	    }
+	    else {
+		if (min < start)
+		    min = start;
+		for (key = min; key <= max; key++) {
+		    U32 offset = key - start;
+		    if (key >= end)
+			goto go_out_list;
+		    if (debug) {
+			PerlIO_printf(Perl_error_log,
+				"%"UVuf" => 1\n", (UV)key);
+		    }
+		    s[offset >> 3] |= 1 << (offset & 7);
+		}
+	    }
+	}
+    go_out_list:
+
+	/* read $self->{EXTRAS} */
+	x = (U8*)SvPV(*extssvp, xcur);
+	xend = x + xcur;
+	while (x < xend) {
+	    STRLEN namelen;
+	    U8 *namestr;
+	    SV** othersvp;
+	    U32 otherbits;
+
+	    U8 opc = *x++;
+	    if (opc == '\n')
+		continue;
+
+	    nextline = (U8*)memchr(x, '\n', xend - x);
+
+	    if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
+		if (nextline) {
+		    x = nextline + 1;
+		    continue;
+		}
+		else {
+		    x = xend;
+		    break;
+		}
+	    }
+
+	    namestr = x;
+
+	    if (nextline) {
+		namelen = nextline - namestr;
+		x = nextline + 1;
+	    }
+	    else {
+		namelen = xend - namestr;
+		x = xend;
+	    }
+
+	    if (debug) {
+		U8* tmpstr;
+		Newx(tmpstr, namelen + 1, U8);
+		Move(namestr, tmpstr, namelen, U8);
+		tmpstr[namelen] = '\0';
+		PerlIO_printf(Perl_error_log,
+			"INDIRECT %c %s\n", opc, tmpstr);
+		Safefree(tmpstr);
+	    }
+
+	    {
+		HV* otherhv;
+		SV **otherbitssvp;
+
+		othersvp = hv_fetch(hv, namestr, namelen, FALSE);
+		if (*othersvp && SvROK(*othersvp) &&
+				 SvTYPE(SvRV(*othersvp))==SVt_PVHV)
+		    otherhv = (HV*)SvRV(*othersvp);
+		else
+		    Perl_croak(aTHX_ "otherhv is not a hash reference");
+
+		otherbitssvp = hv_fetch(otherhv, "BITS", 4, FALSE);
+		otherbits = (U32)SvUV(*otherbitssvp);
+		if (bits < otherbits)
+		    Perl_croak(aTHX_ "SWASHGET size mismatch");
+	    }
+
+	    {
+		dSP;
+		ENTER;
+		SAVETMPS;
+		PUSHMARK(SP);
+		EXTEND(SP,3);
+		PUSHs(*othersvp);
+		PUSHs(sv_2mortal(newSViv(start)));
+		PUSHs(sv_2mortal(newSViv(len)));
+		PUTBACK;
+		if (call_method("SWASHGET", G_SCALAR)) {
+		    U8 *s, *o;
+		    STRLEN slen, olen;
+		    SV* tmpsv = *PL_stack_sp--;
+		    o = (U8*)SvPV(tmpsv, olen);
+
+		    if (!olen)
+			Perl_croak(aTHX_ "SWASHGET didn't return valid swatch");
+		    s = SvPV(swatch, slen);
+		    if (bits == 1 && otherbits == 1) {
+			if (slen != olen)
+			    Perl_croak(aTHX_ "SWASHGET length mismatch");
+
+			switch (opc) {
+			case '+':
+			    while (slen--)
+				*s++ |= *o++;
+			    break;
+			case '!':
+			    while (slen--)
+				*s++ |= ~*o++;
+			    break;
+			case '-':
+			    while (slen--)
+				*s++ &= ~*o++;
+			    break;
+			case '&':
+			    while (slen--)
+				*s++ &= *o++;
+			    break;
+			default:
+			    break;
+			}
+		    }
+		    else {
+			U32 otheroctets = otherbits / 8;
+			U32 offset = 0;
+			U8* send = s + slen;
+
+			while (s < send) {
+			    U32 val = 0;
+
+			    if (otherbits == 1) {
+				val = (o[offset >> 3] >> (offset & 7)) & 1;
+				++offset;
+			    }
+			    else {
+				U32 vlen = otheroctets;
+				val = *o++;
+				while (--vlen) {
+				    val <<= 8;
+				    val |= *o++;
+				}
+			    }
+
+			    if      (opc == '+' && val)
+				val = 1;
+			    else if (opc == '!' && !val)
+				val = 1;
+			    else if (opc == '-' && val)
+				val = 0;
+			    else if (opc == '&' && !val)
+				val = 0;
+			    else {
+				s += octets;
+				continue;
+			    }
+
+			    if (bits == 8)
+				*s++ = (U8)( val & 0xff);
+			    else if (bits == 16) {
+				*s++ = (U8)((val >>  8) & 0xff);
+				*s++ = (U8)( val        & 0xff);
+			    }
+			    else if (bits == 32) {
+				*s++ = (U8)((val >> 24) & 0xff);
+				*s++ = (U8)((val >> 16) & 0xff);
+				*s++ = (U8)((val >>  8) & 0xff);
+				*s++ = (U8)( val        & 0xff);
+			    }
+			}
+		    }
+		}
+		FREETMPS;
+		LEAVE;
+	    }
+	}
+
+	if (debug) {
+	    U8* s = (U8*)SvPVX(swatch);
+	    PerlIO_printf(Perl_error_log, "CELLS ");
+	    if (bits == 1) {
+		U32 key;
+		for (key = 0; key < len; key++) {
+		    int val = (s[key >> 3] >> (key & 7)) & 1;
+		    PerlIO_printf(Perl_error_log, val ? "1 " : "0 ");
+		}
+	    }
+	    else {
+		U8* send = s + len * octets;
+		while (s < send) {
+		    U32 vlen = octets;
+		    U32 val = *s++;
+		    while (--vlen) {
+			val <<= 8;
+			val |= *s++;
+		    }
+		    PerlIO_printf(Perl_error_log, "%"UVuf" ", (UV)val);
+		}
+	    }
+	    PerlIO_printf(Perl_error_log, "\n");
+	}
+
+	ST(0) = swatch;
+	sv_2mortal(ST(0));
+    }
+    XSRETURN(1);
+}
+
+
 /*
  * Local variables:
  * c-indentation-style: bsd
diff --git a/utf8.c b/utf8.c
index 88855bb..586fc74 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1696,8 +1696,8 @@ Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
 	    SV *errsv_save;
 	    ENTER;
 	    SAVETMPS;
-	    save_re_context();
-	    PUSHSTACKi(PERLSI_MAGIC);
+	/*  save_re_context();  */
+	/*  PUSHSTACKi(PERLSI_MAGIC);  */
 	    PUSHMARK(SP);
 	    EXTEND(SP,3);
 	    PUSHs((SV*)sv);
@@ -1714,7 +1714,7 @@ Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
 	    if (!SvTRUE(ERRSV))
 		sv_setsv(ERRSV, errsv_save);
 	    SvREFCNT_dec(errsv_save);
-	    POPSTACK;
+	/*  POPSTACK; */
 	    FREETMPS;
 	    LEAVE;
 	    if (IN_PERL_COMPILETIME)