From: Nicholas Clark <nick@ccl4.org>
Date: Thu, 23 Mar 2006 12:44:35 +0000 (+0000)
Subject: Pass the (byte) length of the entire string into
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=ab455f6077c4f8e6e59a143e82bbdc3535ce31e5;p=p5sagit%2Fp5-mst-13.2.git

Pass the (byte) length of the entire string into
utf8_mg_pos_cache_update()
Start to use the cache to store two pairs of byte/utf-8 pairs.
Add the first third of the cache update code.

p4raw-id: //depot/perl@27582
---

diff --git a/embed.fnc b/embed.fnc
index 2f86df3..d5014c4 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1361,7 +1361,7 @@ s	|STRLEN	|sv_pos_u2b_cached|NN SV *sv|NN MAGIC **mgp \
 		|NN const U8 *const start|NN const U8 *const send \
 		|STRLEN uoffset|STRLEN uoffset0|STRLEN boffset0
 s	|void	|utf8_mg_pos_cache_update|NN SV *sv|NN MAGIC **mgp \
-		|STRLEN byte|STRLEN utf8
+		|STRLEN byte|STRLEN utf8|STRLEN blen
 s	|STRLEN	|sv_pos_b2u_forwards|NN const U8 *s|NN const U8 *const target
 s	|STRLEN	|sv_pos_b2u_midway|NN const U8 *s|NN const U8 *const target \
 		|NN const U8 *end|STRLEN endu
diff --git a/embed.h b/embed.h
index 4ff4ebb..89d4f93 100644
--- a/embed.h
+++ b/embed.h
@@ -3537,7 +3537,7 @@
 #define sv_pos_u2b_forwards(a,b,c)	S_sv_pos_u2b_forwards(aTHX_ a,b,c)
 #define sv_pos_u2b_midway(a,b,c,d)	S_sv_pos_u2b_midway(aTHX_ a,b,c,d)
 #define sv_pos_u2b_cached(a,b,c,d,e,f,g)	S_sv_pos_u2b_cached(aTHX_ a,b,c,d,e,f,g)
-#define utf8_mg_pos_cache_update(a,b,c,d)	S_utf8_mg_pos_cache_update(aTHX_ a,b,c,d)
+#define utf8_mg_pos_cache_update(a,b,c,d,e)	S_utf8_mg_pos_cache_update(aTHX_ a,b,c,d,e)
 #define sv_pos_b2u_forwards(a,b)	S_sv_pos_b2u_forwards(aTHX_ a,b)
 #define sv_pos_b2u_midway(a,b,c,d)	S_sv_pos_b2u_midway(aTHX_ a,b,c,d)
 #define stringify_regexp(a,b,c)	S_stringify_regexp(aTHX_ a,b,c)
diff --git a/proto.h b/proto.h
index 568466a..fc754ea 100644
--- a/proto.h
+++ b/proto.h
@@ -3725,7 +3725,7 @@ STATIC STRLEN	S_sv_pos_u2b_cached(pTHX_ SV *sv, MAGIC **mgp, const U8 *const sta
 			__attribute__nonnull__(pTHX_3)
 			__attribute__nonnull__(pTHX_4);
 
-STATIC void	S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN byte, STRLEN utf8)
+STATIC void	S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN byte, STRLEN utf8, STRLEN blen)
 			__attribute__nonnull__(pTHX_1)
 			__attribute__nonnull__(pTHX_2);
 
diff --git a/sv.c b/sv.c
index 01fe6e5..bc0cde8 100644
--- a/sv.c
+++ b/sv.c
@@ -30,19 +30,16 @@
 #endif
 
 #ifdef PERL_UTF8_CACHE_ASSERT
-/* The cache element 0 is the Unicode offset;
- * the cache element 1 is the byte offset of the element 0;
- * the cache element 2 is the Unicode length of the substring;
- * the cache element 3 is the byte length of the substring;
- * The checking of the substring side would be good
- * but substr() has enough code paths to make my head spin;
- * if adding more checks watch out for the following tests:
+/* if adding more checks watch out for the following tests:
  *   t/op/index.t t/op/length.t t/op/pat.t t/op/substr.t
  *   lib/utf8.t lib/Unicode/Collate/t/index.t
  * --jhi
  */
 #define ASSERT_UTF8_CACHE(cache) \
-    STMT_START { if (cache) { assert((cache)[0] <= (cache)[1]); } } STMT_END
+    STMT_START { if (cache) { assert((cache)[0] <= (cache)[1]); \
+			      assert((cache)[2] <= (cache)[3]); \
+			      assert((cache)[3] <= (cache)[1]);} \
+			      } STMT_END
 #else
 #define ASSERT_UTF8_CACHE(cache) NOOP
 #endif
@@ -5405,6 +5402,10 @@ S_sv_pos_u2b_cached(pTHX_ SV *sv, MAGIC **mgp, const U8 *const start,
 		/* An exact match. */
 		return cache[1];
 	    }
+	    if (cache[2] == uoffset) {
+		/* An exact match. */
+		return cache[3];
+	    }
 
 	    if (cache[0] < uoffset) {
 		/* The cache already knows part of the way.   */
@@ -5464,7 +5465,7 @@ S_sv_pos_u2b_cached(pTHX_ SV *sv, MAGIC **mgp, const U8 *const start,
 	boffset = real_boffset;
     }
 
-    S_utf8_mg_pos_cache_update(aTHX_ sv, mgp, boffset, uoffset);
+    S_utf8_mg_pos_cache_update(aTHX_ sv, mgp, boffset, uoffset, send - start);
     return boffset;
 }
 
@@ -5524,7 +5525,8 @@ Handles magic and type coercion.
  */
 
 static void
-S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN byte, STRLEN utf8)
+S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN byte, STRLEN utf8,
+			   STRLEN blen)
 {
     STRLEN *cache;
     if (SvREADONLY(sv))
@@ -5567,10 +5569,62 @@ S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN byte, STRLEN utf8)
 		       " real %"UVf" for %"SVf, (UV) utf8, (UV) realutf8, sv);
 	}
     }
-    cache[0] = utf8;
-    cache[1] = byte;
+
+    /* Cache is held with the later position first, to simplify the code
+       that deals with unbounded ends.  */
+       
+    ASSERT_UTF8_CACHE(cache);
+    if (cache[1] == 0) {
+	/* Cache is totally empty  */
+	cache[0] = utf8;
+	cache[1] = byte;
+    } else if (cache[3] == 0) {
+	if (byte > cache[1]) {
+	    /* New one is larger, so goes first.  */
+	    cache[2] = cache[0];
+	    cache[3] = cache[1];
+	    cache[0] = utf8;
+	    cache[1] = byte;
+	} else {
+	    cache[2] = utf8;
+	    cache[3] = byte;
+	}
+    } else {
+#define THREEWAY_SQUARE(a,b,c,d) \
+	    ((float)((d) - (c))) * ((float)((d) - (c))) \
+	    + ((float)((c) - (b))) * ((float)((c) - (b))) \
+	       + ((float)((b) - (a))) * ((float)((b) - (a)))
+
+	/* Cache has 2 slots in use, and we know three potential pairs.
+	   Keep the two that give the lowest RMS distance. Do the
+	   calcualation in bytes simply because we always know the byte
+	   length.  squareroot has the same ordering as the positive value,
+	   so don't bother with the actual square root.  */
+	const float existing = THREEWAY_SQUARE(0, cache[3], cache[1], blen);
+	if (byte > cache[1]) {
+	    /* New position is after the existing pair of pairs.  */
+	    const float keep_earlier
+		= THREEWAY_SQUARE(0, cache[3], byte, blen);
+	    const float keep_later
+		= THREEWAY_SQUARE(0, cache[1], byte, blen);
+
+	    if (keep_later < keep_earlier) {
+		if (keep_later < existing) {
+		    cache[2] = cache[0];
+		    cache[3] = cache[1];
+		    cache[0] = utf8;
+		    cache[1] = byte;
+		}
+	    }
+	    else {
+		if (keep_earlier < existing) {
+		    cache[0] = utf8;
+		    cache[1] = byte;
+		}
+	    }
+	}
+    }
     ASSERT_UTF8_CACHE(cache);
-    /* Drop the stale "length" cache */
 }
 
 /* If we don't know the character offset of the end of a region, our only
@@ -5626,15 +5680,16 @@ Perl_sv_pos_b2u(pTHX_ register SV* sv, I32* offsetp)
     const U8* s;
     const STRLEN byte = *offsetp;
     STRLEN len;
+    STRLEN blen;
     MAGIC* mg = NULL;
     const U8* send;
 
     if (!sv)
 	return;
 
-    s = (const U8*)SvPV_const(sv, len);
+    s = (const U8*)SvPV_const(sv, blen);
 
-    if (len < byte)
+    if (blen < byte)
 	Perl_croak(aTHX_ "panic: sv_pos_b2u: bad byte offset");
 
     send = s + byte;
@@ -5648,6 +5703,11 @@ Perl_sv_pos_b2u(pTHX_ register SV* sv, I32* offsetp)
 		*offsetp = cache[0];
 		return;
 	    }
+	    if (cache[3] == byte) {
+		/* An exact match. */
+		*offsetp = cache[2];
+		return;
+	    }
 
 	    if (cache[1] < byte) {
 		/* We already know part of the way. */
@@ -5655,7 +5715,7 @@ Perl_sv_pos_b2u(pTHX_ register SV* sv, I32* offsetp)
 		    /* Actually, we know the end too.  */
 		    len = cache[0]
 			+ S_sv_pos_b2u_midway(aTHX_ s + cache[1], send,
-					      s + len, mg->mg_len - cache[0]);
+					      s + blen, mg->mg_len - cache[0]);
 		} else {
 		    len = cache[0]
 			+ S_sv_pos_b2u_forwards(aTHX_ s + cache[1], send);
@@ -5681,7 +5741,7 @@ Perl_sv_pos_b2u(pTHX_ register SV* sv, I32* offsetp)
 		}
 	    }
 	} else if (mg->mg_len != -1) {
-	    len = S_sv_pos_b2u_midway(aTHX_ s, send, s + len, mg->mg_len);
+	    len = S_sv_pos_b2u_midway(aTHX_ s, send, s + blen, mg->mg_len);
 	} else {
 	    len = S_sv_pos_b2u_forwards(aTHX_ s, send);
 	}
@@ -5691,7 +5751,7 @@ Perl_sv_pos_b2u(pTHX_ register SV* sv, I32* offsetp)
     }
     *offsetp = len;
 
-    S_utf8_mg_pos_cache_update(aTHX_ sv, &mg, byte, len);
+    S_utf8_mg_pos_cache_update(aTHX_ sv, &mg, byte, len, blen);
 }
 
 /*