#endif
#ifdef PERL_UTF8_CACHE_ASSERT
-/* The cache element 0 is the Unicode offset;
- * the cache element 1 is the byte offset of the element 0;
- * the cache element 2 is the Unicode length of the substring;
- * the cache element 3 is the byte length of the substring;
- * The checking of the substring side would be good
- * but substr() has enough code paths to make my head spin;
- * if adding more checks watch out for the following tests:
+/* if adding more checks watch out for the following tests:
* t/op/index.t t/op/length.t t/op/pat.t t/op/substr.t
* lib/utf8.t lib/Unicode/Collate/t/index.t
* --jhi
*/
#define ASSERT_UTF8_CACHE(cache) \
- STMT_START { if (cache) { assert((cache)[0] <= (cache)[1]); } } STMT_END
+ STMT_START { if (cache) { assert((cache)[0] <= (cache)[1]); \
+ assert((cache)[2] <= (cache)[3]); \
+ assert((cache)[3] <= (cache)[1]);} \
+ } STMT_END
#else
#define ASSERT_UTF8_CACHE(cache) NOOP
#endif
certainly cast into the IV range at IV_MAX, whereas the correct
answer is the UV IV_MAX +1. Hence < ensures that dodgy boundary
cases go to UV */
+#if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan)
+ if (Perl_isnan(SvNVX(sv))) {
+ SvUV_set(sv, 0);
+ SvIsUV_on(sv);
+ return FALSE;
+ }
+#endif
if (SvNVX(sv) < (NV)IV_MAX + 0.5) {
SvIV_set(sv, I_V(SvNVX(sv)));
if (SvNVX(sv) == (NV) SvIVX(sv)
/*
* The length is cached in PERL_UTF8_magic, in the mg_len field. Also the
- * mg_ptr is used, by sv_pos_u2b(), see the comments of S_utf8_mg_pos_init().
- * (Note that the mg_len is not the length of the mg_ptr field.)
+ * mg_ptr is used, by sv_pos_u2b() and sv_pos_b2u() - see the comments below.
+ * (Note that the mg_len is not the length of the mg_ptr field.
+ * This allows the cache to store the character length of the string without
+ * needing to malloc() extra storage to attach to the mg_ptr.)
*
*/
}
}
-/* S_utf8_mg_pos_init() is used to initialize the mg_ptr field of
- * a PERL_UTF8_magic. The mg_ptr is used to store the mapping
- * between UTF-8 and byte offsets. There are two (substr offset and substr
- * length, the i offset, PERL_MAGIC_UTF8_CACHESIZE) times two (UTF-8 offset
- * and byte offset) cache positions.
- *
- * The mg_len field is used by sv_len_utf8(), see its comments.
- * Note that the mg_len is not the length of the mg_ptr field.
- *
- */
-STATIC bool
-S_utf8_mg_pos_init(pTHX_ SV *sv, MAGIC **mgp, STRLEN **cachep, I32 i,
- I32 offsetp, const U8 *s, const U8 *start)
-{
- bool found = FALSE;
-
- if (SvMAGICAL(sv) && !SvREADONLY(sv)) {
- if (!*mgp) {
- *mgp = sv_magicext(sv, 0, PERL_MAGIC_utf8, (MGVTBL*)&PL_vtbl_utf8, 0, 0);
- (*mgp)->mg_len = -1;
- }
- assert(*mgp);
-
- if ((*mgp)->mg_ptr)
- *cachep = (STRLEN *) (*mgp)->mg_ptr;
- else {
- Newxz(*cachep, PERL_MAGIC_UTF8_CACHESIZE * 2, STRLEN);
- (*mgp)->mg_ptr = (char *) *cachep;
- }
- assert(*cachep);
-
- (*cachep)[i] = offsetp;
- (*cachep)[i+1] = s - start;
- found = TRUE;
- }
-
- return found;
-}
-
-/*
- * S_utf8_mg_pos() is used to query and update mg_ptr field of
- * a PERL_UTF8_magic. The mg_ptr is used to store the mapping
- * between UTF-8 and byte offsets. See also the comments of
- * S_utf8_mg_pos_init().
- *
- */
-STATIC bool
-S_utf8_mg_pos(pTHX_ SV *sv, MAGIC **mgp, STRLEN **cachep, I32 i, I32 *offsetp, I32 uoff, const U8 **sp, const U8 *start, const U8 *send)
-{
- bool found = FALSE;
-
- if (SvMAGICAL(sv) && !SvREADONLY(sv)) {
- if (!*mgp)
- *mgp = mg_find(sv, PERL_MAGIC_utf8);
- if (*mgp && (*mgp)->mg_ptr) {
- *cachep = (STRLEN *) (*mgp)->mg_ptr;
- ASSERT_UTF8_CACHE(*cachep);
- if ((*cachep)[i] == (STRLEN)uoff) /* An exact match. */
- found = TRUE;
- else { /* We will skip to the right spot. */
- STRLEN forw = 0;
- STRLEN backw = 0;
- const U8* p = NULL;
-
- /* The assumption is that going backward is half
- * the speed of going forward (that's where the
- * 2 * backw in the below comes from). (The real
- * figure of course depends on the UTF-8 data.) */
-
- if ((*cachep)[i] > (STRLEN)uoff) {
- forw = uoff;
- backw = (*cachep)[i] - (STRLEN)uoff;
-
- if (forw < 2 * backw)
- p = start;
- else
- p = start + (*cachep)[i+1];
- }
- /* Try this only for the substr offset (i == 0),
- * not for the substr length (i == 2). */
- else if (i == 0) { /* (*cachep)[i] < uoff */
- const STRLEN ulen = sv_len_utf8(sv);
-
- if ((STRLEN)uoff < ulen) {
- forw = (STRLEN)uoff - (*cachep)[i];
- backw = ulen - (STRLEN)uoff;
-
- if (forw < 2 * backw)
- p = start + (*cachep)[i+1];
- else
- p = send;
- }
-
- /* If the string is not long enough for uoff,
- * we could extend it, but not at this low a level. */
- }
-
- if (p) {
- if (forw < 2 * backw) {
- while (forw--)
- p += UTF8SKIP(p);
- }
- else {
- while (backw--) {
- p--;
- while (UTF8_IS_CONTINUATION(*p))
- p--;
- }
- }
-
- /* Update the cache. */
- (*cachep)[i] = (STRLEN)uoff;
- (*cachep)[i+1] = p - start;
-
- /* Drop the stale "length" cache */
- if (i == 0) {
- (*cachep)[2] = 0;
- (*cachep)[3] = 0;
- }
-
- found = TRUE;
- }
- }
- if (found) { /* Setup the return values. */
- *offsetp = (*cachep)[i+1];
- *sp = start + *offsetp;
- if (*sp >= send) {
- *sp = send;
- *offsetp = send - start;
- }
- else if (*sp < start) {
- *sp = start;
- *offsetp = 0;
- }
- }
- }
-#ifdef PERL_UTF8_CACHE_ASSERT
- if (found) {
- const U8 *s = start;
- I32 n = uoff;
-
- while (n-- && s < send)
- s += UTF8SKIP(s);
-
- if (i == 0) {
- assert(*offsetp == s - start);
- assert((*cachep)[0] == (STRLEN)uoff);
- assert((*cachep)[1] == *offsetp);
- }
- ASSERT_UTF8_CACHE(*cachep);
- }
-#endif
- }
-
- return found;
-}
-
-/*
-=for apidoc sv_pos_u2b
-
-Converts the value pointed to by offsetp from a count of UTF-8 chars from
-the start of the string, to a count of the equivalent number of bytes; if
-lenp is non-zero, it does the same to lenp, but this time starting from
-the offset, rather than from the start of the string. Handles magic and
-type coercion.
-
-=cut
-*/
-
-/*
- * sv_pos_u2b() uses, like sv_pos_b2u(), the mg_ptr of the potential
- * PERL_UTF8_magic of the sv to store the mapping between UTF-8 and
- * byte offsets. See also the comments of S_utf8_mg_pos().
- *
- */
-
-static void
-S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN byte, STRLEN utf8);
-
+/* Walk forwards to find the byte corresponding to the passed in UTF-8
+ offset. */
static STRLEN
S_sv_pos_u2b_forwards(pTHX_ const U8 *const start, const U8 *const send,
STRLEN uoffset)
return s - start;
}
-
+/* Given the length of the string in both bytes and UTF-8 characters, decide
+ whether to walk forwards or backwards to find the byte corresponding to
+ the passed in UTF-8 offset. */
static STRLEN
S_sv_pos_u2b_midway(pTHX_ const U8 *const start, const U8 *send,
STRLEN uoffset, STRLEN uend)
{
STRLEN backw = uend - uoffset;
if (uoffset < 2 * backw) {
- /* The assumption is that going fowards is twice the speed of going
+ /* The assumption is that going forwards is twice the speed of going
forward (that's where the 2 * backw comes from).
(The real figure of course depends on the UTF-8 data.) */
return S_sv_pos_u2b_forwards(aTHX_ start, send, uoffset);
return send - start;
}
+/* For the string representation of the given scalar, find the byte
+ corresponding to the passed in UTF-8 offset. uoffset0 and boffset0
+ give another position in the string, *before* the sought offset, which
+ (which is always true, as 0, 0 is a valid pair of positions), which should
+ help reduce the amount of linear searching.
+ If *mgp is non-NULL, it should point to the UTF-8 cache magic, which
+ will be used to reduce the amount of linear searching. The cache will be
+ created if necessary, and the found value offered to it for update. */
static STRLEN
S_sv_pos_u2b_cached(pTHX_ SV *sv, MAGIC **mgp, const U8 *const start,
const U8 *const send, STRLEN uoffset,
if (SvMAGICAL(sv) && !SvREADONLY(sv) && PL_utf8cache
&& (*mgp || (*mgp = mg_find(sv, PERL_MAGIC_utf8)))) {
- if ((*mgp)->mg_len != -1) {
+ if ((*mgp)->mg_ptr) {
+ STRLEN *cache = (STRLEN *) (*mgp)->mg_ptr;
+ if (cache[0] == uoffset) {
+ /* An exact match. */
+ return cache[1];
+ }
+ if (cache[2] == uoffset) {
+ /* An exact match. */
+ return cache[3];
+ }
+
+ if (cache[0] < uoffset) {
+ /* The cache already knows part of the way. */
+ if (cache[0] > uoffset0) {
+ /* The cache knows more than the passed in pair */
+ uoffset0 = cache[0];
+ boffset0 = cache[1];
+ }
+ if ((*mgp)->mg_len != -1) {
+ /* And we know the end too. */
+ boffset = boffset0
+ + S_sv_pos_u2b_midway(aTHX_ start + boffset0, send,
+ uoffset - uoffset0,
+ (*mgp)->mg_len - uoffset0);
+ } else {
+ boffset = boffset0
+ + S_sv_pos_u2b_forwards(aTHX_ start + boffset0,
+ send, uoffset - uoffset0);
+ }
+ }
+ else if (cache[2] < uoffset) {
+ /* We're between the two cache entries. */
+ if (cache[2] > uoffset0) {
+ /* and the cache knows more than the passed in pair */
+ uoffset0 = cache[2];
+ boffset0 = cache[3];
+ }
+
+ boffset = boffset0
+ + S_sv_pos_u2b_midway(aTHX_ start + boffset0,
+ start + cache[1],
+ uoffset - uoffset0,
+ cache[0] - uoffset0);
+ } else {
+ boffset = boffset0
+ + S_sv_pos_u2b_midway(aTHX_ start + boffset0,
+ start + cache[3],
+ uoffset - uoffset0,
+ cache[2] - uoffset0);
+ }
+ found = TRUE;
+ }
+ else if ((*mgp)->mg_len != -1) {
/* If we can take advantage of a passed in offset, do so. */
/* In fact, offset0 is either 0, or less than offset, so don't
need to worry about the other possibility. */
boffset = real_boffset;
}
- S_utf8_mg_pos_cache_update(aTHX_ sv, mgp, boffset, uoffset);
+ S_utf8_mg_pos_cache_update(aTHX_ sv, mgp, boffset, uoffset, send - start);
return boffset;
}
+
+/*
+=for apidoc sv_pos_u2b
+
+Converts the value pointed to by offsetp from a count of UTF-8 chars from
+the start of the string, to a count of the equivalent number of bytes; if
+lenp is non-zero, it does the same to lenp, but this time starting from
+the offset, rather than from the start of the string. Handles magic and
+type coercion.
+
+=cut
+*/
+
+/*
+ * sv_pos_u2b() uses, like sv_pos_b2u(), the mg_ptr of the potential
+ * PERL_UTF8_magic of the sv to store the mapping between UTF-8 and
+ * byte offsets. See also the comments of S_utf8_mg_pos_cache_update().
+ *
+ */
+
void
Perl_sv_pos_u2b(pTHX_ register SV *sv, I32* offsetp, I32* lenp)
{
return;
}
-/*
-=for apidoc sv_pos_b2u
-
-Converts the value pointed to by offsetp from a count of bytes from the
-start of the string, to a count of the equivalent number of UTF-8 chars.
-Handles magic and type coercion.
-
-=cut
+/* Create and update the UTF8 magic offset cache, with the proffered utf8/
+ byte length pairing. The (byte) length of the total SV is passed in too,
+ as blen, because for some (more esoteric) SVs, the call to SvPV_const()
+ may not have updated SvCUR, so we can't rely on reading it directly.
+
+ The proffered utf8/byte length pairing isn't used if the cache already has
+ two pairs, and swapping either for the proffered pair would increase the
+ RMS of the intervals between known byte offsets.
+
+ The cache itself consists of 4 STRLEN values
+ 0: larger UTF-8 offset
+ 1: corresponding byte offset
+ 2: smaller UTF-8 offset
+ 3: corresponding byte offset
+
+ Unused cache pairs have the value 0, 0.
+ Keeping the cache "backwards" means that the invariant of
+ cache[0] >= cache[2] is maintained even with empty slots, which means that
+ the code that uses it doesn't need to worry if only 1 entry has actually
+ been set to non-zero. It also makes the "position beyond the end of the
+ cache" logic much simpler, as the first slot is always the one to start
+ from.
*/
-
-/*
- * sv_pos_b2u() uses, like sv_pos_u2b(), the mg_ptr of the potential
- * PERL_UTF8_magic of the sv to store the mapping between UTF-8 and
- * byte offsets. See also the comments of S_utf8_mg_pos().
- *
- */
-
-
-static STRLEN
-S_sv_pos_b2u_forwards(pTHX_ const U8 *s, const U8 *const target);
-
static void
-S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN byte, STRLEN utf8)
+S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN byte, STRLEN utf8,
+ STRLEN blen)
{
STRLEN *cache;
if (SvREADONLY(sv))
" real %"UVf" for %"SVf, (UV) utf8, (UV) realutf8, sv);
}
}
- cache[0] = utf8;
- cache[1] = byte;
+
+ /* Cache is held with the later position first, to simplify the code
+ that deals with unbounded ends. */
+
+ ASSERT_UTF8_CACHE(cache);
+ if (cache[1] == 0) {
+ /* Cache is totally empty */
+ cache[0] = utf8;
+ cache[1] = byte;
+ } else if (cache[3] == 0) {
+ if (byte > cache[1]) {
+ /* New one is larger, so goes first. */
+ cache[2] = cache[0];
+ cache[3] = cache[1];
+ cache[0] = utf8;
+ cache[1] = byte;
+ } else {
+ cache[2] = utf8;
+ cache[3] = byte;
+ }
+ } else {
+#define THREEWAY_SQUARE(a,b,c,d) \
+ ((float)((d) - (c))) * ((float)((d) - (c))) \
+ + ((float)((c) - (b))) * ((float)((c) - (b))) \
+ + ((float)((b) - (a))) * ((float)((b) - (a)))
+
+ /* Cache has 2 slots in use, and we know three potential pairs.
+ Keep the two that give the lowest RMS distance. Do the
+ calcualation in bytes simply because we always know the byte
+ length. squareroot has the same ordering as the positive value,
+ so don't bother with the actual square root. */
+ const float existing = THREEWAY_SQUARE(0, cache[3], cache[1], blen);
+ if (byte > cache[1]) {
+ /* New position is after the existing pair of pairs. */
+ const float keep_earlier
+ = THREEWAY_SQUARE(0, cache[3], byte, blen);
+ const float keep_later
+ = THREEWAY_SQUARE(0, cache[1], byte, blen);
+
+ if (keep_later < keep_earlier) {
+ if (keep_later < existing) {
+ cache[2] = cache[0];
+ cache[3] = cache[1];
+ cache[0] = utf8;
+ cache[1] = byte;
+ }
+ }
+ else {
+ if (keep_earlier < existing) {
+ cache[0] = utf8;
+ cache[1] = byte;
+ }
+ }
+ }
+ else if (byte > cache[3]) {
+ /* New position is between the existing pair of pairs. */
+ const float keep_earlier
+ = THREEWAY_SQUARE(0, cache[3], byte, blen);
+ const float keep_later
+ = THREEWAY_SQUARE(0, byte, cache[1], blen);
+
+ if (keep_later < keep_earlier) {
+ if (keep_later < existing) {
+ cache[2] = utf8;
+ cache[3] = byte;
+ }
+ }
+ else {
+ if (keep_earlier < existing) {
+ cache[0] = utf8;
+ cache[1] = byte;
+ }
+ }
+ }
+ else {
+ /* New position is before the existing pair of pairs. */
+ const float keep_earlier
+ = THREEWAY_SQUARE(0, byte, cache[3], blen);
+ const float keep_later
+ = THREEWAY_SQUARE(0, byte, cache[1], blen);
+
+ if (keep_later < keep_earlier) {
+ if (keep_later < existing) {
+ cache[2] = utf8;
+ cache[3] = byte;
+ }
+ }
+ else {
+ if (keep_earlier < existing) {
+ cache[0] = cache[2];
+ cache[1] = cache[3];
+ cache[2] = utf8;
+ cache[3] = byte;
+ }
+ }
+ }
+ }
ASSERT_UTF8_CACHE(cache);
- /* Drop the stale "length" cache */
- cache[2] = 0;
- cache[3] = 0;
}
/* If we don't know the character offset of the end of a region, our only
}
/* We already know all of the way, now we may be able to walk back. The same
- assumption is made as in S_utf8_mg_pos(), namely that walking backward is
- twice slower than walking forward. */
+ assumption is made as in S_sv_pos_u2b_midway(), namely that walking
+ backward is half the speed of walking forward. */
static STRLEN
S_sv_pos_b2u_midway(pTHX_ const U8 *s, const U8 *const target, const U8 *end,
STRLEN endu)
return endu;
}
+/*
+=for apidoc sv_pos_b2u
+
+Converts the value pointed to by offsetp from a count of bytes from the
+start of the string, to a count of the equivalent number of UTF-8 chars.
+Handles magic and type coercion.
+
+=cut
+*/
+
+/*
+ * sv_pos_b2u() uses, like sv_pos_u2b(), the mg_ptr of the potential
+ * PERL_UTF8_magic of the sv to store the mapping between UTF-8 and
+ * byte offsets.
+ *
+ */
void
Perl_sv_pos_b2u(pTHX_ register SV* sv, I32* offsetp)
{
const U8* s;
const STRLEN byte = *offsetp;
STRLEN len;
+ STRLEN blen;
MAGIC* mg = NULL;
const U8* send;
+ bool found = FALSE;
if (!sv)
return;
- s = (const U8*)SvPV_const(sv, len);
+ s = (const U8*)SvPV_const(sv, blen);
- if (len < byte)
+ if (blen < byte)
Perl_croak(aTHX_ "panic: sv_pos_b2u: bad byte offset");
send = s + byte;
if (cache[1] == byte) {
/* An exact match. */
*offsetp = cache[0];
-
return;
}
- else if (cache[1] < byte) {
+ if (cache[3] == byte) {
+ /* An exact match. */
+ *offsetp = cache[2];
+ return;
+ }
+
+ if (cache[1] < byte) {
/* We already know part of the way. */
if (mg->mg_len != -1) {
/* Actually, we know the end too. */
len = cache[0]
+ S_sv_pos_b2u_midway(aTHX_ s + cache[1], send,
- s + len, mg->mg_len - cache[0]);
+ s + blen, mg->mg_len - cache[0]);
} else {
len = cache[0]
+ S_sv_pos_b2u_forwards(aTHX_ s + cache[1], send);
}
}
- else { /* cache[1] > byte */
- len = S_sv_pos_b2u_midway(aTHX_ s, send, s + cache[1],
- cache[0]);
+ else if (cache[3] < byte) {
+ /* We're between the two cached pairs, so we do the calculation
+ offset by the byte/utf-8 positions for the earlier pair,
+ then add the utf-8 characters from the string start to
+ there. */
+ len = S_sv_pos_b2u_midway(aTHX_ s + cache[3], send,
+ s + cache[1], cache[0] - cache[2])
+ + cache[2];
}
- ASSERT_UTF8_CACHE(cache);
- if (PL_utf8cache < 0) {
- const STRLEN reallen = S_sv_pos_b2u_forwards(aTHX_ s, send);
-
- if (len != reallen) {
- /* Need to turn the assertions off otherwise we may recurse
- infinitely while printing error messages. */
- SAVEI8(PL_utf8cache);
- PL_utf8cache = 0;
- Perl_croak(aTHX_ "panic: sv_pos_b2u cache %"UVf
- " real %"UVf" for %"SVf,
- (UV) len, (UV) reallen, sv);
- }
+ else { /* cache[3] > byte */
+ len = S_sv_pos_b2u_midway(aTHX_ s, send, s + cache[3],
+ cache[2]);
+
}
+ ASSERT_UTF8_CACHE(cache);
+ found = TRUE;
} else if (mg->mg_len != -1) {
- len = S_sv_pos_b2u_midway(aTHX_ s, send, s + len, mg->mg_len);
- } else {
- len = S_sv_pos_b2u_forwards(aTHX_ s, send);
+ len = S_sv_pos_b2u_midway(aTHX_ s, send, s + blen, mg->mg_len);
+ found = TRUE;
}
}
- else {
- len = S_sv_pos_b2u_forwards(aTHX_ s, send);
+ if (!found || PL_utf8cache < 0) {
+ const STRLEN real_len = S_sv_pos_b2u_forwards(aTHX_ s, send);
+
+ if (found && PL_utf8cache < 0) {
+ if (len != real_len) {
+ /* Need to turn the assertions off otherwise we may recurse
+ infinitely while printing error messages. */
+ SAVEI8(PL_utf8cache);
+ PL_utf8cache = 0;
+ Perl_croak(aTHX_ "panic: sv_pos_b2u cache %"UVf
+ " real %"UVf" for %"SVf,
+ (UV) len, (UV) real_len, sv);
+ }
+ }
+ len = real_len;
}
*offsetp = len;
- S_utf8_mg_pos_cache_update(aTHX_ sv, &mg, byte, len);
+ S_utf8_mg_pos_cache_update(aTHX_ sv, &mg, byte, len, blen);
}
/*