X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=sv.c;h=a19939ed845b5c30563b114e77a849569e09b78a;hb=2977d345acaba2d25f549b813e6f840a0d225b16;hp=69232be8c170ebada9434f256442074b4c3e0cec;hpb=c336ad0b0ffc3d19ec24d23d8bc7a30158ac1ae5;p=p5sagit%2Fp5-mst-13.2.git diff --git a/sv.c b/sv.c index 69232be..a19939e 100644 --- a/sv.c +++ b/sv.c @@ -30,19 +30,16 @@ #endif #ifdef PERL_UTF8_CACHE_ASSERT -/* The cache element 0 is the Unicode offset; - * the cache element 1 is the byte offset of the element 0; - * the cache element 2 is the Unicode length of the substring; - * the cache element 3 is the byte length of the substring; - * The checking of the substring side would be good - * but substr() has enough code paths to make my head spin; - * if adding more checks watch out for the following tests: +/* if adding more checks watch out for the following tests: * t/op/index.t t/op/length.t t/op/pat.t t/op/substr.t * lib/utf8.t lib/Unicode/Collate/t/index.t * --jhi */ #define ASSERT_UTF8_CACHE(cache) \ - STMT_START { if (cache) { assert((cache)[0] <= (cache)[1]); } } STMT_END + STMT_START { if (cache) { assert((cache)[0] <= (cache)[1]); \ + assert((cache)[2] <= (cache)[3]); \ + assert((cache)[3] <= (cache)[1]);} \ + } STMT_END #else #define ASSERT_UTF8_CACHE(cache) NOOP #endif @@ -1895,6 +1892,13 @@ S_sv_2iuv_common(pTHX_ SV *sv) { certainly cast into the IV range at IV_MAX, whereas the correct answer is the UV IV_MAX +1. Hence < ensures that dodgy boundary cases go to UV */ +#if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan) + if (Perl_isnan(SvNVX(sv))) { + SvUV_set(sv, 0); + SvIsUV_on(sv); + return FALSE; + } +#endif if (SvNVX(sv) < (NV)IV_MAX + 0.5) { SvIV_set(sv, I_V(SvNVX(sv))); if (SvNVX(sv) == (NV) SvIVX(sv) @@ -5270,8 +5274,10 @@ UTF-8 bytes as a single character. Handles magic and type coercion. /* * The length is cached in PERL_UTF8_magic, in the mg_len field. Also the - * mg_ptr is used, by sv_pos_u2b(), see the comments of S_utf8_mg_pos_init(). - * (Note that the mg_len is not the length of the mg_ptr field.) + * mg_ptr is used, by sv_pos_u2b() and sv_pos_b2u() - see the comments below. + * (Note that the mg_len is not the length of the mg_ptr field. + * This allows the cache to store the character length of the string without + * needing to malloc() extra storage to attach to the mg_ptr.) * */ @@ -5325,182 +5331,8 @@ Perl_sv_len_utf8(pTHX_ register SV *sv) } } -/* S_utf8_mg_pos_init() is used to initialize the mg_ptr field of - * a PERL_UTF8_magic. The mg_ptr is used to store the mapping - * between UTF-8 and byte offsets. There are two (substr offset and substr - * length, the i offset, PERL_MAGIC_UTF8_CACHESIZE) times two (UTF-8 offset - * and byte offset) cache positions. - * - * The mg_len field is used by sv_len_utf8(), see its comments. - * Note that the mg_len is not the length of the mg_ptr field. - * - */ -STATIC bool -S_utf8_mg_pos_init(pTHX_ SV *sv, MAGIC **mgp, STRLEN **cachep, I32 i, - I32 offsetp, const U8 *s, const U8 *start) -{ - bool found = FALSE; - - if (SvMAGICAL(sv) && !SvREADONLY(sv)) { - if (!*mgp) { - *mgp = sv_magicext(sv, 0, PERL_MAGIC_utf8, (MGVTBL*)&PL_vtbl_utf8, 0, 0); - (*mgp)->mg_len = -1; - } - assert(*mgp); - - if ((*mgp)->mg_ptr) - *cachep = (STRLEN *) (*mgp)->mg_ptr; - else { - Newxz(*cachep, PERL_MAGIC_UTF8_CACHESIZE * 2, STRLEN); - (*mgp)->mg_ptr = (char *) *cachep; - } - assert(*cachep); - - (*cachep)[i] = offsetp; - (*cachep)[i+1] = s - start; - found = TRUE; - } - - return found; -} - -/* - * S_utf8_mg_pos() is used to query and update mg_ptr field of - * a PERL_UTF8_magic. The mg_ptr is used to store the mapping - * between UTF-8 and byte offsets. See also the comments of - * S_utf8_mg_pos_init(). - * - */ -STATIC bool -S_utf8_mg_pos(pTHX_ SV *sv, MAGIC **mgp, STRLEN **cachep, I32 i, I32 *offsetp, I32 uoff, const U8 **sp, const U8 *start, const U8 *send) -{ - bool found = FALSE; - - if (SvMAGICAL(sv) && !SvREADONLY(sv)) { - if (!*mgp) - *mgp = mg_find(sv, PERL_MAGIC_utf8); - if (*mgp && (*mgp)->mg_ptr) { - *cachep = (STRLEN *) (*mgp)->mg_ptr; - ASSERT_UTF8_CACHE(*cachep); - if ((*cachep)[i] == (STRLEN)uoff) /* An exact match. */ - found = TRUE; - else { /* We will skip to the right spot. */ - STRLEN forw = 0; - STRLEN backw = 0; - const U8* p = NULL; - - /* The assumption is that going backward is half - * the speed of going forward (that's where the - * 2 * backw in the below comes from). (The real - * figure of course depends on the UTF-8 data.) */ - - if ((*cachep)[i] > (STRLEN)uoff) { - forw = uoff; - backw = (*cachep)[i] - (STRLEN)uoff; - - if (forw < 2 * backw) - p = start; - else - p = start + (*cachep)[i+1]; - } - /* Try this only for the substr offset (i == 0), - * not for the substr length (i == 2). */ - else if (i == 0) { /* (*cachep)[i] < uoff */ - const STRLEN ulen = sv_len_utf8(sv); - - if ((STRLEN)uoff < ulen) { - forw = (STRLEN)uoff - (*cachep)[i]; - backw = ulen - (STRLEN)uoff; - - if (forw < 2 * backw) - p = start + (*cachep)[i+1]; - else - p = send; - } - - /* If the string is not long enough for uoff, - * we could extend it, but not at this low a level. */ - } - - if (p) { - if (forw < 2 * backw) { - while (forw--) - p += UTF8SKIP(p); - } - else { - while (backw--) { - p--; - while (UTF8_IS_CONTINUATION(*p)) - p--; - } - } - - /* Update the cache. */ - (*cachep)[i] = (STRLEN)uoff; - (*cachep)[i+1] = p - start; - - /* Drop the stale "length" cache */ - if (i == 0) { - (*cachep)[2] = 0; - (*cachep)[3] = 0; - } - - found = TRUE; - } - } - if (found) { /* Setup the return values. */ - *offsetp = (*cachep)[i+1]; - *sp = start + *offsetp; - if (*sp >= send) { - *sp = send; - *offsetp = send - start; - } - else if (*sp < start) { - *sp = start; - *offsetp = 0; - } - } - } -#ifdef PERL_UTF8_CACHE_ASSERT - if (found) { - const U8 *s = start; - I32 n = uoff; - - while (n-- && s < send) - s += UTF8SKIP(s); - - if (i == 0) { - assert(*offsetp == s - start); - assert((*cachep)[0] == (STRLEN)uoff); - assert((*cachep)[1] == *offsetp); - } - ASSERT_UTF8_CACHE(*cachep); - } -#endif - } - - return found; -} - -/* -=for apidoc sv_pos_u2b - -Converts the value pointed to by offsetp from a count of UTF-8 chars from -the start of the string, to a count of the equivalent number of bytes; if -lenp is non-zero, it does the same to lenp, but this time starting from -the offset, rather than from the start of the string. Handles magic and -type coercion. - -=cut -*/ - -/* - * sv_pos_u2b() uses, like sv_pos_b2u(), the mg_ptr of the potential - * PERL_UTF8_magic of the sv to store the mapping between UTF-8 and - * byte offsets. See also the comments of S_utf8_mg_pos(). - * - */ - +/* Walk forwards to find the byte corresponding to the passed in UTF-8 + offset. */ static STRLEN S_sv_pos_u2b_forwards(pTHX_ const U8 *const start, const U8 *const send, STRLEN uoffset) @@ -5517,14 +5349,16 @@ S_sv_pos_u2b_forwards(pTHX_ const U8 *const start, const U8 *const send, return s - start; } - +/* Given the length of the string in both bytes and UTF-8 characters, decide + whether to walk forwards or backwards to find the byte corresponding to + the passed in UTF-8 offset. */ static STRLEN S_sv_pos_u2b_midway(pTHX_ const U8 *const start, const U8 *send, STRLEN uoffset, STRLEN uend) { STRLEN backw = uend - uoffset; if (uoffset < 2 * backw) { - /* The assumption is that going fowards is twice the speed of going + /* The assumption is that going forwards is twice the speed of going forward (that's where the 2 * backw comes from). (The real figure of course depends on the UTF-8 data.) */ return S_sv_pos_u2b_forwards(aTHX_ start, send, uoffset); @@ -5538,6 +5372,14 @@ S_sv_pos_u2b_midway(pTHX_ const U8 *const start, const U8 *send, return send - start; } +/* For the string representation of the given scalar, find the byte + corresponding to the passed in UTF-8 offset. uoffset0 and boffset0 + give another position in the string, *before* the sought offset, which + (which is always true, as 0, 0 is a valid pair of positions), which should + help reduce the amount of linear searching. + If *mgp is non-NULL, it should point to the UTF-8 cache magic, which + will be used to reduce the amount of linear searching. The cache will be + created if necessary, and the found value offered to it for update. */ static STRLEN S_sv_pos_u2b_cached(pTHX_ SV *sv, MAGIC **mgp, const U8 *const start, const U8 *const send, STRLEN uoffset, @@ -5545,25 +5387,79 @@ S_sv_pos_u2b_cached(pTHX_ SV *sv, MAGIC **mgp, const U8 *const start, STRLEN boffset; bool found = FALSE; + assert (uoffset >= uoffset0); + if (SvMAGICAL(sv) && !SvREADONLY(sv) && PL_utf8cache - && (*mgp = mg_find(sv, PERL_MAGIC_utf8))) { - if ((*mgp)->mg_len != -1) { - boffset = S_sv_pos_u2b_midway(aTHX_ start, send, uoffset, - (*mgp)->mg_len); + && (*mgp || (*mgp = mg_find(sv, PERL_MAGIC_utf8)))) { + if ((*mgp)->mg_ptr) { + STRLEN *cache = (STRLEN *) (*mgp)->mg_ptr; + if (cache[0] == uoffset) { + /* An exact match. */ + return cache[1]; + } + if (cache[2] == uoffset) { + /* An exact match. */ + return cache[3]; + } + + if (cache[0] < uoffset) { + /* The cache already knows part of the way. */ + if (cache[0] > uoffset0) { + /* The cache knows more than the passed in pair */ + uoffset0 = cache[0]; + boffset0 = cache[1]; + } + if ((*mgp)->mg_len != -1) { + /* And we know the end too. */ + boffset = boffset0 + + S_sv_pos_u2b_midway(aTHX_ start + boffset0, send, + uoffset - uoffset0, + (*mgp)->mg_len - uoffset0); + } else { + boffset = boffset0 + + S_sv_pos_u2b_forwards(aTHX_ start + boffset0, + send, uoffset - uoffset0); + } + } + else if (cache[2] < uoffset) { + /* We're between the two cache entries. */ + if (cache[2] > uoffset0) { + /* and the cache knows more than the passed in pair */ + uoffset0 = cache[2]; + boffset0 = cache[3]; + } + + boffset = boffset0 + + S_sv_pos_u2b_midway(aTHX_ start + boffset0, + start + cache[1], + uoffset - uoffset0, + cache[0] - uoffset0); + } else { + boffset = boffset0 + + S_sv_pos_u2b_midway(aTHX_ start + boffset0, + start + cache[3], + uoffset - uoffset0, + cache[2] - uoffset0); + } + found = TRUE; + } + else if ((*mgp)->mg_len != -1) { + /* If we can take advantage of a passed in offset, do so. */ + /* In fact, offset0 is either 0, or less than offset, so don't + need to worry about the other possibility. */ + boffset = boffset0 + + S_sv_pos_u2b_midway(aTHX_ start + boffset0, send, + uoffset - uoffset0, + (*mgp)->mg_len - uoffset0); found = TRUE; } } if (!found || PL_utf8cache < 0) { - STRLEN real_boffset; - if (uoffset >= uoffset0) { - real_boffset - = boffset0 + S_sv_pos_u2b_forwards(aTHX_ start + boffset0, - send, uoffset - uoffset0); - } - else { - real_boffset = S_sv_pos_u2b_forwards(aTHX_ start, send, uoffset); - } + const STRLEN real_boffset + = boffset0 + S_sv_pos_u2b_forwards(aTHX_ start + boffset0, + send, uoffset - uoffset0); + if (found && PL_utf8cache < 0) { if (real_boffset != boffset) { /* Need to turn the assertions off otherwise we may recurse @@ -5577,9 +5473,31 @@ S_sv_pos_u2b_cached(pTHX_ SV *sv, MAGIC **mgp, const U8 *const start, } boffset = real_boffset; } + + S_utf8_mg_pos_cache_update(aTHX_ sv, mgp, boffset, uoffset, send - start); return boffset; } + +/* +=for apidoc sv_pos_u2b + +Converts the value pointed to by offsetp from a count of UTF-8 chars from +the start of the string, to a count of the equivalent number of bytes; if +lenp is non-zero, it does the same to lenp, but this time starting from +the offset, rather than from the start of the string. Handles magic and +type coercion. + +=cut +*/ + +/* + * sv_pos_u2b() uses, like sv_pos_b2u(), the mg_ptr of the potential + * PERL_UTF8_magic of the sv to store the mapping between UTF-8 and + * byte offsets. See also the comments of S_utf8_mg_pos_cache_update(). + * + */ + void Perl_sv_pos_u2b(pTHX_ register SV *sv, I32* offsetp, I32* lenp) { @@ -5593,7 +5511,7 @@ Perl_sv_pos_u2b(pTHX_ register SV *sv, I32* offsetp, I32* lenp) if (len) { STRLEN uoffset = (STRLEN) *offsetp; const U8 * const send = start + len; - MAGIC *mg; + MAGIC *mg = NULL; STRLEN boffset = S_sv_pos_u2b_cached(aTHX_ sv, &mg, start, send, uoffset, 0, 0); @@ -5618,29 +5536,32 @@ Perl_sv_pos_u2b(pTHX_ register SV *sv, I32* offsetp, I32* lenp) return; } -/* -=for apidoc sv_pos_b2u - -Converts the value pointed to by offsetp from a count of bytes from the -start of the string, to a count of the equivalent number of UTF-8 chars. -Handles magic and type coercion. - -=cut +/* Create and update the UTF8 magic offset cache, with the proffered utf8/ + byte length pairing. The (byte) length of the total SV is passed in too, + as blen, because for some (more esoteric) SVs, the call to SvPV_const() + may not have updated SvCUR, so we can't rely on reading it directly. + + The proffered utf8/byte length pairing isn't used if the cache already has + two pairs, and swapping either for the proffered pair would increase the + RMS of the intervals between known byte offsets. + + The cache itself consists of 4 STRLEN values + 0: larger UTF-8 offset + 1: corresponding byte offset + 2: smaller UTF-8 offset + 3: corresponding byte offset + + Unused cache pairs have the value 0, 0. + Keeping the cache "backwards" means that the invariant of + cache[0] >= cache[2] is maintained even with empty slots, which means that + the code that uses it doesn't need to worry if only 1 entry has actually + been set to non-zero. It also makes the "position beyond the end of the + cache" logic much simpler, as the first slot is always the one to start + from. */ - -/* - * sv_pos_b2u() uses, like sv_pos_u2b(), the mg_ptr of the potential - * PERL_UTF8_magic of the sv to store the mapping between UTF-8 and - * byte offsets. See also the comments of S_utf8_mg_pos(). - * - */ - - -static STRLEN -S_sv_pos_b2u_forwards(pTHX_ const U8 *s, const U8 *const target); - static void -S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN byte, STRLEN utf8) +S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN byte, STRLEN utf8, + STRLEN blen) { STRLEN *cache; if (SvREADONLY(sv)) @@ -5661,8 +5582,18 @@ S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN byte, STRLEN utf8) if (PL_utf8cache < 0) { const U8 *start = (const U8 *) SvPVX_const(sv); - const STRLEN realutf8 - = S_sv_pos_b2u_forwards(aTHX_ start, start + byte); + const U8 *const end = start + byte; + STRLEN realutf8 = 0; + + while (start < end) { + start += UTF8SKIP(start); + realutf8++; + } + + /* Can't use S_sv_pos_b2u_forwards as it will scream warnings on + surrogates. FIXME - is it inconsistent that b2u warns, but u2b + doesn't? I don't know whether this difference was introduced with + the caching code in 5.8.1. */ if (realutf8 != utf8) { /* Need to turn the assertions off otherwise we may recurse @@ -5673,11 +5604,104 @@ S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN byte, STRLEN utf8) " real %"UVf" for %"SVf, (UV) utf8, (UV) realutf8, sv); } } - cache[0] = utf8; - cache[1] = byte; - /* Drop the stale "length" cache */ - cache[2] = 0; - cache[3] = 0; + + /* Cache is held with the later position first, to simplify the code + that deals with unbounded ends. */ + + ASSERT_UTF8_CACHE(cache); + if (cache[1] == 0) { + /* Cache is totally empty */ + cache[0] = utf8; + cache[1] = byte; + } else if (cache[3] == 0) { + if (byte > cache[1]) { + /* New one is larger, so goes first. */ + cache[2] = cache[0]; + cache[3] = cache[1]; + cache[0] = utf8; + cache[1] = byte; + } else { + cache[2] = utf8; + cache[3] = byte; + } + } else { +#define THREEWAY_SQUARE(a,b,c,d) \ + ((float)((d) - (c))) * ((float)((d) - (c))) \ + + ((float)((c) - (b))) * ((float)((c) - (b))) \ + + ((float)((b) - (a))) * ((float)((b) - (a))) + + /* Cache has 2 slots in use, and we know three potential pairs. + Keep the two that give the lowest RMS distance. Do the + calcualation in bytes simply because we always know the byte + length. squareroot has the same ordering as the positive value, + so don't bother with the actual square root. */ + const float existing = THREEWAY_SQUARE(0, cache[3], cache[1], blen); + if (byte > cache[1]) { + /* New position is after the existing pair of pairs. */ + const float keep_earlier + = THREEWAY_SQUARE(0, cache[3], byte, blen); + const float keep_later + = THREEWAY_SQUARE(0, cache[1], byte, blen); + + if (keep_later < keep_earlier) { + if (keep_later < existing) { + cache[2] = cache[0]; + cache[3] = cache[1]; + cache[0] = utf8; + cache[1] = byte; + } + } + else { + if (keep_earlier < existing) { + cache[0] = utf8; + cache[1] = byte; + } + } + } + else if (byte > cache[3]) { + /* New position is between the existing pair of pairs. */ + const float keep_earlier + = THREEWAY_SQUARE(0, cache[3], byte, blen); + const float keep_later + = THREEWAY_SQUARE(0, byte, cache[1], blen); + + if (keep_later < keep_earlier) { + if (keep_later < existing) { + cache[2] = utf8; + cache[3] = byte; + } + } + else { + if (keep_earlier < existing) { + cache[0] = utf8; + cache[1] = byte; + } + } + } + else { + /* New position is before the existing pair of pairs. */ + const float keep_earlier + = THREEWAY_SQUARE(0, byte, cache[3], blen); + const float keep_later + = THREEWAY_SQUARE(0, byte, cache[1], blen); + + if (keep_later < keep_earlier) { + if (keep_later < existing) { + cache[2] = utf8; + cache[3] = byte; + } + } + else { + if (keep_earlier < existing) { + cache[0] = cache[2]; + cache[1] = cache[3]; + cache[2] = utf8; + cache[3] = byte; + } + } + } + } + ASSERT_UTF8_CACHE(cache); } /* If we don't know the character offset of the end of a region, our only @@ -5704,8 +5728,8 @@ S_sv_pos_b2u_forwards(pTHX_ const U8 *s, const U8 *const target) } /* We already know all of the way, now we may be able to walk back. The same - assumption is made as in S_utf8_mg_pos(), namely that walking backward is - twice slower than walking forward. */ + assumption is made as in S_sv_pos_u2b_midway(), namely that walking + backward is half the speed of walking forward. */ static STRLEN S_sv_pos_b2u_midway(pTHX_ const U8 *s, const U8 *const target, const U8 *end, STRLEN endu) @@ -5727,21 +5751,39 @@ S_sv_pos_b2u_midway(pTHX_ const U8 *s, const U8 *const target, const U8 *end, return endu; } +/* +=for apidoc sv_pos_b2u + +Converts the value pointed to by offsetp from a count of bytes from the +start of the string, to a count of the equivalent number of UTF-8 chars. +Handles magic and type coercion. + +=cut +*/ + +/* + * sv_pos_b2u() uses, like sv_pos_u2b(), the mg_ptr of the potential + * PERL_UTF8_magic of the sv to store the mapping between UTF-8 and + * byte offsets. + * + */ void Perl_sv_pos_b2u(pTHX_ register SV* sv, I32* offsetp) { const U8* s; const STRLEN byte = *offsetp; STRLEN len; + STRLEN blen; MAGIC* mg = NULL; const U8* send; + bool found = FALSE; if (!sv) return; - s = (const U8*)SvPV_const(sv, len); + s = (const U8*)SvPV_const(sv, blen); - if (len < byte) + if (blen < byte) Perl_croak(aTHX_ "panic: sv_pos_b2u: bad byte offset"); send = s + byte; @@ -5753,52 +5795,67 @@ Perl_sv_pos_b2u(pTHX_ register SV* sv, I32* offsetp) if (cache[1] == byte) { /* An exact match. */ *offsetp = cache[0]; - return; } - else if (cache[1] < byte) { + if (cache[3] == byte) { + /* An exact match. */ + *offsetp = cache[2]; + return; + } + + if (cache[1] < byte) { /* We already know part of the way. */ if (mg->mg_len != -1) { /* Actually, we know the end too. */ len = cache[0] + S_sv_pos_b2u_midway(aTHX_ s + cache[1], send, - s + len, mg->mg_len - cache[0]); + s + blen, mg->mg_len - cache[0]); } else { len = cache[0] + S_sv_pos_b2u_forwards(aTHX_ s + cache[1], send); } } - else { /* cache[1] > byte */ - len = S_sv_pos_b2u_midway(aTHX_ s, send, s + cache[1], - cache[0]); + else if (cache[3] < byte) { + /* We're between the two cached pairs, so we do the calculation + offset by the byte/utf-8 positions for the earlier pair, + then add the utf-8 characters from the string start to + there. */ + len = S_sv_pos_b2u_midway(aTHX_ s + cache[3], send, + s + cache[1], cache[0] - cache[2]) + + cache[2]; } - ASSERT_UTF8_CACHE(cache); - if (PL_utf8cache < 0) { - const STRLEN reallen = S_sv_pos_b2u_forwards(aTHX_ s, send); - - if (len != reallen) { - /* Need to turn the assertions off otherwise we may recurse - infinitely while printing error messages. */ - SAVEI8(PL_utf8cache); - PL_utf8cache = 0; - Perl_croak(aTHX_ "panic: sv_pos_b2u cache %"UVf - " real %"UVf" for %"SVf, - (UV) len, (UV) reallen, sv); - } + else { /* cache[3] > byte */ + len = S_sv_pos_b2u_midway(aTHX_ s, send, s + cache[3], + cache[2]); + } + ASSERT_UTF8_CACHE(cache); + found = TRUE; } else if (mg->mg_len != -1) { - len = S_sv_pos_b2u_midway(aTHX_ s, send, s + len, mg->mg_len); - } else { - len = S_sv_pos_b2u_forwards(aTHX_ s, send); + len = S_sv_pos_b2u_midway(aTHX_ s, send, s + blen, mg->mg_len); + found = TRUE; } } - else { - len = S_sv_pos_b2u_forwards(aTHX_ s, send); + if (!found || PL_utf8cache < 0) { + const STRLEN real_len = S_sv_pos_b2u_forwards(aTHX_ s, send); + + if (found && PL_utf8cache < 0) { + if (len != real_len) { + /* Need to turn the assertions off otherwise we may recurse + infinitely while printing error messages. */ + SAVEI8(PL_utf8cache); + PL_utf8cache = 0; + Perl_croak(aTHX_ "panic: sv_pos_b2u cache %"UVf + " real %"UVf" for %"SVf, + (UV) len, (UV) real_len, sv); + } + } + len = real_len; } *offsetp = len; - S_utf8_mg_pos_cache_update(aTHX_ sv, &mg, byte, len); + S_utf8_mg_pos_cache_update(aTHX_ sv, &mg, byte, len, blen); } /*