X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=sv.c;h=a19939ed845b5c30563b114e77a849569e09b78a;hb=2977d345acaba2d25f549b813e6f840a0d225b16;hp=b4bc5a78aeb2c51e0740f79baed551db6e536250;hpb=f9ba3d20d5b73b0f2fcbb19d0c0c0ff2222c412a;p=p5sagit%2Fp5-mst-13.2.git diff --git a/sv.c b/sv.c index b4bc5a7..a19939e 100644 --- a/sv.c +++ b/sv.c @@ -30,19 +30,16 @@ #endif #ifdef PERL_UTF8_CACHE_ASSERT -/* The cache element 0 is the Unicode offset; - * the cache element 1 is the byte offset of the element 0; - * the cache element 2 is the Unicode length of the substring; - * the cache element 3 is the byte length of the substring; - * The checking of the substring side would be good - * but substr() has enough code paths to make my head spin; - * if adding more checks watch out for the following tests: +/* if adding more checks watch out for the following tests: * t/op/index.t t/op/length.t t/op/pat.t t/op/substr.t * lib/utf8.t lib/Unicode/Collate/t/index.t * --jhi */ #define ASSERT_UTF8_CACHE(cache) \ - STMT_START { if (cache) { assert((cache)[0] <= (cache)[1]); } } STMT_END + STMT_START { if (cache) { assert((cache)[0] <= (cache)[1]); \ + assert((cache)[2] <= (cache)[3]); \ + assert((cache)[3] <= (cache)[1]);} \ + } STMT_END #else #define ASSERT_UTF8_CACHE(cache) NOOP #endif @@ -587,18 +584,6 @@ struct arena_set { struct arena_desc set[ARENAS_PER_SET]; }; -#if !ARENASETS - -static void -S_free_arena(pTHX_ void **root) { - while (root) { - void ** const next = *(void **)root; - Safefree(root); - root = next; - } -} -#endif - /* =for apidoc sv_free_arenas @@ -627,7 +612,6 @@ Perl_sv_free_arenas(pTHX) Safefree(sva); } -#if ARENASETS { struct arena_set *next, *aroot = (struct arena_set*) PL_body_arenas; @@ -641,9 +625,6 @@ Perl_sv_free_arenas(pTHX) Safefree(aroot); } } -#else - S_free_arena(aTHX_ (void**) PL_body_arenas); -#endif PL_body_arenas = 0; for (i=0; inext = PL_body_arenas; - PL_body_arenas = arp; - return arp; - -#else struct arena_desc* adesc; struct arena_set *newroot, **aroot = (struct arena_set**) &PL_body_arenas; int curr; @@ -737,7 +706,6 @@ Perl_get_arena(pTHX_ int arena_size) curr, adesc->arena, arena_size)); return adesc->arena; -#endif } @@ -1081,7 +1049,7 @@ S_more_bodies (pTHX_ svtype sv_type) #ifdef DEBUGGING if (!done_sanity_check) { - int i = SVt_LAST; + unsigned int i = SVt_LAST; done_sanity_check = TRUE; @@ -1094,17 +1062,11 @@ S_more_bodies (pTHX_ svtype sv_type) end = start + bdp->arena_size - body_size; -#if !ARENASETS - /* The initial slot is used to link the arenas together, so it isn't to be - linked into the list of ready-to-use bodies. */ - start += body_size; -#else /* computed count doesnt reflect the 1st slot reservation */ DEBUG_m(PerlIO_printf(Perl_debug_log, "arena %p end %p arena-size %d type %d size %d ct %d\n", start, end, bdp->arena_size, sv_type, body_size, bdp->arena_size / body_size)); -#endif *root = (void *)start; @@ -1444,6 +1406,10 @@ Perl_sv_grow(pTHX_ register SV *sv, register STRLEN newlen) { register char *s; + if (PL_madskills && newlen >= 0x100000) { + PerlIO_printf(Perl_debug_log, + "Allocation too large: %"UVxf"\n", (UV)newlen); + } #ifdef HAS_64K_LIMIT if (newlen >= 0x10000) { PerlIO_printf(Perl_debug_log, @@ -1926,6 +1892,13 @@ S_sv_2iuv_common(pTHX_ SV *sv) { certainly cast into the IV range at IV_MAX, whereas the correct answer is the UV IV_MAX +1. Hence < ensures that dodgy boundary cases go to UV */ +#if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan) + if (Perl_isnan(SvNVX(sv))) { + SvUV_set(sv, 0); + SvIsUV_on(sv); + return FALSE; + } +#endif if (SvNVX(sv) < (NV)IV_MAX + 0.5) { SvIV_set(sv, I_V(SvNVX(sv))); if (SvNVX(sv) == (NV) SvIVX(sv) @@ -2137,7 +2110,7 @@ S_sv_2iuv_common(pTHX_ SV *sv) { } else { if (isGV_with_GP(sv)) { - return PTR2IV(glob_2inpuv((GV *)sv, NULL, TRUE)); + return (bool)PTR2IV(glob_2inpuv((GV *)sv, NULL, TRUE)); } if (!(SvFLAGS(sv) & SVs_PADTMP)) { @@ -3216,8 +3189,7 @@ S_glob_assign_glob(pTHX_ SV *dstr, SV *sstr, const int dtype) GvSTASH(dstr) = GvSTASH(sstr); if (GvSTASH(dstr)) Perl_sv_add_backref(aTHX_ (SV*)GvSTASH(dstr), dstr); - GvNAME(dstr) = savepvn(name, len); - GvNAMELEN(dstr) = len; + gv_name_set((GV *)dstr, name, len, GV_ADD); SvFAKE_on(dstr); /* can coerce to non-glob */ } @@ -3461,10 +3433,7 @@ Perl_sv_setsv_flags(pTHX_ SV *dstr, register SV *sstr, I32 flags) if (dtype < SVt_PVNV) sv_upgrade(dstr, SVt_PVNV); break; - case SVt_PVAV: - case SVt_PVHV: - case SVt_PVCV: - case SVt_PVIO: + default: { const char * const type = sv_reftype(sstr,0); if (PL_op) @@ -3481,7 +3450,9 @@ Perl_sv_setsv_flags(pTHX_ SV *dstr, register SV *sstr, I32 flags) } /*FALLTHROUGH*/ - default: + case SVt_PVMG: + case SVt_PVLV: + case SVt_PVBM: if (SvGMAGICAL(sstr) && (flags & SV_GMAGIC)) { mg_get(sstr); if ((int)SvTYPE(sstr) != stype) { @@ -3685,7 +3656,8 @@ Perl_sv_setsv_flags(pTHX_ SV *dstr, register SV *sstr, I32 flags) if (sflags & SVf_IVisUV) SvIsUV_on(dstr); } - SvFLAGS(dstr) |= sflags & (SVf_IOK|SVp_IOK|SVf_NOK|SVp_NOK|SVf_UTF8); + SvFLAGS(dstr) |= sflags & (SVf_IOK|SVp_IOK|SVf_NOK|SVp_NOK|SVf_UTF8 + |SVf_AMAGIC); { const MAGIC * const smg = SvVOK(sstr); if (smg) { @@ -3697,7 +3669,8 @@ Perl_sv_setsv_flags(pTHX_ SV *dstr, register SV *sstr, I32 flags) } else if (sflags & (SVp_IOK|SVp_NOK)) { (void)SvOK_off(dstr); - SvFLAGS(dstr) |= sflags & (SVf_IOK|SVp_IOK|SVf_IVisUV|SVf_NOK|SVp_NOK); + SvFLAGS(dstr) |= sflags & (SVf_IOK|SVp_IOK|SVf_IVisUV|SVf_NOK|SVp_NOK + |SVf_AMAGIC); if (sflags & SVp_IOK) { /* XXXX Do we want to set IsUV for IV(ROK)? Be extra safe... */ SvIV_set(dstr, SvIVX(sstr)); @@ -3717,6 +3690,7 @@ Perl_sv_setsv_flags(pTHX_ SV *dstr, register SV *sstr, I32 flags) SvFAKE_off(sstr); gv_efullname3(dstr, (GV *)sstr, "*"); SvFLAGS(sstr) |= wasfake; + SvFLAGS(dstr) |= sflags & SVf_AMAGIC; } else (void)SvOK_off(dstr); @@ -4606,7 +4580,7 @@ Perl_sv_unmagic(pTHX_ SV *sv, int type) Safefree(mg->mg_ptr); else if (mg->mg_len == HEf_SVKEY) SvREFCNT_dec((SV*)mg->mg_ptr); - else if (mg->mg_type == PERL_MAGIC_utf8 && mg->mg_ptr) + else if (mg->mg_type == PERL_MAGIC_utf8) Safefree(mg->mg_ptr); } if (mg->mg_flags & MGf_REFCOUNTED) @@ -5113,7 +5087,9 @@ Perl_sv_clear(pTHX_ register SV *sv) goto freescalar; case SVt_PVGV: gp_free((GV*)sv); - Safefree(GvNAME(sv)); + if (GvNAME_HEK(sv)) { + unshare_hek(GvNAME_HEK(sv)); + } /* If we're in a stash, we don't own a reference to it. However it does have a back reference to us, which needs to be cleared. */ if (GvSTASH(sv)) @@ -5298,8 +5274,10 @@ UTF-8 bytes as a single character. Handles magic and type coercion. /* * The length is cached in PERL_UTF8_magic, in the mg_len field. Also the - * mg_ptr is used, by sv_pos_u2b(), see the comments of S_utf8_mg_pos_init(). - * (Note that the mg_len is not the length of the mg_ptr field.) + * mg_ptr is used, by sv_pos_u2b() and sv_pos_b2u() - see the comments below. + * (Note that the mg_len is not the length of the mg_ptr field. + * This allows the cache to store the character length of the string without + * needing to malloc() extra storage to attach to the mg_ptr.) * */ @@ -5313,185 +5291,194 @@ Perl_sv_len_utf8(pTHX_ register SV *sv) return mg_length(sv); else { - STRLEN len, ulen; + STRLEN len; const U8 *s = (U8*)SvPV_const(sv, len); - MAGIC *mg = SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : 0; - if (mg && mg->mg_len != -1 && (mg->mg_len > 0 || len == 0)) { - ulen = mg->mg_len; -#ifdef PERL_UTF8_CACHE_ASSERT - assert(ulen == Perl_utf8_length(aTHX_ s, s + len)); -#endif - } - else { - ulen = Perl_utf8_length(aTHX_ s, s + len); - if (!mg && !SvREADONLY(sv)) { - sv_magic(sv, 0, PERL_MAGIC_utf8, 0, 0); - mg = mg_find(sv, PERL_MAGIC_utf8); - assert(mg); + if (PL_utf8cache) { + STRLEN ulen; + MAGIC *mg = SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : 0; + + if (mg && mg->mg_len != -1) { + ulen = mg->mg_len; + if (PL_utf8cache < 0) { + const STRLEN real = Perl_utf8_length(aTHX_ s, s + len); + if (real != ulen) { + /* Need to turn the assertions off otherwise we may + recurse infinitely while printing error messages. + */ + SAVEI8(PL_utf8cache); + PL_utf8cache = 0; + Perl_croak(aTHX_ "panic: sv_len_utf8 cache %"UVf + " real %"UVf" for %"SVf, + (UV) ulen, (UV) real, sv); + } + } } - if (mg) - mg->mg_len = ulen; + else { + ulen = Perl_utf8_length(aTHX_ s, s + len); + if (!SvREADONLY(sv)) { + if (!mg) { + mg = sv_magicext(sv, 0, PERL_MAGIC_utf8, + &PL_vtbl_utf8, 0, 0); + } + assert(mg); + mg->mg_len = ulen; + } + } + return ulen; } - return ulen; + return Perl_utf8_length(aTHX_ s, s + len); } } -/* S_utf8_mg_pos_init() is used to initialize the mg_ptr field of - * a PERL_UTF8_magic. The mg_ptr is used to store the mapping - * between UTF-8 and byte offsets. There are two (substr offset and substr - * length, the i offset, PERL_MAGIC_UTF8_CACHESIZE) times two (UTF-8 offset - * and byte offset) cache positions. - * - * The mg_len field is used by sv_len_utf8(), see its comments. - * Note that the mg_len is not the length of the mg_ptr field. - * - */ -STATIC bool -S_utf8_mg_pos_init(pTHX_ SV *sv, MAGIC **mgp, STRLEN **cachep, I32 i, - I32 offsetp, const U8 *s, const U8 *start) +/* Walk forwards to find the byte corresponding to the passed in UTF-8 + offset. */ +static STRLEN +S_sv_pos_u2b_forwards(pTHX_ const U8 *const start, const U8 *const send, + STRLEN uoffset) { - bool found = FALSE; - - if (SvMAGICAL(sv) && !SvREADONLY(sv)) { - if (!*mgp) - *mgp = sv_magicext(sv, 0, PERL_MAGIC_utf8, (MGVTBL*)&PL_vtbl_utf8, 0, 0); - assert(*mgp); + const U8 *s = start; - if ((*mgp)->mg_ptr) - *cachep = (STRLEN *) (*mgp)->mg_ptr; - else { - Newxz(*cachep, PERL_MAGIC_UTF8_CACHESIZE * 2, STRLEN); - (*mgp)->mg_ptr = (char *) *cachep; - } - assert(*cachep); + while (s < send && uoffset--) + s += UTF8SKIP(s); + if (s > send) { + /* This is the existing behaviour. Possibly it should be a croak, as + it's actually a bounds error */ + s = send; + } + return s - start; +} - (*cachep)[i] = offsetp; - (*cachep)[i+1] = s - start; - found = TRUE; +/* Given the length of the string in both bytes and UTF-8 characters, decide + whether to walk forwards or backwards to find the byte corresponding to + the passed in UTF-8 offset. */ +static STRLEN +S_sv_pos_u2b_midway(pTHX_ const U8 *const start, const U8 *send, + STRLEN uoffset, STRLEN uend) +{ + STRLEN backw = uend - uoffset; + if (uoffset < 2 * backw) { + /* The assumption is that going forwards is twice the speed of going + forward (that's where the 2 * backw comes from). + (The real figure of course depends on the UTF-8 data.) */ + return S_sv_pos_u2b_forwards(aTHX_ start, send, uoffset); } - return found; + while (backw--) { + send--; + while (UTF8_IS_CONTINUATION(*send)) + send--; + } + return send - start; } -/* - * S_utf8_mg_pos() is used to query and update mg_ptr field of - * a PERL_UTF8_magic. The mg_ptr is used to store the mapping - * between UTF-8 and byte offsets. See also the comments of - * S_utf8_mg_pos_init(). - * - */ -STATIC bool -S_utf8_mg_pos(pTHX_ SV *sv, MAGIC **mgp, STRLEN **cachep, I32 i, I32 *offsetp, I32 uoff, const U8 **sp, const U8 *start, const U8 *send) -{ +/* For the string representation of the given scalar, find the byte + corresponding to the passed in UTF-8 offset. uoffset0 and boffset0 + give another position in the string, *before* the sought offset, which + (which is always true, as 0, 0 is a valid pair of positions), which should + help reduce the amount of linear searching. + If *mgp is non-NULL, it should point to the UTF-8 cache magic, which + will be used to reduce the amount of linear searching. The cache will be + created if necessary, and the found value offered to it for update. */ +static STRLEN +S_sv_pos_u2b_cached(pTHX_ SV *sv, MAGIC **mgp, const U8 *const start, + const U8 *const send, STRLEN uoffset, + STRLEN uoffset0, STRLEN boffset0) { + STRLEN boffset; bool found = FALSE; - if (SvMAGICAL(sv) && !SvREADONLY(sv)) { - if (!*mgp) - *mgp = mg_find(sv, PERL_MAGIC_utf8); - if (*mgp && (*mgp)->mg_ptr) { - *cachep = (STRLEN *) (*mgp)->mg_ptr; - ASSERT_UTF8_CACHE(*cachep); - if ((*cachep)[i] == (STRLEN)uoff) /* An exact match. */ - found = TRUE; - else { /* We will skip to the right spot. */ - STRLEN forw = 0; - STRLEN backw = 0; - const U8* p = NULL; - - /* The assumption is that going backward is half - * the speed of going forward (that's where the - * 2 * backw in the below comes from). (The real - * figure of course depends on the UTF-8 data.) */ - - if ((*cachep)[i] > (STRLEN)uoff) { - forw = uoff; - backw = (*cachep)[i] - (STRLEN)uoff; - - if (forw < 2 * backw) - p = start; - else - p = start + (*cachep)[i+1]; - } - /* Try this only for the substr offset (i == 0), - * not for the substr length (i == 2). */ - else if (i == 0) { /* (*cachep)[i] < uoff */ - const STRLEN ulen = sv_len_utf8(sv); - - if ((STRLEN)uoff < ulen) { - forw = (STRLEN)uoff - (*cachep)[i]; - backw = ulen - (STRLEN)uoff; - - if (forw < 2 * backw) - p = start + (*cachep)[i+1]; - else - p = send; - } - - /* If the string is not long enough for uoff, - * we could extend it, but not at this low a level. */ - } - - if (p) { - if (forw < 2 * backw) { - while (forw--) - p += UTF8SKIP(p); - } - else { - while (backw--) { - p--; - while (UTF8_IS_CONTINUATION(*p)) - p--; - } - } - - /* Update the cache. */ - (*cachep)[i] = (STRLEN)uoff; - (*cachep)[i+1] = p - start; - - /* Drop the stale "length" cache */ - if (i == 0) { - (*cachep)[2] = 0; - (*cachep)[3] = 0; - } - - found = TRUE; - } - } - if (found) { /* Setup the return values. */ - *offsetp = (*cachep)[i+1]; - *sp = start + *offsetp; - if (*sp >= send) { - *sp = send; - *offsetp = send - start; - } - else if (*sp < start) { - *sp = start; - *offsetp = 0; - } + assert (uoffset >= uoffset0); + + if (SvMAGICAL(sv) && !SvREADONLY(sv) && PL_utf8cache + && (*mgp || (*mgp = mg_find(sv, PERL_MAGIC_utf8)))) { + if ((*mgp)->mg_ptr) { + STRLEN *cache = (STRLEN *) (*mgp)->mg_ptr; + if (cache[0] == uoffset) { + /* An exact match. */ + return cache[1]; + } + if (cache[2] == uoffset) { + /* An exact match. */ + return cache[3]; + } + + if (cache[0] < uoffset) { + /* The cache already knows part of the way. */ + if (cache[0] > uoffset0) { + /* The cache knows more than the passed in pair */ + uoffset0 = cache[0]; + boffset0 = cache[1]; + } + if ((*mgp)->mg_len != -1) { + /* And we know the end too. */ + boffset = boffset0 + + S_sv_pos_u2b_midway(aTHX_ start + boffset0, send, + uoffset - uoffset0, + (*mgp)->mg_len - uoffset0); + } else { + boffset = boffset0 + + S_sv_pos_u2b_forwards(aTHX_ start + boffset0, + send, uoffset - uoffset0); + } + } + else if (cache[2] < uoffset) { + /* We're between the two cache entries. */ + if (cache[2] > uoffset0) { + /* and the cache knows more than the passed in pair */ + uoffset0 = cache[2]; + boffset0 = cache[3]; + } + + boffset = boffset0 + + S_sv_pos_u2b_midway(aTHX_ start + boffset0, + start + cache[1], + uoffset - uoffset0, + cache[0] - uoffset0); + } else { + boffset = boffset0 + + S_sv_pos_u2b_midway(aTHX_ start + boffset0, + start + cache[3], + uoffset - uoffset0, + cache[2] - uoffset0); } + found = TRUE; } -#ifdef PERL_UTF8_CACHE_ASSERT - if (found) { - U8 *s = start; - I32 n = uoff; + else if ((*mgp)->mg_len != -1) { + /* If we can take advantage of a passed in offset, do so. */ + /* In fact, offset0 is either 0, or less than offset, so don't + need to worry about the other possibility. */ + boffset = boffset0 + + S_sv_pos_u2b_midway(aTHX_ start + boffset0, send, + uoffset - uoffset0, + (*mgp)->mg_len - uoffset0); + found = TRUE; + } + } - while (n-- && s < send) - s += UTF8SKIP(s); + if (!found || PL_utf8cache < 0) { + const STRLEN real_boffset + = boffset0 + S_sv_pos_u2b_forwards(aTHX_ start + boffset0, + send, uoffset - uoffset0); - if (i == 0) { - assert(*offsetp == s - start); - assert((*cachep)[0] == (STRLEN)uoff); - assert((*cachep)[1] == *offsetp); - } - ASSERT_UTF8_CACHE(*cachep); + if (found && PL_utf8cache < 0) { + if (real_boffset != boffset) { + /* Need to turn the assertions off otherwise we may recurse + infinitely while printing error messages. */ + SAVEI8(PL_utf8cache); + PL_utf8cache = 0; + Perl_croak(aTHX_ "panic: sv_pos_u2b_cache cache %"UVf + " real %"UVf" for %"SVf, + (UV) boffset, (UV) real_boffset, sv); + } } -#endif + boffset = real_boffset; } - return found; + S_utf8_mg_pos_cache_update(aTHX_ sv, mgp, boffset, uoffset, send - start); + return boffset; } + /* =for apidoc sv_pos_u2b @@ -5507,7 +5494,7 @@ type coercion. /* * sv_pos_u2b() uses, like sv_pos_b2u(), the mg_ptr of the potential * PERL_UTF8_magic of the sv to store the mapping between UTF-8 and - * byte offsets. See also the comments of S_utf8_mg_pos(). + * byte offsets. See also the comments of S_utf8_mg_pos_cache_update(). * */ @@ -5522,42 +5509,23 @@ Perl_sv_pos_u2b(pTHX_ register SV *sv, I32* offsetp, I32* lenp) start = (U8*)SvPV_const(sv, len); if (len) { - STRLEN boffset = 0; - STRLEN *cache = NULL; - const U8 *s = start; - I32 uoffset = *offsetp; - const U8 * const send = s + len; + STRLEN uoffset = (STRLEN) *offsetp; + const U8 * const send = start + len; MAGIC *mg = NULL; - bool found = utf8_mg_pos(sv, &mg, &cache, 0, offsetp, *offsetp, &s, start, send); - - if (!found && uoffset > 0) { - while (s < send && uoffset--) - s += UTF8SKIP(s); - if (s >= send) - s = send; - if (utf8_mg_pos_init(sv, &mg, &cache, 0, *offsetp, s, start)) - boffset = cache[1]; - *offsetp = s - start; - } - if (lenp) { - found = FALSE; - start = s; - if (utf8_mg_pos(sv, &mg, &cache, 2, lenp, *lenp, &s, start, send)) { - *lenp -= boffset; - found = TRUE; - } - if (!found && *lenp > 0) { - I32 ulen = *lenp; - if (ulen > 0) - while (s < send && ulen--) - s += UTF8SKIP(s); - if (s >= send) - s = send; - utf8_mg_pos_init(sv, &mg, &cache, 2, *lenp, s, start); - } - *lenp = s - start; - } - ASSERT_UTF8_CACHE(cache); + STRLEN boffset = S_sv_pos_u2b_cached(aTHX_ sv, &mg, start, send, + uoffset, 0, 0); + + *offsetp = (I32) boffset; + + if (lenp) { + /* Convert the relative offset to absolute. */ + STRLEN uoffset2 = uoffset + (STRLEN) *lenp; + STRLEN boffset2 + = S_sv_pos_u2b_cached(aTHX_ sv, &mg, start, send, uoffset2, + uoffset, boffset) - boffset; + + *lenp = boffset2; + } } else { *offsetp = 0; @@ -5568,6 +5536,221 @@ Perl_sv_pos_u2b(pTHX_ register SV *sv, I32* offsetp, I32* lenp) return; } +/* Create and update the UTF8 magic offset cache, with the proffered utf8/ + byte length pairing. The (byte) length of the total SV is passed in too, + as blen, because for some (more esoteric) SVs, the call to SvPV_const() + may not have updated SvCUR, so we can't rely on reading it directly. + + The proffered utf8/byte length pairing isn't used if the cache already has + two pairs, and swapping either for the proffered pair would increase the + RMS of the intervals between known byte offsets. + + The cache itself consists of 4 STRLEN values + 0: larger UTF-8 offset + 1: corresponding byte offset + 2: smaller UTF-8 offset + 3: corresponding byte offset + + Unused cache pairs have the value 0, 0. + Keeping the cache "backwards" means that the invariant of + cache[0] >= cache[2] is maintained even with empty slots, which means that + the code that uses it doesn't need to worry if only 1 entry has actually + been set to non-zero. It also makes the "position beyond the end of the + cache" logic much simpler, as the first slot is always the one to start + from. +*/ +static void +S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN byte, STRLEN utf8, + STRLEN blen) +{ + STRLEN *cache; + if (SvREADONLY(sv)) + return; + + if (!*mgp) { + *mgp = sv_magicext(sv, 0, PERL_MAGIC_utf8, (MGVTBL*)&PL_vtbl_utf8, 0, + 0); + (*mgp)->mg_len = -1; + } + assert(*mgp); + + if (!(cache = (STRLEN *)(*mgp)->mg_ptr)) { + Newxz(cache, PERL_MAGIC_UTF8_CACHESIZE * 2, STRLEN); + (*mgp)->mg_ptr = (char *) cache; + } + assert(cache); + + if (PL_utf8cache < 0) { + const U8 *start = (const U8 *) SvPVX_const(sv); + const U8 *const end = start + byte; + STRLEN realutf8 = 0; + + while (start < end) { + start += UTF8SKIP(start); + realutf8++; + } + + /* Can't use S_sv_pos_b2u_forwards as it will scream warnings on + surrogates. FIXME - is it inconsistent that b2u warns, but u2b + doesn't? I don't know whether this difference was introduced with + the caching code in 5.8.1. */ + + if (realutf8 != utf8) { + /* Need to turn the assertions off otherwise we may recurse + infinitely while printing error messages. */ + SAVEI8(PL_utf8cache); + PL_utf8cache = 0; + Perl_croak(aTHX_ "panic: utf8_mg_pos_cache_update cache %"UVf + " real %"UVf" for %"SVf, (UV) utf8, (UV) realutf8, sv); + } + } + + /* Cache is held with the later position first, to simplify the code + that deals with unbounded ends. */ + + ASSERT_UTF8_CACHE(cache); + if (cache[1] == 0) { + /* Cache is totally empty */ + cache[0] = utf8; + cache[1] = byte; + } else if (cache[3] == 0) { + if (byte > cache[1]) { + /* New one is larger, so goes first. */ + cache[2] = cache[0]; + cache[3] = cache[1]; + cache[0] = utf8; + cache[1] = byte; + } else { + cache[2] = utf8; + cache[3] = byte; + } + } else { +#define THREEWAY_SQUARE(a,b,c,d) \ + ((float)((d) - (c))) * ((float)((d) - (c))) \ + + ((float)((c) - (b))) * ((float)((c) - (b))) \ + + ((float)((b) - (a))) * ((float)((b) - (a))) + + /* Cache has 2 slots in use, and we know three potential pairs. + Keep the two that give the lowest RMS distance. Do the + calcualation in bytes simply because we always know the byte + length. squareroot has the same ordering as the positive value, + so don't bother with the actual square root. */ + const float existing = THREEWAY_SQUARE(0, cache[3], cache[1], blen); + if (byte > cache[1]) { + /* New position is after the existing pair of pairs. */ + const float keep_earlier + = THREEWAY_SQUARE(0, cache[3], byte, blen); + const float keep_later + = THREEWAY_SQUARE(0, cache[1], byte, blen); + + if (keep_later < keep_earlier) { + if (keep_later < existing) { + cache[2] = cache[0]; + cache[3] = cache[1]; + cache[0] = utf8; + cache[1] = byte; + } + } + else { + if (keep_earlier < existing) { + cache[0] = utf8; + cache[1] = byte; + } + } + } + else if (byte > cache[3]) { + /* New position is between the existing pair of pairs. */ + const float keep_earlier + = THREEWAY_SQUARE(0, cache[3], byte, blen); + const float keep_later + = THREEWAY_SQUARE(0, byte, cache[1], blen); + + if (keep_later < keep_earlier) { + if (keep_later < existing) { + cache[2] = utf8; + cache[3] = byte; + } + } + else { + if (keep_earlier < existing) { + cache[0] = utf8; + cache[1] = byte; + } + } + } + else { + /* New position is before the existing pair of pairs. */ + const float keep_earlier + = THREEWAY_SQUARE(0, byte, cache[3], blen); + const float keep_later + = THREEWAY_SQUARE(0, byte, cache[1], blen); + + if (keep_later < keep_earlier) { + if (keep_later < existing) { + cache[2] = utf8; + cache[3] = byte; + } + } + else { + if (keep_earlier < existing) { + cache[0] = cache[2]; + cache[1] = cache[3]; + cache[2] = utf8; + cache[3] = byte; + } + } + } + } + ASSERT_UTF8_CACHE(cache); +} + +/* If we don't know the character offset of the end of a region, our only + option is to walk forwards to the target byte offset. */ +static STRLEN +S_sv_pos_b2u_forwards(pTHX_ const U8 *s, const U8 *const target) +{ + STRLEN len = 0; + while (s < target) { + STRLEN n = 1; + + /* Call utf8n_to_uvchr() to validate the sequence + * (unless a simple non-UTF character) */ + if (!UTF8_IS_INVARIANT(*s)) + utf8n_to_uvchr(s, UTF8SKIP(s), &n, 0); + if (n > 0) { + s += n; + len++; + } + else + break; + } + return len; +} + +/* We already know all of the way, now we may be able to walk back. The same + assumption is made as in S_sv_pos_u2b_midway(), namely that walking + backward is half the speed of walking forward. */ +static STRLEN +S_sv_pos_b2u_midway(pTHX_ const U8 *s, const U8 *const target, const U8 *end, + STRLEN endu) +{ + const STRLEN forw = target - s; + STRLEN backw = end - target; + + if (forw < 2 * backw) { + return S_sv_pos_b2u_forwards(aTHX_ s, target); + } + + while (end > target) { + end--; + while (UTF8_IS_CONTINUATION(*end)) { + end--; + } + endu--; + } + return endu; +} + /* =for apidoc sv_pos_b2u @@ -5581,121 +5764,98 @@ Handles magic and type coercion. /* * sv_pos_b2u() uses, like sv_pos_u2b(), the mg_ptr of the potential * PERL_UTF8_magic of the sv to store the mapping between UTF-8 and - * byte offsets. See also the comments of S_utf8_mg_pos(). + * byte offsets. * */ - void Perl_sv_pos_b2u(pTHX_ register SV* sv, I32* offsetp) { const U8* s; + const STRLEN byte = *offsetp; STRLEN len; + STRLEN blen; + MAGIC* mg = NULL; + const U8* send; + bool found = FALSE; if (!sv) return; - s = (const U8*)SvPV_const(sv, len); - if ((I32)len < *offsetp) - Perl_croak(aTHX_ "panic: sv_pos_b2u: bad byte offset"); - else { - const U8* send = s + *offsetp; - MAGIC* mg = NULL; - STRLEN *cache = NULL; - - len = 0; - - if (SvMAGICAL(sv) && !SvREADONLY(sv)) { - mg = mg_find(sv, PERL_MAGIC_utf8); - if (mg && mg->mg_ptr) { - cache = (STRLEN *) mg->mg_ptr; - if (cache[1] == (STRLEN)*offsetp) { - /* An exact match. */ - *offsetp = cache[0]; + s = (const U8*)SvPV_const(sv, blen); - return; - } - else if (cache[1] < (STRLEN)*offsetp) { - /* We already know part of the way. */ - len = cache[0]; - s += cache[1]; - /* Let the below loop do the rest. */ - } - else { /* cache[1] > *offsetp */ - /* We already know all of the way, now we may - * be able to walk back. The same assumption - * is made as in S_utf8_mg_pos(), namely that - * walking backward is twice slower than - * walking forward. */ - const STRLEN forw = *offsetp; - STRLEN backw = cache[1] - *offsetp; - - if (!(forw < 2 * backw)) { - const U8 *p = s + cache[1]; - STRLEN ubackw = 0; - - cache[1] -= backw; - - while (backw--) { - p--; - while (UTF8_IS_CONTINUATION(*p)) { - p--; - backw--; - } - ubackw++; - } + if (blen < byte) + Perl_croak(aTHX_ "panic: sv_pos_b2u: bad byte offset"); - cache[0] -= ubackw; - *offsetp = cache[0]; + send = s + byte; - /* Drop the stale "length" cache */ - cache[2] = 0; - cache[3] = 0; + if (SvMAGICAL(sv) && !SvREADONLY(sv) && PL_utf8cache + && (mg = mg_find(sv, PERL_MAGIC_utf8))) { + if (mg->mg_ptr) { + STRLEN *cache = (STRLEN *) mg->mg_ptr; + if (cache[1] == byte) { + /* An exact match. */ + *offsetp = cache[0]; + return; + } + if (cache[3] == byte) { + /* An exact match. */ + *offsetp = cache[2]; + return; + } - return; - } + if (cache[1] < byte) { + /* We already know part of the way. */ + if (mg->mg_len != -1) { + /* Actually, we know the end too. */ + len = cache[0] + + S_sv_pos_b2u_midway(aTHX_ s + cache[1], send, + s + blen, mg->mg_len - cache[0]); + } else { + len = cache[0] + + S_sv_pos_b2u_forwards(aTHX_ s + cache[1], send); } } - ASSERT_UTF8_CACHE(cache); - } - - while (s < send) { - STRLEN n = 1; + else if (cache[3] < byte) { + /* We're between the two cached pairs, so we do the calculation + offset by the byte/utf-8 positions for the earlier pair, + then add the utf-8 characters from the string start to + there. */ + len = S_sv_pos_b2u_midway(aTHX_ s + cache[3], send, + s + cache[1], cache[0] - cache[2]) + + cache[2]; - /* Call utf8n_to_uvchr() to validate the sequence - * (unless a simple non-UTF character) */ - if (!UTF8_IS_INVARIANT(*s)) - utf8n_to_uvchr(s, UTF8SKIP(s), &n, 0); - if (n > 0) { - s += n; - len++; } - else - break; - } + else { /* cache[3] > byte */ + len = S_sv_pos_b2u_midway(aTHX_ s, send, s + cache[3], + cache[2]); - if (!SvREADONLY(sv)) { - if (!mg) { - sv_magic(sv, 0, PERL_MAGIC_utf8, 0, 0); - mg = mg_find(sv, PERL_MAGIC_utf8); } - assert(mg); + ASSERT_UTF8_CACHE(cache); + found = TRUE; + } else if (mg->mg_len != -1) { + len = S_sv_pos_b2u_midway(aTHX_ s, send, s + blen, mg->mg_len); + found = TRUE; + } + } + if (!found || PL_utf8cache < 0) { + const STRLEN real_len = S_sv_pos_b2u_forwards(aTHX_ s, send); - if (!mg->mg_ptr) { - Newxz(cache, PERL_MAGIC_UTF8_CACHESIZE * 2, STRLEN); - mg->mg_ptr = (char *) cache; + if (found && PL_utf8cache < 0) { + if (len != real_len) { + /* Need to turn the assertions off otherwise we may recurse + infinitely while printing error messages. */ + SAVEI8(PL_utf8cache); + PL_utf8cache = 0; + Perl_croak(aTHX_ "panic: sv_pos_b2u cache %"UVf + " real %"UVf" for %"SVf, + (UV) len, (UV) real_len, sv); } - assert(cache); - - cache[0] = len; - cache[1] = *offsetp; - /* Drop the stale "length" cache */ - cache[2] = 0; - cache[3] = 0; } - - *offsetp = len; + len = real_len; } - return; + *offsetp = len; + + S_utf8_mg_pos_cache_update(aTHX_ sv, &mg, byte, len, blen); } /* @@ -5971,8 +6131,12 @@ Perl_sv_collxfrm(pTHX_ SV *sv, STRLEN *nxp) return xf + sizeof(PL_collation_ix); } if (! mg) { - sv_magic(sv, 0, PERL_MAGIC_collxfrm, 0, 0); - mg = mg_find(sv, PERL_MAGIC_collxfrm); +#ifdef PERL_OLD_COPY_ON_WRITE + if (SvIsCOW(sv)) + sv_force_normal_flags(sv, 0); +#endif + mg = sv_magicext(sv, 0, PERL_MAGIC_collxfrm, &PL_vtbl_collxfrm, + 0, 0); assert(mg); } mg->mg_ptr = xf; @@ -6982,10 +7146,10 @@ Perl_newRV_noinc(pTHX_ SV *tmpRef) */ SV * -Perl_newRV(pTHX_ SV *tmpRef) +Perl_newRV(pTHX_ SV *sv) { dVAR; - return newRV_noinc(SvREFCNT_inc_simple(tmpRef)); + return newRV_noinc(SvREFCNT_inc_simple_NN(sv)); } /* @@ -7717,7 +7881,9 @@ S_sv_unglob(pTHX_ SV *sv) GvSTASH(sv) = NULL; } GvMULTI_off(sv); - Safefree(GvNAME(sv)); + if (GvNAME_HEK(sv)) { + unshare_hek(GvNAME_HEK(sv)); + } SvSCREAM_off(sv); /* need to keep SvANY(sv) in the right arena */ @@ -9215,6 +9381,7 @@ ptr_table_* functions. #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t)) +#define sv_dup_inc_NN(s,t) SvREFCNT_inc_NN(sv_dup(s,t)) #define av_dup(s,t) (AV*)sv_dup((SV*)s,t) #define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t)) #define hv_dup(s,t) (HV*)sv_dup((SV*)s,t) @@ -9837,7 +10004,9 @@ Perl_sv_dup(pTHX_ const SV *sstr, CLONE_PARAMS* param) LvTARG(dstr) = sv_dup_inc(LvTARG(dstr), param); break; case SVt_PVGV: - GvNAME(dstr) = SAVEPVN(GvNAME(dstr), GvNAMELEN(dstr)); + if (GvNAME_HEK(dstr)) + GvNAME_HEK(dstr) = hek_dup(GvNAME_HEK(dstr), param); + /* Don't call sv_add_backref here as it's going to be created as part of the magic cloning of the symbol table. */ GvSTASH(dstr) = hv_dup(GvSTASH(dstr), param); @@ -10757,8 +10926,7 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags, const I32 len = av_len((AV*)proto_perl->Iregex_padav); SV* const * const regexen = AvARRAY((AV*)proto_perl->Iregex_padav); IV i; - av_push(PL_regex_padav, - sv_dup_inc(regexen[0],param)); + av_push(PL_regex_padav, sv_dup_inc_NN(regexen[0],param)); for(i = 1; i <= len; i++) { const SV * const regex = regexen[i]; SV * const sv = @@ -10935,9 +11103,26 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags, i = (PL_lex_casemods < 12 ? 12 : PL_lex_casemods); PL_lex_casestack = SAVEPVN(proto_perl->Ilex_casestack,i); +#ifdef PERL_MAD + Copy(proto_perl->Inexttoke, PL_nexttoke, 5, NEXTTOKE); + PL_lasttoke = proto_perl->Ilasttoke; + PL_realtokenstart = proto_perl->Irealtokenstart; + PL_faketokens = proto_perl->Ifaketokens; + PL_thismad = proto_perl->Ithismad; + PL_thistoken = proto_perl->Ithistoken; + PL_thisopen = proto_perl->Ithisopen; + PL_thisstuff = proto_perl->Ithisstuff; + PL_thisclose = proto_perl->Ithisclose; + PL_thiswhite = proto_perl->Ithiswhite; + PL_nextwhite = proto_perl->Inextwhite; + PL_skipwhite = proto_perl->Iskipwhite; + PL_endwhite = proto_perl->Iendwhite; + PL_curforce = proto_perl->Icurforce; +#else Copy(proto_perl->Inextval, PL_nextval, 5, YYSTYPE); Copy(proto_perl->Inexttype, PL_nexttype, 5, I32); PL_nexttoke = proto_perl->Inexttoke; +#endif /* XXX This is probably masking the deeper issue of why * SvANY(proto_perl->Ilinestr) can be NULL at this point. For test case: