{ PERL_MAGIC_uvar_elem, "uvar_elem(v)" },
{ PERL_MAGIC_vec, "vec(v)" },
{ PERL_MAGIC_vstring, "vstring(V)" },
+ { PERL_MAGIC_utf8, "utf8(w)" },
{ PERL_MAGIC_substr, "substr(x)" },
{ PERL_MAGIC_defelem, "defelem(y)" },
{ PERL_MAGIC_ext, "ext(~)" },
else if (v == &PL_vtbl_amagic) s = "amagic";
else if (v == &PL_vtbl_amagicelem) s = "amagicelem";
else if (v == &PL_vtbl_backref) s = "backref";
+ else if (v == &PL_vtbl_utf8) s = "utf8";
if (s)
Perl_dump_indent(aTHX_ level, file, " MG_VIRTUAL = &PL_vtbl_%s\n", s);
else
if (mg->mg_ptr) {
Perl_dump_indent(aTHX_ level, file, " MG_PTR = 0x%"UVxf, PTR2UV(mg->mg_ptr));
if (mg->mg_len >= 0) {
- SV *sv = newSVpvn("", 0);
- PerlIO_printf(file, " %s", pv_display(sv, mg->mg_ptr, mg->mg_len, 0, pvlim));
- SvREFCNT_dec(sv);
+ if (mg->mg_type != PERL_MAGIC_utf8) {
+ SV *sv = newSVpvn("", 0);
+ PerlIO_printf(file, " %s", pv_display(sv, mg->mg_ptr, mg->mg_len, 0, pvlim));
+ SvREFCNT_dec(sv);
+ }
}
else if (mg->mg_len == HEf_SVKEY) {
PerlIO_puts(file, " => HEf_SVKEY\n");
PerlIO_puts(file, " ???? - please notify IZ");
PerlIO_putc(file, '\n');
}
+ if (mg->mg_type == PERL_MAGIC_utf8) {
+ STRLEN *cache = (STRLEN *) mg->mg_ptr;
+ if (cache) {
+ IV i;
+ for (i = 0; i < PERL_MAGIC_UTF8_CACHESIZE; i++)
+ Perl_dump_indent(aTHX_ level, file,
+ " %2"IVdf": %"UVuf" -> %"UVuf"\n",
+ i,
+ (UV)cache[i * 2],
+ (UV)cache[i * 2 + 1]);
+ }
+ }
}
}
p |int |magic_settaint |SV* sv|MAGIC* mg
p |int |magic_setuvar |SV* sv|MAGIC* mg
p |int |magic_setvec |SV* sv|MAGIC* mg
+p |int |magic_setutf8 |SV* sv|MAGIC* mg
p |int |magic_set_all_env|SV* sv|MAGIC* mg
p |U32 |magic_sizepack |SV* sv|MAGIC* mg
p |int |magic_wipepack |SV* sv|MAGIC* mg
# if defined(USE_ITHREADS)
s |SV* |gv_share |SV *sv|CLONE_PARAMS *param
# endif
+s |bool |utf8_mg_pos |SV *sv|MAGIC **mgp|STRLEN **cachep|I32 i|I32 *offsetp|I32 uoff|U8 **sp|U8 *start|U8 *send
+s |bool |utf8_mg_pos_init |SV *sv|MAGIC **mgp|STRLEN **cachep|I32 i|I32 *offsetp|U8 *s|U8 *start
#if defined(PERL_COPY_ON_WRITE)
sM |void |sv_release_COW |SV *sv|char *pvx|STRLEN cur|STRLEN len \
|U32 hash|SV *after
#define magic_settaint Perl_magic_settaint
#define magic_setuvar Perl_magic_setuvar
#define magic_setvec Perl_magic_setvec
+#define magic_setutf8 Perl_magic_setutf8
#define magic_set_all_env Perl_magic_set_all_env
#define magic_sizepack Perl_magic_sizepack
#define magic_wipepack Perl_magic_wipepack
# if defined(USE_ITHREADS)
#define gv_share S_gv_share
# endif
+#define utf8_mg_pos S_utf8_mg_pos
+#define utf8_mg_pos_init S_utf8_mg_pos_init
#if defined(PERL_COPY_ON_WRITE)
#define sv_release_COW S_sv_release_COW
#endif
#define magic_settaint(a,b) Perl_magic_settaint(aTHX_ a,b)
#define magic_setuvar(a,b) Perl_magic_setuvar(aTHX_ a,b)
#define magic_setvec(a,b) Perl_magic_setvec(aTHX_ a,b)
+#define magic_setutf8(a,b) Perl_magic_setutf8(aTHX_ a,b)
#define magic_set_all_env(a,b) Perl_magic_set_all_env(aTHX_ a,b)
#define magic_sizepack(a,b) Perl_magic_sizepack(aTHX_ a,b)
#define magic_wipepack(a,b) Perl_magic_wipepack(aTHX_ a,b)
# if defined(USE_ITHREADS)
#define gv_share(a,b) S_gv_share(aTHX_ a,b)
# endif
+#define utf8_mg_pos(a,b,c,d,e,f,g,h,i) S_utf8_mg_pos(aTHX_ a,b,c,d,e,f,g,h,i)
+#define utf8_mg_pos_init(a,b,c,d,e,f,g) S_utf8_mg_pos_init(aTHX_ a,b,c,d,e,f,g)
#if defined(PERL_COPY_ON_WRITE)
#define sv_release_COW(a,b,c,d,e,f) S_sv_release_COW(aTHX_ a,b,c,d,e,f)
#endif
}
#endif /* USE_LOCALE_COLLATE */
+/* Just clear the UTF-8 cache data. */
+int
+Perl_magic_setutf8(pTHX_ SV *sv, MAGIC *mg)
+{
+ Safefree(mg->mg_ptr); /* The mg_ptr holds the pos cache. */
+ mg->mg_ptr = 0;
+ mg->mg_len = -1; /* The mg_len holds the len cache. */
+ return 0;
+}
+
int
Perl_magic_set(pTHX_ SV *sv, MAGIC *mg)
{
#define PERL_MAGIC_uvar_elem 'u' /* Reserved for use by extensions */
#define PERL_MAGIC_vec 'v' /* vec() lvalue */
#define PERL_MAGIC_vstring 'V' /* SV was vstring literal */
+#define PERL_MAGIC_utf8 'w' /* Cached UTF-8 information */
#define PERL_MAGIC_substr 'x' /* substr() lvalue */
#define PERL_MAGIC_defelem 'y' /* Shadow "foreach" iterator variable /
smart parameter vivification */
want_vtbl_amagicelem,
want_vtbl_regdata,
want_vtbl_regdatum,
- want_vtbl_backref
+ want_vtbl_backref,
+ want_vtbl_utf8
};
/* Note: the lowest 8 bits are reserved for
EXT MGVTBL PL_vtbl_ovrld = {0, 0,
0, 0, MEMBER_TO_FPTR(Perl_magic_freeovrld)};
+EXT MGVTBL PL_vtbl_utf8 = {0,
+ MEMBER_TO_FPTR(Perl_magic_setutf8),
+ 0, 0, 0};
+
#else /* !DOINIT */
EXT MGVTBL PL_vtbl_sv;
EXT MGVTBL PL_vtbl_amagicelem;
EXT MGVTBL PL_vtbl_backref;
+EXT MGVTBL PL_vtbl_utf8;
#endif /* !DOINIT */
# define PIPESOCK_MODE
#endif
+#define PERL_MAGIC_UTF8_CACHESIZE 2
+
/* and finally... */
#define PERL_PATCHLEVEL_H_IMPLICIT
#include "patchlevel.h"
stored within the hash (as in the case of tied hashes). Otherwise it can
be dereferenced to get the original C<SV*>. Note that the caller is
responsible for suitably incrementing the reference count of C<val> before
-the call, and decrementing it if the function returned NULL.
+the call, and decrementing it if the function returned NULL. Effectively
+a successful hv_store takes ownership of one reference to C<val>. This is
+usually what you want; a newly created SV has a reference count of one, so
+if all your code does is create SVs then store them in a hash, hv_store
+will own the only reference to the new SV, and your code doesn't need to do
+anything further to tidy up. hv_store is not implemented as a call to
+hv_store_ent, and does not create a temporary SV for the key, so if your
+key data is not already in SV form then use hv_store in preference to
+hv_store_ent.
See L<perlguts/"Understanding the Magic of Tied Hashes and Arrays"> for more
information on how to use this function on tied hashes.
contents of the return value can be accessed using the C<He?> macros
described here. Note that the caller is responsible for suitably
incrementing the reference count of C<val> before the call, and
-decrementing it if the function returned NULL.
+decrementing it if the function returned NULL. Effectively a successful
+hv_store_ent takes ownership of one reference to C<val>. This is
+usually what you want; a newly created SV has a reference count of one, so
+if all your code does is create SVs then store them in a hash, hv_store
+will own the only reference to the new SV, and your code doesn't need to do
+anything further to tidy up. Note that hv_store_ent only reads the C<key>;
+unlike C<val> it does not take ownership of it, so maintaining the correct
+reference count on C<key> is entirely the caller's responsibility. hv_store
+is not implemented as a call to hv_store_ent, and does not create a temporary
+SV for the key, so if your key data is not already in SV form then use
+hv_store in preference to hv_store_ent.
See L<perlguts/"Understanding the Magic of Tied Hashes and Arrays"> for more
information on how to use this function on tied hashes.
PERL_CALLCONV int Perl_magic_settaint(pTHX_ SV* sv, MAGIC* mg);
PERL_CALLCONV int Perl_magic_setuvar(pTHX_ SV* sv, MAGIC* mg);
PERL_CALLCONV int Perl_magic_setvec(pTHX_ SV* sv, MAGIC* mg);
+PERL_CALLCONV int Perl_magic_setutf8(pTHX_ SV* sv, MAGIC* mg);
PERL_CALLCONV int Perl_magic_set_all_env(pTHX_ SV* sv, MAGIC* mg);
PERL_CALLCONV U32 Perl_magic_sizepack(pTHX_ SV* sv, MAGIC* mg);
PERL_CALLCONV int Perl_magic_wipepack(pTHX_ SV* sv, MAGIC* mg);
# if defined(USE_ITHREADS)
STATIC SV* S_gv_share(pTHX_ SV *sv, CLONE_PARAMS *param);
# endif
+STATIC bool S_utf8_mg_pos(pTHX_ SV *sv, MAGIC **mgp, STRLEN **cachep, I32 i, I32 *offsetp, I32 uoff, U8 **sp, U8 *start, U8 *send);
+STATIC bool S_utf8_mg_pos_init(pTHX_ SV *sv, MAGIC **mgp, STRLEN **cachep, I32 i, I32 *offsetp, U8 *s, U8 *start);
#if defined(PERL_COPY_ON_WRITE)
STATIC void S_sv_release_COW(pTHX_ SV *sv, char *pvx, STRLEN cur, STRLEN len, U32 hash, SV *after);
#endif
case PERL_MAGIC_vstring:
vtable = 0;
break;
+ case PERL_MAGIC_utf8:
+ vtable = &PL_vtbl_utf8;
+ break;
case PERL_MAGIC_substr:
vtable = &PL_vtbl_substr;
break;
Safefree(mg->mg_ptr);
else if (mg->mg_len == HEf_SVKEY)
SvREFCNT_dec((SV*)mg->mg_ptr);
+ else if (mg->mg_type == PERL_MAGIC_utf8 && mg->mg_ptr)
+ Safefree(mg->mg_ptr);
}
if (mg->mg_flags & MGf_REFCOUNTED)
SvREFCNT_dec(mg->mg_obj);
=cut
*/
+/*
+ * The length is cached in PERL_UTF8_magic, in the mg_len field. Also the
+ * mg_ptr is used, by sv_pos_u2b(), see the comments of S_utf8_mg_pos_init().
+ * (Note that the mg_len is not the length of the mg_ptr field.)
+ *
+ */
+
STRLEN
Perl_sv_len_utf8(pTHX_ register SV *sv)
{
return mg_length(sv);
else
{
- STRLEN len;
+ STRLEN len, ulen;
U8 *s = (U8*)SvPV(sv, len);
+ MAGIC *mg = SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : 0;
- return Perl_utf8_length(aTHX_ s, s + len);
+ if (mg && mg->mg_len != -1 && (mg->mg_len > 0 || len == 0))
+ ulen = mg->mg_len;
+ else {
+ ulen = Perl_utf8_length(aTHX_ s, s + len);
+ if (!mg && !SvREADONLY(sv)) {
+ sv_magic(sv, 0, PERL_MAGIC_utf8, 0, 0);
+ mg = mg_find(sv, PERL_MAGIC_utf8);
+ assert(mg);
+ }
+ if (mg)
+ mg->mg_len = ulen;
+ }
+ return ulen;
}
}
+/* S_utf8_mg_pos_init() is used to initialize the mg_ptr field of
+ * a PERL_UTF8_magic. The mg_ptr is used to store the mapping
+ * between UTF-8 and byte offsets. There are two (substr offset and substr
+ * length, the i offset, PERL_MAGIC_UTF8_CACHESIZE) times two (UTF-8 offset
+ * and byte offset) cache positions.
+ *
+ * The mg_len field is used by sv_len_utf8(), see its comments.
+ * Note that the mg_len is not the length of the mg_ptr field.
+ *
+ */
+STATIC bool
+S_utf8_mg_pos_init(SV *sv, MAGIC **mgp, STRLEN **cachep, I32 i, I32 *offsetp, U8 *s, U8 *start)
+{
+ bool found = FALSE;
+
+ if (SvMAGICAL(sv) && !SvREADONLY(sv)) {
+ if (!*mgp) {
+ sv_magic(sv, 0, PERL_MAGIC_utf8, 0, 0);
+ *mgp = mg_find(sv, PERL_MAGIC_utf8);
+ }
+ assert(*mgp);
+
+ if ((*mgp)->mg_ptr)
+ *cachep = (STRLEN *) (*mgp)->mg_ptr;
+ else {
+ Newz(0, *cachep, PERL_MAGIC_UTF8_CACHESIZE * 2, STRLEN);
+ (*mgp)->mg_ptr = (char *) *cachep;
+ }
+ assert(*cachep);
+
+ (*cachep)[i] = *offsetp;
+ (*cachep)[i+1] = s - start;
+ found = TRUE;
+ }
+
+ return found;
+}
+
+/*
+ * S_utf8_mg_pos() is used to query and update mg_ptr field of
+ * a PERL_UTF8_magic. The mg_ptr is used to store the mapping
+ * between UTF-8 and byte offsets. See also the comments of
+ * S_utf8_mg_pos_init().
+ *
+ */
+STATIC bool
+S_utf8_mg_pos(SV *sv, MAGIC **mgp, STRLEN **cachep, I32 i, I32 *offsetp, I32 uoff, U8 **sp, U8 *start, U8 *send)
+{
+ bool found = FALSE;
+
+ if (SvMAGICAL(sv) && !SvREADONLY(sv)) {
+ if (!*mgp)
+ *mgp = mg_find(sv, PERL_MAGIC_utf8);
+ if (*mgp && (*mgp)->mg_ptr) {
+ *cachep = (STRLEN *) (*mgp)->mg_ptr;
+ if ((*cachep)[i] == uoff) /* An exact match. */
+ found = TRUE;
+ else { /* We will skip to the right spot. */
+ STRLEN forw = 0;
+ STRLEN backw = 0;
+ U8* p = NULL;
+
+ /* The assumption is that going backward is half
+ * the speed of going forward (that's where the
+ * 2 * backw in the below comes from). (The real
+ * figure of course depends on the UTF-8 data.) */
+
+ if ((*cachep)[i] > uoff) {
+ forw = uoff;
+ backw = (*cachep)[i] - uoff;
+
+ if (forw < 2 * backw)
+ p = start;
+ else
+ p = start + (*cachep)[i+1];
+ }
+ /* Try this only for the substr offset (i == 0),
+ * not for the substr length (i == 2). */
+ else if (i == 0) { /* (*cachep)[i] < uoff */
+ STRLEN ulen = sv_len_utf8(sv);
+
+ if (uoff < ulen) {
+ forw = uoff - (*cachep)[i];
+ backw = ulen - uoff;
+
+ if (forw < 2 * backw)
+ p = start + (*cachep)[i+1];
+ else
+ p = send;
+ }
+
+ /* If the string is not long enough for uoff,
+ * we could extend it, but not at this low a level. */
+ }
+
+ if (p) {
+ if (forw < 2 * backw) {
+ while (forw--)
+ p += UTF8SKIP(p);
+ }
+ else {
+ while (backw--) {
+ p--;
+ while (UTF8_IS_CONTINUATION(*p))
+ p--;
+ }
+ }
+
+ /* Update the cache. */
+ (*cachep)[i] = uoff;
+ (*cachep)[i+1] = p - start;
+
+ found = TRUE;
+ }
+ }
+ if (found) { /* Setup the return values. */
+ *offsetp = (*cachep)[i+1];
+ *sp = start + *offsetp;
+ if (*sp >= send) {
+ *sp = send;
+ *offsetp = send - start;
+ }
+ else if (*sp < start) {
+ *sp = start;
+ *offsetp = 0;
+ }
+ }
+ }
+ }
+ return found;
+}
+
/*
=for apidoc sv_pos_u2b
=cut
*/
+/*
+ * sv_pos_u2b() uses, like sv_pos_b2u(), the mg_ptr of the potential
+ * PERL_UTF8_magic of the sv to store the mapping between UTF-8 and
+ * byte offsets. See also the comments of S_utf8_mg_pos().
+ *
+ */
+
void
Perl_sv_pos_u2b(pTHX_ register SV *sv, I32* offsetp, I32* lenp)
{
U8 *start;
U8 *s;
- U8 *send;
- I32 uoffset = *offsetp;
STRLEN len;
+ STRLEN *cache = 0;
+ STRLEN boffset = 0;
if (!sv)
return;
start = s = (U8*)SvPV(sv, len);
- send = s + len;
- while (s < send && uoffset--)
- s += UTF8SKIP(s);
- if (s >= send)
- s = send;
- *offsetp = s - start;
- if (lenp) {
- I32 ulen = *lenp;
- start = s;
- while (s < send && ulen--)
- s += UTF8SKIP(s);
- if (s >= send)
- s = send;
- *lenp = s - start;
+ if (len) {
+ I32 uoffset = *offsetp;
+ U8 *send = s + len;
+ MAGIC *mg = 0;
+ bool found = FALSE;
+
+ if (S_utf8_mg_pos(sv, &mg, &cache, 0, offsetp, *offsetp, &s, start, send))
+ found = TRUE;
+ if (!found && uoffset > 0) {
+ while (s < send && uoffset--)
+ s += UTF8SKIP(s);
+ if (s >= send)
+ s = send;
+ if (S_utf8_mg_pos_init(sv, &mg, &cache, 0, offsetp, s, start))
+ boffset = cache[1];
+ *offsetp = s - start;
+ }
+ if (lenp) {
+ found = FALSE;
+ start = s;
+ if (S_utf8_mg_pos(sv, &mg, &cache, 2, lenp, *lenp + *offsetp, &s, start, send)) {
+ *lenp -= boffset;
+ found = TRUE;
+ }
+ if (!found && *lenp > 0) {
+ I32 ulen = *lenp;
+ if (ulen > 0)
+ while (s < send && ulen--)
+ s += UTF8SKIP(s);
+ if (s >= send)
+ s = send;
+ if (S_utf8_mg_pos_init(sv, &mg, &cache, 2, lenp, s, start))
+ cache[2] += *offsetp;
+ }
+ *lenp = s - start;
+ }
+ }
+ else {
+ *offsetp = 0;
+ if (lenp)
+ *lenp = 0;
}
return;
}
=cut
*/
+/*
+ * sv_pos_b2u() uses, like sv_pos_u2b(), the mg_ptr of the potential
+ * PERL_UTF8_magic of the sv to store the mapping between UTF-8 and
+ * byte offsets. See also the comments of S_utf8_mg_pos().
+ *
+ */
+
void
-Perl_sv_pos_b2u(pTHX_ register SV *sv, I32* offsetp)
+Perl_sv_pos_b2u(pTHX_ register SV* sv, I32* offsetp)
{
- U8 *s;
- U8 *send;
+ U8* s;
STRLEN len;
if (!sv)
s = (U8*)SvPV(sv, len);
if ((I32)len < *offsetp)
Perl_croak(aTHX_ "panic: sv_pos_b2u: bad byte offset");
- send = s + *offsetp;
- len = 0;
- while (s < send) {
- STRLEN n = 1;
- /* Call utf8n_to_uvchr() to validate the sequence
- * (unless a simple non-UTF character) */
- if (!UTF8_IS_INVARIANT(*s))
- utf8n_to_uvchr(s, UTF8SKIP(s), &n, 0);
- if (n > 0) {
- s += n;
- len++;
+ else {
+ U8* send = s + *offsetp;
+ MAGIC* mg = NULL;
+ STRLEN *cache = NULL;
+
+ len = 0;
+
+ if (SvMAGICAL(sv) && !SvREADONLY(sv)) {
+ mg = mg_find(sv, PERL_MAGIC_utf8);
+ if (mg && mg->mg_ptr) {
+ cache = (STRLEN *) mg->mg_ptr;
+ if (cache[1] == *offsetp) {
+ /* An exact match. */
+ *offsetp = cache[0];
+
+ return;
+ }
+ else if (cache[1] < *offsetp) {
+ /* We already know part of the way. */
+ len = cache[0];
+ s += cache[1];
+ /* Let the below loop do the rest. */
+ }
+ else { /* cache[1] > *offsetp */
+ /* We already know all of the way, now we may
+ * be able to walk back. The same assumption
+ * is made as in S_utf8_mg_pos(), namely that
+ * walking backward is twice slower than
+ * walking forward. */
+ STRLEN forw = *offsetp;
+ STRLEN backw = cache[1] - *offsetp;
+
+ if (!(forw < 2 * backw)) {
+ U8 *p = s + cache[1];
+ STRLEN ubackw = 0;
+
+ while (backw--) {
+ p--;
+ while (UTF8_IS_CONTINUATION(*p))
+ p--;
+ ubackw++;
+ }
+
+ cache[0] -= ubackw;
+ cache[1] -= backw;
+
+ return;
+ }
+ }
+ }
}
- else
- break;
+
+ while (s < send) {
+ STRLEN n = 1;
+
+ /* Call utf8n_to_uvchr() to validate the sequence
+ * (unless a simple non-UTF character) */
+ if (!UTF8_IS_INVARIANT(*s))
+ utf8n_to_uvchr(s, UTF8SKIP(s), &n, 0);
+ if (n > 0) {
+ s += n;
+ len++;
+ }
+ else
+ break;
+ }
+
+ if (!SvREADONLY(sv)) {
+ if (!mg) {
+ sv_magic(sv, 0, PERL_MAGIC_utf8, 0, 0);
+ mg = mg_find(sv, PERL_MAGIC_utf8);
+ }
+ assert(mg);
+
+ if (!mg->mg_ptr) {
+ Newz(0, cache, PERL_MAGIC_UTF8_CACHESIZE * 2, STRLEN);
+ mg->mg_ptr = (char *) cache;
+ }
+ assert(cache);
+
+ cache[0] = len;
+ cache[1] = *offsetp;
+ }
+
+ *offsetp = len;
}
- *offsetp = len;
return;
}
case want_vtbl_backref:
result = &PL_vtbl_backref;
break;
+ case want_vtbl_utf8:
+ result = &PL_vtbl_utf8;
+ break;
}
return result;
}