From: Nicholas Clark Date: Sat, 6 Apr 2002 00:21:17 +0000 (+0100) Subject: Re: the dirty half dozen (Re: perl@15662) X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=19692e8d256164f96817d6df6ecee26c3cda4ae9;p=p5sagit%2Fp5-mst-13.2.git Re: the dirty half dozen (Re: perl@15662) Message-ID: <20020405232117.GE323@Bagpuss.unfortu.net> (with the last one reversed) p4raw-id: //depot/perl@15757 --- diff --git a/doop.c b/doop.c index 20379a9..51c2248 100644 --- a/doop.c +++ b/doop.c @@ -1338,15 +1338,6 @@ Perl_do_kv(pTHX) SPAGAIN; if (dokeys) { SV* sv = hv_iterkeysv(entry); - if (HvUTF8KEYS((SV*)hv) && !DO_UTF8(sv)) { - STRLEN len, i; - char* s = SvPV(sv, len); - for (i = 0; i < len && NATIVE_IS_INVARIANT(s[i]); i++); - if (i < len) { - sv = newSVsv(sv); - sv_utf8_upgrade(sv); - } - } XPUSHs(sv); /* won't clobber stack_sp */ } if (dovalues) { diff --git a/dump.c b/dump.c index 48a3b38..240d1c2 100644 --- a/dump.c +++ b/dump.c @@ -980,7 +980,7 @@ Perl_do_sv_dump(pTHX_ I32 level, PerlIO *file, SV *sv, I32 nest, I32 maxnest, bo case SVt_PVHV: if (HvSHAREKEYS(sv)) sv_catpv(d, "SHAREKEYS,"); if (HvLAZYDEL(sv)) sv_catpv(d, "LAZYDEL,"); - if (HvUTF8KEYS(sv)) sv_catpv(d, "UTF8,"); + if (HvHASKFLAGS(sv)) sv_catpv(d, "HASKFLAGS,"); break; case SVt_PVGV: if (GvINTRO(sv)) sv_catpv(d, "INTRO,"); diff --git a/embed.fnc b/embed.fnc index 5cf2630..a5b29c2 100644 --- a/embed.fnc +++ b/embed.fnc @@ -986,8 +986,14 @@ s |void |hfreeentries |HV *hv s |void |more_he s |HE* |new_he s |void |del_he |HE *p -s |HEK* |save_hek |const char *str|I32 len|U32 hash +s |HEK* |save_hek_flags |const char *str|I32 len|U32 hash|int flags s |void |hv_magic_check |HV *hv|bool *needs_copy|bool *needs_store +s |void |unshare_hek_or_pvn|HEK* hek|const char* sv|I32 len|U32 hash +s |HEK* |share_hek_flags|const char* sv|I32 len|U32 hash|int flags +s |SV** |hv_store_flags |HV* tb|const char* key|I32 klen|SV* val \ + |U32 hash|int flags +s |SV** |hv_fetch_flags |HV* tb|const char* key|I32 klen|I32 lval \ + |int flags #endif #if defined(PERL_IN_MG_C) || defined(PERL_DECL_PROT) diff --git a/ext/Devel/Peek/Peek.t b/ext/Devel/Peek/Peek.t index f577369..a44e2b0 100644 --- a/ext/Devel/Peek/Peek.t +++ b/ext/Devel/Peek/Peek.t @@ -373,7 +373,7 @@ do_test(19, RV = $ADDR SV = PVHV\\($ADDR\\) at $ADDR REFCNT = 2 - FLAGS = \\(SHAREKEYS,UTF8\\) + FLAGS = \\(SHAREKEYS,HASKFLAGS\\) UV = 1 NV = 0 ARRAY = $ADDR \\(0:7, 1:1\\) diff --git a/hv.c b/hv.c index f92e31e..d9f640b 100644 --- a/hv.c +++ b/hv.c @@ -74,16 +74,10 @@ S_more_he(pTHX) #endif STATIC HEK * -S_save_hek(pTHX_ const char *str, I32 len, U32 hash) +S_save_hek_flags(pTHX_ const char *str, I32 len, U32 hash, int flags) { char *k; register HEK *hek; - bool is_utf8 = FALSE; - - if (len < 0) { - len = -len; - is_utf8 = TRUE; - } New(54, k, HEK_BASESIZE + len + 2, char); hek = (HEK*)k; @@ -91,17 +85,10 @@ S_save_hek(pTHX_ const char *str, I32 len, U32 hash) HEK_KEY(hek)[len] = 0; HEK_LEN(hek) = len; HEK_HASH(hek) = hash; - HEK_UTF8(hek) = (char)is_utf8; + HEK_FLAGS(hek) = (unsigned char)flags; return hek; } -void -Perl_unshare_hek(pTHX_ HEK *hek) -{ - unsharepvn(HEK_KEY(hek),HEK_UTF8(hek)?-HEK_LEN(hek):HEK_LEN(hek), - HEK_HASH(hek)); -} - #if defined(USE_ITHREADS) HE * Perl_he_dup(pTHX_ HE *e, bool shared, CLONE_PARAMS* param) @@ -123,20 +110,22 @@ Perl_he_dup(pTHX_ HE *e, bool shared, CLONE_PARAMS* param) if (HeKLEN(e) == HEf_SVKEY) HeKEY_sv(ret) = SvREFCNT_inc(sv_dup(HeKEY_sv(e), param)); else if (shared) - HeKEY_hek(ret) = share_hek(HeKEY(e), HeKLEN_UTF8(e), HeHASH(e)); + HeKEY_hek(ret) = share_hek_flags(HeKEY(e), HeKLEN(e), HeHASH(e), + HeKFLAGS(e)); else - HeKEY_hek(ret) = save_hek(HeKEY(e), HeKLEN_UTF8(e), HeHASH(e)); + HeKEY_hek(ret) = save_hek_flags(HeKEY(e), HeKLEN(e), HeHASH(e), + HeKFLAGS(e)); HeVAL(ret) = SvREFCNT_inc(sv_dup(HeVAL(e), param)); return ret; } #endif /* USE_ITHREADS */ static void -Perl_hv_notallowed(pTHX_ bool is_utf8, const char *key, I32 klen, - const char *keysave, const char *msg) +Perl_hv_notallowed(pTHX_ int flags, const char *key, I32 klen, + const char *msg) { SV *sv = sv_newmortal(); - if (key == keysave) { + if (!(flags & HVhek_FREEKEY)) { sv_setpvn(sv, key, klen); } else { @@ -144,7 +133,7 @@ Perl_hv_notallowed(pTHX_ bool is_utf8, const char *key, I32 klen, SV *sv = sv_newmortal(); sv_usepvn(sv, (char *) key, klen); } - if (is_utf8) { + if (flags & HVhek_UTF8) { SvUTF8_on(sv); } Perl_croak(aTHX_ msg, sv); @@ -167,28 +156,60 @@ information on how to use this function on tied hashes. =cut */ + SV** Perl_hv_fetch(pTHX_ HV *hv, const char *key, I32 klen, I32 lval) { - register XPVHV* xhv; - register U32 hash; - register HE *entry; - SV *sv; bool is_utf8 = FALSE; const char *keysave = key; - - if (!hv) - return 0; + int flags = 0; if (klen < 0) { klen = -klen; is_utf8 = TRUE; } + if (is_utf8) { + STRLEN tmplen = klen; + /* Just casting the &klen to (STRLEN) won't work well + * if STRLEN and I32 are of different widths. --jhi */ + key = (char*)bytes_from_utf8((U8*)key, &tmplen, &is_utf8); + klen = tmplen; + /* If we were able to downgrade here, then than means that we were + passed in a key which only had chars 0-255, but was utf8 encoded. */ + if (is_utf8) + flags = HVhek_UTF8; + /* If we found we were able to downgrade the string to bytes, then + we should flag that it needs upgrading on keys or each. */ + if (key != keysave) + flags |= HVhek_WASUTF8 | HVhek_FREEKEY; + } + + return hv_fetch_flags (hv, key, klen, lval, flags); +} + +SV** +S_hv_fetch_flags(pTHX_ HV *hv, const char *key, I32 klen, I32 lval, int flags) +{ + register XPVHV* xhv; + register U32 hash; + register HE *entry; + SV *sv; + + if (!hv) + return 0; + if (SvRMAGICAL(hv)) { + /* All this clause seems to be utf8 unaware. + By moving the utf8 stuff out to hv_fetch_flags I need to ensure + key doesn't leak. I've not tried solving the utf8-ness. + NWC. + */ if (mg_find((SV*)hv, PERL_MAGIC_tied) || SvGMAGICAL((SV*)hv)) { sv = sv_newmortal(); mg_copy((SV*)hv, sv, key, klen); + if (flags & HVhek_FREEKEY) + Safefree(key); PL_hv_fetch_sv = sv; return &PL_hv_fetch_sv; } @@ -199,8 +220,11 @@ Perl_hv_fetch(pTHX_ HV *hv, const char *key, I32 klen, I32 lval) if (isLOWER(key[i])) { char *nkey = strupr(SvPVX(sv_2mortal(newSVpvn(key,klen)))); SV **ret = hv_fetch(hv, nkey, klen, 0); - if (!ret && lval) - ret = hv_store(hv, key, klen, NEWSV(61,0), 0); + if (!ret && lval) { + ret = hv_store_flags(hv, key, klen, NEWSV(61,0), 0, + flags); + } else if (flags & HVhek_FREEKEY) + Safefree(key); return ret; } } @@ -219,16 +243,11 @@ Perl_hv_fetch(pTHX_ HV *hv, const char *key, I32 klen, I32 lval) Newz(503, xhv->xhv_array /* HvARRAY(hv) */, PERL_HV_ARRAY_ALLOC_BYTES(xhv->xhv_max+1 /* HvMAX(hv)+1 */), char); - else + else { + if (flags & HVhek_FREEKEY) + Safefree(key); return 0; - } - - if (is_utf8) { - STRLEN tmplen = klen; - /* Just casting the &klen to (STRLEN) won't work well - * if STRLEN and I32 are of different widths. --jhi */ - key = (char*)bytes_from_utf8((U8*)key, &tmplen, &is_utf8); - klen = tmplen; + } } PERL_HASH(hash, key, klen); @@ -242,10 +261,30 @@ Perl_hv_fetch(pTHX_ HV *hv, const char *key, I32 klen, I32 lval) continue; if (HeKEY(entry) != key && memNE(HeKEY(entry),key,klen)) /* is this it? */ continue; - if (HeKUTF8(entry) != (char)is_utf8) + /* flags is 0 if not utf8. need HeKFLAGS(entry) also 0. + flags is 1 if utf8. need HeKFLAGS(entry) also 1. + xor is true if bits differ, in which case this isn't a match. */ + if ((HeKFLAGS(entry) ^ flags) & HVhek_UTF8) continue; - if (key != keysave) - Safefree(key); + if (lval && HeKFLAGS(entry) != flags) { + /* We match if HVhek_UTF8 bit in our flags and hash key's match. + But if entry was set previously with HVhek_WASUTF8 and key now + doesn't (or vice versa) then we should change the key's flag, + as this is assignment. */ + if (HvSHAREKEYS(hv)) { + /* Need to swap the key we have for a key with the flags we + need. As keys are shared we can't just write to the flag, + so we share the new one, unshare the old one. */ + int flags_nofree = flags & ~HVhek_FREEKEY; + HEK *new_hek = share_hek_flags(key, klen, hash, flags_nofree); + unshare_hek (HeKEY_hek(entry)); + HeKEY_hek(entry) = new_hek; + } + else + HeKFLAGS(entry) = flags; + } + if (flags & HVhek_FREEKEY) + Safefree(key); /* if we find a placeholder, we pretend we haven't found anything */ if (HeVAL(entry) == &PL_sv_undef) break; @@ -266,22 +305,16 @@ Perl_hv_fetch(pTHX_ HV *hv, const char *key, I32 klen, I32 lval) } #endif if (!entry && SvREADONLY(hv)) { - Perl_hv_notallowed(aTHX_ is_utf8, key, klen, keysave, + Perl_hv_notallowed(aTHX_ flags, key, klen, "Attempt to access disallowed key '%"SVf"' in a fixed hash" ); } if (lval) { /* gonna assign to this, so it better be there */ sv = NEWSV(61,0); - if (key != keysave) { /* must be is_utf8 == 0 */ - SV **ret = hv_store(hv,key,klen,sv,hash); - Safefree(key); - return ret; - } - else - return hv_store(hv,key,is_utf8?-klen:klen,sv,hash); + return hv_store_flags(hv,key,klen,sv,hash,flags); } - if (key != keysave) - Safefree(key); + if (flags & HVhek_FREEKEY) + Safefree(key); return 0; } @@ -313,6 +346,7 @@ Perl_hv_fetch_ent(pTHX_ HV *hv, SV *keysv, I32 lval, register U32 hash) register HE *entry; SV *sv; bool is_utf8; + int flags = 0; char *keysave; if (!hv) @@ -366,8 +400,13 @@ Perl_hv_fetch_ent(pTHX_ HV *hv, SV *keysv, I32 lval, register U32 hash) keysave = key = SvPV(keysv, klen); is_utf8 = (SvUTF8(keysv)!=0); - if (is_utf8) + if (is_utf8) { key = (char*)bytes_from_utf8((U8*)key, &klen, &is_utf8); + if (is_utf8) + flags = HVhek_UTF8; + if (key != keysave) + flags |= HVhek_WASUTF8 | HVhek_FREEKEY; + } if (!hash) PERL_HASH(hash, key, klen); @@ -381,8 +420,25 @@ Perl_hv_fetch_ent(pTHX_ HV *hv, SV *keysv, I32 lval, register U32 hash) continue; if (HeKEY(entry) != key && memNE(HeKEY(entry),key,klen)) /* is this it? */ continue; - if (HeKUTF8(entry) != (char)is_utf8) + if ((HeKFLAGS(entry) ^ flags) & HVhek_UTF8) continue; + if (lval && HeKFLAGS(entry) != flags) { + /* We match if HVhek_UTF8 bit in our flags and hash key's match. + But if entry was set previously with HVhek_WASUTF8 and key now + doesn't (or vice versa) then we should change the key's flag, + as this is assignment. */ + if (HvSHAREKEYS(hv)) { + /* Need to swap the key we have for a key with the flags we + need. As keys are shared we can't just write to the flag, + so we share the new one, unshare the old one. */ + int flags_nofree = flags & ~HVhek_FREEKEY; + HEK *new_hek = share_hek_flags(key, klen, hash, flags_nofree); + unshare_hek (HeKEY_hek(entry)); + HeKEY_hek(entry) = new_hek; + } + else + HeKFLAGS(entry) = flags; + } if (key != keysave) Safefree(key); /* if we find a placeholder, we pretend we haven't found anything */ @@ -402,11 +458,11 @@ Perl_hv_fetch_ent(pTHX_ HV *hv, SV *keysv, I32 lval, register U32 hash) } #endif if (!entry && SvREADONLY(hv)) { - Perl_hv_notallowed(aTHX_ is_utf8, key, klen, keysave, + Perl_hv_notallowed(aTHX_ flags, key, klen, "Attempt to access disallowed key '%"SVf"' in a fixed hash" ); } - if (key != keysave) + if (flags & HVhek_FREEKEY) Safefree(key); if (lval) { /* gonna assign to this, so it better be there */ sv = NEWSV(61,0); @@ -453,23 +509,43 @@ information on how to use this function on tied hashes. */ SV** -Perl_hv_store(pTHX_ HV *hv, const char *key, I32 klen, SV *val, register U32 hash) +Perl_hv_store(pTHX_ HV *hv, const char *key, I32 klen, SV *val, U32 hash) +{ + bool is_utf8 = FALSE; + const char *keysave = key; + int flags = 0; + + if (is_utf8) { + STRLEN tmplen = klen; + /* Just casting the &klen to (STRLEN) won't work well + * if STRLEN and I32 are of different widths. --jhi */ + key = (char*)bytes_from_utf8((U8*)key, &tmplen, &is_utf8); + klen = tmplen; + /* If we were able to downgrade here, then than means that we were + passed in a key which only had chars 0-255, but was utf8 encoded. */ + if (is_utf8) + flags = HVhek_UTF8; + /* If we found we were able to downgrade the string to bytes, then + we should flag that it needs upgrading on keys or each. */ + if (key != keysave) + flags |= HVhek_WASUTF8 | HVhek_FREEKEY; + } + + return hv_store_flags (hv, key, klen, val, hash, flags); +} + +SV** +S_hv_store_flags(pTHX_ HV *hv, const char *key, I32 klen, SV *val, + register U32 hash, int flags) { register XPVHV* xhv; register I32 i; register HE *entry; register HE **oentry; - bool is_utf8 = FALSE; - const char *keysave = key; if (!hv) return 0; - if (klen < 0) { - klen = -klen; - is_utf8 = TRUE; - } - xhv = (XPVHV*)SvANY(hv); if (SvMAGICAL(hv)) { bool needs_copy; @@ -477,8 +553,11 @@ Perl_hv_store(pTHX_ HV *hv, const char *key, I32 klen, SV *val, register U32 has hv_magic_check (hv, &needs_copy, &needs_store); if (needs_copy) { mg_copy((SV*)hv, val, key, klen); - if (!xhv->xhv_array /* !HvARRAY */ && !needs_store) + if (!xhv->xhv_array /* !HvARRAY */ && !needs_store) { + if (flags & HVhek_FREEKEY) + Safefree(key); return 0; + } #ifdef ENV_IS_CASELESS else if (mg_find((SV*)hv, PERL_MAGIC_env)) { key = savepvn(key,klen); @@ -489,13 +568,8 @@ Perl_hv_store(pTHX_ HV *hv, const char *key, I32 klen, SV *val, register U32 has } } - if (is_utf8) { - STRLEN tmplen = klen; - /* See the note in hv_fetch(). --jhi */ - key = (char*)bytes_from_utf8((U8*)key, &tmplen, &is_utf8); - klen = tmplen; - HvUTF8KEYS_on((SV*)hv); - } + if (flags) + HvHASKFLAGS_on((SV*)hv); if (!hash) PERL_HASH(hash, key, klen); @@ -516,31 +590,49 @@ Perl_hv_store(pTHX_ HV *hv, const char *key, I32 klen, SV *val, register U32 has continue; if (HeKEY(entry) != key && memNE(HeKEY(entry),key,klen)) /* is this it? */ continue; - if (HeKUTF8(entry) != (char)is_utf8) + if ((HeKFLAGS(entry) ^ flags) & HVhek_UTF8) continue; if (HeVAL(entry) == &PL_sv_undef) xhv->xhv_placeholders--; /* yes, can store into placeholder slot */ else SvREFCNT_dec(HeVAL(entry)); HeVAL(entry) = val; - if (key != keysave) - Safefree(key); + + if (HeKFLAGS(entry) != flags) { + /* We match if HVhek_UTF8 bit in our flags and hash key's match. + But if entry was set previously with HVhek_WASUTF8 and key now + doesn't (or vice versa) then we should change the key's flag, + as this is assignment. */ + if (HvSHAREKEYS(hv)) { + /* Need to swap the key we have for a key with the flags we + need. As keys are shared we can't just write to the flag, + so we share the new one, unshare the old one. */ + int flags_nofree = flags & ~HVhek_FREEKEY; + HEK *new_hek = share_hek_flags(key, klen, hash, flags_nofree); + unshare_hek (HeKEY_hek(entry)); + HeKEY_hek(entry) = new_hek; + } + else + HeKFLAGS(entry) = flags; + } + if (flags & HVhek_FREEKEY) + Safefree(key); return &HeVAL(entry); } if (SvREADONLY(hv)) { - Perl_hv_notallowed(aTHX_ is_utf8, key, klen, keysave, + Perl_hv_notallowed(aTHX_ flags, key, klen, "Attempt to access disallowed key '%"SVf"' to a fixed hash" ); } entry = new_HE(); + /* share_hek_flags will do the free for us. This might be considered + bad API design. */ if (HvSHAREKEYS(hv)) - HeKEY_hek(entry) = share_hek(key, is_utf8?-klen:klen, hash); + HeKEY_hek(entry) = share_hek_flags(key, klen, hash, flags); else /* gotta do the real thing */ - HeKEY_hek(entry) = save_hek(key, is_utf8?-klen:klen, hash); - if (key != keysave) - Safefree(key); + HeKEY_hek(entry) = save_hek_flags(key, klen, hash, flags); HeVAL(entry) = val; HeNEXT(entry) = *oentry; *oentry = entry; @@ -575,15 +667,16 @@ information on how to use this function on tied hashes. */ HE * -Perl_hv_store_ent(pTHX_ HV *hv, SV *keysv, SV *val, register U32 hash) +Perl_hv_store_ent(pTHX_ HV *hv, SV *keysv, SV *val, U32 hash) { - register XPVHV* xhv; - register char *key; + XPVHV* xhv; + char *key; STRLEN klen; - register I32 i; - register HE *entry; - register HE **oentry; + I32 i; + HE *entry; + HE **oentry; bool is_utf8; + int flags = 0; char *keysave; if (!hv) @@ -619,7 +712,11 @@ Perl_hv_store_ent(pTHX_ HV *hv, SV *keysv, SV *val, register U32 hash) if (is_utf8) { key = (char*)bytes_from_utf8((U8*)key, &klen, &is_utf8); - HvUTF8KEYS_on((SV*)hv); + if (is_utf8) + flags = HVhek_UTF8; + if (key != keysave) + flags |= HVhek_WASUTF8 | HVhek_FREEKEY; + HvHASKFLAGS_on((SV*)hv); } if (!hash) @@ -633,39 +730,56 @@ Perl_hv_store_ent(pTHX_ HV *hv, SV *keysv, SV *val, register U32 hash) /* oentry = &(HvARRAY(hv))[hash & (I32) HvMAX(hv)]; */ oentry = &((HE**)xhv->xhv_array)[hash & (I32) xhv->xhv_max]; i = 1; - - for (entry = *oentry; entry; i=0, entry = HeNEXT(entry)) { + entry = *oentry; + for (; entry; i=0, entry = HeNEXT(entry)) { if (HeHASH(entry) != hash) /* strings can't be equal */ continue; if (HeKLEN(entry) != klen) continue; if (HeKEY(entry) != key && memNE(HeKEY(entry),key,klen)) /* is this it? */ continue; - if (HeKUTF8(entry) != (char)is_utf8) + if ((HeKFLAGS(entry) ^ flags) & HVhek_UTF8) continue; if (HeVAL(entry) == &PL_sv_undef) xhv->xhv_placeholders--; /* yes, can store into placeholder slot */ else SvREFCNT_dec(HeVAL(entry)); HeVAL(entry) = val; - if (key != keysave) + if (HeKFLAGS(entry) != flags) { + /* We match if HVhek_UTF8 bit in our flags and hash key's match. + But if entry was set previously with HVhek_WASUTF8 and key now + doesn't (or vice versa) then we should change the key's flag, + as this is assignment. */ + if (HvSHAREKEYS(hv)) { + /* Need to swap the key we have for a key with the flags we + need. As keys are shared we can't just write to the flag, + so we share the new one, unshare the old one. */ + int flags_nofree = flags & ~HVhek_FREEKEY; + HEK *new_hek = share_hek_flags(key, klen, hash, flags_nofree); + unshare_hek (HeKEY_hek(entry)); + HeKEY_hek(entry) = new_hek; + } + else + HeKFLAGS(entry) = flags; + } + if (flags & HVhek_FREEKEY) Safefree(key); return entry; } if (SvREADONLY(hv)) { - Perl_hv_notallowed(aTHX_ is_utf8, key, klen, keysave, + Perl_hv_notallowed(aTHX_ flags, key, klen, "Attempt to access disallowed key '%"SVf"' to a fixed hash" ); } entry = new_HE(); + /* share_hek_flags will do the free for us. This might be considered + bad API design. */ if (HvSHAREKEYS(hv)) - HeKEY_hek(entry) = share_hek(key, is_utf8?-(I32)klen:klen, hash); + HeKEY_hek(entry) = share_hek_flags(key, klen, hash, flags); else /* gotta do the real thing */ - HeKEY_hek(entry) = save_hek(key, is_utf8?-(I32)klen:klen, hash); - if (key != keysave) - Safefree(key); + HeKEY_hek(entry) = save_hek_flags(key, klen, hash, flags); HeVAL(entry) = val; HeNEXT(entry) = *oentry; *oentry = entry; @@ -702,6 +816,7 @@ Perl_hv_delete(pTHX_ HV *hv, const char *key, I32 klen, I32 flags) SV **svp; SV *sv; bool is_utf8 = FALSE; + int k_flags = 0; const char *keysave = key; if (!hv) @@ -743,6 +858,10 @@ Perl_hv_delete(pTHX_ HV *hv, const char *key, I32 klen, I32 flags) /* See the note in hv_fetch(). --jhi */ key = (char*)bytes_from_utf8((U8*)key, &tmplen, &is_utf8); klen = tmplen; + if (is_utf8) + k_flags = HVhek_UTF8; + if (key != keysave) + k_flags |= HVhek_FREEKEY; } PERL_HASH(hash, key, klen); @@ -758,9 +877,9 @@ Perl_hv_delete(pTHX_ HV *hv, const char *key, I32 klen, I32 flags) continue; if (HeKEY(entry) != key && memNE(HeKEY(entry),key,klen)) /* is this it? */ continue; - if (HeKUTF8(entry) != (char)is_utf8) + if ((HeKFLAGS(entry) ^ k_flags) & HVhek_UTF8) continue; - if (key != keysave) + if (k_flags & HVhek_FREEKEY) Safefree(key); /* if placeholder is here, it's already been deleted.... */ if (HeVAL(entry) == &PL_sv_undef) @@ -778,13 +897,13 @@ Perl_hv_delete(pTHX_ HV *hv, const char *key, I32 klen, I32 flags) hv_free_ent(hv, entry); xhv->xhv_keys--; /* HvKEYS(hv)-- */ if (xhv->xhv_keys == 0) - HvUTF8KEYS_off(hv); + HvHASKFLAGS_off(hv); xhv->xhv_placeholders--; return Nullsv; } } else if (SvREADONLY(hv) && HeVAL(entry) && SvREADONLY(HeVAL(entry))) { - Perl_hv_notallowed(aTHX_ is_utf8, key, klen, keysave, + Perl_hv_notallowed(aTHX_ k_flags, key, klen, "Attempt to delete readonly key '%"SVf"' from a fixed hash" ); } @@ -817,17 +936,17 @@ Perl_hv_delete(pTHX_ HV *hv, const char *key, I32 klen, I32 flags) hv_free_ent(hv, entry); xhv->xhv_keys--; /* HvKEYS(hv)-- */ if (xhv->xhv_keys == 0) - HvUTF8KEYS_off(hv); + HvHASKFLAGS_off(hv); } return sv; } if (SvREADONLY(hv)) { - Perl_hv_notallowed(aTHX_ is_utf8, key, klen, keysave, + Perl_hv_notallowed(aTHX_ k_flags, key, klen, "Attempt to access disallowed key '%"SVf"' from a fixed hash" ); } - if (key != keysave) + if (k_flags & HVhek_FREEKEY) Safefree(key); return Nullsv; } @@ -854,6 +973,7 @@ Perl_hv_delete_ent(pTHX_ HV *hv, SV *keysv, I32 flags, U32 hash) register HE **oentry; SV *sv; bool is_utf8; + int k_flags = 0; char *keysave; if (!hv) @@ -891,8 +1011,13 @@ Perl_hv_delete_ent(pTHX_ HV *hv, SV *keysv, I32 flags, U32 hash) keysave = key = SvPV(keysv, klen); is_utf8 = (SvUTF8(keysv) != 0); - if (is_utf8) + if (is_utf8) { key = (char*)bytes_from_utf8((U8*)key, &klen, &is_utf8); + if (is_utf8) + k_flags = HVhek_UTF8; + if (key != keysave) + k_flags |= HVhek_FREEKEY; + } if (!hash) PERL_HASH(hash, key, klen); @@ -908,10 +1033,10 @@ Perl_hv_delete_ent(pTHX_ HV *hv, SV *keysv, I32 flags, U32 hash) continue; if (HeKEY(entry) != key && memNE(HeKEY(entry),key,klen)) /* is this it? */ continue; - if (HeKUTF8(entry) != (char)is_utf8) + if ((HeKFLAGS(entry) ^ k_flags) & HVhek_UTF8) continue; - if (key != keysave) - Safefree(key); + if (k_flags & HVhek_FREEKEY) + Safefree(key); /* if placeholder is here, it's already been deleted.... */ if (HeVAL(entry) == &PL_sv_undef) @@ -929,12 +1054,12 @@ Perl_hv_delete_ent(pTHX_ HV *hv, SV *keysv, I32 flags, U32 hash) hv_free_ent(hv, entry); xhv->xhv_keys--; /* HvKEYS(hv)-- */ if (xhv->xhv_keys == 0) - HvUTF8KEYS_off(hv); + HvHASKFLAGS_off(hv); xhv->xhv_placeholders--; return Nullsv; } else if (SvREADONLY(hv) && HeVAL(entry) && SvREADONLY(HeVAL(entry))) { - Perl_hv_notallowed(aTHX_ is_utf8, key, klen, keysave, + Perl_hv_notallowed(aTHX_ k_flags, key, klen, "Attempt to delete readonly key '%"SVf"' from a fixed hash" ); } @@ -967,17 +1092,17 @@ Perl_hv_delete_ent(pTHX_ HV *hv, SV *keysv, I32 flags, U32 hash) hv_free_ent(hv, entry); xhv->xhv_keys--; /* HvKEYS(hv)-- */ if (xhv->xhv_keys == 0) - HvUTF8KEYS_off(hv); + HvHASKFLAGS_off(hv); } return sv; } if (SvREADONLY(hv)) { - Perl_hv_notallowed(aTHX_ is_utf8, key, klen, keysave, + Perl_hv_notallowed(aTHX_ k_flags, key, klen, "Attempt to delete disallowed key '%"SVf"' from a fixed hash" ); } - if (key != keysave) + if (k_flags & HVhek_FREEKEY) Safefree(key); return Nullsv; } @@ -1000,6 +1125,7 @@ Perl_hv_exists(pTHX_ HV *hv, const char *key, I32 klen) SV *sv; bool is_utf8 = FALSE; const char *keysave = key; + int k_flags = 0; if (!hv) return 0; @@ -1035,6 +1161,10 @@ Perl_hv_exists(pTHX_ HV *hv, const char *key, I32 klen) /* See the note in hv_fetch(). --jhi */ key = (char*)bytes_from_utf8((U8*)key, &tmplen, &is_utf8); klen = tmplen; + if (is_utf8) + k_flags = HVhek_UTF8; + if (key != keysave) + k_flags |= HVhek_FREEKEY; } PERL_HASH(hash, key, klen); @@ -1052,9 +1182,9 @@ Perl_hv_exists(pTHX_ HV *hv, const char *key, I32 klen) continue; if (HeKEY(entry) != key && memNE(HeKEY(entry),key,klen)) /* is this it? */ continue; - if (HeKUTF8(entry) != (char)is_utf8) + if ((HeKFLAGS(entry) ^ k_flags) & HVhek_UTF8) continue; - if (key != keysave) + if (k_flags & HVhek_FREEKEY) Safefree(key); /* If we find the key, but the value is a placeholder, return false. */ if (HeVAL(entry) == &PL_sv_undef) @@ -1070,12 +1200,14 @@ Perl_hv_exists(pTHX_ HV *hv, const char *key, I32 klen) sv = newSVpvn(env,len); SvTAINTED_on(sv); (void)hv_store(hv,key,klen,sv,hash); + if (k_flags & HVhek_FREEKEY) + Safefree(key); return TRUE; } } #endif - if (key != keysave) - Safefree(key); + if (k_flags & HVhek_FREEKEY) + Safefree(key); return FALSE; } @@ -1100,6 +1232,7 @@ Perl_hv_exists_ent(pTHX_ HV *hv, SV *keysv, U32 hash) SV *sv; bool is_utf8; char *keysave; + int k_flags = 0; if (!hv) return 0; @@ -1131,8 +1264,13 @@ Perl_hv_exists_ent(pTHX_ HV *hv, SV *keysv, U32 hash) keysave = key = SvPV(keysv, klen); is_utf8 = (SvUTF8(keysv) != 0); - if (is_utf8) + if (is_utf8) { key = (char*)bytes_from_utf8((U8*)key, &klen, &is_utf8); + if (is_utf8) + k_flags = HVhek_UTF8; + if (key != keysave) + k_flags |= HVhek_FREEKEY; + } if (!hash) PERL_HASH(hash, key, klen); @@ -1149,9 +1287,9 @@ Perl_hv_exists_ent(pTHX_ HV *hv, SV *keysv, U32 hash) continue; if (HeKEY(entry) != key && memNE(HeKEY(entry),key,klen)) /* is this it? */ continue; - if (HeKUTF8(entry) != (char)is_utf8) + if ((HeKFLAGS(entry) ^ k_flags) & HVhek_UTF8) continue; - if (key != keysave) + if (k_flags & HVhek_FREEKEY) Safefree(key); /* If we find the key, but the value is a placeholder, return false. */ if (HeVAL(entry) == &PL_sv_undef) @@ -1166,12 +1304,14 @@ Perl_hv_exists_ent(pTHX_ HV *hv, SV *keysv, U32 hash) sv = newSVpvn(env,len); SvTAINTED_on(sv); (void)hv_store_ent(hv,keysv,sv,hash); + if (k_flags & HVhek_FREEKEY) + Safefree(key); return TRUE; } } #endif - if (key != keysave) - Safefree(key); + if (k_flags & HVhek_FREEKEY) + Safefree(key); return FALSE; } @@ -1376,12 +1516,14 @@ Perl_newHVhv(pTHX_ HV *ohv) for (oent = oents[i]; oent; oent = HeNEXT(oent)) { U32 hash = HeHASH(oent); char *key = HeKEY(oent); - STRLEN len = HeKLEN_UTF8(oent); + STRLEN len = HeKLEN(oent); + int flags = HeKFLAGS(oent); ent = new_HE(); HeVAL(ent) = newSVsv(HeVAL(oent)); - HeKEY_hek(ent) = shared ? share_hek(key, len, hash) - : save_hek(key, len, hash); + HeKEY_hek(ent) + = shared ? share_hek_flags(key, len, hash, flags) + : save_hek_flags(key, len, hash, flags); if (prev) HeNEXT(prev) = ent; else @@ -1409,8 +1551,9 @@ Perl_newHVhv(pTHX_ HV *ohv) hv_iterinit(ohv); while ((entry = hv_iternext(ohv))) { - hv_store(hv, HeKEY(entry), HeKLEN_UTF8(entry), - newSVsv(HeVAL(entry)), HeHASH(entry)); + hv_store_flags(hv, HeKEY(entry), HeKLEN(entry), + newSVsv(HeVAL(entry)), HeHASH(entry), + HeKFLAGS(entry)); } HvRITER(ohv) = riter; HvEITER(ohv) = eiter; @@ -1491,7 +1634,7 @@ Perl_hv_clear(pTHX_ HV *hv) if (SvRMAGICAL(hv)) mg_clear((SV*)hv); - HvUTF8KEYS_off(hv); + HvHASKFLAGS_off(hv); } STATIC void @@ -1726,11 +1869,28 @@ see C. SV * Perl_hv_iterkeysv(pTHX_ register HE *entry) { - if (HeKLEN(entry) == HEf_SVKEY) - return sv_mortalcopy(HeKEY_sv(entry)); - else - return sv_2mortal(newSVpvn_share((HeKLEN(entry) ? HeKEY(entry) : ""), - HeKLEN_UTF8(entry), HeHASH(entry))); + if (HeKLEN(entry) != HEf_SVKEY) { + HEK *hek = HeKEY_hek(entry); + int flags = HEK_FLAGS(hek); + SV *sv; + + if (flags & HVhek_WASUTF8) { + /* Trouble :-) + Andreas would like keys he put in as utf8 to come back as utf8 + */ + STRLEN utf8_len = HEK_LEN(hek); + U8 *as_utf8 = bytes_to_utf8 (HEK_KEY(hek), &utf8_len); + + sv = newSVpvn (as_utf8, utf8_len); + SvUTF8_on (sv); + } else { + sv = newSVpvn_share(HEK_KEY(hek), + (HEK_UTF8(hek) ? -HEK_LEN(hek) : HEK_LEN(hek)), + HEK_HASH(hek)); + } + return sv_2mortal(sv); + } + return sv_mortalcopy(HeKEY_sv(entry)); } /* @@ -1806,20 +1966,44 @@ Perl_sharepvn(pTHX_ const char *sv, I32 len, U32 hash) void Perl_unsharepvn(pTHX_ const char *str, I32 len, U32 hash) { + unshare_hek_or_pvn (NULL, str, len, hash); +} + + +void +Perl_unshare_hek(pTHX_ HEK *hek) +{ + unshare_hek_or_pvn(hek, NULL, 0, 0); +} + +/* possibly free a shared string if no one has access to it + hek if non-NULL takes priority over the other 3, else str, len and hash + are used. If so, len and hash must both be valid for str. + */ +void +S_unshare_hek_or_pvn(pTHX_ HEK *hek, const char *str, I32 len, U32 hash) +{ register XPVHV* xhv; register HE *entry; register HE **oentry; register I32 i = 1; I32 found = 0; bool is_utf8 = FALSE; + int k_flags = 0; const char *save = str; - if (len < 0) { - STRLEN tmplen = -len; - is_utf8 = TRUE; - /* See the note in hv_fetch(). --jhi */ - str = (char*)bytes_from_utf8((U8*)str, &tmplen, &is_utf8); - len = tmplen; + if (hek) { + hash = HEK_HASH(hek); + } else if (len < 0) { + STRLEN tmplen = -len; + is_utf8 = TRUE; + /* See the note in hv_fetch(). --jhi */ + str = (char*)bytes_from_utf8((U8*)str, &tmplen, &is_utf8); + len = tmplen; + if (is_utf8) + k_flags = HVhek_UTF8; + if (str != save) + k_flags |= HVhek_WASUTF8 | HVhek_FREEKEY; } /* what follows is the moral equivalent of: @@ -1832,31 +2016,48 @@ Perl_unsharepvn(pTHX_ const char *str, I32 len, U32 hash) LOCK_STRTAB_MUTEX; /* oentry = &(HvARRAY(hv))[hash & (I32) HvMAX(hv)]; */ oentry = &((HE**)xhv->xhv_array)[hash & (I32) xhv->xhv_max]; - for (entry = *oentry; entry; i=0, oentry = &HeNEXT(entry), entry = *oentry) { - if (HeHASH(entry) != hash) /* strings can't be equal */ - continue; - if (HeKLEN(entry) != len) - continue; - if (HeKEY(entry) != str && memNE(HeKEY(entry),str,len)) /* is this it? */ - continue; - if (HeKUTF8(entry) != (char)is_utf8) - continue; - found = 1; - if (--HeVAL(entry) == Nullsv) { - *oentry = HeNEXT(entry); - if (i && !*oentry) - xhv->xhv_fill--; /* HvFILL(hv)-- */ - Safefree(HeKEY_hek(entry)); - del_HE(entry); - xhv->xhv_keys--; /* HvKEYS(hv)-- */ - } - break; + if (hek) { + for (entry = *oentry; entry; i=0, oentry = &HeNEXT(entry), entry = *oentry) { + if (HeKEY_hek(entry) != hek) + continue; + found = 1; + break; + } + } else { + int flags_masked = k_flags & HVhek_MASK; + for (entry = *oentry; entry; i=0, oentry = &HeNEXT(entry), entry = *oentry) { + if (HeHASH(entry) != hash) /* strings can't be equal */ + continue; + if (HeKLEN(entry) != len) + continue; + if (HeKEY(entry) != str && memNE(HeKEY(entry),str,len)) /* is this it? */ + continue; + if (HeKFLAGS(entry) != flags_masked) + continue; + found = 1; + break; + } + } + + if (found) { + if (--HeVAL(entry) == Nullsv) { + *oentry = HeNEXT(entry); + if (i && !*oentry) + xhv->xhv_fill--; /* HvFILL(hv)-- */ + Safefree(HeKEY_hek(entry)); + del_HE(entry); + xhv->xhv_keys--; /* HvKEYS(hv)-- */ + } } + UNLOCK_STRTAB_MUTEX; - if (str != save) - Safefree(str); if (!found && ckWARN_d(WARN_INTERNAL)) - Perl_warner(aTHX_ packWARN(WARN_INTERNAL), "Attempt to free non-existent shared string '%s'",str); + Perl_warner(aTHX_ packWARN(WARN_INTERNAL), + "Attempt to free non-existent shared string '%s'%s", + hek ? HEK_KEY(hek) : str, + (k_flags & HVhek_UTF8) ? " (utf8)" : ""); + if (k_flags & HVhek_FREEKEY) + Safefree(str); } /* get a (constant) string ptr from the global string table @@ -1866,12 +2067,8 @@ Perl_unsharepvn(pTHX_ const char *str, I32 len, U32 hash) HEK * Perl_share_hek(pTHX_ const char *str, I32 len, register U32 hash) { - register XPVHV* xhv; - register HE *entry; - register HE **oentry; - register I32 i = 1; - I32 found = 0; bool is_utf8 = FALSE; + int flags = 0; const char *save = str; if (len < 0) { @@ -1880,7 +2077,29 @@ Perl_share_hek(pTHX_ const char *str, I32 len, register U32 hash) /* See the note in hv_fetch(). --jhi */ str = (char*)bytes_from_utf8((U8*)str, &tmplen, &is_utf8); len = tmplen; - } + /* If we were able to downgrade here, then than means that we were passed + in a key which only had chars 0-255, but was utf8 encoded. */ + if (is_utf8) + flags = HVhek_UTF8; + /* If we found we were able to downgrade the string to bytes, then + we should flag that it needs upgrading on keys or each. Also flag + that we need share_hek_flags to free the string. */ + if (str != save) + flags |= HVhek_WASUTF8 | HVhek_FREEKEY; + } + + return share_hek_flags (str, len, hash, flags); +} + +HEK * +S_share_hek_flags(pTHX_ const char *str, I32 len, register U32 hash, int flags) +{ + register XPVHV* xhv; + register HE *entry; + register HE **oentry; + register I32 i = 1; + I32 found = 0; + int flags_masked = flags & HVhek_MASK; /* what follows is the moral equivalent of: @@ -1899,14 +2118,14 @@ Perl_share_hek(pTHX_ const char *str, I32 len, register U32 hash) continue; if (HeKEY(entry) != str && memNE(HeKEY(entry),str,len)) /* is this it? */ continue; - if (HeKUTF8(entry) != (char)is_utf8) + if (HeKFLAGS(entry) != flags_masked) continue; found = 1; break; } if (!found) { entry = new_HE(); - HeKEY_hek(entry) = save_hek(str, is_utf8?-len:len, hash); + HeKEY_hek(entry) = save_hek_flags(str, len, hash, flags); HeVAL(entry) = Nullsv; HeNEXT(entry) = *oentry; *oentry = entry; @@ -1920,7 +2139,9 @@ Perl_share_hek(pTHX_ const char *str, I32 len, register U32 hash) ++HeVAL(entry); /* use value slot as REFCNT */ UNLOCK_STRTAB_MUTEX; - if (str != save) + + if (flags & HVhek_FREEKEY) Safefree(str); + return HeKEY_hek(entry); } diff --git a/hv.h b/hv.h index 3d51075..68fc9da 100644 --- a/hv.h +++ b/hv.h @@ -24,7 +24,8 @@ struct hek { I32 hek_len; /* length of hash key */ char hek_key[1]; /* variable-length hash key */ /* the hash-key is \0-terminated */ - /* after the \0 there is a byte telling whether the key is UTF8 */ + /* after the \0 there is a byte for flags, such as whehter the key is + UTF8 */ }; /* hash structure: */ @@ -163,9 +164,16 @@ C. #define HvSHAREKEYS_on(hv) (SvFLAGS(hv) |= SVphv_SHAREKEYS) #define HvSHAREKEYS_off(hv) (SvFLAGS(hv) &= ~SVphv_SHAREKEYS) -#define HvUTF8KEYS(hv) (SvFLAGS(hv) & SVphv_UTF8KEYS) -#define HvUTF8KEYS_on(hv) (SvFLAGS(hv) |= SVphv_UTF8KEYS) -#define HvUTF8KEYS_off(hv) (SvFLAGS(hv) &= ~SVphv_UTF8KEYS) +/* This is an optimisation flag. It won't be set if all hash keys have a 0 + * flag. Currently the only flags relate to utf8. + * Hence it won't be set if all keys are 8 bit only. It will be set if any key + * is utf8 (including 8 bit keys that were entered as utf8, and need upgrading + * when retrieved during iteration. It may still be set when there are no longer + * any utf8 keys. + */ +#define HvHASKFLAGS(hv) (SvFLAGS(hv) & SVphv_HASKFLAGS) +#define HvHASKFLAGS_on(hv) (SvFLAGS(hv) |= SVphv_HASKFLAGS) +#define HvHASKFLAGS_off(hv) (SvFLAGS(hv) &= ~SVphv_HASKFLAGS) #define HvLAZYDEL(hv) (SvFLAGS(hv) & SVphv_LAZYDEL) #define HvLAZYDEL_on(hv) (SvFLAGS(hv) |= SVphv_LAZYDEL) @@ -191,7 +199,9 @@ C. #define HeKEY_sv(he) (*(SV**)HeKEY(he)) #define HeKLEN(he) HEK_LEN(HeKEY_hek(he)) #define HeKUTF8(he) HEK_UTF8(HeKEY_hek(he)) +#define HeKWASUTF8(he) HEK_WASUTF8(HeKEY_hek(he)) #define HeKLEN_UTF8(he) (HeKUTF8(he) ? -HeKLEN(he) : HeKLEN(he)) +#define HeKFLAGS(he) HEK_FLAGS(HeKEY_hek(he)) #define HeVAL(he) (he)->hent_val #define HeHASH(he) HEK_HASH(HeKEY_hek(he)) #define HePV(he,lp) ((HeKLEN(he) == HEf_SVKEY) ? \ @@ -216,7 +226,19 @@ C. #define HEK_HASH(hek) (hek)->hek_hash #define HEK_LEN(hek) (hek)->hek_len #define HEK_KEY(hek) (hek)->hek_key -#define HEK_UTF8(hek) (*(HEK_KEY(hek)+HEK_LEN(hek)+1)) +#define HEK_FLAGS(hek) (*((unsigned char *)(HEK_KEY(hek))+HEK_LEN(hek)+1)) + +#define HVhek_UTF8 0x01 /* Key is utf8 encoded. */ +#define HVhek_WASUTF8 0x02 /* Key is bytes here, but was supplied as utf8. */ +#define HVhek_FREEKEY 0x100 /* Internal flag to say key is malloc()ed. */ +#define HVhek_MASK 0xFF + +#define HEK_UTF8(hek) (HEK_FLAGS(hek) & HVhek_UTF8) +#define HEK_UTF8_on(hek) (HEK_FLAGS(hek) |= HVhek_UTF8) +#define HEK_UTF8_off(hek) (HEK_FLAGS(hek) &= ~HVhek_UTF8) +#define HEK_WASUTF8(hek) (HEK_FLAGS(hek) & HVhek_WASUTF8) +#define HEK_WASUTF8_on(hek) (HEK_FLAGS(hek) |= HVhek_WASUTF8) +#define HEK_WASUTF8_off(hek) (HEK_FLAGS(hek) &= ~HVhek_WASUTF8) /* calculate HV array allocation */ #if defined(STRANGE_MALLOC) || defined(MYMALLOC) diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index a5daaf5..ad3637f 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -137,19 +137,6 @@ This works for all characters that have names. =item * -If Unicode is used in hash keys, there is a subtle effect on the hashes. -The hash becomes "Unicode-sticky" so that keys retrieved from the hash -(either by %hash, each %hash, or keys %hash) will be in Unicode, not -in bytes, even when the keys were bytes went they "went in". This -"stickiness" persists unless the hash is completely emptied, either by -using delete() or clearing the with undef() or assigning an empty list -to the hash. Most of the time this difference is negligible, but -there are few places where it matters: for example the regular -expression character classes like C<\w> behave differently for -bytes and characters. - -=item * - If an appropriate L is specified, identifiers within the Perl script may contain Unicode alphanumeric characters, including ideographs. (You are currently on your own when it comes to using the diff --git a/pp.c b/pp.c index fbe4737..6c4f2ff 100644 --- a/pp.c +++ b/pp.c @@ -3689,15 +3689,6 @@ PP(pp_each) EXTEND(SP, 2); if (entry) { SV* sv = hv_iterkeysv(entry); - if (HvUTF8KEYS((SV*)hash) && !DO_UTF8(sv)) { - STRLEN len, i; - char* s = SvPV(sv, len); - for (i = 0; i < len && NATIVE_IS_INVARIANT(s[i]); i++); - if (i < len) { - sv = newSVsv(sv); - sv_utf8_upgrade(sv); - } - } PUSHs(sv); /* won't clobber stack_sp */ if (gimme == G_ARRAY) { SV *val; diff --git a/sv.h b/sv.h index 74c7f3c..92dec20 100644 --- a/sv.h +++ b/sv.h @@ -235,7 +235,7 @@ perform the upgrade if necessary. See C. #define SVphv_SHAREKEYS 0x20000000 /* keys live on shared string table */ #define SVphv_LAZYDEL 0x40000000 /* entry in xhv_eiter must be deleted */ -#define SVphv_UTF8KEYS 0x80000000 /* keys when fetched are UTF8 */ +#define SVphv_HASKFLAGS 0x80000000 /* keys have flag byte after hash */ #define SVprv_WEAKREF 0x80000000 /* Weak reference */ diff --git a/t/op/utfhash.t b/t/op/utfhash.t index e2337e0..af7e6c1 100644 --- a/t/op/utfhash.t +++ b/t/op/utfhash.t @@ -1,11 +1,15 @@ +#!./perl -w + BEGIN { chdir 't' if -d 't'; @INC = '../lib'; require './test.pl'; - plan(tests => 48); + plan(tests => 91); } +use strict; + # Two hashes one will all keys 8-bit possible (initially), other # with a utf8 requiring key from the outset. @@ -79,24 +83,90 @@ foreach my $a ("\x7f","\xff") { - print "# Unicode hash keys and \\w\n"; - # This is not really a regex test but regexes bring - # out the issue nicely. - use strict; - my $u3 = "f\x{df}\x{100}"; - my $u2 = substr($u3,0,2); - my $u1 = substr($u2,0,1); - my %u = ( $u1 => $u1, $u2 => $u2, $u3 => $u3 ); + print "# Unicode hash keys and \\w\n"; + # This is not really a regex test but regexes bring + # out the issue nicely. + use strict; + my $u3 = "f\x{df}\x{100}"; + my $u2 = substr($u3,0,2); + my $u1 = substr($u2,0,1); + my $u0 = chr (0xdf)x4; # Make this 4 chars so that all lengths are distinct. + + my @u = ($u0, $u1, $u2, $u3); + + while (@u) { + my %u = (map {( $_, $_)} @u); + my $keys = scalar @u; + $keys .= ($keys == 1) ? " key" : " keys"; for (keys %u) { - ok (/^\w+$/ && $u{$_} =~ /^\w+$/, "\\w on keys"); - } - - for (each %u) { - ok (/^\w+$/ && $u{$_} =~ /^\w+$/, "\\w on each"); - } + my $l = 0 + /^\w+$/; + my $r = 0 + $u{$_} =~ /^\w+$/; + is ($l, $r, "\\w on keys with $keys, key of length " . length $_); + } + + my $more; + do { + $more = 0; + # Want to do this direct, rather than copying to a temporary variable + # The first time each will return key and value at the start of the hash. + # each will return () after we've done the last pair. $more won't get + # set then, and the do will exit. + for (each %u) { + $more = 1; + my $l = 0 + /^\w+$/; + my $r = 0 + $u{$_} =~ /^\w+$/; + is ($l, $r, "\\w on each, with $keys, key of length " . length $_); + } + } while ($more); for (%u) { - ok (/^\w+$/ && $u{$_} =~ /^\w+$/, "\\w on hash"); - } + my $l = 0 + /^\w+$/; + my $r = 0 + $u{$_} =~ /^\w+$/; + is ($l, $r, "\\w on hash with $keys, key of length " . length $_); + } + pop @u; + undef %u; + } +} + +{ + my $utf8_sz = my $bytes_sz = "\x{df}"; + $utf8_sz .= chr 256; + chop ($utf8_sz); + + my (%bytes_first, %utf8_first); + + $bytes_first{$bytes_sz} = $bytes_sz; + + for (keys %bytes_first) { + my $l = 0 + /^\w+$/; + my $r = 0 + $bytes_first{$_} =~ /^\w+$/; + is ($l, $r, "\\w on each, bytes"); + } + + $bytes_first{$utf8_sz} = $utf8_sz; + + for (keys %bytes_first) { + my $l = 0 + /^\w+$/; + my $r = 0 + $bytes_first{$_} =~ /^\w+$/; + is ($l, $r, "\\w on each, bytes now utf8"); + } + + $utf8_first{$utf8_sz} = $utf8_sz; + + for (keys %utf8_first) { + my $l = 0 + /^\w+$/; + my $r = 0 + $utf8_first{$_} =~ /^\w+$/; + is ($l, $r, "\\w on each, utf8"); + } + + $utf8_first{$bytes_sz} = $bytes_sz; + + for (keys %utf8_first) { + my $l = 0 + /^\w+$/; + my $r = 0 + $utf8_first{$_} =~ /^\w+$/; + is ($l, $r, "\\w on each, utf8 now bytes"); + } + }