From: Jarkko Hietaniemi Date: Fri, 22 Mar 2002 04:07:13 +0000 (+0000) Subject: If Unicode keys are entered to a hash, a bit is turned on. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=574c8022b1fdc7312bf9a5af037c8f777b60b6db;p=p5sagit%2Fp5-mst-13.2.git If Unicode keys are entered to a hash, a bit is turned on. If the bit is on, when the keys are fetched from the hash (%h, each %h, keys %h), the Unicodified versions of the keys are returned if needed. This solution errs on the size of over-Unicodifying, the old solution erred on the side of under-Unicodifying. As long as the hash keys can be a mix of byte and Unicode strings, a perfect fit is hard to come by. p4raw-id: //depot/perl@15407 --- diff --git a/doop.c b/doop.c index e2faa87..20379a9 100644 --- a/doop.c +++ b/doop.c @@ -1336,8 +1336,19 @@ Perl_do_kv(pTHX) PUTBACK; /* hv_iternext and hv_iterval might clobber stack_sp */ while ((entry = hv_iternext(keys))) { SPAGAIN; - if (dokeys) - XPUSHs(hv_iterkeysv(entry)); /* won't clobber stack_sp */ + if (dokeys) { + SV* sv = hv_iterkeysv(entry); + if (HvUTF8KEYS((SV*)hv) && !DO_UTF8(sv)) { + STRLEN len, i; + char* s = SvPV(sv, len); + for (i = 0; i < len && NATIVE_IS_INVARIANT(s[i]); i++); + if (i < len) { + sv = newSVsv(sv); + sv_utf8_upgrade(sv); + } + } + XPUSHs(sv); /* won't clobber stack_sp */ + } if (dovalues) { PUTBACK; tmpstr = realhv ? diff --git a/dump.c b/dump.c index b4b37bb..48a3b38 100644 --- a/dump.c +++ b/dump.c @@ -980,6 +980,7 @@ Perl_do_sv_dump(pTHX_ I32 level, PerlIO *file, SV *sv, I32 nest, I32 maxnest, bo case SVt_PVHV: if (HvSHAREKEYS(sv)) sv_catpv(d, "SHAREKEYS,"); if (HvLAZYDEL(sv)) sv_catpv(d, "LAZYDEL,"); + if (HvUTF8KEYS(sv)) sv_catpv(d, "UTF8,"); break; case SVt_PVGV: if (GvINTRO(sv)) sv_catpv(d, "INTRO,"); diff --git a/ext/Devel/Peek/Peek.t b/ext/Devel/Peek/Peek.t index bd42d93..f577369 100644 --- a/ext/Devel/Peek/Peek.t +++ b/ext/Devel/Peek/Peek.t @@ -347,8 +347,8 @@ do_test(19, RV = $ADDR SV = PVHV\\($ADDR\\) at $ADDR REFCNT = 2 - FLAGS = \\(SHAREKEYS\\) - IV = 1 + FLAGS = \\(SHAREKEYS,UTF8\\) + UV = 1 NV = $FLOAT ARRAY = $ADDR \\(0:7, 1:1\\) hash quality = 100.0% @@ -373,8 +373,8 @@ do_test(19, RV = $ADDR SV = PVHV\\($ADDR\\) at $ADDR REFCNT = 2 - FLAGS = \\(SHAREKEYS\\) - IV = 1 + FLAGS = \\(SHAREKEYS,UTF8\\) + UV = 1 NV = 0 ARRAY = $ADDR \\(0:7, 1:1\\) hash quality = 100.0% diff --git a/hv.c b/hv.c index 41aa8bb..f92e31e 100644 --- a/hv.c +++ b/hv.c @@ -488,11 +488,13 @@ Perl_hv_store(pTHX_ HV *hv, const char *key, I32 klen, SV *val, register U32 has #endif } } + if (is_utf8) { STRLEN tmplen = klen; /* See the note in hv_fetch(). --jhi */ key = (char*)bytes_from_utf8((U8*)key, &tmplen, &is_utf8); klen = tmplen; + HvUTF8KEYS_on((SV*)hv); } if (!hash) @@ -615,8 +617,10 @@ Perl_hv_store_ent(pTHX_ HV *hv, SV *keysv, SV *val, register U32 hash) keysave = key = SvPV(keysv, klen); is_utf8 = (SvUTF8(keysv) != 0); - if (is_utf8) + if (is_utf8) { key = (char*)bytes_from_utf8((U8*)key, &klen, &is_utf8); + HvUTF8KEYS_on((SV*)hv); + } if (!hash) PERL_HASH(hash, key, klen); @@ -773,6 +777,8 @@ Perl_hv_delete(pTHX_ HV *hv, const char *key, I32 klen, I32 flags) else hv_free_ent(hv, entry); xhv->xhv_keys--; /* HvKEYS(hv)-- */ + if (xhv->xhv_keys == 0) + HvUTF8KEYS_off(hv); xhv->xhv_placeholders--; return Nullsv; } @@ -810,6 +816,8 @@ Perl_hv_delete(pTHX_ HV *hv, const char *key, I32 klen, I32 flags) else hv_free_ent(hv, entry); xhv->xhv_keys--; /* HvKEYS(hv)-- */ + if (xhv->xhv_keys == 0) + HvUTF8KEYS_off(hv); } return sv; } @@ -920,6 +928,8 @@ Perl_hv_delete_ent(pTHX_ HV *hv, SV *keysv, I32 flags, U32 hash) else hv_free_ent(hv, entry); xhv->xhv_keys--; /* HvKEYS(hv)-- */ + if (xhv->xhv_keys == 0) + HvUTF8KEYS_off(hv); xhv->xhv_placeholders--; return Nullsv; } @@ -956,6 +966,8 @@ Perl_hv_delete_ent(pTHX_ HV *hv, SV *keysv, I32 flags, U32 hash) else hv_free_ent(hv, entry); xhv->xhv_keys--; /* HvKEYS(hv)-- */ + if (xhv->xhv_keys == 0) + HvUTF8KEYS_off(hv); } return sv; } @@ -1478,6 +1490,8 @@ Perl_hv_clear(pTHX_ HV *hv) if (SvRMAGICAL(hv)) mg_clear((SV*)hv); + + HvUTF8KEYS_off(hv); } STATIC void diff --git a/hv.h b/hv.h index 369bf3c..3d51075 100644 --- a/hv.h +++ b/hv.h @@ -159,11 +159,14 @@ C. #define HvTOTALKEYS(hv) XHvTOTALKEYS((XPVHV*) SvANY(hv)) #define HvPLACEHOLDERS(hv) XHvPLACEHOLDERS((XPVHV*) SvANY(hv)) - #define HvSHAREKEYS(hv) (SvFLAGS(hv) & SVphv_SHAREKEYS) #define HvSHAREKEYS_on(hv) (SvFLAGS(hv) |= SVphv_SHAREKEYS) #define HvSHAREKEYS_off(hv) (SvFLAGS(hv) &= ~SVphv_SHAREKEYS) +#define HvUTF8KEYS(hv) (SvFLAGS(hv) & SVphv_UTF8KEYS) +#define HvUTF8KEYS_on(hv) (SvFLAGS(hv) |= SVphv_UTF8KEYS) +#define HvUTF8KEYS_off(hv) (SvFLAGS(hv) &= ~SVphv_UTF8KEYS) + #define HvLAZYDEL(hv) (SvFLAGS(hv) & SVphv_LAZYDEL) #define HvLAZYDEL_on(hv) (SvFLAGS(hv) |= SVphv_LAZYDEL) #define HvLAZYDEL_off(hv) (SvFLAGS(hv) &= ~SVphv_LAZYDEL) diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index 4cb8325..9ba32ee 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -113,8 +113,8 @@ Character semantics have the following effects: =item * -Strings and patterns may contain characters that have an ordinal value -larger than 255. +Strings (including hash keys) and regular expression patterns may +contain characters that have an ordinal value larger than 255. If you use a Unicode editor to edit your program, Unicode characters may occur directly within the literal strings in one of the various @@ -128,18 +128,20 @@ hexadecimal, into the curlies. For instance, a smiley face is C<\x{263A}>. This works only for characters with a code 0x100 and above. Additionally, if you + use charnames ':full'; + you can use the C<\N{...}> notation, putting the official Unicode character name within the curlies. For example, C<\N{WHITE SMILING FACE}>. This works for all characters that have names. =item * -If an appropriate L is specified, -identifiers within the Perl script may contain Unicode alphanumeric -characters, including ideographs. (You are currently on your own when -it comes to using the canonical forms of characters--Perl doesn't -(yet) attempt to canonicalize variable names for you.) +If an appropriate L is specified, identifiers within the +Perl script may contain Unicode alphanumeric characters, including +ideographs. (You are currently on your own when it comes to using the +canonical forms of characters--Perl doesn't (yet) attempt to +canonicalize variable names for you.) =item * @@ -846,8 +848,7 @@ B, is UTF-8. Perl tries really hard to work both with Unicode and the old byte oriented world: most often this is nice, but sometimes this causes -problems. See L for example how sometimes using locales -with Unicode can help with these problems. +problems. =back @@ -959,19 +960,10 @@ Use of locales with Unicode data may lead to odd results. Currently there is some attempt to apply 8-bit locale info to characters in the range 0..255, but this is demonstrably incorrect for locales that use characters above that range when mapped into Unicode. It will also -tend to run slower. Avoidance of locales is strongly encouraged, -with one known expection, see the next paragraph. - -If the keys of a hash are "mixed", that is, some keys are Unicode, -while some keys are "byte", the keys may behave differently in regular -expressions since the definition of character classes like C -is different for byte strings and character strings. This problem can -sometimes be helped by using an appropriate locale (see L). -Another way is to force all the strings to be character encoded by -using utf8::upgrade() (see L). +tend to run slower. Use of locales with Unicode is discouraged. Some functions are slower when working on UTF-8 encoded strings than -on byte encoded strings. All functions that need to hop over +on byte encoded strings. All functions that need to hop over characters such as length(), substr() or index() can work B faster when the underlying data are byte-encoded. Witness the following benchmark: diff --git a/pp.c b/pp.c index 15bf351..757b4f0 100644 --- a/pp.c +++ b/pp.c @@ -3686,7 +3686,17 @@ PP(pp_each) EXTEND(SP, 2); if (entry) { - PUSHs(hv_iterkeysv(entry)); /* won't clobber stack_sp */ + SV* sv = hv_iterkeysv(entry); + if (HvUTF8KEYS((SV*)hash) && !DO_UTF8(sv)) { + STRLEN len, i; + char* s = SvPV(sv, len); + for (i = 0; i < len && NATIVE_IS_INVARIANT(s[i]); i++); + if (i < len) { + sv = newSVsv(sv); + sv_utf8_upgrade(sv); + } + } + PUSHs(sv); /* won't clobber stack_sp */ if (gimme == G_ARRAY) { SV *val; PUTBACK; diff --git a/sv.h b/sv.h index b956768..9671bd7 100644 --- a/sv.h +++ b/sv.h @@ -235,6 +235,7 @@ perform the upgrade if necessary. See C. #define SVphv_SHAREKEYS 0x20000000 /* keys live on shared string table */ #define SVphv_LAZYDEL 0x40000000 /* entry in xhv_eiter must be deleted */ +#define SVphv_UTF8KEYS 0x80000000 /* keys when fetched are UTF8 */ #define SVprv_WEAKREF 0x80000000 /* Weak reference */ diff --git a/t/op/pat.t b/t/op/pat.t index b5dff4b..001a5b0 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -6,7 +6,7 @@ $| = 1; -print "1..892\n"; +print "1..903\n"; BEGIN { chdir 't' if -d 't'; @@ -2771,3 +2771,36 @@ print "# some Unicode properties\n"; ++$test; } } + + +{ + my $test = 893; + + print "# Unicode hash keys and \\w\n"; + # This is not really a regex test but regexes bring + # out the issue nicely. + use strict; + my $u3 = "f\x{df}\x{100}"; + my $u2 = substr($u3,0,2); + my $u1 = substr($u2,0,1); + my %u = ( $u1 => $u1, $u2 => $u2, $u3 => $u3 ); + + for (keys %u) { + print /^\w+$/ && $u{$_} =~ /^\w+$/ ? + "ok $test\n" : "not ok $test\n"; + $test++; + } + + for (each %u) { + print /^\w+$/ && $u{$_} =~ /^\w+$/ ? + "ok $test\n" : "not ok $test\n"; + $test++; + } + + for (%u) { + print /^\w+$/ && $u{$_} =~ /^\w+$/ ? + "ok $test\n" : "not ok $test\n"; + $test++; + } +} +