From: Jarkko Hietaniemi <jhi@iki.fi>
Date: Fri, 22 Mar 2002 04:07:13 +0000 (+0000)
Subject: If Unicode keys are entered to a hash, a bit is turned on.
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=574c8022b1fdc7312bf9a5af037c8f777b60b6db;p=p5sagit%2Fp5-mst-13.2.git

If Unicode keys are entered to a hash, a bit is turned on.
If the bit is on, when the keys are fetched from the hash
(%h, each %h, keys %h), the Unicodified versions of the keys
are returned if needed.  This solution errs on the size of
over-Unicodifying, the old solution erred on the side of
under-Unicodifying.  As long as the hash keys can be a mix
of byte and Unicode strings, a perfect fit is hard to come by.

p4raw-id: //depot/perl@15407
---

diff --git a/doop.c b/doop.c
index e2faa87..20379a9 100644
--- a/doop.c
+++ b/doop.c
@@ -1336,8 +1336,19 @@ Perl_do_kv(pTHX)
     PUTBACK;	/* hv_iternext and hv_iterval might clobber stack_sp */
     while ((entry = hv_iternext(keys))) {
 	SPAGAIN;
-	if (dokeys)
-	    XPUSHs(hv_iterkeysv(entry));	/* won't clobber stack_sp */
+	if (dokeys) {
+	    SV* sv = hv_iterkeysv(entry);
+	    if (HvUTF8KEYS((SV*)hv) && !DO_UTF8(sv)) {
+	        STRLEN len, i;
+	        char* s = SvPV(sv, len);
+		for (i = 0; i < len && NATIVE_IS_INVARIANT(s[i]); i++);
+		if (i < len) {
+		    sv = newSVsv(sv);
+		    sv_utf8_upgrade(sv);
+		}
+	    }
+	    XPUSHs(sv);	/* won't clobber stack_sp */
+	}
 	if (dovalues) {
 	    PUTBACK;
 	    tmpstr = realhv ?
diff --git a/dump.c b/dump.c
index b4b37bb..48a3b38 100644
--- a/dump.c
+++ b/dump.c
@@ -980,6 +980,7 @@ Perl_do_sv_dump(pTHX_ I32 level, PerlIO *file, SV *sv, I32 nest, I32 maxnest, bo
     case SVt_PVHV:
 	if (HvSHAREKEYS(sv))	sv_catpv(d, "SHAREKEYS,");
 	if (HvLAZYDEL(sv))	sv_catpv(d, "LAZYDEL,");
+	if (HvUTF8KEYS(sv))	sv_catpv(d, "UTF8,");
 	break;
     case SVt_PVGV:
 	if (GvINTRO(sv))	sv_catpv(d, "INTRO,");
diff --git a/ext/Devel/Peek/Peek.t b/ext/Devel/Peek/Peek.t
index bd42d93..f577369 100644
--- a/ext/Devel/Peek/Peek.t
+++ b/ext/Devel/Peek/Peek.t
@@ -347,8 +347,8 @@ do_test(19,
   RV = $ADDR
   SV = PVHV\\($ADDR\\) at $ADDR
     REFCNT = 2
-    FLAGS = \\(SHAREKEYS\\)
-    IV = 1
+    FLAGS = \\(SHAREKEYS,UTF8\\)
+    UV = 1
     NV = $FLOAT
     ARRAY = $ADDR  \\(0:7, 1:1\\)
     hash quality = 100.0%
@@ -373,8 +373,8 @@ do_test(19,
   RV = $ADDR
   SV = PVHV\\($ADDR\\) at $ADDR
     REFCNT = 2
-    FLAGS = \\(SHAREKEYS\\)
-    IV = 1
+    FLAGS = \\(SHAREKEYS,UTF8\\)
+    UV = 1
     NV = 0
     ARRAY = $ADDR  \\(0:7, 1:1\\)
     hash quality = 100.0%
diff --git a/hv.c b/hv.c
index 41aa8bb..f92e31e 100644
--- a/hv.c
+++ b/hv.c
@@ -488,11 +488,13 @@ Perl_hv_store(pTHX_ HV *hv, const char *key, I32 klen, SV *val, register U32 has
 #endif
 	}
     }
+
     if (is_utf8) {
 	STRLEN tmplen = klen;
 	/* See the note in hv_fetch(). --jhi */
 	key = (char*)bytes_from_utf8((U8*)key, &tmplen, &is_utf8);
 	klen = tmplen;
+	HvUTF8KEYS_on((SV*)hv);
     }
 
     if (!hash)
@@ -615,8 +617,10 @@ Perl_hv_store_ent(pTHX_ HV *hv, SV *keysv, SV *val, register U32 hash)
     keysave = key = SvPV(keysv, klen);
     is_utf8 = (SvUTF8(keysv) != 0);
 
-    if (is_utf8)
+    if (is_utf8) {
 	key = (char*)bytes_from_utf8((U8*)key, &klen, &is_utf8);
+	HvUTF8KEYS_on((SV*)hv);
+    }
 
     if (!hash)
 	PERL_HASH(hash, key, klen);
@@ -773,6 +777,8 @@ Perl_hv_delete(pTHX_ HV *hv, const char *key, I32 klen, I32 flags)
 		else
 		    hv_free_ent(hv, entry);
 		xhv->xhv_keys--; /* HvKEYS(hv)-- */
+		if (xhv->xhv_keys == 0)
+		    HvUTF8KEYS_off(hv);
 		xhv->xhv_placeholders--;
 		return Nullsv;
 	    }
@@ -810,6 +816,8 @@ Perl_hv_delete(pTHX_ HV *hv, const char *key, I32 klen, I32 flags)
 	    else
 		hv_free_ent(hv, entry);
 	    xhv->xhv_keys--; /* HvKEYS(hv)-- */
+	    if (xhv->xhv_keys == 0)
+	        HvUTF8KEYS_off(hv);
 	}
 	return sv;
     }
@@ -920,6 +928,8 @@ Perl_hv_delete_ent(pTHX_ HV *hv, SV *keysv, I32 flags, U32 hash)
            else
                hv_free_ent(hv, entry);
            xhv->xhv_keys--; /* HvKEYS(hv)-- */
+	   if (xhv->xhv_keys == 0)
+               HvUTF8KEYS_off(hv);
            xhv->xhv_placeholders--;
            return Nullsv;
 	}
@@ -956,6 +966,8 @@ Perl_hv_delete_ent(pTHX_ HV *hv, SV *keysv, I32 flags, U32 hash)
 	    else
 		hv_free_ent(hv, entry);
 	    xhv->xhv_keys--; /* HvKEYS(hv)-- */
+	    if (xhv->xhv_keys == 0)
+	        HvUTF8KEYS_off(hv);
 	}
 	return sv;
     }
@@ -1478,6 +1490,8 @@ Perl_hv_clear(pTHX_ HV *hv)
 
     if (SvRMAGICAL(hv))
 	mg_clear((SV*)hv);
+
+    HvUTF8KEYS_off(hv);
 }
 
 STATIC void
diff --git a/hv.h b/hv.h
index 369bf3c..3d51075 100644
--- a/hv.h
+++ b/hv.h
@@ -159,11 +159,14 @@ C<SV*>.
 #define HvTOTALKEYS(hv)		XHvTOTALKEYS((XPVHV*)  SvANY(hv))
 #define HvPLACEHOLDERS(hv)	XHvPLACEHOLDERS((XPVHV*)  SvANY(hv))
 
-
 #define HvSHAREKEYS(hv)		(SvFLAGS(hv) & SVphv_SHAREKEYS)
 #define HvSHAREKEYS_on(hv)	(SvFLAGS(hv) |= SVphv_SHAREKEYS)
 #define HvSHAREKEYS_off(hv)	(SvFLAGS(hv) &= ~SVphv_SHAREKEYS)
 
+#define HvUTF8KEYS(hv)		(SvFLAGS(hv) & SVphv_UTF8KEYS)
+#define HvUTF8KEYS_on(hv)	(SvFLAGS(hv) |= SVphv_UTF8KEYS)
+#define HvUTF8KEYS_off(hv)	(SvFLAGS(hv) &= ~SVphv_UTF8KEYS)
+
 #define HvLAZYDEL(hv)		(SvFLAGS(hv) & SVphv_LAZYDEL)
 #define HvLAZYDEL_on(hv)	(SvFLAGS(hv) |= SVphv_LAZYDEL)
 #define HvLAZYDEL_off(hv)	(SvFLAGS(hv) &= ~SVphv_LAZYDEL)
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
index 4cb8325..9ba32ee 100644
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -113,8 +113,8 @@ Character semantics have the following effects:
 
 =item *
 
-Strings and patterns may contain characters that have an ordinal value
-larger than 255.
+Strings (including hash keys) and regular expression patterns may
+contain characters that have an ordinal value larger than 255.
 
 If you use a Unicode editor to edit your program, Unicode characters
 may occur directly within the literal strings in one of the various
@@ -128,18 +128,20 @@ hexadecimal, into the curlies. For instance, a smiley face is C<\x{263A}>.
 This works only for characters with a code 0x100 and above.
 
 Additionally, if you
+
    use charnames ':full';
+
 you can use the C<\N{...}> notation, putting the official Unicode character
 name within the curlies. For example, C<\N{WHITE SMILING FACE}>.
 This works for all characters that have names.
 
 =item *
 
-If an appropriate L<encoding> is specified,
-identifiers within the Perl script may contain Unicode alphanumeric
-characters, including ideographs.  (You are currently on your own when
-it comes to using the canonical forms of characters--Perl doesn't
-(yet) attempt to canonicalize variable names for you.)
+If an appropriate L<encoding> is specified, identifiers within the
+Perl script may contain Unicode alphanumeric characters, including
+ideographs.  (You are currently on your own when it comes to using the
+canonical forms of characters--Perl doesn't (yet) attempt to
+canonicalize variable names for you.)
 
 =item *
 
@@ -846,8 +848,7 @@ B<any subsequent file open>, is UTF-8.
 
 Perl tries really hard to work both with Unicode and the old byte
 oriented world: most often this is nice, but sometimes this causes
-problems.  See L</BUGS> for example how sometimes using locales
-with Unicode can help with these problems.
+problems.
 
 =back
 
@@ -959,19 +960,10 @@ Use of locales with Unicode data may lead to odd results.  Currently
 there is some attempt to apply 8-bit locale info to characters in the
 range 0..255, but this is demonstrably incorrect for locales that use
 characters above that range when mapped into Unicode.  It will also
-tend to run slower.  Avoidance of locales is strongly encouraged,
-with one known expection, see the next paragraph.
-
-If the keys of a hash are "mixed", that is, some keys are Unicode,
-while some keys are "byte", the keys may behave differently in regular
-expressions since the definition of character classes like C</\w/>
-is different for byte strings and character strings.  This problem can
-sometimes be helped by using an appropriate locale (see L<perllocale>).
-Another way is to force all the strings to be character encoded by
-using utf8::upgrade() (see L<utf8>).
+tend to run slower.  Use of locales with Unicode is discouraged.
 
 Some functions are slower when working on UTF-8 encoded strings than
-on byte encoded strings. All functions that need to hop over
+on byte encoded strings.  All functions that need to hop over
 characters such as length(), substr() or index() can work B<much>
 faster when the underlying data are byte-encoded. Witness the
 following benchmark:
diff --git a/pp.c b/pp.c
index 15bf351..757b4f0 100644
--- a/pp.c
+++ b/pp.c
@@ -3686,7 +3686,17 @@ PP(pp_each)
 
     EXTEND(SP, 2);
     if (entry) {
-	PUSHs(hv_iterkeysv(entry));	/* won't clobber stack_sp */
+        SV* sv = hv_iterkeysv(entry);
+	if (HvUTF8KEYS((SV*)hash) && !DO_UTF8(sv)) {
+	    STRLEN len, i;
+	    char* s = SvPV(sv, len);
+	    for (i = 0; i < len && NATIVE_IS_INVARIANT(s[i]); i++);
+	    if (i < len) {
+	        sv = newSVsv(sv);
+		sv_utf8_upgrade(sv);
+	    }
+	}
+	PUSHs(sv);	/* won't clobber stack_sp */
 	if (gimme == G_ARRAY) {
 	    SV *val;
 	    PUTBACK;
diff --git a/sv.h b/sv.h
index b956768..9671bd7 100644
--- a/sv.h
+++ b/sv.h
@@ -235,6 +235,7 @@ perform the upgrade if necessary.  See C<svtype>.
 
 #define SVphv_SHAREKEYS 0x20000000	/* keys live on shared string table */
 #define SVphv_LAZYDEL	0x40000000	/* entry in xhv_eiter must be deleted */
+#define SVphv_UTF8KEYS 	0x80000000	/* keys when fetched are UTF8 */
 
 #define SVprv_WEAKREF   0x80000000      /* Weak reference */
 
diff --git a/t/op/pat.t b/t/op/pat.t
index b5dff4b..001a5b0 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -6,7 +6,7 @@
 
 $| = 1;
 
-print "1..892\n";
+print "1..903\n";
 
 BEGIN {
     chdir 't' if -d 't';
@@ -2771,3 +2771,36 @@ print "# some Unicode properties\n";
 	++$test;
     }
 }
+
+
+{
+    my $test = 893;
+
+    print "# Unicode hash keys and \\w\n";
+    # This is not really a regex test but regexes bring
+    # out the issue nicely.
+    use strict;
+    my $u3 = "f\x{df}\x{100}";
+    my $u2 = substr($u3,0,2);
+    my $u1 = substr($u2,0,1);
+    my %u = ( $u1 => $u1, $u2 => $u2, $u3 => $u3 );  
+
+    for (keys %u) {
+	print /^\w+$/ && $u{$_} =~ /^\w+$/ ?
+	    "ok $test\n" : "not ok $test\n";
+	$test++;
+   }
+
+    for (each %u) {
+	print /^\w+$/ && $u{$_} =~ /^\w+$/ ?
+	    "ok $test\n" : "not ok $test\n";
+	$test++;
+   }
+
+    for (%u) {
+	print /^\w+$/ && $u{$_} =~ /^\w+$/ ?
+	    "ok $test\n" : "not ok $test\n";
+	$test++;
+   }
+}
+