From: Chip Salzenberg <chip@pobox.com>
Date: Wed, 26 Aug 2009 21:33:15 +0000 (-0700)
Subject: In C<use utf8; a=>'b'>, do not set utf8 flag on 'a' [perl #68812]
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=eaf7a4d2ee7805b14e26e634fba0893913924a6c;p=p5sagit%2Fp5-mst-13.2.git

In C<use utf8; a=>'b'>, do not set utf8 flag on 'a' [perl #68812]
---

diff --git a/embed.fnc b/embed.fnc
index 3f9ddcd..3e829b6 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -481,6 +481,7 @@ ApPR	|bool	|is_uni_lower_lc|UV c
 ApPR	|bool	|is_uni_print_lc|UV c
 ApPR	|bool	|is_uni_punct_lc|UV c
 ApPR	|bool	|is_uni_xdigit_lc|UV c
+Apd	|bool	|is_ascii_string|NN const U8 *s|STRLEN len
 Apd	|STRLEN	|is_utf8_char	|NN const U8 *s
 Apd	|bool	|is_utf8_string	|NN const U8 *s|STRLEN len
 Apdmb	|bool	|is_utf8_string_loc|NN const U8 *s|STRLEN len|NULLOK const U8 **p
diff --git a/embed.h b/embed.h
index 5968fb6..ba78d60 100644
--- a/embed.h
+++ b/embed.h
@@ -363,6 +363,7 @@
 #define is_uni_print_lc		Perl_is_uni_print_lc
 #define is_uni_punct_lc		Perl_is_uni_punct_lc
 #define is_uni_xdigit_lc	Perl_is_uni_xdigit_lc
+#define is_ascii_string		Perl_is_ascii_string
 #define is_utf8_char		Perl_is_utf8_char
 #define is_utf8_string		Perl_is_utf8_string
 #define is_utf8_string_loclen	Perl_is_utf8_string_loclen
@@ -2701,6 +2702,7 @@
 #define is_uni_print_lc(a)	Perl_is_uni_print_lc(aTHX_ a)
 #define is_uni_punct_lc(a)	Perl_is_uni_punct_lc(aTHX_ a)
 #define is_uni_xdigit_lc(a)	Perl_is_uni_xdigit_lc(aTHX_ a)
+#define is_ascii_string(a,b)	Perl_is_ascii_string(aTHX_ a,b)
 #define is_utf8_char(a)		Perl_is_utf8_char(aTHX_ a)
 #define is_utf8_string(a,b)	Perl_is_utf8_string(aTHX_ a,b)
 #define is_utf8_string_loclen(a,b,c,d)	Perl_is_utf8_string_loclen(aTHX_ a,b,c,d)
diff --git a/global.sym b/global.sym
index 115490a..a5c9f93 100644
--- a/global.sym
+++ b/global.sym
@@ -220,6 +220,7 @@ Perl_is_uni_lower_lc
 Perl_is_uni_print_lc
 Perl_is_uni_punct_lc
 Perl_is_uni_xdigit_lc
+Perl_is_ascii_string
 Perl_is_utf8_char
 Perl_is_utf8_string
 Perl_is_utf8_string_loc
diff --git a/proto.h b/proto.h
index 0dc4aab..8c52f5a 100644
--- a/proto.h
+++ b/proto.h
@@ -1281,6 +1281,11 @@ PERL_CALLCONV bool	Perl_is_uni_xdigit_lc(pTHX_ UV c)
 			__attribute__warn_unused_result__
 			__attribute__pure__;
 
+PERL_CALLCONV bool	Perl_is_ascii_string(pTHX_ const U8 *s, STRLEN len)
+			__attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT_IS_ASCII_STRING	\
+	assert(s)
+
 PERL_CALLCONV STRLEN	Perl_is_utf8_char(pTHX_ const U8 *s)
 			__attribute__nonnull__(pTHX_1);
 #define PERL_ARGS_ASSERT_IS_UTF8_CHAR	\
diff --git a/t/op/utfhash.t b/t/op/utfhash.t
index 32a1826..a9af502 100644
--- a/t/op/utfhash.t
+++ b/t/op/utfhash.t
@@ -5,7 +5,7 @@ BEGIN {
     @INC = '../lib';
     require './test.pl';
 
-    plan(tests => 97);
+    plan(tests => 99);
 }
 
 use strict;
@@ -196,6 +196,12 @@ __END__
   is($hash{тест}, $hash{'тест'});
   is($hash{тест}, 123);
   is($hash{'тест'}, 123);
+
+  # See if plain ASCII strings quoted with '=>' erroneously get utf8 flag [perl #68812]
+  my %foo = (a => 'b', 'c' => 'd');
+  for my $key (keys %foo) {
+    ok !utf8::is_utf8($key), "'$key' shouldn't have utf8 flag";
+  }
 }
 __END__
 {
@@ -209,4 +215,10 @@ __END__
   is($hash{½ää½âÀ½äâ½ää}, $hash{'½ää½âÀ½äâ½ää'});
   is($hash{½ää½âÀ½äâ½ää}, 123);
   is($hash{'½ää½âÀ½äâ½ää'}, 123);
+
+  # See if plain ASCII strings quoted with '=>' erroneously get utf8 flag [perl #68812]
+  my %foo = (a => 'b', 'c' => 'd');
+  for my $key (keys %foo) {
+    ok !utf8::is_utf8($key), "'$key' shouldn't have utf8 flag";
+  }
 }
diff --git a/toke.c b/toke.c
index 24b3c40..35ea218 100644
--- a/toke.c
+++ b/toke.c
@@ -1384,7 +1384,9 @@ S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
 {
     dVAR;
     SV * const sv = newSVpvn_utf8(start, len,
-				  UTF && !IN_BYTES
+				  !IN_BYTES
+				  && UTF
+				  && !is_ascii_string((const U8*)start, len)
 				  && is_utf8_string((const U8*)start, len));
     return sv;
 }
diff --git a/utf8.c b/utf8.c
index b5a3809..4bf4705 100644
--- a/utf8.c
+++ b/utf8.c
@@ -51,6 +51,38 @@ Unicode characters as a variable number of bytes, in such a way that
 characters in the ASCII range are unmodified, and a zero byte never appears
 within non-zero characters.
 
+=cut
+*/
+
+/*
+=for apidoc is_ascii_string
+
+Returns true if first C<len> bytes of the given string are ASCII (i.e. none
+of them even raise the question of UTF-8-ness).
+
+See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
+
+=cut
+*/
+
+bool
+Perl_is_ascii_string(pTHX_ const U8 *s, STRLEN len)
+{
+    const U8* const send = s + (len ? len : strlen((const char *)s));
+    const U8* x = s;
+
+    PERL_ARGS_ASSERT_IS_ASCII_STRING;
+    PERL_UNUSED_CONTEXT;
+
+    for (; x < send; ++x) {
+	if (!UTF8_IS_INVARIANT(*x))
+	    break;
+    }
+
+    return x == send;
+}
+
+/*
 =for apidoc uvuni_to_utf8_flags
 
 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
@@ -266,6 +298,7 @@ Perl_is_utf8_char(pTHX_ const U8 *s)
     return is_utf8_char_slow(s, len);
 }
 
+
 /*
 =for apidoc is_utf8_string
 
@@ -274,7 +307,7 @@ UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
 because a valid ASCII string is a valid UTF-8 string.
 
-See also is_utf8_string_loclen() and is_utf8_string_loc().
+See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
 
 =cut
 */