In C<use utf8; a=>'b'>, do not set utf8 flag on 'a' [perl #68812]

diff --git a/embed.fnc b/embed.fnc

index 3f9ddcd..3e829b6 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -481,6 +481,7 @@ ApPR        |bool   |is_uni_lower_lc|UV c
 ApPR   |bool   |is_uni_print_lc|UV c
 ApPR   |bool   |is_uni_punct_lc|UV c
 ApPR   |bool   |is_uni_xdigit_lc|UV c
+Apd    |bool   |is_ascii_string|NN const U8 *s|STRLEN len
 Apd    |STRLEN |is_utf8_char   |NN const U8 *s
 Apd    |bool   |is_utf8_string |NN const U8 *s|STRLEN len
 Apdmb  |bool   |is_utf8_string_loc|NN const U8 *s|STRLEN len|NULLOK const U8 **p
diff --git a/embed.h b/embed.h

index 5968fb6..ba78d60 100644 (file)
--- a/embed.h
+++ b/embed.h
@@ -363,6 +363,7 @@
 #define is_uni_print_lc                Perl_is_uni_print_lc
 #define is_uni_punct_lc                Perl_is_uni_punct_lc
 #define is_uni_xdigit_lc       Perl_is_uni_xdigit_lc
+#define is_ascii_string                Perl_is_ascii_string
 #define is_utf8_char           Perl_is_utf8_char
 #define is_utf8_string         Perl_is_utf8_string
 #define is_utf8_string_loclen  Perl_is_utf8_string_loclen
@@ -2701,6 +2702,7 @@
 #define is_uni_print_lc(a)     Perl_is_uni_print_lc(aTHX_ a)
 #define is_uni_punct_lc(a)     Perl_is_uni_punct_lc(aTHX_ a)
 #define is_uni_xdigit_lc(a)    Perl_is_uni_xdigit_lc(aTHX_ a)
+#define is_ascii_string(a,b)   Perl_is_ascii_string(aTHX_ a,b)
 #define is_utf8_char(a)                Perl_is_utf8_char(aTHX_ a)
 #define is_utf8_string(a,b)    Perl_is_utf8_string(aTHX_ a,b)
 #define is_utf8_string_loclen(a,b,c,d) Perl_is_utf8_string_loclen(aTHX_ a,b,c,d)
diff --git a/global.sym b/global.sym

index 115490a..a5c9f93 100644 (file)
--- a/global.sym
+++ b/global.sym
@@ -220,6 +220,7 @@ Perl_is_uni_lower_lc
 Perl_is_uni_print_lc
 Perl_is_uni_punct_lc
 Perl_is_uni_xdigit_lc
+Perl_is_ascii_string
 Perl_is_utf8_char
 Perl_is_utf8_string
 Perl_is_utf8_string_loc
diff --git a/proto.h b/proto.h

index 0dc4aab..8c52f5a 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -1281,6 +1281,11 @@ PERL_CALLCONV bool       Perl_is_uni_xdigit_lc(pTHX_ UV c)
                        __attribute__warn_unused_result__
                        __attribute__pure__;
 
+PERL_CALLCONV bool     Perl_is_ascii_string(pTHX_ const U8 *s, STRLEN len)
+                       __attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT_IS_ASCII_STRING       \
+       assert(s)
+
 PERL_CALLCONV STRLEN   Perl_is_utf8_char(pTHX_ const U8 *s)
                        __attribute__nonnull__(pTHX_1);
 #define PERL_ARGS_ASSERT_IS_UTF8_CHAR  \
diff --git a/t/op/utfhash.t b/t/op/utfhash.t

index 32a1826..a9af502 100644 (file)
--- a/t/op/utfhash.t
+++ b/t/op/utfhash.t
@@ -5,7 +5,7 @@ BEGIN {
     @INC = '../lib';
     require './test.pl';
 
-    plan(tests => 97);
+    plan(tests => 99);
 }
 
 use strict;
@@ -196,6 +196,12 @@ __END__
   is($hash{тест}, $hash{'тест'});
   is($hash{тест}, 123);
   is($hash{'тест'}, 123);
+
+  # See if plain ASCII strings quoted with '=>' erroneously get utf8 flag [perl #68812]
+  my %foo = (a => 'b', 'c' => 'd');
+  for my $key (keys %foo) {
+    ok !utf8::is_utf8($key), "'$key' shouldn't have utf8 flag";
+  }
 }
 __END__
 {
@@ -209,4 +215,10 @@ __END__
   is($hash{½ää½âÀ½äâ½ää}, $hash{'½ää½âÀ½äâ½ää'});
   is($hash{½ää½âÀ½äâ½ää}, 123);
   is($hash{'½ää½âÀ½äâ½ää'}, 123);
+
+  # See if plain ASCII strings quoted with '=>' erroneously get utf8 flag [perl #68812]
+  my %foo = (a => 'b', 'c' => 'd');
+  for my $key (keys %foo) {
+    ok !utf8::is_utf8($key), "'$key' shouldn't have utf8 flag";
+  }
 }
diff --git a/toke.c b/toke.c

index 24b3c40..35ea218 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -1384,7 +1384,9 @@ S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
 {
     dVAR;
     SV * const sv = newSVpvn_utf8(start, len,
-                                 UTF && !IN_BYTES
+                                 !IN_BYTES
+                                 && UTF
+                                 && !is_ascii_string((const U8*)start, len)
                                  && is_utf8_string((const U8*)start, len));
     return sv;
 }
diff --git a/utf8.c b/utf8.c

index b5a3809..4bf4705 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -51,6 +51,38 @@ Unicode characters as a variable number of bytes, in such a way that
 characters in the ASCII range are unmodified, and a zero byte never appears
 within non-zero characters.
 
+=cut
+*/
+
+/*
+=for apidoc is_ascii_string
+
+Returns true if first C<len> bytes of the given string are ASCII (i.e. none
+of them even raise the question of UTF-8-ness).
+
+See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
+
+=cut
+*/
+
+bool
+Perl_is_ascii_string(pTHX_ const U8 *s, STRLEN len)
+{
+    const U8* const send = s + (len ? len : strlen((const char *)s));
+    const U8* x = s;
+
+    PERL_ARGS_ASSERT_IS_ASCII_STRING;
+    PERL_UNUSED_CONTEXT;
+
+    for (; x < send; ++x) {
+       if (!UTF8_IS_INVARIANT(*x))
+           break;
+    }
+
+    return x == send;
+}
+
+/*
 =for apidoc uvuni_to_utf8_flags
 
 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
@@ -266,6 +298,7 @@ Perl_is_utf8_char(pTHX_ const U8 *s)
     return is_utf8_char_slow(s, len);
 }
 
+
 /*
 =for apidoc is_utf8_string
 
@@ -274,7 +307,7 @@ UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
 because a valid ASCII string is a valid UTF-8 string.
 
-See also is_utf8_string_loclen() and is_utf8_string_loc().
+See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
 
 =cut
 */
embed.fnc		patch \| blob \| blame \| history
embed.h		patch \| blob \| blame \| history
global.sym		patch \| blob \| blame \| history
proto.h		patch \| blob \| blame \| history
t/op/utfhash.t		patch \| blob \| blame \| history
toke.c		patch \| blob \| blame \| history
utf8.c		patch \| blob \| blame \| history