From: Chip Salzenberg Date: Wed, 26 Aug 2009 21:33:15 +0000 (-0700) Subject: In C'b'>, do not set utf8 flag on 'a' [perl #68812] X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=eaf7a4d2ee7805b14e26e634fba0893913924a6c;p=p5sagit%2Fp5-mst-13.2.git In C'b'>, do not set utf8 flag on 'a' [perl #68812] --- diff --git a/embed.fnc b/embed.fnc index 3f9ddcd..3e829b6 100644 --- a/embed.fnc +++ b/embed.fnc @@ -481,6 +481,7 @@ ApPR |bool |is_uni_lower_lc|UV c ApPR |bool |is_uni_print_lc|UV c ApPR |bool |is_uni_punct_lc|UV c ApPR |bool |is_uni_xdigit_lc|UV c +Apd |bool |is_ascii_string|NN const U8 *s|STRLEN len Apd |STRLEN |is_utf8_char |NN const U8 *s Apd |bool |is_utf8_string |NN const U8 *s|STRLEN len Apdmb |bool |is_utf8_string_loc|NN const U8 *s|STRLEN len|NULLOK const U8 **p diff --git a/embed.h b/embed.h index 5968fb6..ba78d60 100644 --- a/embed.h +++ b/embed.h @@ -363,6 +363,7 @@ #define is_uni_print_lc Perl_is_uni_print_lc #define is_uni_punct_lc Perl_is_uni_punct_lc #define is_uni_xdigit_lc Perl_is_uni_xdigit_lc +#define is_ascii_string Perl_is_ascii_string #define is_utf8_char Perl_is_utf8_char #define is_utf8_string Perl_is_utf8_string #define is_utf8_string_loclen Perl_is_utf8_string_loclen @@ -2701,6 +2702,7 @@ #define is_uni_print_lc(a) Perl_is_uni_print_lc(aTHX_ a) #define is_uni_punct_lc(a) Perl_is_uni_punct_lc(aTHX_ a) #define is_uni_xdigit_lc(a) Perl_is_uni_xdigit_lc(aTHX_ a) +#define is_ascii_string(a,b) Perl_is_ascii_string(aTHX_ a,b) #define is_utf8_char(a) Perl_is_utf8_char(aTHX_ a) #define is_utf8_string(a,b) Perl_is_utf8_string(aTHX_ a,b) #define is_utf8_string_loclen(a,b,c,d) Perl_is_utf8_string_loclen(aTHX_ a,b,c,d) diff --git a/global.sym b/global.sym index 115490a..a5c9f93 100644 --- a/global.sym +++ b/global.sym @@ -220,6 +220,7 @@ Perl_is_uni_lower_lc Perl_is_uni_print_lc Perl_is_uni_punct_lc Perl_is_uni_xdigit_lc +Perl_is_ascii_string Perl_is_utf8_char Perl_is_utf8_string Perl_is_utf8_string_loc diff --git a/proto.h b/proto.h index 0dc4aab..8c52f5a 100644 --- a/proto.h +++ b/proto.h @@ -1281,6 +1281,11 @@ PERL_CALLCONV bool Perl_is_uni_xdigit_lc(pTHX_ UV c) __attribute__warn_unused_result__ __attribute__pure__; +PERL_CALLCONV bool Perl_is_ascii_string(pTHX_ const U8 *s, STRLEN len) + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_IS_ASCII_STRING \ + assert(s) + PERL_CALLCONV STRLEN Perl_is_utf8_char(pTHX_ const U8 *s) __attribute__nonnull__(pTHX_1); #define PERL_ARGS_ASSERT_IS_UTF8_CHAR \ diff --git a/t/op/utfhash.t b/t/op/utfhash.t index 32a1826..a9af502 100644 --- a/t/op/utfhash.t +++ b/t/op/utfhash.t @@ -5,7 +5,7 @@ BEGIN { @INC = '../lib'; require './test.pl'; - plan(tests => 97); + plan(tests => 99); } use strict; @@ -196,6 +196,12 @@ __END__ is($hash{тест}, $hash{'тест'}); is($hash{тест}, 123); is($hash{'тест'}, 123); + + # See if plain ASCII strings quoted with '=>' erroneously get utf8 flag [perl #68812] + my %foo = (a => 'b', 'c' => 'd'); + for my $key (keys %foo) { + ok !utf8::is_utf8($key), "'$key' shouldn't have utf8 flag"; + } } __END__ { @@ -209,4 +215,10 @@ __END__ is($hash{½ää½âÀ½äâ½ää}, $hash{'½ää½âÀ½äâ½ää'}); is($hash{½ää½âÀ½äâ½ää}, 123); is($hash{'½ää½âÀ½äâ½ää'}, 123); + + # See if plain ASCII strings quoted with '=>' erroneously get utf8 flag [perl #68812] + my %foo = (a => 'b', 'c' => 'd'); + for my $key (keys %foo) { + ok !utf8::is_utf8($key), "'$key' shouldn't have utf8 flag"; + } } diff --git a/toke.c b/toke.c index 24b3c40..35ea218 100644 --- a/toke.c +++ b/toke.c @@ -1384,7 +1384,9 @@ S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len) { dVAR; SV * const sv = newSVpvn_utf8(start, len, - UTF && !IN_BYTES + !IN_BYTES + && UTF + && !is_ascii_string((const U8*)start, len) && is_utf8_string((const U8*)start, len)); return sv; } diff --git a/utf8.c b/utf8.c index b5a3809..4bf4705 100644 --- a/utf8.c +++ b/utf8.c @@ -51,6 +51,38 @@ Unicode characters as a variable number of bytes, in such a way that characters in the ASCII range are unmodified, and a zero byte never appears within non-zero characters. +=cut +*/ + +/* +=for apidoc is_ascii_string + +Returns true if first C bytes of the given string are ASCII (i.e. none +of them even raise the question of UTF-8-ness). + +See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc(). + +=cut +*/ + +bool +Perl_is_ascii_string(pTHX_ const U8 *s, STRLEN len) +{ + const U8* const send = s + (len ? len : strlen((const char *)s)); + const U8* x = s; + + PERL_ARGS_ASSERT_IS_ASCII_STRING; + PERL_UNUSED_CONTEXT; + + for (; x < send; ++x) { + if (!UTF8_IS_INVARIANT(*x)) + break; + } + + return x == send; +} + +/* =for apidoc uvuni_to_utf8_flags Adds the UTF-8 representation of the Unicode codepoint C to the end @@ -266,6 +298,7 @@ Perl_is_utf8_char(pTHX_ const U8 *s) return is_utf8_char_slow(s, len); } + /* =for apidoc is_utf8_string @@ -274,7 +307,7 @@ UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does not mean 'a string that contains code points above 0x7F encoded in UTF-8' because a valid ASCII string is a valid UTF-8 string. -See also is_utf8_string_loclen() and is_utf8_string_loc(). +See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc(). =cut */