From: Jarkko Hietaniemi Date: Sat, 4 Jun 2005 13:40:15 +0000 (+0300) Subject: further speeding up of is_utf8_string() X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=646ca15d5cf720a2af3a258d7092a967f3a80550;p=p5sagit%2Fp5-mst-13.2.git further speeding up of is_utf8_string() Message-ID: <42A1850F.4040109@gmail.com> p4raw-id: //depot/perl@24706 --- diff --git a/MANIFEST b/MANIFEST index a3597dd..9fb3834 100644 --- a/MANIFEST +++ b/MANIFEST @@ -2756,6 +2756,7 @@ t/op/caller.t See if caller() works t/op/chars.t See if character escapes work t/op/chdir.t See if chdir works t/op/chop.t See if chop works +t/op/chr.t See if chr works t/op/closure.t See if closures work t/op/cmp.t See if the various string and numeric compare work t/op/concat.t See if string concatenation works diff --git a/embed.fnc b/embed.fnc index c3ff3d3..9a7f4e7 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1344,6 +1344,10 @@ s |SV* |mess_alloc sn |NV|mulexp10 |NV value|I32 exponent #endif +#if defined(PERL_IN_UTF8_C) || defined(PERL_DECL_PROT) +s |STRLEN |is_utf8_char_slow|NN const U8 *s|const STRLEN len +#endif + START_EXTERN_C Apd |void |sv_setsv_flags |NN SV* dsv|SV* ssv|I32 flags diff --git a/embed.h b/embed.h index aa87797..2344eca 100644 --- a/embed.h +++ b/embed.h @@ -1417,6 +1417,11 @@ #define mulexp10 S_mulexp10 #endif #endif +#if defined(PERL_IN_UTF8_C) || defined(PERL_DECL_PROT) +#ifdef PERL_CORE +#define is_utf8_char_slow S_is_utf8_char_slow +#endif +#endif #define sv_setsv_flags Perl_sv_setsv_flags #define sv_catpvn_flags Perl_sv_catpvn_flags #define sv_catsv_flags Perl_sv_catsv_flags @@ -3399,6 +3404,11 @@ #define mulexp10 S_mulexp10 #endif #endif +#if defined(PERL_IN_UTF8_C) || defined(PERL_DECL_PROT) +#ifdef PERL_CORE +#define is_utf8_char_slow(a,b) S_is_utf8_char_slow(aTHX_ a,b) +#endif +#endif #define sv_setsv_flags(a,b,c) Perl_sv_setsv_flags(aTHX_ a,b,c) #define sv_catpvn_flags(a,b,c,d) Perl_sv_catpvn_flags(aTHX_ a,b,c,d) #define sv_catsv_flags(a,b,c) Perl_sv_catsv_flags(aTHX_ a,b,c) diff --git a/global.sym b/global.sym index 8a1e9ca..b4c745c 100644 --- a/global.sym +++ b/global.sym @@ -315,7 +315,7 @@ Perl_newSVuv Perl_newSVnv Perl_newSVpv Perl_newSVpvn -Perl_newSVpv_hek +Perl_newSVhek Perl_newSVpvn_share Perl_newSVpvf Perl_vnewSVpvf diff --git a/proto.h b/proto.h index 04fc3ec..1b9c6b6 100644 --- a/proto.h +++ b/proto.h @@ -2492,6 +2492,12 @@ STATIC SV* S_mess_alloc(pTHX); STATIC NV S_mulexp10(NV value, I32 exponent); #endif +#if defined(PERL_IN_UTF8_C) || defined(PERL_DECL_PROT) +STATIC STRLEN S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len) + __attribute__nonnull__(pTHX_1); + +#endif + START_EXTERN_C PERL_CALLCONV void Perl_sv_setsv_flags(pTHX_ SV* dsv, SV* ssv, I32 flags) diff --git a/t/op/chr.t b/t/op/chr.t new file mode 100644 index 0000000..94450ec --- /dev/null +++ b/t/op/chr.t @@ -0,0 +1,50 @@ +#!./perl + +BEGIN { + chdir 't' if -d 't'; + @INC = qw(. ../lib); # ../lib needed for test.deparse + require "test.pl"; +} + +plan tests => 26; + +# Note that t/op/ord.t already tests for chr() <-> ord() rountripping. + +# Don't assume ASCII. + +is(chr(ord("A")), "A"); + +is(chr( 0), "\x00"); +is(chr(127), "\x7F"); +is(chr(128), "\x80"); +is(chr(255), "\xFF"); + +# is(chr(-1), undef); # Shouldn't it be? + +# Check UTF-8. + +sub hexes { join(" ",map{sprintf"%02x",$_}unpack("C*",chr($_[0]))) } + +# The following code points are some interesting steps in UTF-8. +is(hexes( 0x100), "c4 80"); +is(hexes( 0x7FF), "df bf"); +is(hexes( 0x800), "e0 a0 80"); +is(hexes( 0xFFF), "e0 bf bf"); +is(hexes( 0x1000), "e1 80 80"); +is(hexes( 0xCFFF), "ec bf bf"); +is(hexes( 0xD000), "ed 80 80"); +is(hexes( 0xD7FF), "ed 9f bf"); +is(hexes( 0xD800), "ed a0 80"); # not strict utf-8 (surrogate area begin) +is(hexes( 0xDFFF), "ed bf bf"); # not strict utf-8 (surrogate area end) +is(hexes( 0xE000), "ee 80 80"); +is(hexes( 0xFFFF), "ef bf bf"); +is(hexes( 0x10000), "f0 90 80 80"); +is(hexes( 0x3FFFF), "f0 bf bf bf"); +is(hexes( 0x40000), "f1 80 80 80"); +is(hexes( 0xFFFFF), "f3 bf bf bf"); +is(hexes(0x100000), "f4 80 80 80"); +is(hexes(0x10FFFF), "f4 8f bf bf"); # Unicode (4.1) last code point +is(hexes(0x110000), "f4 90 80 80"); +is(hexes(0x1FFFFF), "f7 bf bf bf"); # last four byte encoding +is(hexes(0x200000), "f8 88 80 80 80"); + diff --git a/utf8.c b/utf8.c index ecc77c0..53983cb 100644 --- a/utf8.c +++ b/utf8.c @@ -173,6 +173,60 @@ Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv) return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0); } +/* + +Tests if some arbitrary number of bytes begins in a valid UTF-8 +character. Note that an INVARIANT (i.e. ASCII) character is a valid +UTF-8 character. The actual number of bytes in the UTF-8 character +will be returned if it is valid, otherwise 0. + +This is the "slow" version as opposed to the "fast" version which is +the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed +difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four +or less you should use the IS_UTF8_CHAR(), for lengths of five or more +you should use the _slow(). In practice this means that the _slow() +will be used very rarely, since the maximum Unicode code point (as of +Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only +the "Perl extended UTF-8" (the infamous 'v-strings') will encode into +five bytes or more. + +=cut */ +STRLEN +S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len) +{ + U8 u = *s; + STRLEN slen; + UV uv, ouv; + + if (UTF8_IS_INVARIANT(u)) + return 1; + + if (!UTF8_IS_START(u)) + return 0; + + if (len < 2 || !UTF8_IS_CONTINUATION(s[1])) + return 0; + + slen = len - 1; + s++; + u &= UTF_START_MASK(len); + uv = u; + ouv = uv; + while (slen--) { + if (!UTF8_IS_CONTINUATION(*s)) + return 0; + uv = UTF8_ACCUMULATE(uv, *s); + if (uv < ouv) + return 0; + ouv = uv; + s++; + } + + if ((STRLEN)UNISKIP(uv) < len) + return 0; + + return len; +} /* =for apidoc A|STRLEN|is_utf8_char|const U8 *s @@ -192,42 +246,7 @@ Perl_is_utf8_char(pTHX_ const U8 *s) if (len <= 4) return IS_UTF8_CHAR(s, len) ? len : 0; #endif /* #ifdef IS_UTF8_CHAR */ - { - U8 u = *s; - STRLEN slen; - UV uv, ouv; - - if (UTF8_IS_INVARIANT(u)) - return 1; - - if (!UTF8_IS_START(u)) - return 0; - - len = UTF8SKIP(s); - - if (len < 2 || !UTF8_IS_CONTINUATION(s[1])) - return 0; - - slen = len - 1; - s++; - u &= UTF_START_MASK(len); - uv = u; - ouv = uv; - while (slen--) { - if (!UTF8_IS_CONTINUATION(*s)) - return 0; - uv = UTF8_ACCUMULATE(uv, *s); - if (uv < ouv) - return 0; - ouv = uv; - s++; - } - - if ((STRLEN)UNISKIP(uv) < len) - return 0; - - return len; - } + return S_is_utf8_char_slow(s, len); } /* @@ -260,7 +279,18 @@ Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len) return FALSE; else { /* ... and call is_utf8_char() only if really needed. */ - c = is_utf8_char(x); +#ifdef IS_UTF8_CHAR + c = UTF8SKIP(x); + if (c <= 4) { + if (!IS_UTF8_CHAR(x, c)) + return FALSE; + } else { + if (!S_is_utf8_char_slow(x, c)) + return FALSE; + } +#else + c = is_utf8_char(x); +#endif /* #ifdef IS_UTF8_CHAR */ if (!c) return FALSE; }