From: Jarkko Hietaniemi Date: Sun, 5 Jun 2005 18:06:12 +0000 (+0300) Subject: one more round of is_utf8_foo tuneup X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=768c67ee6ae82fc2dd99a9dd06708f0a0cc097cb;p=p5sagit%2Fp5-mst-13.2.git one more round of is_utf8_foo tuneup Message-ID: <42A314E4.8060608@gmail.com> p4raw-id: //depot/perl@24730 --- diff --git a/embed.fnc b/embed.fnc index 9a7f4e7..94515c4 100644 --- a/embed.fnc +++ b/embed.fnc @@ -346,7 +346,8 @@ ApPR |bool |is_uni_punct_lc|UV c ApPR |bool |is_uni_xdigit_lc|UV c Apd |STRLEN |is_utf8_char |NN const U8 *p Apd |bool |is_utf8_string |NN const U8 *s|STRLEN len -Apd |bool |is_utf8_string_loc|NN const U8 *s|STRLEN len|NN const U8 **p +Apd |bool |is_utf8_string_loc|NN const U8 *s|STRLEN len|const U8 **p +Apd |bool |is_utf8_string_loclen|NN const U8 *s|STRLEN len|const U8 **ep|STRLEN *el ApR |bool |is_utf8_alnum |NN const U8 *p ApR |bool |is_utf8_alnumc |NN const U8 *p ApR |bool |is_utf8_idfirst|NN const U8 *p diff --git a/embed.h b/embed.h index 2344eca..6613bbb 100644 --- a/embed.h +++ b/embed.h @@ -345,6 +345,7 @@ #define is_utf8_char Perl_is_utf8_char #define is_utf8_string Perl_is_utf8_string #define is_utf8_string_loc Perl_is_utf8_string_loc +#define is_utf8_string_loclen Perl_is_utf8_string_loclen #define is_utf8_alnum Perl_is_utf8_alnum #define is_utf8_alnumc Perl_is_utf8_alnumc #define is_utf8_idfirst Perl_is_utf8_idfirst @@ -2344,6 +2345,7 @@ #define is_utf8_char(a) Perl_is_utf8_char(aTHX_ a) #define is_utf8_string(a,b) Perl_is_utf8_string(aTHX_ a,b) #define is_utf8_string_loc(a,b,c) Perl_is_utf8_string_loc(aTHX_ a,b,c) +#define is_utf8_string_loclen(a,b,c,d) Perl_is_utf8_string_loclen(aTHX_ a,b,c,d) #define is_utf8_alnum(a) Perl_is_utf8_alnum(aTHX_ a) #define is_utf8_alnumc(a) Perl_is_utf8_alnumc(aTHX_ a) #define is_utf8_idfirst(a) Perl_is_utf8_idfirst(aTHX_ a) diff --git a/global.sym b/global.sym index b4c745c..208bd2d 100644 --- a/global.sym +++ b/global.sym @@ -207,6 +207,7 @@ Perl_is_uni_xdigit_lc Perl_is_utf8_char Perl_is_utf8_string Perl_is_utf8_string_loc +Perl_is_utf8_string_loclen Perl_is_utf8_alnum Perl_is_utf8_alnumc Perl_is_utf8_idfirst diff --git a/mg.c b/mg.c index 05b9fc6..7a5c1cf 100644 --- a/mg.c +++ b/mg.c @@ -495,10 +495,12 @@ Perl_magic_len(pTHX_ SV *sv, MAGIC *mg) if (i > 0 && RX_MATCH_UTF8(rx)) { char *s = rx->subbeg + s1; char *send = rx->subbeg + t1; + const U8 *ep; + STRLEN el; i = t1 - s1; - if (is_utf8_string((U8*)s, i)) - i = Perl_utf8_length(aTHX_ (U8*)s, (U8*)send); + if (is_utf8_string_loclen((U8*)s, i, &ep, &el)) + i = el; } if (i < 0) Perl_croak(aTHX_ "panic: magic_len: %"IVdf, (IV)i); diff --git a/pod/perlapi.pod b/pod/perlapi.pod index aadf23a..86d97ff 100644 --- a/pod/perlapi.pod +++ b/pod/perlapi.pod @@ -5325,6 +5325,8 @@ UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does not mean 'a string that contains code points above 0x7F encoded in UTF-8' because a valid ASCII string is a valid UTF-8 string. +See also is_utf8_string_loclen() and is_utf8_string_loc(). + bool is_utf8_string(const U8 *s, STRLEN len) =for hackers @@ -5332,14 +5334,31 @@ Found in file utf8.c =item is_utf8_string_loc -Like is_ut8_string but store the location of the failure in -the last argument. +Like is_ut8_string() but stores the location of the failure (in the +case of "utf8ness failure") or the location s+len (in the case of +"utf8ness success") in the C. + +See also is_utf8_string_loclen() and is_utf8_string(). bool is_utf8_string_loc(const U8 *s, STRLEN len, const U8 **p) =for hackers Found in file utf8.c +=item is_utf8_string_loclen + +Like is_ut8_string() but stores the location of the failure (in the +case of "utf8ness failure") or the location s+len (in the case of +"utf8ness success") in the C, and the number of UTF-8 +encoded characters in the C. + +See also is_utf8_string_loc() and is_utf8_string(). + + bool is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) + +=for hackers +Found in file utf8.c + =item pv_uni_display Build to the scalar dsv a displayable version of the string spv, diff --git a/proto.h b/proto.h index 1b9c6b6..a3be5fb 100644 --- a/proto.h +++ b/proto.h @@ -620,8 +620,10 @@ PERL_CALLCONV bool Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len) __attribute__nonnull__(pTHX_1); PERL_CALLCONV bool Perl_is_utf8_string_loc(pTHX_ const U8 *s, STRLEN len, const U8 **p) - __attribute__nonnull__(pTHX_1) - __attribute__nonnull__(pTHX_3); + __attribute__nonnull__(pTHX_1); + +PERL_CALLCONV bool Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) + __attribute__nonnull__(pTHX_1); PERL_CALLCONV bool Perl_is_utf8_alnum(pTHX_ const U8 *p) __attribute__warn_unused_result__ diff --git a/utf8.c b/utf8.c index 5a80bfd..f8128d8 100644 --- a/utf8.c +++ b/utf8.c @@ -240,10 +240,9 @@ will be returned if it is valid, otherwise 0. STRLEN Perl_is_utf8_char(pTHX_ const U8 *s) { - STRLEN len; + STRLEN len = UTF8SKIP(s); #ifdef IS_UTF8_CHAR - len = UTF8SKIP(s); - if (len <= 4) + if (IS_UTF8_CHAR_FAST(len)) return IS_UTF8_CHAR(s, len) ? len : 0; #endif /* #ifdef IS_UTF8_CHAR */ return is_utf8_char_slow(s, len); @@ -257,6 +256,8 @@ UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does not mean 'a string that contains code points above 0x7F encoded in UTF-8' because a valid ASCII string is a valid UTF-8 string. +See also is_utf8_string_loclen() and is_utf8_string_loc(). + =cut */ @@ -276,26 +277,26 @@ Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len) if (UTF8_IS_INVARIANT(*x)) c = 1; else if (!UTF8_IS_START(*x)) - return FALSE; + goto out; else { /* ... and call is_utf8_char() only if really needed. */ #ifdef IS_UTF8_CHAR c = UTF8SKIP(x); - if (c <= 4) { - if (!IS_UTF8_CHAR(x, c)) - return FALSE; - } else { - if (!is_utf8_char_slow(x, c)) - return FALSE; - } + if (IS_UTF8_CHAR_FAST(c)) { + if (!IS_UTF8_CHAR(x, c)) + goto out; + } else if (!is_utf8_char_slow(x, c)) + goto out; #else c = is_utf8_char(x); #endif /* #ifdef IS_UTF8_CHAR */ if (!c) - return FALSE; + goto out; } x += c; } + + out: if (x != send) return FALSE; @@ -303,16 +304,20 @@ Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len) } /* -=for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **p +=for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el -Like is_ut8_string but store the location of the failure in -the last argument. +Like is_ut8_string() but stores the location of the failure (in the +case of "utf8ness failure") or the location s+len (in the case of +"utf8ness success") in the C, and the number of UTF-8 +encoded characters in the C. + +See also is_utf8_string_loc() and is_utf8_string(). =cut */ bool -Perl_is_utf8_string_loc(pTHX_ const U8 *s, STRLEN len, const U8 **p) +Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) { const U8* x = s; const U8* send; @@ -321,37 +326,63 @@ Perl_is_utf8_string_loc(pTHX_ const U8 *s, STRLEN len, const U8 **p) if (!len && s) len = strlen((const char *)s); send = s + len; + if (el) + *el = 0; while (x < send) { /* Inline the easy bits of is_utf8_char() here for speed... */ if (UTF8_IS_INVARIANT(*x)) - c = 1; - else if (!UTF8_IS_START(*x)) { - if (p) - *p = x; - return FALSE; - } + c = 1; + else if (!UTF8_IS_START(*x)) + goto out; else { - /* ... and call is_utf8_char() only if really needed. */ - c = is_utf8_char(x); - if (!c) { - if (p) - *p = x; - return FALSE; - } + /* ... and call is_utf8_char() only if really needed. */ +#ifdef IS_UTF8_CHAR + c = UTF8SKIP(x); + if (IS_UTF8_CHAR_FAST(c)) { + if (!IS_UTF8_CHAR(x, c)) + c = 0; + } else + c = is_utf8_char_slow(x, c); +#else + c = is_utf8_char(x); +#endif /* #ifdef IS_UTF8_CHAR */ + if (!c) + goto out; } - x += c; + x += c; + if (el) + (*el)++; } - if (x != send) { - if (p) - *p = x; + + out: + if (ep) + *ep = x; + if (x != send) return FALSE; - } return TRUE; } /* +=for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el + +Like is_ut8_string() but stores the location of the failure (in the +case of "utf8ness failure") or the location s+len (in the case of +"utf8ness success") in the C. + +See also is_utf8_string_loclen() and is_utf8_string(). + +=cut +*/ + +bool +Perl_is_utf8_string_loc(pTHX_ const U8 *s, STRLEN len, const U8 **ep) +{ + return is_utf8_string_loclen(s, len, ep, 0); +} + +/* =for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags Bottom level UTF-8 decode routine. diff --git a/utf8.h b/utf8.h index fb44c85..c8bcb36 100644 --- a/utf8.h +++ b/utf8.h @@ -327,3 +327,5 @@ encoded character. (n) == 3 ? IS_UTF8_CHAR_3(p) : \ (n) == 4 ? IS_UTF8_CHAR_4(p) : 0) +#define IS_UTF8_CHAR_FAST(n) ((n) <= 4) +