From: Jarkko Hietaniemi Date: Wed, 10 Sep 2003 06:54:02 +0000 (+0000) Subject: A new UTF-8 API, Perl_is_utf8_string_loc(), a variant X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=81cd54e3d8dc0f62b7c4bf5206036c9493ef5300;p=p5sagit%2Fp5-mst-13.2.git A new UTF-8 API, Perl_is_utf8_string_loc(), a variant of Perl_utf8_is_string(). p4raw-id: //depot/perl@21152 --- diff --git a/embed.fnc b/embed.fnc index 26d3bd5..ca50143 100644 --- a/embed.fnc +++ b/embed.fnc @@ -337,6 +337,7 @@ Ap |bool |is_uni_punct_lc|UV c Ap |bool |is_uni_xdigit_lc|UV c Apd |STRLEN |is_utf8_char |U8 *p Apd |bool |is_utf8_string |U8 *s|STRLEN len +Apd |bool |is_utf8_string_loc|U8 *s|STRLEN len|U8 **p Ap |bool |is_utf8_alnum |U8 *p Ap |bool |is_utf8_alnumc |U8 *p Ap |bool |is_utf8_idfirst|U8 *p diff --git a/embed.h b/embed.h index 873cb04..5d0e52d 100644 --- a/embed.h +++ b/embed.h @@ -409,6 +409,7 @@ #define is_uni_xdigit_lc Perl_is_uni_xdigit_lc #define is_utf8_char Perl_is_utf8_char #define is_utf8_string Perl_is_utf8_string +#define is_utf8_string_loc Perl_is_utf8_string_loc #define is_utf8_alnum Perl_is_utf8_alnum #define is_utf8_alnumc Perl_is_utf8_alnumc #define is_utf8_idfirst Perl_is_utf8_idfirst @@ -2899,6 +2900,7 @@ #define is_uni_xdigit_lc(a) Perl_is_uni_xdigit_lc(aTHX_ a) #define is_utf8_char(a) Perl_is_utf8_char(aTHX_ a) #define is_utf8_string(a,b) Perl_is_utf8_string(aTHX_ a,b) +#define is_utf8_string_loc(a,b,c) Perl_is_utf8_string_loc(aTHX_ a,b,c) #define is_utf8_alnum(a) Perl_is_utf8_alnum(aTHX_ a) #define is_utf8_alnumc(a) Perl_is_utf8_alnumc(aTHX_ a) #define is_utf8_idfirst(a) Perl_is_utf8_idfirst(aTHX_ a) diff --git a/global.sym b/global.sym index 3496198..83bec56 100644 --- a/global.sym +++ b/global.sym @@ -204,6 +204,7 @@ Perl_is_uni_punct_lc Perl_is_uni_xdigit_lc Perl_is_utf8_char Perl_is_utf8_string +Perl_is_utf8_string_loc Perl_is_utf8_alnum Perl_is_utf8_alnumc Perl_is_utf8_idfirst diff --git a/proto.h b/proto.h index 97ae843..b6a584c 100644 --- a/proto.h +++ b/proto.h @@ -316,6 +316,7 @@ PERL_CALLCONV bool Perl_is_uni_punct_lc(pTHX_ UV c); PERL_CALLCONV bool Perl_is_uni_xdigit_lc(pTHX_ UV c); PERL_CALLCONV STRLEN Perl_is_utf8_char(pTHX_ U8 *p); PERL_CALLCONV bool Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len); +PERL_CALLCONV bool Perl_is_utf8_string_loc(pTHX_ U8 *s, STRLEN len, U8 **p); PERL_CALLCONV bool Perl_is_utf8_alnum(pTHX_ U8 *p); PERL_CALLCONV bool Perl_is_utf8_alnumc(pTHX_ U8 *p); PERL_CALLCONV bool Perl_is_utf8_idfirst(pTHX_ U8 *p); diff --git a/utf8.c b/utf8.c index 21d0f08..ad8758e 100644 --- a/utf8.c +++ b/utf8.c @@ -257,6 +257,55 @@ Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len) } /* +=for apidoc A|bool|is_utf8_string_loc|U8 *s|STRLEN len|U8 **p + +Like is_ut8_string but store the location of the failure in +the last argument. + +=cut +*/ + +bool +Perl_is_utf8_string_loc(pTHX_ U8 *s, STRLEN len, U8 **p) +{ + U8* x = s; + U8* send; + STRLEN c; + + if (!len) + len = strlen((char *)s); + send = s + len; + + while (x < send) { + /* Inline the easy bits of is_utf8_char() here for speed... */ + if (UTF8_IS_INVARIANT(*x)) + c = 1; + else if (!UTF8_IS_START(*x)) { + if (p) + *p = x; + return FALSE; + } + else { + /* ... and call is_utf8_char() only if really needed. */ + c = is_utf8_char(x); + if (!c) { + if (p) + *p = x; + return FALSE; + } + } + x += c; + } + if (x != send) { + if (p) + *p = x; + return FALSE; + } + + return TRUE; +} + +/* =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags Bottom level UTF-8 decode routine.