A new UTF-8 API, Perl_is_utf8_string_loc(), a variant

diff --git a/embed.fnc b/embed.fnc

index 26d3bd5..ca50143 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -337,6 +337,7 @@ Ap  |bool   |is_uni_punct_lc|UV c
 Ap     |bool   |is_uni_xdigit_lc|UV c
 Apd    |STRLEN |is_utf8_char   |U8 *p
 Apd    |bool   |is_utf8_string |U8 *s|STRLEN len
+Apd    |bool   |is_utf8_string_loc|U8 *s|STRLEN len|U8 **p
 Ap     |bool   |is_utf8_alnum  |U8 *p
 Ap     |bool   |is_utf8_alnumc |U8 *p
 Ap     |bool   |is_utf8_idfirst|U8 *p
diff --git a/embed.h b/embed.h

index 873cb04..5d0e52d 100644 (file)
--- a/embed.h
+++ b/embed.h
@@ -409,6 +409,7 @@
 #define is_uni_xdigit_lc       Perl_is_uni_xdigit_lc
 #define is_utf8_char           Perl_is_utf8_char
 #define is_utf8_string         Perl_is_utf8_string
+#define is_utf8_string_loc     Perl_is_utf8_string_loc
 #define is_utf8_alnum          Perl_is_utf8_alnum
 #define is_utf8_alnumc         Perl_is_utf8_alnumc
 #define is_utf8_idfirst                Perl_is_utf8_idfirst
@@ -2899,6 +2900,7 @@
 #define is_uni_xdigit_lc(a)    Perl_is_uni_xdigit_lc(aTHX_ a)
 #define is_utf8_char(a)                Perl_is_utf8_char(aTHX_ a)
 #define is_utf8_string(a,b)    Perl_is_utf8_string(aTHX_ a,b)
+#define is_utf8_string_loc(a,b,c)      Perl_is_utf8_string_loc(aTHX_ a,b,c)
 #define is_utf8_alnum(a)       Perl_is_utf8_alnum(aTHX_ a)
 #define is_utf8_alnumc(a)      Perl_is_utf8_alnumc(aTHX_ a)
 #define is_utf8_idfirst(a)     Perl_is_utf8_idfirst(aTHX_ a)
diff --git a/global.sym b/global.sym

index 3496198..83bec56 100644 (file)
--- a/global.sym
+++ b/global.sym
@@ -204,6 +204,7 @@ Perl_is_uni_punct_lc
 Perl_is_uni_xdigit_lc
 Perl_is_utf8_char
 Perl_is_utf8_string
+Perl_is_utf8_string_loc
 Perl_is_utf8_alnum
 Perl_is_utf8_alnumc
 Perl_is_utf8_idfirst
diff --git a/proto.h b/proto.h

index 97ae843..b6a584c 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -316,6 +316,7 @@ PERL_CALLCONV bool  Perl_is_uni_punct_lc(pTHX_ UV c);
 PERL_CALLCONV bool     Perl_is_uni_xdigit_lc(pTHX_ UV c);
 PERL_CALLCONV STRLEN   Perl_is_utf8_char(pTHX_ U8 *p);
 PERL_CALLCONV bool     Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len);
+PERL_CALLCONV bool     Perl_is_utf8_string_loc(pTHX_ U8 *s, STRLEN len, U8 **p);
 PERL_CALLCONV bool     Perl_is_utf8_alnum(pTHX_ U8 *p);
 PERL_CALLCONV bool     Perl_is_utf8_alnumc(pTHX_ U8 *p);
 PERL_CALLCONV bool     Perl_is_utf8_idfirst(pTHX_ U8 *p);
diff --git a/utf8.c b/utf8.c

index 21d0f08..ad8758e 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -257,6 +257,55 @@ Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 }
 
 /*
+=for apidoc A|bool|is_utf8_string_loc|U8 *s|STRLEN len|U8 **p
+
+Like is_ut8_string but store the location of the failure in
+the last argument.
+
+=cut
+*/
+
+bool
+Perl_is_utf8_string_loc(pTHX_ U8 *s, STRLEN len, U8 **p)
+{
+    U8* x = s;
+    U8* send;
+    STRLEN c;
+
+    if (!len)
+       len = strlen((char *)s);
+    send = s + len;
+
+    while (x < send) {
+        /* Inline the easy bits of is_utf8_char() here for speed... */
+        if (UTF8_IS_INVARIANT(*x))
+             c = 1;
+        else if (!UTF8_IS_START(*x)) {
+             if (p)
+                 *p = x;
+             return FALSE;
+        }
+        else {
+             /* ... and call is_utf8_char() only if really needed. */
+             c = is_utf8_char(x);
+             if (!c) {
+                  if (p)
+                     *p = x;
+                  return FALSE;
+             }
+        }
+        x += c;
+    }
+    if (x != send) {
+       if (p)
+          *p = x;
+       return FALSE;
+    }
+
+    return TRUE;
+}
+
+/*
 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 
 Bottom level UTF-8 decode routine.
embed.fnc		patch \| blob \| blame \| history
embed.h		patch \| blob \| blame \| history
global.sym		patch \| blob \| blame \| history
proto.h		patch \| blob \| blame \| history
utf8.c		patch \| blob \| blame \| history