From: Gurusamy Sarathy Date: Mon, 13 Mar 2000 09:57:59 +0000 (+0000) Subject: make the is_utf8_*() safe for use on invalid utf8 (they now X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=386d01d60781c452398a2c213e241a4169059c2c;p=p5sagit%2Fp5-mst-13.2.git make the is_utf8_*() safe for use on invalid utf8 (they now return false on such input instead of emitting warnings) p4raw-id: //depot/perl@5700 --- diff --git a/embed.h b/embed.h index b597558..2725f8b 100644 --- a/embed.h +++ b/embed.h @@ -300,6 +300,7 @@ #define to_uni_upper_lc Perl_to_uni_upper_lc #define to_uni_title_lc Perl_to_uni_title_lc #define to_uni_lower_lc Perl_to_uni_lower_lc +#define is_utf8_char Perl_is_utf8_char #define is_utf8_alnum Perl_is_utf8_alnum #define is_utf8_alnumc Perl_is_utf8_alnumc #define is_utf8_idfirst Perl_is_utf8_idfirst @@ -1744,6 +1745,7 @@ #define to_uni_upper_lc(a) Perl_to_uni_upper_lc(aTHX_ a) #define to_uni_title_lc(a) Perl_to_uni_title_lc(aTHX_ a) #define to_uni_lower_lc(a) Perl_to_uni_lower_lc(aTHX_ a) +#define is_utf8_char(a) Perl_is_utf8_char(aTHX_ a) #define is_utf8_alnum(a) Perl_is_utf8_alnum(aTHX_ a) #define is_utf8_alnumc(a) Perl_is_utf8_alnumc(aTHX_ a) #define is_utf8_idfirst(a) Perl_is_utf8_idfirst(aTHX_ a) @@ -3420,6 +3422,8 @@ #define to_uni_title_lc Perl_to_uni_title_lc #define Perl_to_uni_lower_lc CPerlObj::Perl_to_uni_lower_lc #define to_uni_lower_lc Perl_to_uni_lower_lc +#define Perl_is_utf8_char CPerlObj::Perl_is_utf8_char +#define is_utf8_char Perl_is_utf8_char #define Perl_is_utf8_alnum CPerlObj::Perl_is_utf8_alnum #define is_utf8_alnum Perl_is_utf8_alnum #define Perl_is_utf8_alnumc CPerlObj::Perl_is_utf8_alnumc diff --git a/embed.pl b/embed.pl index 8b6c887..600e818 100755 --- a/embed.pl +++ b/embed.pl @@ -1597,6 +1597,7 @@ Ap |bool |is_uni_xdigit_lc|U32 c Ap |U32 |to_uni_upper_lc|U32 c Ap |U32 |to_uni_title_lc|U32 c Ap |U32 |to_uni_lower_lc|U32 c +Ap |int |is_utf8_char |U8 *p Ap |bool |is_utf8_alnum |U8 *p Ap |bool |is_utf8_alnumc |U8 *p Ap |bool |is_utf8_idfirst|U8 *p diff --git a/global.sym b/global.sym index 10b5303..ea77dfe 100644 --- a/global.sym +++ b/global.sym @@ -180,6 +180,7 @@ Perl_is_uni_xdigit_lc Perl_to_uni_upper_lc Perl_to_uni_title_lc Perl_to_uni_lower_lc +Perl_is_utf8_char Perl_is_utf8_alnum Perl_is_utf8_alnumc Perl_is_utf8_idfirst diff --git a/gv.c b/gv.c index 587d3dc..eaf2ab1 100644 --- a/gv.c +++ b/gv.c @@ -448,10 +448,10 @@ Perl_gv_autoload4(pTHX_ HV *stash, const char *name, STRLEN len, I32 method) /* =for apidoc gv_stashpv -Returns a pointer to the stash for a specified package. If C is -set then the package will be created if it does not already exist. If -C is not set and the package does not exist then NULL is -returned. +Returns a pointer to the stash for a specified package. C should +be a valid UTF-8 string. If C is set then the package will be +created if it does not already exist. If C is not set and the +package does not exist then NULL is returned. =cut */ @@ -494,8 +494,8 @@ Perl_gv_stashpvn(pTHX_ const char *name, U32 namelen, I32 create) /* =for apidoc gv_stashsv -Returns a pointer to the stash for a specified package. See -C. +Returns a pointer to the stash for a specified package, which must be a +valid UTF-8 string. See C. =cut */ diff --git a/objXSUB.h b/objXSUB.h index 569065c..1906a66 100644 --- a/objXSUB.h +++ b/objXSUB.h @@ -687,6 +687,10 @@ #define Perl_to_uni_lower_lc pPerl->Perl_to_uni_lower_lc #undef to_uni_lower_lc #define to_uni_lower_lc Perl_to_uni_lower_lc +#undef Perl_is_utf8_char +#define Perl_is_utf8_char pPerl->Perl_is_utf8_char +#undef is_utf8_char +#define is_utf8_char Perl_is_utf8_char #undef Perl_is_utf8_alnum #define Perl_is_utf8_alnum pPerl->Perl_is_utf8_alnum #undef is_utf8_alnum diff --git a/perlapi.c b/perlapi.c index cfb4dc8..2ee7060 100644 --- a/perlapi.c +++ b/perlapi.c @@ -1288,6 +1288,13 @@ Perl_to_uni_lower_lc(pTHXo_ U32 c) return ((CPerlObj*)pPerl)->Perl_to_uni_lower_lc(c); } +#undef Perl_is_utf8_char +int +Perl_is_utf8_char(pTHXo_ U8 *p) +{ + return ((CPerlObj*)pPerl)->Perl_is_utf8_char(p); +} + #undef Perl_is_utf8_alnum bool Perl_is_utf8_alnum(pTHXo_ U8 *p) diff --git a/pod/perlapi.pod b/pod/perlapi.pod index e4dedbe..c13dcde 100644 --- a/pod/perlapi.pod +++ b/pod/perlapi.pod @@ -381,17 +381,17 @@ C apply equally to these functions. =item gv_stashpv -Returns a pointer to the stash for a specified package. If C is -set then the package will be created if it does not already exist. If -C is not set and the package does not exist then NULL is -returned. +Returns a pointer to the stash for a specified package. C should +be a valid UTF-8 string. If C is set then the package will be +created if it does not already exist. If C is not set and the +package does not exist then NULL is returned. HV* gv_stashpv(const char* name, I32 create) =item gv_stashsv -Returns a pointer to the stash for a specified package. See -C. +Returns a pointer to the stash for a specified package, which must be a +valid UTF-8 string. See C. HV* gv_stashsv(SV* sv, I32 create) diff --git a/proto.h b/proto.h index 3a58718..37a7bdc 100644 --- a/proto.h +++ b/proto.h @@ -365,6 +365,7 @@ PERL_CALLCONV bool Perl_is_uni_xdigit_lc(pTHX_ U32 c); PERL_CALLCONV U32 Perl_to_uni_upper_lc(pTHX_ U32 c); PERL_CALLCONV U32 Perl_to_uni_title_lc(pTHX_ U32 c); PERL_CALLCONV U32 Perl_to_uni_lower_lc(pTHX_ U32 c); +PERL_CALLCONV int Perl_is_utf8_char(pTHX_ U8 *p); PERL_CALLCONV bool Perl_is_utf8_alnum(pTHX_ U8 *p); PERL_CALLCONV bool Perl_is_utf8_alnumc(pTHX_ U8 *p); PERL_CALLCONV bool Perl_is_utf8_idfirst(pTHX_ U8 *p); diff --git a/utf8.c b/utf8.c index 212c555..223f5ac 100644 --- a/utf8.c +++ b/utf8.c @@ -101,6 +101,39 @@ Perl_uv_to_utf8(pTHX_ U8 *d, UV uv) #endif } +/* Tests if some arbitrary number of bytes begins in a valid UTF-8 character. + * The actual number of bytes in the UTF-8 character will be returned if it + * is valid, otherwise 0. */ +int +Perl_is_utf8_char(pTHX_ U8 *s) +{ + U8 u = *s; + int slen, len; + + if (!(u & 0x80)) + return 1; + + if (!(u & 0x40)) + return 0; + + if (!(u & 0x20)) { len = 2; } + else if (!(u & 0x10)) { len = 3; } + else if (!(u & 0x08)) { len = 4; } + else if (!(u & 0x04)) { len = 5; } + else if (!(u & 0x02)) { len = 6; } + else if (!(u & 0x01)) { len = 7; } + else { len = 13; } /* whoa! */ + + slen = len - 1; + s++; + while (slen--) { + if ((*s & 0xc0) != 0x80) + return 0; + s++; + } + return len; +} + UV Perl_utf8_to_uv(pTHX_ U8* s, I32* retlen) { @@ -500,6 +533,8 @@ Perl_to_uni_lower_lc(pTHX_ U32 c) bool Perl_is_utf8_alnum(pTHX_ U8 *p) { + if (!is_utf8_char(p)) + return FALSE; if (!PL_utf8_alnum) PL_utf8_alnum = swash_init("utf8", "IsAlnum", &PL_sv_undef, 0, 0); return swash_fetch(PL_utf8_alnum, p); @@ -515,6 +550,8 @@ Perl_is_utf8_alnum(pTHX_ U8 *p) bool Perl_is_utf8_alnumc(pTHX_ U8 *p) { + if (!is_utf8_char(p)) + return FALSE; if (!PL_utf8_alnum) PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0); return swash_fetch(PL_utf8_alnum, p); @@ -536,6 +573,8 @@ Perl_is_utf8_idfirst(pTHX_ U8 *p) bool Perl_is_utf8_alpha(pTHX_ U8 *p) { + if (!is_utf8_char(p)) + return FALSE; if (!PL_utf8_alpha) PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0); return swash_fetch(PL_utf8_alpha, p); @@ -544,6 +583,8 @@ Perl_is_utf8_alpha(pTHX_ U8 *p) bool Perl_is_utf8_ascii(pTHX_ U8 *p) { + if (!is_utf8_char(p)) + return FALSE; if (!PL_utf8_ascii) PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0); return swash_fetch(PL_utf8_ascii, p); @@ -552,6 +593,8 @@ Perl_is_utf8_ascii(pTHX_ U8 *p) bool Perl_is_utf8_space(pTHX_ U8 *p) { + if (!is_utf8_char(p)) + return FALSE; if (!PL_utf8_space) PL_utf8_space = swash_init("utf8", "IsSpace", &PL_sv_undef, 0, 0); return swash_fetch(PL_utf8_space, p); @@ -560,6 +603,8 @@ Perl_is_utf8_space(pTHX_ U8 *p) bool Perl_is_utf8_digit(pTHX_ U8 *p) { + if (!is_utf8_char(p)) + return FALSE; if (!PL_utf8_digit) PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0); return swash_fetch(PL_utf8_digit, p); @@ -568,6 +613,8 @@ Perl_is_utf8_digit(pTHX_ U8 *p) bool Perl_is_utf8_upper(pTHX_ U8 *p) { + if (!is_utf8_char(p)) + return FALSE; if (!PL_utf8_upper) PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0); return swash_fetch(PL_utf8_upper, p); @@ -576,6 +623,8 @@ Perl_is_utf8_upper(pTHX_ U8 *p) bool Perl_is_utf8_lower(pTHX_ U8 *p) { + if (!is_utf8_char(p)) + return FALSE; if (!PL_utf8_lower) PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0); return swash_fetch(PL_utf8_lower, p); @@ -584,6 +633,8 @@ Perl_is_utf8_lower(pTHX_ U8 *p) bool Perl_is_utf8_cntrl(pTHX_ U8 *p) { + if (!is_utf8_char(p)) + return FALSE; if (!PL_utf8_cntrl) PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0); return swash_fetch(PL_utf8_cntrl, p); @@ -592,6 +643,8 @@ Perl_is_utf8_cntrl(pTHX_ U8 *p) bool Perl_is_utf8_graph(pTHX_ U8 *p) { + if (!is_utf8_char(p)) + return FALSE; if (!PL_utf8_graph) PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0); return swash_fetch(PL_utf8_graph, p); @@ -600,6 +653,8 @@ Perl_is_utf8_graph(pTHX_ U8 *p) bool Perl_is_utf8_print(pTHX_ U8 *p) { + if (!is_utf8_char(p)) + return FALSE; if (!PL_utf8_print) PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0); return swash_fetch(PL_utf8_print, p); @@ -608,6 +663,8 @@ Perl_is_utf8_print(pTHX_ U8 *p) bool Perl_is_utf8_punct(pTHX_ U8 *p) { + if (!is_utf8_char(p)) + return FALSE; if (!PL_utf8_punct) PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0); return swash_fetch(PL_utf8_punct, p); @@ -616,6 +673,8 @@ Perl_is_utf8_punct(pTHX_ U8 *p) bool Perl_is_utf8_xdigit(pTHX_ U8 *p) { + if (!is_utf8_char(p)) + return FALSE; if (!PL_utf8_xdigit) PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0); return swash_fetch(PL_utf8_xdigit, p); @@ -624,6 +683,8 @@ Perl_is_utf8_xdigit(pTHX_ U8 *p) bool Perl_is_utf8_mark(pTHX_ U8 *p) { + if (!is_utf8_char(p)) + return FALSE; if (!PL_utf8_mark) PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0); return swash_fetch(PL_utf8_mark, p);