X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=utf8.c;h=a68af530e7072725cf88d7b12f43810f7f5953ab;hb=8c5b8ff02c62badaeb38078556879720bdf8945a;hp=4f4c3eaffe5ee36cbf52c3271ccee7633d01cbd3;hpb=6673a63c63e2a65dbfcc835d6499cc97c449c67b;p=p5sagit%2Fp5-mst-13.2.git diff --git a/utf8.c b/utf8.c index 4f4c3ea..a68af53 100644 --- a/utf8.c +++ b/utf8.c @@ -51,6 +51,37 @@ Unicode characters as a variable number of bytes, in such a way that characters in the ASCII range are unmodified, and a zero byte never appears within non-zero characters. +=cut +*/ + +/* +=for apidoc is_ascii_string + +Returns true if first C bytes of the given string are ASCII (i.e. none +of them even raise the question of UTF-8-ness). + +See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc(). + +=cut +*/ + +bool +Perl_is_ascii_string(const U8 *s, STRLEN len) +{ + const U8* const send = s + (len ? len : strlen((const char *)s)); + const U8* x = s; + + PERL_ARGS_ASSERT_IS_ASCII_STRING; + + for (; x < send; ++x) { + if (!UTF8_IS_INVARIANT(*x)) + break; + } + + return x == send; +} + +/* =for apidoc uvuni_to_utf8_flags Adds the UTF-8 representation of the Unicode codepoint C to the end @@ -253,12 +284,11 @@ character will be returned if it is valid, otherwise 0. =cut */ STRLEN -Perl_is_utf8_char(pTHX_ const U8 *s) +Perl_is_utf8_char(const U8 *s) { const STRLEN len = UTF8SKIP(s); PERL_ARGS_ASSERT_IS_UTF8_CHAR; - PERL_UNUSED_CONTEXT; #ifdef IS_UTF8_CHAR if (IS_UTF8_CHAR_FAST(len)) return IS_UTF8_CHAR(s, len) ? len : 0; @@ -266,6 +296,7 @@ Perl_is_utf8_char(pTHX_ const U8 *s) return is_utf8_char_slow(s, len); } + /* =for apidoc is_utf8_string @@ -274,19 +305,18 @@ UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does not mean 'a string that contains code points above 0x7F encoded in UTF-8' because a valid ASCII string is a valid UTF-8 string. -See also is_utf8_string_loclen() and is_utf8_string_loc(). +See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc(). =cut */ bool -Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len) +Perl_is_utf8_string(const U8 *s, STRLEN len) { const U8* const send = s + (len ? len : strlen((const char *)s)); const U8* x = s; PERL_ARGS_ASSERT_IS_UTF8_STRING; - PERL_UNUSED_CONTEXT; while (x < send) { STRLEN c; @@ -345,7 +375,7 @@ See also is_utf8_string_loc() and is_utf8_string(). */ bool -Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) +Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) { const U8* const send = s + (len ? len : strlen((const char *)s)); const U8* x = s; @@ -353,7 +383,6 @@ Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN STRLEN outlen = 0; PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN; - PERL_UNUSED_CONTEXT; while (x < send) { /* Inline the easy bits of is_utf8_char() here for speed... */ @@ -682,7 +711,6 @@ Perl_utf8_length(pTHX_ const U8 *s, const U8 *e) { dVAR; STRLEN len = 0; - U8 t = 0; PERL_ARGS_ASSERT_UTF8_LENGTH; @@ -693,20 +721,23 @@ Perl_utf8_length(pTHX_ const U8 *s, const U8 *e) if (e < s) goto warn_and_return; while (s < e) { - t = UTF8SKIP(s); - if (e - s < t) { - warn_and_return: - if (ckWARN_d(WARN_UTF8)) { - if (PL_op) - Perl_warner(aTHX_ packWARN(WARN_UTF8), + if (!UTF8_IS_INVARIANT(*s)) + s += UTF8SKIP(s); + else + s++; + len++; + } + + if (e != s) { + len--; + warn_and_return: + if (ckWARN_d(WARN_UTF8)) { + if (PL_op) + Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s in %s", unees, OP_DESC(PL_op)); - else - Perl_warner(aTHX_ packWARN(WARN_UTF8), unees); - } - return len; + else + Perl_warner(aTHX_ packWARN(WARN_UTF8), unees); } - s += t; - len++; } return len; @@ -1011,14 +1042,6 @@ Perl_is_uni_alnum(pTHX_ UV c) } bool -Perl_is_uni_alnumc(pTHX_ UV c) -{ - U8 tmpbuf[UTF8_MAXBYTES+1]; - uvchr_to_utf8(tmpbuf, c); - return is_utf8_alnumc(tmpbuf); -} - -bool Perl_is_uni_idfirst(pTHX_ UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; @@ -1159,12 +1182,6 @@ Perl_is_uni_alnum_lc(pTHX_ UV c) } bool -Perl_is_uni_alnumc_lc(pTHX_ UV c) -{ - return is_uni_alnumc(c); /* XXX no locale support yet */ -} - -bool Perl_is_uni_idfirst_lc(pTHX_ UV c) { return is_uni_idfirst(c); /* XXX no locale support yet */ @@ -1295,16 +1312,6 @@ Perl_is_utf8_alnum(pTHX_ const U8 *p) } bool -Perl_is_utf8_alnumc(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_ALNUMC; - - return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC"); -} - -bool Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */ { dVAR;