X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=utf8.c;h=c50489115ee9554c9954fc8f5ef6ba1991abad9f;hb=0b98bec9fd8d0513cb1904db85614d287298f7f8;hp=4f4c3eaffe5ee36cbf52c3271ccee7633d01cbd3;hpb=6673a63c63e2a65dbfcc835d6499cc97c449c67b;p=p5sagit%2Fp5-mst-13.2.git diff --git a/utf8.c b/utf8.c index 4f4c3ea..c504891 100644 --- a/utf8.c +++ b/utf8.c @@ -51,6 +51,37 @@ Unicode characters as a variable number of bytes, in such a way that characters in the ASCII range are unmodified, and a zero byte never appears within non-zero characters. +=cut +*/ + +/* +=for apidoc is_ascii_string + +Returns true if first C bytes of the given string are ASCII (i.e. none +of them even raise the question of UTF-8-ness). + +See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc(). + +=cut +*/ + +bool +Perl_is_ascii_string(const U8 *s, STRLEN len) +{ + const U8* const send = s + (len ? len : strlen((const char *)s)); + const U8* x = s; + + PERL_ARGS_ASSERT_IS_ASCII_STRING; + + for (; x < send; ++x) { + if (!UTF8_IS_INVARIANT(*x)) + break; + } + + return x == send; +} + +/* =for apidoc uvuni_to_utf8_flags Adds the UTF-8 representation of the Unicode codepoint C to the end @@ -96,7 +127,7 @@ Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) !(flags & UNICODE_ALLOW_SUPER)) ) Perl_warner(aTHX_ packWARN(WARN_UTF8), - "Unicode character 0x%04"UVxf" is illegal", uv); + "Unicode non-character 0x%04"UVxf" is illegal for interchange", uv); } if (UNI_IS_INVARIANT(uv)) { *d++ = (U8)UTF_TO_NATIVE(uv); @@ -253,12 +284,11 @@ character will be returned if it is valid, otherwise 0. =cut */ STRLEN -Perl_is_utf8_char(pTHX_ const U8 *s) +Perl_is_utf8_char(const U8 *s) { const STRLEN len = UTF8SKIP(s); PERL_ARGS_ASSERT_IS_UTF8_CHAR; - PERL_UNUSED_CONTEXT; #ifdef IS_UTF8_CHAR if (IS_UTF8_CHAR_FAST(len)) return IS_UTF8_CHAR(s, len) ? len : 0; @@ -266,6 +296,7 @@ Perl_is_utf8_char(pTHX_ const U8 *s) return is_utf8_char_slow(s, len); } + /* =for apidoc is_utf8_string @@ -274,19 +305,18 @@ UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does not mean 'a string that contains code points above 0x7F encoded in UTF-8' because a valid ASCII string is a valid UTF-8 string. -See also is_utf8_string_loclen() and is_utf8_string_loc(). +See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc(). =cut */ bool -Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len) +Perl_is_utf8_string(const U8 *s, STRLEN len) { const U8* const send = s + (len ? len : strlen((const char *)s)); const U8* x = s; PERL_ARGS_ASSERT_IS_UTF8_STRING; - PERL_UNUSED_CONTEXT; while (x < send) { STRLEN c; @@ -345,7 +375,7 @@ See also is_utf8_string_loc() and is_utf8_string(). */ bool -Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) +Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) { const U8* const send = s + (len ? len : strlen((const char *)s)); const U8* x = s; @@ -353,7 +383,6 @@ Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN STRLEN outlen = 0; PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN; - PERL_UNUSED_CONTEXT; while (x < send) { /* Inline the easy bits of is_utf8_char() here for speed... */ @@ -682,7 +711,6 @@ Perl_utf8_length(pTHX_ const U8 *s, const U8 *e) { dVAR; STRLEN len = 0; - U8 t = 0; PERL_ARGS_ASSERT_UTF8_LENGTH; @@ -693,22 +721,23 @@ Perl_utf8_length(pTHX_ const U8 *s, const U8 *e) if (e < s) goto warn_and_return; while (s < e) { - t = UTF8SKIP(s); - if (e - s < t) { - warn_and_return: - if (ckWARN_d(WARN_UTF8)) { - if (PL_op) - Perl_warner(aTHX_ packWARN(WARN_UTF8), - "%s in %s", unees, OP_DESC(PL_op)); - else - Perl_warner(aTHX_ packWARN(WARN_UTF8), unees); - } - return len; - } - s += t; + if (!UTF8_IS_INVARIANT(*s)) + s += UTF8SKIP(s); + else + s++; len++; } + if (e != s) { + len--; + warn_and_return: + if (PL_op) + Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), + "%s in %s", unees, OP_DESC(PL_op)); + else + Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), unees); + } + return len; } @@ -929,12 +958,6 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) PERL_ARGS_ASSERT_UTF16_TO_UTF8; - if (bytelen == 1 && p[0] == 0) { /* Be understanding. */ - d[0] = 0; - *newlen = 1; - return d; - } - if (bytelen & 1) Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen); @@ -956,12 +979,18 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) *d++ = (U8)(( uv & 0x3f) | 0x80); continue; } - if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */ - UV low = (p[0] << 8) + p[1]; - p += 2; - if (low < 0xdc00 || low >= 0xdfff) + if (uv >= 0xd800 && uv <= 0xdbff) { /* surrogates */ + if (p >= pend) { Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); - uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000; + } else { + UV low = (p[0] << 8) + p[1]; + p += 2; + if (low < 0xdc00 || low > 0xdfff) + Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); + uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000; + } + } else if (uv >= 0xdc00 && uv <= 0xdfff) { + Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); } if (uv < 0x10000) { *d++ = (U8)(( uv >> 12) | 0xe0); @@ -991,6 +1020,10 @@ Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED; + if (bytelen & 1) + Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf, + (UV)bytelen); + while (s < send) { const U8 tmp = s[0]; s[0] = s[1]; @@ -1011,14 +1044,6 @@ Perl_is_uni_alnum(pTHX_ UV c) } bool -Perl_is_uni_alnumc(pTHX_ UV c) -{ - U8 tmpbuf[UTF8_MAXBYTES+1]; - uvchr_to_utf8(tmpbuf, c); - return is_utf8_alnumc(tmpbuf); -} - -bool Perl_is_uni_idfirst(pTHX_ UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; @@ -1159,12 +1184,6 @@ Perl_is_uni_alnum_lc(pTHX_ UV c) } bool -Perl_is_uni_alnumc_lc(pTHX_ UV c) -{ - return is_uni_alnumc(c); /* XXX no locale support yet */ -} - -bool Perl_is_uni_idfirst_lc(pTHX_ UV c) { return is_uni_idfirst(c); /* XXX no locale support yet */ @@ -1295,16 +1314,6 @@ Perl_is_utf8_alnum(pTHX_ const U8 *p) } bool -Perl_is_utf8_alnumc(pTHX_ const U8 *p) -{ - dVAR; - - PERL_ARGS_ASSERT_IS_UTF8_ALNUMC; - - return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC"); -} - -bool Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */ { dVAR; @@ -1360,6 +1369,26 @@ Perl_is_utf8_space(pTHX_ const U8 *p) } bool +Perl_is_utf8_perl_space(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_PERL_SPACE; + + return is_utf8_common(p, &PL_utf8_perl_space, "IsPerlSpace"); +} + +bool +Perl_is_utf8_perl_word(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_PERL_WORD; + + return is_utf8_common(p, &PL_utf8_perl_word, "IsPerlWord"); +} + +bool Perl_is_utf8_digit(pTHX_ const U8 *p) { dVAR; @@ -1370,6 +1399,16 @@ Perl_is_utf8_digit(pTHX_ const U8 *p) } bool +Perl_is_utf8_posix_digit(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_POSIX_DIGIT; + + return is_utf8_common(p, &PL_utf8_posix_digit, "IsPosixDigit"); +} + +bool Perl_is_utf8_upper(pTHX_ const U8 *p) { dVAR; @@ -1436,7 +1475,7 @@ Perl_is_utf8_xdigit(pTHX_ const U8 *p) PERL_ARGS_ASSERT_IS_UTF8_XDIGIT; - return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit"); + return is_utf8_common(p, &PL_utf8_xdigit, "IsXDigit"); } bool