X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=utf8.c;h=debfb9ceac8b95c4a1d40d0151c5db5ef3b614dc;hb=b6512f489e761186d508cf0b8b7705805cfefc52;hp=4ca7b1c1f01fa096165a71f5b0f4b2d3de4d23b5;hpb=097fb8e2acde8522bd4ee4e5e00d3d2b810e2e56;p=p5sagit%2Fp5-mst-13.2.git diff --git a/utf8.c b/utf8.c index 4ca7b1c..debfb9c 100644 --- a/utf8.c +++ b/utf8.c @@ -27,15 +27,23 @@ /* Unicode support */ /* -=for apidoc A|U8 *|uvuni_to_utf8|U8 *d|UV uv +=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags Adds the UTF8 representation of the Unicode codepoint C to the end of the string C; C should be have at least C free bytes available. The return value is the pointer to the byte after the end of the new character. In other words, + d = uvuni_to_utf8_flags(d, uv, flags); + +or, in most cases, + d = uvuni_to_utf8(d, uv); +(which is equivalent to) + + d = uvuni_to_utf8_flags(d, uv, 0); + is the recommended Unicode-aware way of saying *(d++) = uv; @@ -44,13 +52,26 @@ is the recommended Unicode-aware way of saying */ U8 * -Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv) +Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) { if (ckWARN_d(WARN_UTF8)) { - if (UNICODE_IS_SURROGATE(uv)) + if (UNICODE_IS_SURROGATE(uv) && + !(flags & UNICODE_ALLOW_SURROGATE)) Perl_warner(aTHX_ WARN_UTF8, "UTF-16 surrogate 0x%04"UVxf, uv); - else if ((uv >= 0xFDD0 && uv <= 0xFDEF) || - (uv == 0xFFFE || uv == 0xFFFF)) + else if ( + ((uv >= 0xFDD0 && uv <= 0xFDEF && + !(flags & UNICODE_ALLOW_FDD0)) + || + ((uv & 0xFFFF) == 0xFFFE && + !(flags & UNICODE_ALLOW_FFFE)) + || + ((uv & 0xFFFF) == 0xFFFF && + !(flags & UNICODE_ALLOW_FFFF))) && + /* UNICODE_ALLOW_SUPER includes + * FFFEs and FFFFs beyond 0x10FFFF. */ + ((uv <= PERL_UNICODE_MAX) || + !(flags & UNICODE_ALLOW_SUPER)) + ) Perl_warner(aTHX_ WARN_UTF8, "Unicode character 0x%04"UVxf" is illegal", uv); } @@ -138,7 +159,12 @@ Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv) #endif #endif /* Loop style */ } - + +U8 * +Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv) +{ + return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0); +} /* @@ -255,7 +281,7 @@ Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) UV uv = *s, ouv = 0; STRLEN len = 1; bool dowarn = ckWARN_d(WARN_UTF8); - U8 startbyte = *s; + UV startbyte = *s; STRLEN expectlen = 0; U32 warning = 0; @@ -1041,6 +1067,36 @@ Perl_is_uni_xdigit_lc(pTHX_ UV c) return is_uni_xdigit(c); /* XXX no locale support yet */ } +U32 +Perl_to_uni_upper_lc(pTHX_ U32 c) +{ + /* XXX returns only the first character -- do not use XXX */ + /* XXX no locale support yet */ + STRLEN len; + U8 tmpbuf[UTF8_MAXLEN_UCLC+1]; + return (U32)to_uni_upper(c, tmpbuf, &len); +} + +U32 +Perl_to_uni_title_lc(pTHX_ U32 c) +{ + /* XXX returns only the first character XXX -- do not use XXX */ + /* XXX no locale support yet */ + STRLEN len; + U8 tmpbuf[UTF8_MAXLEN_UCLC+1]; + return (U32)to_uni_title(c, tmpbuf, &len); +} + +U32 +Perl_to_uni_lower_lc(pTHX_ U32 c) +{ + /* XXX returns only the first character -- do not use XXX */ + /* XXX no locale support yet */ + STRLEN len; + U8 tmpbuf[UTF8_MAXLEN_UCLC+1]; + return (U32)to_uni_lower(c, tmpbuf, &len); +} + bool Perl_is_utf8_alnum(pTHX_ U8 *p) { @@ -1514,9 +1570,14 @@ is the recommended wide native character-aware way of saying U8 * Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv) { - return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv)); + return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0); } +U8 * +Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) +{ + return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags); +} /* =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags