X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=utf8.c;h=8d812ab3e505bfabde59ce21fd3c66f12d271b08;hb=1725693fac4322554ed5d17f384f2502ef67bf23;hp=244bb639bb048f992fac1f3fda5b05685169d820;hpb=b06226ff370ef661c3ff28e6f65e1ba0ef078609;p=p5sagit%2Fp5-mst-13.2.git diff --git a/utf8.c b/utf8.c index 244bb63..8d812ab 100644 --- a/utf8.c +++ b/utf8.c @@ -137,7 +137,7 @@ Perl_is_utf8_char(pTHX_ U8 *s) while (slen--) { if ((*s & 0xc0) != 0x80) return 0; - uv = (uv << 6) | (*s & 0x3f); + uv = UTF8_ACCUMULATE(uv, *s); if (uv < ouv) return 0; ouv = uv; @@ -189,9 +189,13 @@ and the pointer C will be advanced to the end of the character. If C does not point to a well-formed UTF8 character, the behaviour is dependent on the value of C: if it contains UTF8_CHECK_ONLY, it is assumed that the caller will raise a warning, and this function -will set C to C<-1> and return. The C can also contain -various flags to allow deviations from the strict UTF-8 encoding -(see F). +will silently just set C to C<-1> and return zero. If the +C does not contain UTF8_CHECK_ONLY, warnings about +malformations will be given, C will be set to the expected +length of the UTF-8 character in bytes, and zero will be returned. + +The C can also contain various flags to allow deviations from +the strict UTF-8 encoding (see F). =cut */ @@ -214,13 +218,13 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) goto malformed; } - if (uv <= 0x7f) { /* Pure ASCII. */ + if (UTF8_IS_ASCII(uv)) { if (retlen) *retlen = 1; return *s; } - if ((uv >= 0x80 && uv <= 0xbf) && + if (UTF8_IS_CONTINUATION(uv) && !(flags & UTF8_ALLOW_CONTINUATION)) { if (dowarn) Perl_warner(aTHX_ WARN_UTF8, @@ -229,11 +233,11 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) goto malformed; } - if ((uv >= 0xc0 && uv <= 0xfd && curlen > 1 && s[1] < 0x80) && + if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) && !(flags & UTF8_ALLOW_NON_CONTINUATION)) { if (dowarn) Perl_warner(aTHX_ WARN_UTF8, - "Malformed UTF-8 character (unexpected non-continuation byte 0x%02"UVxf" after byte 0x%02"UVxf")", + "Malformed UTF-8 character (unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")", (UV)s[1], uv); goto malformed; } @@ -274,15 +278,16 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) ouv = uv; while (len--) { - if ((*s & 0xc0) != 0x80) { + if (!UTF8_IS_CONTINUATION(*s) && + !(flags & UTF8_ALLOW_NON_CONTINUATION)) { if (dowarn) Perl_warner(aTHX_ WARN_UTF8, - "Malformed UTF-8 character (unexpected continuation byte 0x%02x)", + "Malformed UTF-8 character (unexpected non-continuation byte 0x%02x)", *s); goto malformed; } else - uv = (uv << 6) | (*s & 0x3f); + uv = UTF8_ACCUMULATE(uv, *s); if (uv < ouv) { /* This cannot be allowed. */ if (dowarn) @@ -295,14 +300,14 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) ouv = uv; } - if ((uv >= 0xd800 && uv <= 0xdfff) && + if (UNICODE_IS_SURROGATE(uv) && !(flags & UTF8_ALLOW_SURROGATE)) { if (dowarn) Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character (UTF-16 surrogate 0x%04"UVxf")", uv); goto malformed; - } else if ((uv == 0xfffe) && + } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) && !(flags & UTF8_ALLOW_BOM)) { if (dowarn) Perl_warner(aTHX_ WARN_UTF8, @@ -316,7 +321,7 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) "Malformed UTF-8 character (%d byte%s, need %d)", expectlen, expectlen == 1 ? "": "s", UNISKIP(uv)); goto malformed; - } else if ((uv == 0xffff) && + } else if (UNICODE_IS_ILLEGAL(uv) && !(flags & UTF8_ALLOW_FFFF)) { if (dowarn) Perl_warner(aTHX_ WARN_UTF8, @@ -338,7 +343,7 @@ malformed: if (retlen) *retlen = expectlen ? expectlen : len; - return UNICODE_REPLACEMENT_CHARACTER; + return 0; } /* @@ -376,6 +381,10 @@ Perl_utf8_length(pTHX_ U8* s, U8* e) { STRLEN len = 0; + /* Note: cannot use UTF8_IS_...() too eagerly here since e.g. + * the bitops (especially ~) can create illegal UTF-8. + * In other words: in Perl UTF-8 is not just for Unicode. */ + if (e < s) Perl_croak(aTHX_ "panic: utf8_length: unexpected end"); while (s < e) { @@ -406,6 +415,10 @@ Perl_utf8_distance(pTHX_ U8 *a, U8 *b) { IV off = 0; + /* Note: cannot use UTF8_IS_...() too eagerly here since e.g. + * the bitops (especially ~) can create illegal UTF-8. + * In other words: in Perl UTF-8 is not just for Unicode. */ + if (a < b) { while (a < b) { U8 c = UTF8SKIP(a); @@ -433,17 +446,22 @@ Perl_utf8_distance(pTHX_ U8 *a, U8 *b) /* =for apidoc Am|U8*|utf8_hop|U8 *s|I32 off -Move the C pointing to UTF-8 data by C characters, either forward -or backward. +Return the UTF-8 pointer C displaced by C characters, either +forward or backward. WARNING: do not use the following unless you *know* C is within -the UTF-8 buffer pointed to by C. +the UTF-8 data pointed to by C *and* that on entry C is aligned +on the first byte of character or just after the last byte of a character. =cut */ U8 * Perl_utf8_hop(pTHX_ U8 *s, I32 off) { + /* Note: cannot use UTF8_IS_...() too eagerly here since e.g + * the bitops (especially ~) can create illegal UTF-8. + * In other words: in Perl UTF-8 is not just for Unicode. */ + if (off >= 0) { while (off--) s += UTF8SKIP(s); @@ -451,10 +469,8 @@ Perl_utf8_hop(pTHX_ U8 *s, I32 off) else { while (off++) { s--; - if (*s & 0x80) { - while ((*s & 0xc0) == 0x80) - s--; - } + while (UTF8_IS_CONTINUATION(*s)) + s--; } } return s; @@ -492,14 +508,9 @@ Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len) d = s = save; while (s < send) { - if (*s < 0x80) { - *d++ = *s++; - } - else { - STRLEN ulen; - *d++ = (U8)utf8_to_uv_simple(s, &ulen); - s += ulen; - } + STRLEN ulen; + *d++ = (U8)utf8_to_uv_simple(s, &ulen); + s += ulen; } *d = '\0'; *len = d - save;