X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=utf8.c;h=8d812ab3e505bfabde59ce21fd3c66f12d271b08;hb=1725693fac4322554ed5d17f384f2502ef67bf23;hp=5e018260fa855b3cdf0ba54ac770f7718f6ccf2e;hpb=411caa507cab4ba311ec4000c486ad2592d51146;p=p5sagit%2Fp5-mst-13.2.git diff --git a/utf8.c b/utf8.c index 5e01826..8d812ab 100644 --- a/utf8.c +++ b/utf8.c @@ -137,7 +137,7 @@ Perl_is_utf8_char(pTHX_ U8 *s) while (slen--) { if ((*s & 0xc0) != 0x80) return 0; - uv = (uv << 6) | (*s & 0x3f); + uv = UTF8_ACCUMULATE(uv, *s); if (uv < ouv) return 0; ouv = uv; @@ -189,9 +189,13 @@ and the pointer C will be advanced to the end of the character. If C does not point to a well-formed UTF8 character, the behaviour is dependent on the value of C: if it contains UTF8_CHECK_ONLY, it is assumed that the caller will raise a warning, and this function -will set C to C<-1> and return. The C can also contain -various flags to allow deviations from the strict UTF-8 encoding -(see F). +will silently just set C to C<-1> and return zero. If the +C does not contain UTF8_CHECK_ONLY, warnings about +malformations will be given, C will be set to the expected +length of the UTF-8 character in bytes, and zero will be returned. + +The C can also contain various flags to allow deviations from +the strict UTF-8 encoding (see F). =cut */ @@ -214,13 +218,13 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) goto malformed; } - if (uv <= 0x7f) { /* Pure ASCII. */ + if (UTF8_IS_ASCII(uv)) { if (retlen) *retlen = 1; return *s; } - if ((uv >= 0x80 && uv <= 0xbf) && + if (UTF8_IS_CONTINUATION(uv) && !(flags & UTF8_ALLOW_CONTINUATION)) { if (dowarn) Perl_warner(aTHX_ WARN_UTF8, @@ -229,11 +233,11 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) goto malformed; } - if ((uv >= 0xc0 && uv <= 0xfd && curlen > 1 && s[1] < 0x80) && + if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) && !(flags & UTF8_ALLOW_NON_CONTINUATION)) { if (dowarn) Perl_warner(aTHX_ WARN_UTF8, - "Malformed UTF-8 character (unexpected non-continuation byte 0x%02"UVxf" after byte 0x%02"UVxf")", + "Malformed UTF-8 character (unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")", (UV)s[1], uv); goto malformed; } @@ -274,15 +278,16 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) ouv = uv; while (len--) { - if ((*s & 0xc0) != 0x80) { + if (!UTF8_IS_CONTINUATION(*s) && + !(flags & UTF8_ALLOW_NON_CONTINUATION)) { if (dowarn) Perl_warner(aTHX_ WARN_UTF8, - "Malformed UTF-8 character (unexpected continuation byte 0x%02x)", + "Malformed UTF-8 character (unexpected non-continuation byte 0x%02x)", *s); goto malformed; } else - uv = (uv << 6) | (*s & 0x3f); + uv = UTF8_ACCUMULATE(uv, *s); if (uv < ouv) { /* This cannot be allowed. */ if (dowarn) @@ -295,27 +300,20 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) ouv = uv; } - if ((uv >= 0xd800 && uv <= 0xdfff) && + if (UNICODE_IS_SURROGATE(uv) && !(flags & UTF8_ALLOW_SURROGATE)) { if (dowarn) Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character (UTF-16 surrogate 0x%04"UVxf")", uv); goto malformed; - } else if ((uv == 0xfffe) && + } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) && !(flags & UTF8_ALLOW_BOM)) { if (dowarn) Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character (byte order mark 0x%04"UVxf")", uv); goto malformed; - } else if ((uv == 0xffff) && - !(flags & UTF8_ALLOW_FFFF)) { - if (dowarn) - Perl_warner(aTHX_ WARN_UTF8, - "Malformed UTF-8 character (character 0x%04"UVxf")", - uv); - goto malformed; } else if ((expectlen > UNISKIP(uv)) && !(flags & UTF8_ALLOW_LONG)) { if (dowarn) @@ -323,6 +321,13 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) "Malformed UTF-8 character (%d byte%s, need %d)", expectlen, expectlen == 1 ? "": "s", UNISKIP(uv)); goto malformed; + } else if (UNICODE_IS_ILLEGAL(uv) && + !(flags & UTF8_ALLOW_FFFF)) { + if (dowarn) + Perl_warner(aTHX_ WARN_UTF8, + "Malformed UTF-8 character (character 0x%04"UVxf")", + uv); + goto malformed; } return uv; @@ -338,7 +343,7 @@ malformed: if (retlen) *retlen = expectlen ? expectlen : len; - return UNICODE_REPLACEMENT_CHARACTER; + return 0; } /* @@ -362,7 +367,7 @@ Perl_utf8_to_uv_simple(pTHX_ U8* s, STRLEN* retlen) } /* -=for apidoc|utf8_length|U8 *s|U8 *e +=for apidoc Am|STRLEN|utf8_length|U8* s|U8 *e Return the length of the UTF-8 char encoded string C in characters. Stops at C (inclusive). If C s> or if the scan would end @@ -376,6 +381,10 @@ Perl_utf8_length(pTHX_ U8* s, U8* e) { STRLEN len = 0; + /* Note: cannot use UTF8_IS_...() too eagerly here since e.g. + * the bitops (especially ~) can create illegal UTF-8. + * In other words: in Perl UTF-8 is not just for Unicode. */ + if (e < s) Perl_croak(aTHX_ "panic: utf8_length: unexpected end"); while (s < e) { @@ -390,14 +399,26 @@ Perl_utf8_length(pTHX_ U8* s, U8* e) return len; } -/* utf8_distance(a,b) returns the number of UTF8 characters between - the pointers a and b */ +/* +=for apidoc Am|IV|utf8_distance|U8 *a|U8 *b + +Returns the number of UTF8 characters between the UTF-8 pointers C +and C. + +WARNING: use only if you *know* that the pointers point inside the +same UTF-8 buffer. + +=cut */ IV Perl_utf8_distance(pTHX_ U8 *a, U8 *b) { IV off = 0; + /* Note: cannot use UTF8_IS_...() too eagerly here since e.g. + * the bitops (especially ~) can create illegal UTF-8. + * In other words: in Perl UTF-8 is not just for Unicode. */ + if (a < b) { while (a < b) { U8 c = UTF8SKIP(a); @@ -422,11 +443,25 @@ Perl_utf8_distance(pTHX_ U8 *a, U8 *b) return off; } -/* WARNING: do not use the following unless you *know* off is within bounds */ +/* +=for apidoc Am|U8*|utf8_hop|U8 *s|I32 off + +Return the UTF-8 pointer C displaced by C characters, either +forward or backward. + +WARNING: do not use the following unless you *know* C is within +the UTF-8 data pointed to by C *and* that on entry C is aligned +on the first byte of character or just after the last byte of a character. + +=cut */ U8 * Perl_utf8_hop(pTHX_ U8 *s, I32 off) { + /* Note: cannot use UTF8_IS_...() too eagerly here since e.g + * the bitops (especially ~) can create illegal UTF-8. + * In other words: in Perl UTF-8 is not just for Unicode. */ + if (off >= 0) { while (off--) s += UTF8SKIP(s); @@ -434,10 +469,8 @@ Perl_utf8_hop(pTHX_ U8 *s, I32 off) else { while (off++) { s--; - if (*s & 0x80) { - while ((*s & 0xc0) == 0x80) - s--; - } + while (UTF8_IS_CONTINUATION(*s)) + s--; } } return s; @@ -475,14 +508,9 @@ Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len) d = s = save; while (s < send) { - if (*s < 0x80) { - *d++ = *s++; - } - else { - STRLEN ulen; - *d++ = (U8)utf8_to_uv_simple(s, &ulen); - s += ulen; - } + STRLEN ulen; + *d++ = (U8)utf8_to_uv_simple(s, &ulen); + s += ulen; } *d = '\0'; *len = d - save;