/*
=for apidoc A|STRLEN|is_utf8_char|U8 *s
-Tests if some arbitrary number of bytes begins in a valid UTF-8 character.
-The actual number of bytes in the UTF-8 character will be returned if it
-is valid, otherwise 0.
+Tests if some arbitrary number of bytes begins in a valid UTF-8
+character. Note that an ASCII character is a valid UTF-8 character.
+The actual number of bytes in the UTF-8 character will be returned if
+it is valid, otherwise 0.
-=cut
-*/
+=cut */
STRLEN
Perl_is_utf8_char(pTHX_ U8 *s)
{
/*
=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
-Returns true if first C<len> bytes of the given string form valid a UTF8
-string, false otherwise.
+Returns true if first C<len> bytes of the given string form a valid UTF8
+string, false otherwise. Note that 'a valid UTF8 string' does not mean
+'a string that contains UTF8' because a valid ASCII string is a valid
+UTF8 string.
=cut
*/
}
/*
-=for apidoc A|U8* s|utf8_to_uv|STRLEN curlen|STRLEN *retlen|U32 flags
+=for apidoc A|UV|utf8_to_uv|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
Returns the character value of the first character in the string C<s>
which is assumed to be in UTF8 encoding and no longer than C<curlen>;
s = start; start = d;
while (s < send) {
U8 c = *s++;
+
if (UTF8_IS_ASCII(c))
*d++ = c;
else
- *d++ = UTF8_ACCUMULATE(c&3, *s++);
+ *d++ = UTF8_ACCUMULATE(c, *s++);
}
*d = '\0';
*len = d - start;
dst = d;
while (s < send) {
- if (*s < 0x80)
+ if (UTF8_IS_ASCII(*s))
*d++ = *s++;
else {
UV uv = *s++;
- *d++ = (( uv >> 6) | 0xc0);
- *d++ = (( uv & 0x3f) | 0x80);
+
+ *d++ = UTF8_EIGHT_BIT_HI(uv);
+ *d++ = UTF8_EIGHT_BIT_LO(uv);
}
}
*d = '\0';