while (slen--) {
if ((*s & 0xc0) != 0x80)
return 0;
- uv = (uv << 6) | (*s & 0x3f);
+ uv = UTF8_ACCUMULATE(uv, *s);
if (uv < ouv)
return 0;
ouv = uv;
Returns the character value of the first character in the string C<s>
which is assumed to be in UTF8 encoding and no longer than C<curlen>;
-C<retlen> will be set to the length, in bytes, of that character,
-and the pointer C<s> will be advanced to the end of the character.
+C<retlen> will be set to the length, in bytes, of that character.
If C<s> does not point to a well-formed UTF8 character, the behaviour
is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
it is assumed that the caller will raise a warning, and this function
-will set C<retlen> to C<-1> and return zero. If the C<flags> does not
-contain UTF8_CHECK_ONLY, the UNICODE_REPLACEMENT (0xFFFD) will be
-returned, and C<retlen> will be set to the expected length of the
-UTF-8 character in bytes. The C<flags> can also contain various flags
-to allow deviations from the strict UTF-8 encoding (see F<utf8.h>).
+will silently just set C<retlen> to C<-1> and return zero. If the
+C<flags> does not contain UTF8_CHECK_ONLY, warnings about
+malformations will be given, C<retlen> will be set to the expected
+length of the UTF-8 character in bytes, and zero will be returned.
+
+The C<flags> can also contain various flags to allow deviations from
+the strict UTF-8 encoding (see F<utf8.h>).
=cut */
goto malformed;
}
else
- uv = (uv << 6) | (*s & 0x3f);
+ uv = UTF8_ACCUMULATE(uv, *s);
if (uv < ouv) {
/* This cannot be allowed. */
if (dowarn)
}
if (retlen)
- *retlen = expectlen;
+ *retlen = expectlen ? expectlen : len;
- return UNICODE_REPLACEMENT;
+ return 0;
}
/*
Returns the character value of the first character in the string C<s>
which is assumed to be in UTF8 encoding; C<retlen> will be set to the
-length, in bytes, of that character, and the pointer C<s> will be
-advanced to the end of the character.
+length, in bytes, of that character.
If C<s> does not point to a well-formed UTF8 character, zero is
returned and retlen is set, if possible, to -1.
{
STRLEN len = 0;
+ /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
+ * the bitops (especially ~) can create illegal UTF-8.
+ * In other words: in Perl UTF-8 is not just for Unicode. */
+
if (e < s)
Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
while (s < e) {
{
IV off = 0;
+ /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
+ * the bitops (especially ~) can create illegal UTF-8.
+ * In other words: in Perl UTF-8 is not just for Unicode. */
+
if (a < b) {
while (a < b) {
U8 c = UTF8SKIP(a);
/*
=for apidoc Am|U8*|utf8_hop|U8 *s|I32 off
-Move the C<s> pointing to UTF-8 data by C<off> characters, either forward
-or backward.
+Return the UTF-8 pointer C<s> displaced by C<off> characters, either
+forward or backward.
WARNING: do not use the following unless you *know* C<off> is within
-the UTF-8 buffer pointed to by C<s>.
+the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
+on the first byte of character or just after the last byte of a character.
=cut */
U8 *
Perl_utf8_hop(pTHX_ U8 *s, I32 off)
{
+ /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
+ * the bitops (especially ~) can create illegal UTF-8.
+ * In other words: in Perl UTF-8 is not just for Unicode. */
+
if (off >= 0) {
while (off--)
s += UTF8SKIP(s);
else {
while (off++) {
s--;
- if (*s & 0x80) {
- while ((*s & 0xc0) == 0x80)
- s--;
- }
+ while (UTF8_IS_CONTINUATION(*s))
+ s--;
}
}
return s;
d = s = save;
while (s < send) {
- if (*s < 0x80) {
- *d++ = *s++;
- }
- else {
- STRLEN ulen;
- *d++ = (U8)utf8_to_uv_simple(s, &ulen);
- s += ulen;
- }
+ STRLEN ulen;
+ *d++ = (U8)utf8_to_uv_simple(s, &ulen);
+ s += ulen;
}
*d = '\0';
*len = d - save;