return d;
}
#ifdef HAS_QUAD
- if (uv < 0x1000000000LL)
+ if (uv < UTF8_QUAD_MAX)
#endif
{
*d++ = 0xfe; /* Can't match U+FEFF! */
dTHR;
UV uv = *s, ouv;
STRLEN len = 1;
+#ifdef EBCDIC
+ bool dowarn = 0;
+#else
bool dowarn = ckWARN_d(WARN_UTF8);
+#endif
STRLEN expectlen = 0;
+ if (curlen == 0) {
+ if (dowarn)
+ Perl_warner(aTHX_ WARN_UTF8,
+ "Malformed UTF-8 character (an empty string)");
+ goto malformed;
+ }
+
if (uv <= 0x7f) { /* Pure ASCII. */
if (retlen)
*retlen = 1;
!(flags & UTF8_ALLOW_CONTINUATION)) {
if (dowarn)
Perl_warner(aTHX_ WARN_UTF8,
- "Malformed UTF-8 character (unexpected continuation byte 0x%02x)",
+ "Malformed UTF-8 character (unexpected continuation byte 0x%02"UVxf")",
uv);
goto malformed;
}
- if ((uv >= 0xc0 && uv <= 0xfd && s[1] < 0x80) &&
+ if ((uv >= 0xc0 && uv <= 0xfd && curlen > 1 && s[1] < 0x80) &&
!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
if (dowarn)
Perl_warner(aTHX_ WARN_UTF8,
- "Malformed UTF-8 character (unexpected non-continuation byte 0x%02x after byte 0x%02x)",
- s[1], uv);
+ "Malformed UTF-8 character (unexpected non-continuation byte 0x%02"UVxf" after byte 0x%02"UVxf")",
+ (UV)s[1], uv);
goto malformed;
}
!(flags & UTF8_ALLOW_FE_FF)) {
if (dowarn)
Perl_warner(aTHX_ WARN_UTF8,
- "Malformed UTF-8 character (byte 0x%02x)",
+ "Malformed UTF-8 character (byte 0x%02"UVxf")",
uv);
goto malformed;
}
if (dowarn)
Perl_warner(aTHX_ WARN_UTF8,
"Malformed UTF-8 character (%d byte%s, need %d)",
- curlen, curlen > 1 ? "s" : "", expectlen);
+ curlen, curlen == 1 ? "" : "s", expectlen);
goto malformed;
}
if (dowarn)
Perl_warner(aTHX_ WARN_UTF8,
"Malformed UTF-8 character (%d byte%s, need %d)",
- expectlen, expectlen > 1 ? "s": "", UNISKIP(uv));
+ expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
goto malformed;
}
if (flags & UTF8_CHECK_ONLY) {
if (retlen)
- *retlen = len;
+ *retlen = -1;
return 0;
}
if (retlen)
- *retlen = -1;
+ *retlen = expectlen ? expectlen : len;
return UNICODE_REPLACEMENT_CHARACTER;
}
return Perl_utf8_to_uv(aTHX_ s, (STRLEN)-1, retlen, 0);
}
+/*
+=for apidoc|utf8_length|U8 *s|U8 *e
+
+Return the length of the UTF-8 char encoded string C<s> in characters.
+Stops at string C<e>. If C<e E<lt> s> or if the scan would end up
+past C<e>, return -1.
+
+=cut
+*/
+
+STRLEN
+Perl_utf8_length(pTHX_ U8* s, U8* e)
+{
+ STRLEN len = 0;
+
+ if (e < s)
+ return -1;
+ while (s < e) {
+ STRLEN t = UTF8SKIP(s);
+
+ if (e - s < t)
+ return -1;
+ s += t;
+ len++;
+ }
+
+ return len;
+}
+
/* utf8_distance(a,b) returns the number of UTF8 characters between
the pointers a and b */
Copy(ptr, PL_last_swash_key, klen, U8);
}
- switch ((slen << 3) / needents) {
+ switch ((int)((slen << 3) / needents)) {
case 1:
bit = 1 << (off & 7);
off >>= 3;