((uv >= 0xFDD0 && uv <= 0xFDEF &&
!(flags & UNICODE_ALLOW_FDD0))
||
- ((uv & 0xFFFF) == 0xFFFE &&
- !(flags & UNICODE_ALLOW_FFFE))
- ||
- ((uv & 0xFFFF) == 0xFFFF &&
+ ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
!(flags & UNICODE_ALLOW_FFFF))) &&
/* UNICODE_ALLOW_SUPER includes
* FFFEs and FFFFs beyond 0x10FFFF. */
=for apidoc A|STRLEN|is_utf8_char|U8 *s
Tests if some arbitrary number of bytes begins in a valid UTF-8
-character. Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
-The actual number of bytes in the UTF-8 character will be returned if
-it is valid, otherwise 0.
+character. Note that an INVARIANT (i.e. ASCII) character is a valid
+UTF-8 character. The actual number of bytes in the UTF-8 character
+will be returned if it is valid, otherwise 0.
-=cut
-*/
+=cut */
STRLEN
Perl_is_utf8_char(pTHX_ U8 *s)
{
#define UTF8_WARN_SHORT 5
#define UTF8_WARN_OVERFLOW 6
#define UTF8_WARN_SURROGATE 7
-#define UTF8_WARN_BOM 8
-#define UTF8_WARN_LONG 9
-#define UTF8_WARN_FFFF 10
+#define UTF8_WARN_LONG 8
+#define UTF8_WARN_FFFF 9 /* Also FFFE. */
if (curlen == 0 &&
!(flags & UTF8_ALLOW_EMPTY)) {
!(flags & UTF8_ALLOW_SURROGATE)) {
warning = UTF8_WARN_SURROGATE;
goto malformed;
- } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
- !(flags & UTF8_ALLOW_BOM)) {
- warning = UTF8_WARN_BOM;
- goto malformed;
} else if ((expectlen > UNISKIP(uv)) &&
!(flags & UTF8_ALLOW_LONG)) {
warning = UTF8_WARN_LONG;
case UTF8_WARN_SURROGATE:
Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
break;
- case UTF8_WARN_BOM:
- Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
- break;
case UTF8_WARN_LONG:
Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
UV
Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
{
- return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
+ return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen,
+ ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
}
/*
Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
{
/* Call the low level routine asking for checks */
- return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
+ return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen,
+ ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
}
/*
}
bool
-Perl_is_utf8_idfirst(pTHX_ U8 *p)
+Perl_is_utf8_idfirst(pTHX_ U8 *p) /* The naming is historical. */
+{
+ if (*p == '_')
+ return TRUE;
+ if (!is_utf8_char(p))
+ return FALSE;
+ if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
+ PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
+ return swash_fetch(PL_utf8_idstart, p, TRUE);
+}
+
+bool
+Perl_is_utf8_idcont(pTHX_ U8 *p)
{
- return *p == '_' || is_utf8_alpha(p);
+ if (*p == '_')
+ return TRUE;
+ if (!is_utf8_char(p))
+ return FALSE;
+ if (!PL_utf8_idcont)
+ PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
+ return swash_fetch(PL_utf8_idcont, p, TRUE);
}
bool
SAVEI32(PL_hints);
PL_hints = 0;
save_re_context();
- if (PL_curcop == &PL_compiling)
+ if (PL_curcop == &PL_compiling) {
/* XXX ought to be handled by lex_start */
+ SAVEI32(PL_in_my);
sv_setpv(tokenbufsv, PL_tokenbuf);
+ }
errsv_save = newSVsv(ERRSV);
if (call_method("SWASHNEW", G_SCALAR))
retval = newSVsv(*PL_stack_sp--);
/* We use utf8n_to_uvuni() as we want an index into
Unicode tables, not a native character number.
*/
- UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
+ UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY);
SV *errsv_save;
ENTER;
SAVETMPS;