but 0xFFFE quite wrong.
p4raw-id: //depot/perl@15762
my $d800 = chr(0xD800);
my $dfff = chr(0xDFFF);
my $e000 = chr(0xE000);
+my $feff = chr(0xFEFF);
my $fffd = chr(0xFFFD);
+my $fffe = chr(0xFFFE);
my $ffff = chr(0xFFFF);
my $hex4 = chr(0x10000);
my $hex5 = chr(0x100000);
+my $maxm1 = chr(0x10FFFE);
my $max = chr(0x10FFFF);
no warnings 'utf8';
my $d7ff = chr(0xD7FF);
my $d800 = chr(0xD800);
my $dfff = chr(0xDFFF);
my $e000 = chr(0xE000);
+my $feff = chr(0xFEFF);
my $fffd = chr(0xFFFD);
+my $fffe = chr(0xFFFE);
my $ffff = chr(0xFFFF);
my $hex4 = chr(0x10000);
my $hex5 = chr(0x100000);
+my $maxm1 = chr(0x10FFFE);
my $max = chr(0x10FFFF);
EXPECT
UTF-16 surrogate 0xd800 at - line 3.
UTF-16 surrogate 0xdfff at - line 4.
-Unicode character 0xffff is illegal at - line 7.
-Unicode character 0x10ffff is illegal at - line 10.
+Unicode character 0xfffe is illegal at - line 8.
+Unicode character 0xffff is illegal at - line 9.
+Unicode character 0x10fffe is illegal at - line 12.
+Unicode character 0x10ffff is illegal at - line 13.
########
use warnings 'utf8';
my $d7ff = pack("U", 0xD7FF);
my $d800 = pack("U", 0xD800);
my $dfff = pack("U", 0xDFFF);
my $e000 = pack("U", 0xE000);
+my $feff = pack("U", 0xFEFF);
my $fffd = pack("U", 0xFFFD);
+my $fffe = pack("U", 0xFFFE);
my $ffff = pack("U", 0xFFFF);
my $hex4 = pack("U", 0x10000);
my $hex5 = pack("U", 0x100000);
+my $maxm1 = pack("U", 0x10FFFE);
my $max = pack("U", 0x10FFFF);
no warnings 'utf8';
my $d7ff = pack("U", 0xD7FF);
my $d800 = pack("U", 0xD800);
my $dfff = pack("U", 0xDFFF);
my $e000 = pack("U", 0xE000);
+my $feff = pack("U", 0xFEFF);
my $fffd = pack("U", 0xFFFD);
+my $fffe = pack("U", 0xFFFE);
my $ffff = pack("U", 0xFFFF);
my $hex4 = pack("U", 0x10000);
my $hex5 = pack("U", 0x100000);
+my $maxm1 = pack("U", 0x10FFFE);
my $max = pack("U", 0x10FFFF);
EXPECT
UTF-16 surrogate 0xd800 at - line 3.
UTF-16 surrogate 0xdfff at - line 4.
-Unicode character 0xffff is illegal at - line 7.
-Unicode character 0x10ffff is illegal at - line 10.
+Unicode character 0xfffe is illegal at - line 8.
+Unicode character 0xffff is illegal at - line 9.
+Unicode character 0x10fffe is illegal at - line 12.
+Unicode character 0x10ffff is illegal at - line 13.
########
use warnings 'utf8';
my $d7ff = "\x{D7FF}";
my $d800 = "\x{D800}";
my $dfff = "\x{DFFF}";
my $e000 = "\x{E000}";
+my $feff = "\x{FEFF}";
my $fffd = "\x{FFFD}";
+my $fffe = "\x{FFFE}";
my $ffff = "\x{FFFF}";
my $hex4 = "\x{10000}";
my $hex5 = "\x{100000}";
+my $maxm1 = "\x{10FFFE}";
my $max = "\x{10FFFF}";
no warnings 'utf8';
my $d7ff = "\x{D7FF}";
my $d800 = "\x{D800}";
my $dfff = "\x{DFFF}";
my $e000 = "\x{E000}";
+my $feff = "\x{FEFF}";
my $fffd = "\x{FFFD}";
+my $fffe = "\x{FFFE}";
my $ffff = "\x{FFFF}";
my $hex4 = "\x{10000}";
my $hex5 = "\x{100000}";
+my $maxm1 = "\x{10FFFE}";
my $max = "\x{10FFFF}";
EXPECT
UTF-16 surrogate 0xd800 at - line 3.
UTF-16 surrogate 0xdfff at - line 4.
-Unicode character 0xffff is illegal at - line 7.
-Unicode character 0x10ffff is illegal at - line 10.
+Unicode character 0xfffe is illegal at - line 8.
+Unicode character 0xffff is illegal at - line 9.
+Unicode character 0x10fffe is illegal at - line 12.
+Unicode character 0x10ffff is illegal at - line 13.
((uv >= 0xFDD0 && uv <= 0xFDEF &&
!(flags & UNICODE_ALLOW_FDD0))
||
- (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
- !(flags & UNICODE_ALLOW_BOM))
- ||
- ((uv & 0xFFFF) == 0xFFFF &&
+ ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
!(flags & UNICODE_ALLOW_FFFF))) &&
/* UNICODE_ALLOW_SUPER includes
* FFFFs beyond 0x10FFFF. */
#define UTF8_WARN_SHORT 5
#define UTF8_WARN_OVERFLOW 6
#define UTF8_WARN_SURROGATE 7
-#define UTF8_WARN_BOM 8
-#define UTF8_WARN_LONG 9
-#define UTF8_WARN_FFFF 10
+#define UTF8_WARN_LONG 8
+#define UTF8_WARN_FFFF 9 /* Also FFFE. */
if (curlen == 0 &&
!(flags & UTF8_ALLOW_EMPTY)) {
!(flags & UTF8_ALLOW_SURROGATE)) {
warning = UTF8_WARN_SURROGATE;
goto malformed;
- } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
- !(flags & UTF8_ALLOW_BOM)) {
- warning = UTF8_WARN_BOM;
- goto malformed;
} else if ((expectlen > UNISKIP(uv)) &&
!(flags & UTF8_ALLOW_LONG)) {
warning = UTF8_WARN_LONG;
case UTF8_WARN_SURROGATE:
Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
break;
- case UTF8_WARN_BOM:
- Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
- break;
case UTF8_WARN_LONG:
Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
#define UTF8_ALLOW_FE_FF 0x0008
#define UTF8_ALLOW_SHORT 0x0010
#define UTF8_ALLOW_SURROGATE 0x0020
-#define UTF8_ALLOW_BOM 0x0040
-#define UTF8_ALLOW_FFFF 0x0080
-#define UTF8_ALLOW_LONG 0x0100
+#define UTF8_ALLOW_FFFF 0x0040 /* Allows also FFFE. */
+#define UTF8_ALLOW_LONG 0x0080
#define UTF8_ALLOW_ANYUV (UTF8_ALLOW_EMPTY|UTF8_ALLOW_FE_FF|\
- UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|\
+ UTF8_ALLOW_SURROGATE|\
UTF8_ALLOW_FFFF|UTF8_ALLOW_LONG)
-#define UTF8_ALLOW_ANY 0x00ff
+#define UTF8_ALLOW_ANY 0x00FF
#define UTF8_CHECK_ONLY 0x0200
-#define UNICODE_SURROGATE_FIRST 0xd800
-#define UNICODE_SURROGATE_LAST 0xdfff
-#define UNICODE_REPLACEMENT 0xfffd
-#define UNICODE_BYTE_ORDER_MARK 0xfeff
-#define UNICODE_ILLEGAL 0xffff
+#define UNICODE_SURROGATE_FIRST 0xD800
+#define UNICODE_SURROGATE_LAST 0xDFFF
+#define UNICODE_REPLACEMENT 0xFFFD
+#define UNICODE_BYTE_ORDER_MARK 0xFEFF
+#define UNICODE_ILLEGAL 0xFFFF
/* Though our UTF-8 encoding can go beyond this,
* let's be conservative and do as Unicode 3.2 says. */
#define UNICODE_ALLOW_SURROGATE 0x0001 /* Allow UTF-16 surrogates (EVIL) */
#define UNICODE_ALLOW_FDD0 0x0002 /* Allow the U+FDD0...U+FDEF */
-#define UNICODE_ALLOW_BOM 0x0004 /* Allow 0xFEFF */
-#define UNICODE_ALLOW_FFFF 0x0008 /* Allow 0xFFFF, 0x1FFFF, ... */
-#define UNICODE_ALLOW_SUPER 0x0010 /* Allow past 10xFFFF */
-#define UNICODE_ALLOW_ANY 0xFFFF
+#define UNICODE_ALLOW_FFFF 0x0004 /* Allow 0xFFF[EF], 0x1FFF[EF], ... */
+#define UNICODE_ALLOW_SUPER 0x0008 /* Allow past 10xFFFF */
+#define UNICODE_ALLOW_ANY 0x000F
#define UNICODE_IS_SURROGATE(c) ((c) >= UNICODE_SURROGATE_FIRST && \
(c) <= UNICODE_SURROGATE_LAST)