From: Jarkko Hietaniemi Date: Sat, 6 Apr 2002 16:41:06 +0000 (+0000) Subject: As noted by Philip Newton: nothing wrong with BOM, X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=c867b36081026b7a72e449b72b5b9916973f9bf1;p=p5sagit%2Fp5-mst-13.2.git As noted by Philip Newton: nothing wrong with BOM, but 0xFFFE quite wrong. p4raw-id: //depot/perl@15762 --- diff --git a/t/lib/warnings/utf8 b/t/lib/warnings/utf8 index 5cd0e05..6635f02 100644 --- a/t/lib/warnings/utf8 +++ b/t/lib/warnings/utf8 @@ -38,75 +38,99 @@ my $d7ff = chr(0xD7FF); my $d800 = chr(0xD800); my $dfff = chr(0xDFFF); my $e000 = chr(0xE000); +my $feff = chr(0xFEFF); my $fffd = chr(0xFFFD); +my $fffe = chr(0xFFFE); my $ffff = chr(0xFFFF); my $hex4 = chr(0x10000); my $hex5 = chr(0x100000); +my $maxm1 = chr(0x10FFFE); my $max = chr(0x10FFFF); no warnings 'utf8'; my $d7ff = chr(0xD7FF); my $d800 = chr(0xD800); my $dfff = chr(0xDFFF); my $e000 = chr(0xE000); +my $feff = chr(0xFEFF); my $fffd = chr(0xFFFD); +my $fffe = chr(0xFFFE); my $ffff = chr(0xFFFF); my $hex4 = chr(0x10000); my $hex5 = chr(0x100000); +my $maxm1 = chr(0x10FFFE); my $max = chr(0x10FFFF); EXPECT UTF-16 surrogate 0xd800 at - line 3. UTF-16 surrogate 0xdfff at - line 4. -Unicode character 0xffff is illegal at - line 7. -Unicode character 0x10ffff is illegal at - line 10. +Unicode character 0xfffe is illegal at - line 8. +Unicode character 0xffff is illegal at - line 9. +Unicode character 0x10fffe is illegal at - line 12. +Unicode character 0x10ffff is illegal at - line 13. ######## use warnings 'utf8'; my $d7ff = pack("U", 0xD7FF); my $d800 = pack("U", 0xD800); my $dfff = pack("U", 0xDFFF); my $e000 = pack("U", 0xE000); +my $feff = pack("U", 0xFEFF); my $fffd = pack("U", 0xFFFD); +my $fffe = pack("U", 0xFFFE); my $ffff = pack("U", 0xFFFF); my $hex4 = pack("U", 0x10000); my $hex5 = pack("U", 0x100000); +my $maxm1 = pack("U", 0x10FFFE); my $max = pack("U", 0x10FFFF); no warnings 'utf8'; my $d7ff = pack("U", 0xD7FF); my $d800 = pack("U", 0xD800); my $dfff = pack("U", 0xDFFF); my $e000 = pack("U", 0xE000); +my $feff = pack("U", 0xFEFF); my $fffd = pack("U", 0xFFFD); +my $fffe = pack("U", 0xFFFE); my $ffff = pack("U", 0xFFFF); my $hex4 = pack("U", 0x10000); my $hex5 = pack("U", 0x100000); +my $maxm1 = pack("U", 0x10FFFE); my $max = pack("U", 0x10FFFF); EXPECT UTF-16 surrogate 0xd800 at - line 3. UTF-16 surrogate 0xdfff at - line 4. -Unicode character 0xffff is illegal at - line 7. -Unicode character 0x10ffff is illegal at - line 10. +Unicode character 0xfffe is illegal at - line 8. +Unicode character 0xffff is illegal at - line 9. +Unicode character 0x10fffe is illegal at - line 12. +Unicode character 0x10ffff is illegal at - line 13. ######## use warnings 'utf8'; my $d7ff = "\x{D7FF}"; my $d800 = "\x{D800}"; my $dfff = "\x{DFFF}"; my $e000 = "\x{E000}"; +my $feff = "\x{FEFF}"; my $fffd = "\x{FFFD}"; +my $fffe = "\x{FFFE}"; my $ffff = "\x{FFFF}"; my $hex4 = "\x{10000}"; my $hex5 = "\x{100000}"; +my $maxm1 = "\x{10FFFE}"; my $max = "\x{10FFFF}"; no warnings 'utf8'; my $d7ff = "\x{D7FF}"; my $d800 = "\x{D800}"; my $dfff = "\x{DFFF}"; my $e000 = "\x{E000}"; +my $feff = "\x{FEFF}"; my $fffd = "\x{FFFD}"; +my $fffe = "\x{FFFE}"; my $ffff = "\x{FFFF}"; my $hex4 = "\x{10000}"; my $hex5 = "\x{100000}"; +my $maxm1 = "\x{10FFFE}"; my $max = "\x{10FFFF}"; EXPECT UTF-16 surrogate 0xd800 at - line 3. UTF-16 surrogate 0xdfff at - line 4. -Unicode character 0xffff is illegal at - line 7. -Unicode character 0x10ffff is illegal at - line 10. +Unicode character 0xfffe is illegal at - line 8. +Unicode character 0xffff is illegal at - line 9. +Unicode character 0x10fffe is illegal at - line 12. +Unicode character 0x10ffff is illegal at - line 13. diff --git a/utf8.c b/utf8.c index 0100eb1..9f2c4fb 100644 --- a/utf8.c +++ b/utf8.c @@ -64,10 +64,7 @@ Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) ((uv >= 0xFDD0 && uv <= 0xFDEF && !(flags & UNICODE_ALLOW_FDD0)) || - (UNICODE_IS_BYTE_ORDER_MARK(uv) && - !(flags & UNICODE_ALLOW_BOM)) - || - ((uv & 0xFFFF) == 0xFFFF && + ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */ !(flags & UNICODE_ALLOW_FFFF))) && /* UNICODE_ALLOW_SUPER includes * FFFFs beyond 0x10FFFF. */ @@ -296,9 +293,8 @@ Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) #define UTF8_WARN_SHORT 5 #define UTF8_WARN_OVERFLOW 6 #define UTF8_WARN_SURROGATE 7 -#define UTF8_WARN_BOM 8 -#define UTF8_WARN_LONG 9 -#define UTF8_WARN_FFFF 10 +#define UTF8_WARN_LONG 8 +#define UTF8_WARN_FFFF 9 /* Also FFFE. */ if (curlen == 0 && !(flags & UTF8_ALLOW_EMPTY)) { @@ -393,10 +389,6 @@ Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) !(flags & UTF8_ALLOW_SURROGATE)) { warning = UTF8_WARN_SURROGATE; goto malformed; - } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) && - !(flags & UTF8_ALLOW_BOM)) { - warning = UTF8_WARN_BOM; - goto malformed; } else if ((expectlen > UNISKIP(uv)) && !(flags & UTF8_ALLOW_LONG)) { warning = UTF8_WARN_LONG; @@ -452,9 +444,6 @@ malformed: case UTF8_WARN_SURROGATE: Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv); break; - case UTF8_WARN_BOM: - Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv); - break; case UTF8_WARN_LONG: Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")", expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte); diff --git a/utf8.h b/utf8.h index 0b74517..6885859 100644 --- a/utf8.h +++ b/utf8.h @@ -180,20 +180,19 @@ encoded character. #define UTF8_ALLOW_FE_FF 0x0008 #define UTF8_ALLOW_SHORT 0x0010 #define UTF8_ALLOW_SURROGATE 0x0020 -#define UTF8_ALLOW_BOM 0x0040 -#define UTF8_ALLOW_FFFF 0x0080 -#define UTF8_ALLOW_LONG 0x0100 +#define UTF8_ALLOW_FFFF 0x0040 /* Allows also FFFE. */ +#define UTF8_ALLOW_LONG 0x0080 #define UTF8_ALLOW_ANYUV (UTF8_ALLOW_EMPTY|UTF8_ALLOW_FE_FF|\ - UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|\ + UTF8_ALLOW_SURROGATE|\ UTF8_ALLOW_FFFF|UTF8_ALLOW_LONG) -#define UTF8_ALLOW_ANY 0x00ff +#define UTF8_ALLOW_ANY 0x00FF #define UTF8_CHECK_ONLY 0x0200 -#define UNICODE_SURROGATE_FIRST 0xd800 -#define UNICODE_SURROGATE_LAST 0xdfff -#define UNICODE_REPLACEMENT 0xfffd -#define UNICODE_BYTE_ORDER_MARK 0xfeff -#define UNICODE_ILLEGAL 0xffff +#define UNICODE_SURROGATE_FIRST 0xD800 +#define UNICODE_SURROGATE_LAST 0xDFFF +#define UNICODE_REPLACEMENT 0xFFFD +#define UNICODE_BYTE_ORDER_MARK 0xFEFF +#define UNICODE_ILLEGAL 0xFFFF /* Though our UTF-8 encoding can go beyond this, * let's be conservative and do as Unicode 3.2 says. */ @@ -201,10 +200,9 @@ encoded character. #define UNICODE_ALLOW_SURROGATE 0x0001 /* Allow UTF-16 surrogates (EVIL) */ #define UNICODE_ALLOW_FDD0 0x0002 /* Allow the U+FDD0...U+FDEF */ -#define UNICODE_ALLOW_BOM 0x0004 /* Allow 0xFEFF */ -#define UNICODE_ALLOW_FFFF 0x0008 /* Allow 0xFFFF, 0x1FFFF, ... */ -#define UNICODE_ALLOW_SUPER 0x0010 /* Allow past 10xFFFF */ -#define UNICODE_ALLOW_ANY 0xFFFF +#define UNICODE_ALLOW_FFFF 0x0004 /* Allow 0xFFF[EF], 0x1FFF[EF], ... */ +#define UNICODE_ALLOW_SUPER 0x0008 /* Allow past 10xFFFF */ +#define UNICODE_ALLOW_ANY 0x000F #define UNICODE_IS_SURROGATE(c) ((c) >= UNICODE_SURROGATE_FIRST && \ (c) <= UNICODE_SURROGATE_LAST)