X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=utf8.h;h=87653360eae00e24c7eb79edfe6280fdde953bf5;hb=842c41230043ce99d4bf7b2c79aed85ce2908e89;hp=1c8e06b59e51838e39e3c0a79734d58d730feb26;hpb=d06134e53994ea13d6ce081c8d670ed0bd7802ee;p=p5sagit%2Fp5-mst-13.2.git

diff --git a/utf8.h b/utf8.h
index 1c8e06b..8765336 100644
--- a/utf8.h
+++ b/utf8.h
@@ -72,22 +72,21 @@ END_EXTERN_C
  Code Points		1st Byte  2nd Byte  3rd Byte  4th Byte
 
    U+0000..U+007F	00..7F
-   U+0080..U+07FF	C2..DF    80..BF
-   U+0800..U+0FFF	E0        A0..BF    80..BF
+   U+0080..U+07FF     * C2..DF    80..BF
+   U+0800..U+0FFF	E0      * A0..BF    80..BF
    U+1000..U+CFFF       E1..EC    80..BF    80..BF
    U+D000..U+D7FF       ED        80..9F    80..BF
-   U+D800..U+DFFF       ******* ill-formed *******
+   U+D800..U+DFFF       +++++++ utf16 surrogates, not legal utf8 +++++++
    U+E000..U+FFFF       EE..EF    80..BF    80..BF
-  U+10000..U+3FFFF	F0        90..BF    80..BF    80..BF
+  U+10000..U+3FFFF	F0      * 90..BF    80..BF    80..BF
   U+40000..U+FFFFF	F1..F3    80..BF    80..BF    80..BF
  U+100000..U+10FFFF	F4        80..8F    80..BF    80..BF
 
-Note the A0..BF in U+0800..U+0FFF, the 80..9F in U+D000...U+D7FF,
-the 90..BF in U+10000..U+3FFFF, and the 80...8F in U+100000..U+10FFFF.
-The "gaps" are caused by legal UTF-8 avoiding non-shortest encodings:
-it is technically possible to UTF-8-encode a single code point in different
-ways, but that is explicitly forbidden, and the shortest possible encoding
-should always be used (and that is what Perl does).
+Note the gaps before several of the byte entries above marked by '*'.  These are
+caused by legal UTF-8 avoiding non-shortest encodings: it is technically
+possible to UTF-8-encode a single code point in different ways, but that is
+explicitly forbidden, and the shortest possible encoding should always be used
+(and that is what Perl does).
 
  */
 
@@ -102,7 +101,7 @@ should always be used (and that is what Perl does).
   00000dddccccccbbbbbbaaaaaa     11110ddd  10cccccc  10bbbbbb  10aaaaaa
 
 As you can see, the continuation bytes all begin with C<10>, and the
-leading bits of the start byte tell how many bytes the are in the
+leading bits of the start byte tell how many bytes there are in the
 encoded character.
 
 */
@@ -207,15 +206,18 @@ encoded character.
 
 #define IN_BYTES (CopHINTS_get(PL_curcop) & HINT_BYTES)
 #define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTES)
+#define IN_UNI_8_BIT ( (CopHINTS_get(PL_curcop) & HINT_UNI_8_BIT) \
+			&& ! IN_LOCALE_RUNTIME && ! IN_BYTES)
 
 #define UTF8_ALLOW_EMPTY		0x0001
 #define UTF8_ALLOW_CONTINUATION		0x0002
 #define UTF8_ALLOW_NON_CONTINUATION	0x0004
-#define UTF8_ALLOW_FE_FF		0x0008 /* Allow above 0x7fffFFFF */
-#define UTF8_ALLOW_SHORT		0x0010
+#define UTF8_ALLOW_FE_FF		0x0008 /* Allow FE or FF start bytes, \
+						  yields above 0x7fffFFFF */
+#define UTF8_ALLOW_SHORT		0x0010 /* expecting more bytes */
 #define UTF8_ALLOW_SURROGATE		0x0020
 #define UTF8_ALLOW_FFFF			0x0040 /* Allow UNICODE_ILLEGAL */
-#define UTF8_ALLOW_LONG			0x0080
+#define UTF8_ALLOW_LONG			0x0080 /* expecting fewer bytes */
 #define UTF8_ALLOW_ANYUV		(UTF8_ALLOW_EMPTY|UTF8_ALLOW_FE_FF|\
 					 UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)
 #define UTF8_ALLOW_ANY			0x00FF