A modified version of
[p5sagit/p5-mst-13.2.git] / utf8.h
diff --git a/utf8.h b/utf8.h
index 28aa057..4917811 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -64,10 +64,27 @@ END_EXTERN_C
 
 #define UTF8_QUAD_MAX  UINT64_C(0x1000000000)
 
+/*
+ The following table is from Unicode 3.1.
+
+ Code Points           1st Byte  2nd Byte  3rd Byte  4th Byte
+
+   U+0000..U+007F      00..7F   
+   U+0080..U+07FF      C2..DF    80..BF   
+   U+0800..U+0FFF      E0        A0..BF    80..BF  
+   U+1000..U+FFFF      E1..EF    80..BF    80..BF  
+  U+10000..U+3FFFF     F0        90..BF    80..BF    80..BF
+  U+40000..U+FFFFF     F1..F3    80..BF    80..BF    80..BF
+ U+100000..U+10FFFF    F4        80..8F    80..BF    80..BF
+
+ */
+
 #define UTF8_IS_ASCII(c)               (((U8)c) <  0x80)
 #define UTF8_IS_START(c)               (((U8)c) >= 0xc0 && (((U8)c) <= 0xfd))
 #define UTF8_IS_CONTINUATION(c)                (((U8)c) >= 0x80 && (((U8)c) <= 0xbf))
 #define UTF8_IS_CONTINUED(c)           (((U8)c) &  0x80)
+#define UTF8_IS_DOWNGRADEABLE_START(c) (((U8)c & 0xfc) != 0xc0)
 
 #define UTF8_CONTINUATION_MASK         ((U8)0x3f)
 #define UTF8_ACCUMULATION_SHIFT                6
@@ -114,3 +131,22 @@ END_EXTERN_C
 #endif
 #define isIDFIRST_lazy(p)      isIDFIRST_lazy_if(p,1)
 #define isALNUM_lazy(p)                isALNUM_lazy_if(p,1)
+
+/* EBCDIC-happy ways of converting native code to UTF8; the reverse
+   process is taken care of in utf8_to_uv */
+
+#ifdef EBCDIC
+#define NATIVE_TO_ASCII(ch)                  PL_e2a[(ch)]
+#define ASCII_TO_NATIVE(ch)                  PL_a2e[(ch)]
+#else
+#define NATIVE_TO_ASCII(ch)                  (ch)
+#define ASCII_TO_NATIVE(ch)                  (ch)
+#endif
+
+#define UTF8_NEEDS_UPGRADE(ch)        (NATIVE_TO_ASCII(ch) & 0x80)
+#define NATIVE_TO_UTF8(ch, string)    STMT_START { \
+                                        if (!UTF8_NEEDS_UPGRADE(ch)) \
+                                            *(string)++ = NATIVE_TO_ASCII(ch); \
+                                        else /*  uv_to_utf8 is EBCDIC-aware */ \
+                                           string = uv_to_utf8(string, ch); \
+                                      } STMT_END