Define specially handled chars; and clean-up ebcdic vs unicode
Karl Williamson [Sun, 8 Nov 2009 00:36:55 +0000 (17:36 -0700)]
utf8.h
utfebcdic.h

diff --git a/utf8.h b/utf8.h
index 659319e..e70559e 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -112,7 +112,7 @@ encoded character.
 
 #define UNI_IS_INVARIANT(c)            (((UV)c) <  0x80)
 #define UTF8_IS_INVARIANT(c)           UNI_IS_INVARIANT(NATIVE_TO_UTF(c))
-#define NATIVE_IS_INVARIANT(c)         UNI_IS_INVARIANT(NATIVE_TO_ASCII(c))
+#define NATIVE_IS_INVARIANT(c)         UNI_IS_INVARIANT(NATIVE8_TO_UNI(c))
 #define UTF8_IS_START(c)               (((U8)c) >= 0xc0 && (((U8)c) <= 0xfd))
 #define UTF8_IS_CONTINUATION(c)                (((U8)c) >= 0x80 && (((U8)c) <= 0xbf))
 #define UTF8_IS_CONTINUED(c)           (((U8)c) &  0x80)
@@ -235,35 +235,28 @@ encoded character.
 
 #define UTF8_IS_ASCII(c) UTF8_IS_INVARIANT(c)
 
-#define UNICODE_LATIN_SMALL_LETTER_SHARP_S     0x00DF
 #define UNICODE_GREEK_CAPITAL_LETTER_SIGMA     0x03A3
 #define UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA 0x03C2
 #define UNICODE_GREEK_SMALL_LETTER_SIGMA       0x03C3
 
-#define EBCDIC_LATIN_SMALL_LETTER_SHARP_S      0x0059
-
 #define UNI_DISPLAY_ISPRINT    0x0001
 #define UNI_DISPLAY_BACKSLASH  0x0002
 #define UNI_DISPLAY_QQ         (UNI_DISPLAY_ISPRINT|UNI_DISPLAY_BACKSLASH)
 #define UNI_DISPLAY_REGEX      (UNI_DISPLAY_ISPRINT|UNI_DISPLAY_BACKSLASH)
 
-#ifdef EBCDIC
-#   define ANYOF_FOLD_SHARP_S(node, input, end)        \
-       (ANYOF_BITMAP_TEST(node, EBCDIC_LATIN_SMALL_LETTER_SHARP_S) && \
-        (ANYOF_FLAGS(node) & ANYOF_UNICODE) && \
-        (ANYOF_FLAGS(node) & ANYOF_FOLD) && \
-        ((end) > (input) + 1) && \
-        toLOWER((input)[0]) == 's' && \
-        toLOWER((input)[1]) == 's')
-#else
-#   define ANYOF_FOLD_SHARP_S(node, input, end)        \
-       (ANYOF_BITMAP_TEST(node, UNICODE_LATIN_SMALL_LETTER_SHARP_S) && \
+#ifndef EBCDIC
+#   define LATIN_SMALL_LETTER_SHARP_S  0x00DF
+#   define LATIN_SMALL_LETTER_Y_WITH_DIAERESIS 0x00FF
+#   define MICRO_SIGN 0x00B5
+#endif
+
+#define ANYOF_FOLD_SHARP_S(node, input, end)   \
+       (ANYOF_BITMAP_TEST(node, LATIN_SMALL_LETTER_SHARP_S) && \
         (ANYOF_FLAGS(node) & ANYOF_UNICODE) && \
         (ANYOF_FLAGS(node) & ANYOF_FOLD) && \
         ((end) > (input) + 1) && \
         toLOWER((input)[0]) == 's' && \
         toLOWER((input)[1]) == 's')
-#endif
 #define SHARP_S_SKIP 2
 
 #ifdef EBCDIC
index e61b4a7..8a6176c 100644 (file)
@@ -293,6 +293,10 @@ EXTCONST unsigned char PL_a2e[] = { /* ASCII (iso-8859-1) to EBCDIC (IBM-1047) *
  0x8C, 0x49, 0xCD, 0xCE, 0xCB, 0xCF, 0xCC, 0xE1, 0x70, 0xDD, 0xDE, 0xDB, 0xDC, 0x8D, 0x8E, 0xDF
 };
 
+#define LATIN_SMALL_LETTER_Y_WITH_DIAERESIS 0xDF
+#define LATIN_SMALL_LETTER_SHARP_S 0x59
+#define MICRO_SIGN 0xA0
+
 EXTCONST unsigned char PL_e2a[] = { /* EBCDIC (IBM-1047) to ASCII (iso-8859-1) */
  0x00, 0x01, 0x02, 0x03, 0x9C, 0x09, 0x86, 0x7F, 0x97, 0x8D, 0x8E, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
  0x10, 0x11, 0x12, 0x13, 0x9D, 0x0A, 0x08, 0x87, 0x18, 0x19, 0x92, 0x8F, 0x1C, 0x1D, 0x1E, 0x1F,
@@ -333,6 +337,10 @@ EXTCONST unsigned char PL_a2e[] = { /* ASCII (ISO8859-1) to EBCDIC (POSIX-BC) */
  0x8C, 0x49, 0xCD, 0xCE, 0xCB, 0xCF, 0xCC, 0xE1, 0x70, 0xC0, 0xDE, 0xDB, 0xDC, 0x8D, 0x8E, 0xDF
 };
 
+#define LATIN_SMALL_LETTER_Y_WITH_DIAERESIS 0xDF
+#define LATIN_SMALL_LETTER_SHARP_S 0x59
+#define MICRO_SIGN 0xA0
+
 EXTCONST unsigned char PL_e2a[] = { /* EBCDIC (POSIX-BC) to ASCII (ISO8859-1) */
  0x00, 0x01, 0x02, 0x03, 0x9C, 0x09, 0x86, 0x7F, 0x97, 0x8D, 0x8E, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
  0x10, 0x11, 0x12, 0x13, 0x9D, 0x0A, 0x08, 0x87, 0x18, 0x19, 0x92, 0x8F, 0x1C, 0x1D, 0x1E, 0x1F,
@@ -373,6 +381,11 @@ EXTCONST unsigned char PL_a2e[] = { /* ASCII (ISO8859-1) to EBCDIC (IBM-037) */
  0x8C, 0x49, 0xCD, 0xCE, 0xCB, 0xCF, 0xCC, 0xE1, 0x70, 0xDD, 0xDE, 0xDB, 0xDC, 0x8D, 0x8E, 0xDF
 };
 
+
+#define LATIN_SMALL_LETTER_Y_WITH_DIAERESIS 0xDF
+#define LATIN_SMALL_LETTER_SHARP_S 0x59
+#define MICRO_SIGN 0xA0
+
 EXTCONST unsigned char PL_e2a[] = { /* EBCDIC (IBM-037) to ASCII (ISO8859-1) */
  0x00, 0x01, 0x02, 0x03, 0x9C, 0x09, 0x86, 0x7F, 0x97, 0x8D, 0x8E, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
  0x10, 0x11, 0x12, 0x13, 0x9D, 0x85, 0x08, 0x87, 0x18, 0x19, 0x92, 0x8F, 0x1C, 0x1D, 0x1E, 0x1F,
@@ -409,7 +422,7 @@ END_EXTERN_C
 
 /* Native to iso-8859-1 */
 #define NATIVE_TO_ASCII(ch)      PL_e2a[(U8)(ch)]
-#define NATIVE8_TO_UNI(ch)     NATIVE_TO_ASCII(ch)     /* synonym */
+#define NATIVE8_TO_UNI(ch)     NATIVE_TO_ASCII(ch)     /* a clearer synonym */
 #define ASCII_TO_NATIVE(ch)      PL_a2e[(U8)(ch)]
 /* Transform after encoding */
 #define NATIVE_TO_UTF(ch)        PL_e2utf[(U8)(ch)]