From: karl williamson Date: Mon, 9 Nov 2009 15:42:17 +0000 (-0700) Subject: More cleanup of utfebcdic.h and utf8.h X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=d06134e53994ea13d6ce081c8d670ed0bd7802ee;p=p5sagit%2Fp5-mst-13.2.git More cleanup of utfebcdic.h and utf8.h Attached is a patch that removes from utfebcdic.h most definitions that are common to it and utf8.h, and moves them to the common area of utf8.h. The duplicate ones that are retained are each an integral part of a larger related set that do differ between the headers. Some of the definitions had started to drift, so this brings them back into line, with a lowered possibility of future drift. In particular the ones for the 'lazy' macros did not do quite as intended, especially in the EBCDIC case. The bugs were a small performance hit only, in that the macro was not quite as lazy as expected, and so loaded utf8_heavy.pl possibly unnecessarily. In examining these, I noted that the utf8.h definition of the start byte of a utf8 encoded string accepts invalid start bytes 0xC0 and 0xC1. These are invalid because they are for overlong encodings of ASCII code points. One is not supposed to allow these, and there have been security attacks, according to Wikipedia, against code that does. But I don't know all the ramifications for Perl of changing to exclude these, so I left it alone, but added a comment (and an item on my personal todo list to check into it). I made some comment clarifications, and removed some definitions marked as obsolete in utf8.h that are in fact no longer used. I added some synonyms for existing macros that more clearly reflect the use that I intend to put them to in future patches. From ba581aa4db767e5531ec0c0efdea5de4e9b09921 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 9 Nov 2009 08:38:24 -0700 Subject: [PATCH] Clean up utf headers Signed-off-by: H.Merijn Brand --- diff --git a/utf8.h b/utf8.h index e70559e..1c8e06b 100644 --- a/utf8.h +++ b/utf8.h @@ -27,7 +27,7 @@ #include "utfebcdic.h" -#else +#else /* ! EBCDIC */ START_EXTERN_C #ifdef DOINIT @@ -47,11 +47,9 @@ EXTCONST unsigned char PL_utf8skip[]; #endif END_EXTERN_C -#define UTF8SKIP(s) PL_utf8skip[*(const U8*)(s)] /* Native character to iso-8859-1 */ #define NATIVE_TO_ASCII(ch) (ch) -#define NATIVE8_TO_UNI(ch) (ch) #define ASCII_TO_NATIVE(ch) (ch) /* Transform after encoding */ #define NATIVE_TO_UTF(ch) (ch) @@ -63,7 +61,7 @@ END_EXTERN_C #define NATIVE_TO_NEED(enc,ch) (ch) #define ASCII_TO_NEED(enc,ch) (ch) -/* As there are no translations avoid the function wrapper */ +/* As there are no translations, avoid the function wrapper */ #define utf8n_to_uvchr utf8n_to_uvuni #define uvchr_to_utf8 uvuni_to_utf8 @@ -111,8 +109,8 @@ encoded character. #define UNI_IS_INVARIANT(c) (((UV)c) < 0x80) -#define UTF8_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_UTF(c)) -#define NATIVE_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE8_TO_UNI(c)) +/* Note that C0 and C1 are invalid in legal UTF8, so the lower bound of the + * below might ought to be C2 */ #define UTF8_IS_START(c) (((U8)c) >= 0xc0 && (((U8)c) <= 0xfd)) #define UTF8_IS_CONTINUATION(c) (((U8)c) >= 0x80 && (((U8)c) <= 0xbf)) #define UTF8_IS_CONTINUED(c) (((U8)c) & 0x80) @@ -124,10 +122,6 @@ encoded character. #define UTF_CONTINUATION_MARK 0x80 #define UTF_ACCUMULATION_SHIFT 6 #define UTF_CONTINUATION_MASK ((U8)0x3f) -#define UTF8_ACCUMULATE(old, new) (((old) << UTF_ACCUMULATION_SHIFT) | (((U8)new) & UTF_CONTINUATION_MASK)) - -#define UTF8_EIGHT_BIT_HI(c) ((((U8)(c))>>UTF_ACCUMULATION_SHIFT)|UTF_START_MARK(2)) -#define UTF8_EIGHT_BIT_LO(c) (((((U8)(c)))&UTF_CONTINUATION_MASK)|UTF_CONTINUATION_MARK) #ifdef HAS_QUAD #define UNISKIP(uv) ( (uv) < 0x80 ? 1 : \ @@ -147,24 +141,51 @@ encoded character. (uv) < 0x80000000 ? 6 : 7 ) #endif +#endif /* EBCDIC vs ASCII */ + +/* Rest of these are attributes of Unicode and perl's internals rather than the + * encoding, or happen to be the same in both ASCII and EBCDIC (at least at + * this level; the macros that some of these call may have different + * definitions in the two encodings */ + +#define NATIVE8_TO_UNI(ch) NATIVE_TO_ASCII(ch) /* a clearer synonym */ + +#define UTF8_ACCUMULATE(old, new) (((old) << UTF_ACCUMULATION_SHIFT) | (((U8)new) & UTF_CONTINUATION_MASK)) + +#define UTF8SKIP(s) PL_utf8skip[*(const U8*)(s)] + +#define UTF8_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_UTF(c)) +#define NATIVE_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE8_TO_UNI(c)) + +#define MAX_PORTABLE_UTF8_TWO_BYTE 0x3FF /* constrained by EBCDIC */ + +/* The macros in the next sets are used to generate the two utf8 or utfebcdic + * bytes from an ordinal that is known to fit into two bytes; it must be less + * than 0x3FF to work across both encodings. */ +/* Nocast allows these to be used in the case label of a switch statement */ +#define UTF8_TWO_BYTE_HI_nocast(c) UTF_TO_NATIVE(((c)>>UTF_ACCUMULATION_SHIFT)|UTF_START_MARK(2)) +#define UTF8_TWO_BYTE_LO_nocast(c) UTF_TO_NATIVE(((c)&UTF_CONTINUATION_MASK)|UTF_CONTINUATION_MARK) + +#define UTF8_TWO_BYTE_HI(c) ((U8) (UTF8_TWO_BYTE_HI_nocast(c))) +#define UTF8_TWO_BYTE_LO(c) ((U8) (UTF8_TWO_BYTE_LO_nocast(c))) + +/* This name is used when the source is a single byte */ +#define UTF8_EIGHT_BIT_HI(c) UTF8_TWO_BYTE_HI((U8)(c)) +#define UTF8_EIGHT_BIT_LO(c) UTF8_TWO_BYTE_LO((U8)(c)) + /* * Note: we try to be careful never to call the isXXX_utf8() functions - * unless we're pretty sure we've seen the beginning of a UTF-8 character - * (that is, the two high bits are set). Otherwise we risk loading in the - * heavy-duty swash_init and swash_fetch routines unnecessarily. + * unless we're pretty sure we've seen the beginning of a UTF-8 or UTFEBCDIC + * character. Otherwise we risk loading in the heavy-duty swash_init and + * swash_fetch routines unnecessarily. */ -#define isIDFIRST_lazy_if(p,c) ((IN_BYTES || (!c || (*((const U8*)p) < 0xc0))) \ +#define isIDFIRST_lazy_if(p,c) ((IN_BYTES || (!c || ! UTF8_IS_START(*((const U8*)p)))) \ ? isIDFIRST(*(p)) \ : isIDFIRST_utf8((const U8*)p)) -#define isALNUM_lazy_if(p,c) ((IN_BYTES || (!c || (*((const U8*)p) < 0xc0))) \ +#define isALNUM_lazy_if(p,c) ((IN_BYTES || (!c || ! UTF8_IS_START(*((const U8*)p)))) \ ? isALNUM(*(p)) \ : isALNUM_utf8((const U8*)p)) - -#endif /* EBCDIC vs ASCII */ - -/* Rest of these are attributes of Unicode and perl's internals rather than the encoding */ - #define isIDFIRST_lazy(p) isIDFIRST_lazy_if(p,1) #define isALNUM_lazy(p) isALNUM_lazy_if(p,1) @@ -176,16 +197,11 @@ encoded character. * as a way to encode non-negative integers in a binary format. */ #define UTF8_MAXLEN UTF8_MAXBYTES -#define UTF8_MAXLEN_UCLC 3 /* Obsolete, do not use. */ -#define UTF8_MAXLEN_UCLC_MULT 39 /* Obsolete, do not use. */ -#define UTF8_MAXLEN_FOLD 3 /* Obsolete, do not use. */ -#define UTF8_MAXLEN_FOLD_MULT 39 /* Obsolete, do not use. */ - /* The maximum number of UTF-8 bytes a single Unicode character can * uppercase/lowercase/fold into; this number depends on the Unicode * version. An example of maximal expansion is the U+03B0 which * uppercases to U+03C5 U+0308 U+0301. The Unicode databases that - * tell these things are UnicodeDatabase.txt, CaseFolding.txt, and + * tell these things are UnicodeData.txt, CaseFolding.txt, and * SpecialCasing.txt. */ #define UTF8_MAXBYTES_CASE 6 diff --git a/utfebcdic.h b/utfebcdic.h index 8a6176c..c3fe603 100644 --- a/utfebcdic.h +++ b/utfebcdic.h @@ -12,13 +12,14 @@ * * To summarize, the way it works is: * To convert an EBCDIC character to UTF-EBCDIC: - * 1) convert to Unicode. The table in this file that does this is for + * 1) convert to Unicode. The table in this file that does this for * EBCDIC bytes is PL_e2a (with inverse PLa2e). The 'a' stands for * ASCIIish, meaning latin1. - * 2) convert that to a utf8-like string called I8 with variant characters - * occupying multiple bytes. This step is similar to the utf8-creating - * step from Unicode, but the details are different. There is a chart - * about the bit patterns in a comment later in this file. But + * 2) convert that to a utf8-like string called I8 (I stands for + * intermediate) with variant characters occupying multiple bytes. This + * step is similar to the utf8-creating step from Unicode, but the details + * are different. This transformation is called UTF8-Mod. There is a + * chart about the bit patterns in a comment later in this file. But * essentially here are the differences: * UTF8 I8 * invariant byte starts with 0 starts with 0 or 100 @@ -29,18 +30,19 @@ * in I8, far beyond the current Unicode standard's * max, as shown in the comment later in this file.) * 3) Use the table published in tr16 to convert each byte from step 2 into - * final UTF-EBCDIC. The table in this file is PL_utf2e, and its inverse - * is PL_e2utf. They are constructed so that all EBCDIC invariants remain - * invariant, but no others do. For example, the ordinal value of 'A' is - * 193 in EBCDIC, and also is 193 in UTF-EBCDIC. Step 1) converts it to - * 65, Step 2 leaves it at 65, and Step 3 converts it back to 193. As an - * example of how a variant character works, take LATIN SMALL LETTER Y - * WITH DIAERESIS, which is typicially 0xDF in EBCDIC. Step 1 converts it - * to the Unicode value, 0xFF. Step 2 converts that to two bytes = - * 11000111 10111111 = C7 BF, and Step 3 converts those to 0x8B 0x73. The - * table is constructed so that the first bytes of a variant will always - * have its upper bit set (at least in the encodings that Perl recognizes, - * and probably all). + * final UTF-EBCDIC. That table is reproduced in this file as PL_utf2e, + * and its inverse is PL_e2utf. They are constructed so that all EBCDIC + * invariants remain invariant, but no others do. For example, the + * ordinal value of 'A' is 193 in EBCDIC, and also is 193 in UTF-EBCDIC. + * Step 1) converts it to 65, Step 2 leaves it at 65, and Step 3 converts + * it back to 193. As an example of how a variant character works, take + * LATIN SMALL LETTER Y WITH DIAERESIS, which is typicially 0xDF in + * EBCDIC. Step 1 converts it to the Unicode value, 0xFF. Step 2 + * converts that to two bytes = 11000111 10111111 = C7 BF, and Step 3 + * converts those to 0x8B 0x73. The table is constructed so that the + * first byte of the final form of a variant will always have its upper + * bit set (at least in the encodings that Perl recognizes, and probably + * all). But note that the upper bit of some invariants is also 1. * * If you're starting from Unicode, skip step 1. For UTF-EBCDIC to straight * EBCDIC, reverse the steps. @@ -60,8 +62,8 @@ * There are actually 3 slightly different UTF-EBCDIC encodings in this file, * one for each of the code pages recognized by Perl. That means that there * are actually three different sets of tables, one for each code page. (If - * Perl is compiled on platforms using other EBCDIC code pages, it may not - * compile, or silently mistake it for one of the three.) + * Perl is compiled on platforms using another EBCDIC code page, it may not + * compile, or Perl may silently mistake it for one of the three.) * * EBCDIC characters above 0xFF are the same as Unicode in Perl's * implementation of all 3 encodings, so for those Step 1 is trivial. @@ -150,7 +152,7 @@ unsigned char PL_utf8skip[] = { * remains 'A' */ #if '^' == 95 /* if defined(__MVS__) || defined(??) (VM/ESA?) 1047 */ -EXTCONST unsigned char PL_utf2e[] = { /* UTF-8-mod to EBCDIC (IBM-1047) */ +EXTCONST unsigned char PL_utf2e[] = { /* I8 to EBCDIC (IBM-1047) */ 0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x15, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F, 0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61, @@ -169,7 +171,7 @@ EXTCONST unsigned char PL_utf2e[] = { /* UTF-8-mod to EBCDIC (IBM-1047) */ 0xDC, 0xDD, 0xDE, 0xDF, 0xE1, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE }; -EXTCONST unsigned char PL_e2utf[] = { /* EBCDIC (IBM-1047) to UTF-8-mod */ +EXTCONST unsigned char PL_e2utf[] = { /* EBCDIC (IBM-1047) to I8 */ 0x00, 0x01, 0x02, 0x03, 0x9C, 0x09, 0x86, 0x7F, 0x97, 0x8D, 0x8E, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x9D, 0x0A, 0x08, 0x87, 0x18, 0x19, 0x92, 0x8F, 0x1C, 0x1D, 0x1E, 0x1F, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x17, 0x1B, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x05, 0x06, 0x07, @@ -190,7 +192,7 @@ EXTCONST unsigned char PL_e2utf[] = { /* EBCDIC (IBM-1047) to UTF-8-mod */ #endif /* 1047 */ #if '^' == 106 /* if defined(_OSD_POSIX) POSIX-BC */ -unsigned char PL_utf2e[] = { /* UTF-8-mod to EBCDIC (POSIX-BC) */ +unsigned char PL_utf2e[] = { /* I8 to EBCDIC (POSIX-BC) */ 0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x15, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F, 0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61, @@ -209,7 +211,7 @@ unsigned char PL_utf2e[] = { /* UTF-8-mod to EBCDIC (POSIX-BC) */ 0xDC, 0xC0, 0xDE, 0xDF, 0xE1, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xFA, 0xDD, 0xFC, 0xE0, 0xFE }; -unsigned char PL_e2utf[] = { /* EBCDIC (POSIX-BC) to UTF-8-mod */ +unsigned char PL_e2utf[] = { /* EBCDIC (POSIX-BC) to I8 */ 0x00, 0x01, 0x02, 0x03, 0x9C, 0x09, 0x86, 0x7F, 0x97, 0x8D, 0x8E, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x9D, 0x0A, 0x08, 0x87, 0x18, 0x19, 0x92, 0x8F, 0x1C, 0x1D, 0x1E, 0x1F, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x17, 0x1B, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x05, 0x06, 0x07, @@ -230,7 +232,7 @@ unsigned char PL_e2utf[] = { /* EBCDIC (POSIX-BC) to UTF-8-mod */ #endif /* POSIX-BC */ #if '^' == 176 /* if defined(??) (OS/400?) 037 */ -unsigned char PL_utf2e[] = { /* UTF-8-mod to EBCDIC (IBM-037) */ +unsigned char PL_utf2e[] = { /* I8 to EBCDIC (IBM-037) */ 0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x25, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F, 0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61, @@ -249,7 +251,7 @@ unsigned char PL_utf2e[] = { /* UTF-8-mod to EBCDIC (IBM-037) */ 0xDC, 0xDD, 0xDE, 0xDF, 0xE1, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE }; -unsigned char PL_e2utf[] = { /* EBCDIC (IBM-037) to UTF-8-mod */ +unsigned char PL_e2utf[] = { /* EBCDIC (IBM-037) to I8 */ 0x00, 0x01, 0x02, 0x03, 0x9C, 0x09, 0x86, 0x7F, 0x97, 0x8D, 0x8E, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x9D, 0x85, 0x08, 0x87, 0x18, 0x19, 0x92, 0x8F, 0x1C, 0x1D, 0x1E, 0x1F, 0x80, 0x81, 0x82, 0x83, 0x84, 0x0A, 0x17, 0x1B, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x05, 0x06, 0x07, @@ -416,13 +418,10 @@ EXTCONST unsigned char PL_a2e[]; END_EXTERN_C -#define UTF8SKIP(s) PL_utf8skip[*(const U8*)(s)] - /* EBCDIC-happy ways of converting native code to UTF-8 */ /* Native to iso-8859-1 */ #define NATIVE_TO_ASCII(ch) PL_e2a[(U8)(ch)] -#define NATIVE8_TO_UNI(ch) NATIVE_TO_ASCII(ch) /* a clearer synonym */ #define ASCII_TO_NATIVE(ch) PL_a2e[(U8)(ch)] /* Transform after encoding */ #define NATIVE_TO_UTF(ch) PL_e2utf[(U8)(ch)] @@ -435,21 +434,7 @@ END_EXTERN_C #define ASCII_TO_NEED(enc,ch) ((enc) ? UTF_TO_NATIVE(ch) : ASCII_TO_NATIVE(ch)) /* - * Note: we should try and be careful never to call the isXXX_utf8() functions - * unless we're pretty sure we've seen the beginning of a UTF-EBCDIC character - * Otherwise we risk loading in the heavy-duty swash_init and swash_fetch - * routines unnecessarily. - */ - -#define isIDFIRST_lazy_if(p,c) ((IN_BYTES || (!c || UTF8_IS_INVARIANT(*p))) \ - ? isIDFIRST(*(p)) \ - : isIDFIRST_utf8((const U8*)p)) -#define isALNUM_lazy_if(p,c) ((IN_BYTES || (!c || UTF8_IS_INVARIANT(*p))) \ - ? isALNUM(*(p)) \ - : isALNUM_utf8((const U8*)p)) - -/* - The following table is adapted from tr16, it shows UTF-8-mod encoding of Unicode code points. + The following table is adapted from tr16, it shows I8 encoding of Unicode code points. Unicode Bit pattern 1st Byte 2nd Byte 3rd Byte 4th Byte 5th Byte 6th Byte 7th byte U+0000..U+007F 000000000xxxxxxx 0xxxxxxx @@ -463,7 +448,7 @@ END_EXTERN_C U+400000..U+3FFFFFF 0uvvvvvwwwwwzzzzzyyyyyxxxxx 1111110u 101vvvvv 101wwwww 101zzzzz 101yyyyy 101xxxxx U+4000000..U+7FFFFFFF 0tuuuuuvvvvvwwwwwzzzzzyyyyyxxxxx 1111111t 101uuuuu 101vvvvv 101wwwww 101zzzzz 101yyyyy 101xxxxx - Note: The UTF-8-Mod transformation is valid for UCS-4 values X'0' to + Note: The I8 transformation is valid for UCS-4 values X'0' to X'7FFFFFFF' (the full extent of ISO/IEC 10646 coding space). */ @@ -477,9 +462,7 @@ END_EXTERN_C #define UNI_IS_INVARIANT(c) ((c) < 0xA0) -/* UTF-EBCDIC sematic macros - transform back into UTF-8-Mod and then compare */ -#define NATIVE_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE8_TO_UNI(c)) -#define UTF8_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_UTF(c)) +/* UTF-EBCDIC semantic macros - transform back into I8 and then compare */ #define UTF8_IS_START(c) (NATIVE_TO_UTF(c) >= 0xA0 && (NATIVE_TO_UTF(c) & 0xE0) != 0xA0) #define UTF8_IS_CONTINUATION(c) ((NATIVE_TO_UTF(c) & 0xE0) == 0xA0) #define UTF8_IS_CONTINUED(c) (NATIVE_TO_UTF(c) >= 0xA0) @@ -491,12 +474,6 @@ END_EXTERN_C #define UTF_CONTINUATION_MASK ((U8)0x1f) #define UTF_ACCUMULATION_SHIFT 5 -#define UTF8_ACCUMULATE(old, new) (((old) << UTF_ACCUMULATION_SHIFT)|(NATIVE_TO_UTF(new) & UTF_CONTINUATION_MASK)) - -/* UTF-EBCDIC encode a downgradeable value */ -#define UTF8_EIGHT_BIT_HI(c) UTF_TO_NATIVE((((U8)(c))>>UTF_ACCUMULATION_SHIFT)|UTF_START_MARK(2)) -#define UTF8_EIGHT_BIT_LO(c) UTF_TO_NATIVE(((((U8)(c)))&UTF_CONTINUATION_MASK)|UTF_CONTINUATION_MARK) - /* * Local variables: * c-indentation-style: bsd