From: Nick Ing-Simmons Date: Fri, 16 Mar 2001 17:23:21 +0000 (+0000) Subject: EBCDIC Fixes. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=db42d1485c38c3442e7b62e63d45f5e5b9b66ee1;p=p5sagit%2Fp5-mst-13.2.git EBCDIC Fixes. p4raw-id: //depot/perlio@9180 --- diff --git a/perl.h b/perl.h index 27cd66d..8a17159 100644 --- a/perl.h +++ b/perl.h @@ -2519,7 +2519,7 @@ EXT int PL_sig_num[]; #ifdef DOINIT #ifdef EBCDIC #if '^' == 106 /* if defined(_OSD_POSIX) POSIX-BC */ -EXT unsigned char PL_e2a[] = { /* ASCII (ISO8859-1) to EBCDIC (POSIX-BC) */ +EXT unsigned char PL_a2e[] = { /* ASCII (ISO8859-1) to EBCDIC (POSIX-BC) */ 0, 1, 2, 3, 55, 45, 46, 47, 22, 5, 21, 11, 12, 13, 14, 15, 16, 17, 18, 19, 60, 61, 50, 38, @@ -2553,7 +2553,7 @@ EXT unsigned char PL_e2a[] = { /* ASCII (ISO8859-1) to EBCDIC (POSIX-BC) */ 140, 73, 205, 206, 203, 207, 204, 225, 112, 192, 222, 219, 220, 141, 142, 223 }; -EXT unsigned char PL_a2e[] = { /* EBCDIC (POSIX-BC) to ASCII (ISO8859-1) */ +EXT unsigned char PL_e2a[] = { /* EBCDIC (POSIX-BC) to ASCII (ISO8859-1) */ 0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15, 16, 17, 18, 19, 157, 10, 8, 135, @@ -2589,7 +2589,7 @@ EXT unsigned char PL_a2e[] = { /* EBCDIC (POSIX-BC) to ASCII (ISO8859-1) */ }; #endif /* POSIX-BC */ #if '^' == 176 /* if defined(??) (OS/400?) 037 */ -EXT unsigned char PL_e2a[] = { /* ASCII (ISO8859-1) to EBCDIC (IBM-037) */ +EXT unsigned char PL_a2e[] = { /* ASCII (ISO8859-1) to EBCDIC (IBM-037) */ 0, 1, 2, 3, 55, 45, 46, 47, 22, 5, 37, 11, 12, 13, 14, 15, 16, 17, 18, 19, 60, 61, 50, 38, @@ -2623,7 +2623,7 @@ EXT unsigned char PL_e2a[] = { /* ASCII (ISO8859-1) to EBCDIC (IBM-037) */ 140, 73, 205, 206, 203, 207, 204, 225, 112, 221, 222, 219, 220, 141, 142, 223 }; -EXT unsigned char PL_a2e[] = { /* EBCDIC (IBM-037) to ASCII (ISO8859-1) */ +EXT unsigned char PL_e2a[] = { /* EBCDIC (IBM-037) to ASCII (ISO8859-1) */ 0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15, 16, 17, 18, 19, 157, 133, 8, 135, @@ -2659,7 +2659,7 @@ EXT unsigned char PL_a2e[] = { /* EBCDIC (IBM-037) to ASCII (ISO8859-1) */ }; #endif /* 037 */ #if '^' == 95 /* if defined(__MVS__) || defined(??) (VM/ESA?) 1047 */ -EXT unsigned char PL_e2a[] = { /* ASCII (ISO8859-1) to EBCDIC (IBM-1047) */ +EXT unsigned char PL_a2e[] = { /* ASCII (ISO8859-1) to EBCDIC (IBM-1047) */ 0, 1, 2, 3, 55, 45, 46, 47, 22, 5, 21, 11, 12, 13, 14, 15, 16, 17, 18, 19, 60, 61, 50, 38, @@ -2693,7 +2693,7 @@ EXT unsigned char PL_e2a[] = { /* ASCII (ISO8859-1) to EBCDIC (IBM-1047) */ 140, 73, 205, 206, 203, 207, 204, 225, 112, 221, 222, 219, 220, 141, 142, 223 }; -EXT unsigned char PL_a2e[] = { /* EBCDIC (IBM-1047) to ASCII (ISO8859-1) */ +EXT unsigned char PL_e2a[] = { /* EBCDIC (IBM-1047) to ASCII (ISO8859-1) */ 0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15, 16, 17, 18, 19, 157, 10, 8, 135, diff --git a/sv.c b/sv.c index b96cc45..4d3181a 100644 --- a/sv.c +++ b/sv.c @@ -2954,7 +2954,7 @@ if all the bytes have hibit clear. STRLEN Perl_sv_utf8_upgrade(pTHX_ register SV *sv) { - char *s, *t, *e; + U8 *s, *t, *e; int hibit = 0; if (!sv) @@ -2966,25 +2966,24 @@ Perl_sv_utf8_upgrade(pTHX_ register SV *sv) if (SvUTF8(sv)) return SvCUR(sv); + if (SvREADONLY(sv) && SvFAKE(sv)) { + sv_force_normal(sv); + } + /* This function could be much more efficient if we had a FLAG in SVs * to signal if there are any hibit chars in the PV. * Given that there isn't make loop fast as possible */ - s = SvPVX(sv); - e = SvEND(sv); + s = (U8 *) SvPVX(sv); + e = (U8 *) SvEND(sv); t = s; while (t < e) { - if ((hibit = UTF8_IS_CONTINUED(*t++))) + if ((hibit = UTF8_IS_CONTINUED(NATIVE_TO_ASCII(*t++)))) break; } - if (hibit) { STRLEN len; - if (SvREADONLY(sv) && SvFAKE(sv)) { - sv_force_normal(sv); - s = SvPVX(sv); - } len = SvCUR(sv) + 1; /* Plus the \0 */ SvPVX(sv) = (char*)bytes_to_utf8((U8*)s, &len); SvCUR(sv) = len - 1; @@ -2992,6 +2991,12 @@ Perl_sv_utf8_upgrade(pTHX_ register SV *sv) Safefree(s); /* No longer using what was there before. */ SvLEN(sv) = len; /* No longer know the real size. */ } +#ifdef EBCDIC + else { + for (t = s; t < e; t++) + *t = NATIVE_TO_ASCII(*t); + } +#endif /* Mark as UTF-8 even if no hibit - saves scanning loop */ SvUTF8_on(sv); return SvCUR(sv); @@ -4755,8 +4760,7 @@ Perl_sv_eq(pTHX_ register SV *sv1, register SV *sv2) char *pv2; STRLEN cur2; I32 eq = 0; - bool pv1tmp = FALSE; - bool pv2tmp = FALSE; + char *tpv = Nullch; if (!sv1) { pv1 = ""; @@ -4775,31 +4779,33 @@ Perl_sv_eq(pTHX_ register SV *sv1, register SV *sv2) /* do not utf8ize the comparands as a side-effect */ if (cur1 && cur2 && SvUTF8(sv1) != SvUTF8(sv2) && !IN_BYTE) { bool is_utf8 = TRUE; - + /* UTF-8ness differs */ if (PL_hints & HINT_UTF8_DISTINCT) return FALSE; if (SvUTF8(sv1)) { + /* sv1 is the UTF-8 one , If is equal it must be downgrade-able */ char *pv = (char*)bytes_from_utf8((U8*)pv1, &cur1, &is_utf8); - - if ((pv1tmp = (pv != pv1))) - pv1 = pv; + if (pv != pv1) + pv1 = tpv = pv; } else { + /* sv2 is the UTF-8 one , If is equal it must be downgrade-able */ char *pv = (char *)bytes_from_utf8((U8*)pv2, &cur2, &is_utf8); - - if ((pv2tmp = (pv != pv2))) - pv2 = pv; + if (pv != pv2) + pv2 = tpv = pv; + } + if (is_utf8) { + /* Downgrade not possible - cannot be eq */ + return FALSE; } } if (cur1 == cur2) eq = memEQ(pv1, pv2, cur1); - if (pv1tmp) - Safefree(pv1); - if (pv2tmp) - Safefree(pv2); + if (tpv != Nullch) + Safefree(tpv); return eq; } diff --git a/toke.c b/toke.c index 0bc4a53..b802512 100644 --- a/toke.c +++ b/toke.c @@ -1271,11 +1271,11 @@ S_scan_const(pTHX_ char *start) if (isLOWER(min)) { for (i = min; i <= max; i++) if (isLOWER(i)) - *d++ = i; + *d++ = NATIVE_TO_NEED(has_utf8,i); } else { for (i = min; i <= max; i++) if (isUPPER(i)) - *d++ = i; + *d++ = NATIVE_TO_NEED(has_utf8,i); } } else @@ -1314,7 +1314,7 @@ S_scan_const(pTHX_ char *start) else if (*s == '(' && PL_lex_inpat && s[1] == '?') { if (s[2] == '#') { while (s < send && *s != ')') - *d++ = *s++; + *d++ = NATIVE_TO_NEED(has_utf8,*s++); } else if (s[2] == '{' /* This should match regcomp.c */ || ((s[2] == 'p' || s[2] == '?') && s[3] == '{')) @@ -1337,7 +1337,7 @@ S_scan_const(pTHX_ char *start) yyerror("Sequence (?{...}) not terminated or not {}-balanced"); } while (s < regparse) - *d++ = *s++; + *d++ = NATIVE_TO_NEED(has_utf8,*s++); } } @@ -1345,7 +1345,7 @@ S_scan_const(pTHX_ char *start) else if (*s == '#' && PL_lex_inpat && ((PMOP*)PL_lex_inpat)->op_pmflags & PMf_EXTENDED) { while (s+1 < send && *s != '\n') - *d++ = *s++; + *d++ = NATIVE_TO_NEED(has_utf8,*s++); } /* check for embedded arrays @@ -1371,8 +1371,8 @@ S_scan_const(pTHX_ char *start) /* some backslashes we leave behind */ if (*leaveit && *s && strchr(leaveit, *s)) { - *d++ = '\\'; - *d++ = *s++; + *d++ = NATIVE_TO_NEED(has_utf8,'\\'); + *d++ = NATIVE_TO_NEED(has_utf8,*s++); continue; } @@ -1448,13 +1448,13 @@ S_scan_const(pTHX_ char *start) NUM_ESCAPE_INSERT: /* Insert oct or hex escaped character. * There will always enough room in sv since such - * escapes will be longer than any UT-F8 sequence + * escapes will be longer than any UTF-8 sequence * they can end up as. */ /* We need to map to chars to ASCII before doing the tests to cover EBCDIC */ - if (NATIVE_TO_ASCII(uv) > 127) { + if (NATIVE_TO_UNI(uv) > 127) { if (!has_utf8 && uv > 255) { /* Might need to recode whatever we have * accumulated so far if it contains any @@ -1465,28 +1465,23 @@ S_scan_const(pTHX_ char *start) */ int hicount = 0; char *c; - for (c = SvPVX(sv); c < d; c++) { - if (UTF8_IS_CONTINUED(NATIVE_TO_ASCII(*c))) + if (UTF8_IS_CONTINUED(NATIVE_TO_ASCII(*c))) { hicount++; + } } - if (hicount) { - char *old_pvx = SvPVX(sv); - char *src, *dst; - - d = SvGROW(sv, - SvLEN(sv) + hicount + 1) + - (d - old_pvx); - - src = d - 1; - d += hicount; - dst = d - 1; - - while (src < dst) { + if (hicount || NATIVE_TO_ASCII('A') != 'A') { + STRLEN offset = d - SvPVX(sv); + U8 *src, *dst; + d = SvGROW(sv, SvLEN(sv) + hicount + 1) + offset; + src = (U8 *)d - 1; + dst = src+hicount; + d += hicount; + while (src >= (U8 *)SvPVX(sv)) { U8 ch = NATIVE_TO_ASCII(*src); if (UTF8_IS_CONTINUED(ch)) { - *dst-- = UTF8_EIGHT_BIT_LO(ch); - *dst-- = UTF8_EIGHT_BIT_HI(ch); + *dst-- = UTF8_EIGHT_BIT_LO(ch); + *dst-- = UTF8_EIGHT_BIT_HI(ch); } else { *dst-- = ch; @@ -1512,7 +1507,7 @@ S_scan_const(pTHX_ char *start) } } else { - *d++ = (char)uv; + *d++ = NATIVE_TO_NEED(has_utf8,uv); } continue; @@ -1572,31 +1567,31 @@ S_scan_const(pTHX_ char *start) if (isLOWER(c)) c = toUPPER(c); #endif - *d++ = toCTRL(c); + *d++ = NATIVE_TO_NEED(has_utf8,toCTRL(c)); } continue; /* printf-style backslashes, formfeeds, newlines, etc */ case 'b': - *d++ = '\b'; + *d++ = NATIVE_TO_NEED(has_utf8,'\b'); break; case 'n': - *d++ = '\n'; + *d++ = NATIVE_TO_NEED(has_utf8,'\n'); break; case 'r': - *d++ = '\r'; + *d++ = NATIVE_TO_NEED(has_utf8,'\r'); break; case 'f': - *d++ = '\f'; + *d++ = NATIVE_TO_NEED(has_utf8,'\f'); break; case 't': - *d++ = '\t'; + *d++ = NATIVE_TO_NEED(has_utf8,'\t'); break; case 'e': - *d++ = ASCII_TO_NATIVE('\033'); + *d++ = ASCII_TO_NEED(has_utf8,'\033'); break; case 'a': - *d++ = ASCII_TO_NATIVE('\007'); + *d++ = ASCII_TO_NEED(has_utf8,'\007'); break; } /* end switch */ @@ -1605,6 +1600,8 @@ S_scan_const(pTHX_ char *start) } /* end if (backslash) */ default_action: +#ifndef EBCDIC + /* The 'has_utf8' here is very dubious */ if (UTF8_IS_CONTINUED(NATIVE_TO_ASCII(*s)) && (this_utf8 || has_utf8)) { STRLEN len = (STRLEN) -1; UV uv; @@ -1630,8 +1627,8 @@ S_scan_const(pTHX_ char *start) } continue; } - - *d++ = *s++; +#endif + *d++ = NATIVE_TO_NEED(has_utf8,*s++); } /* while loop to process each character */ /* terminate the string and set up the sv */ diff --git a/utf8.c b/utf8.c index 7302bb7..227dcba 100644 --- a/utf8.c +++ b/utf8.c @@ -668,9 +668,9 @@ is unchanged. Do nothing if C points to 0. Sets C to U8 * Perl_bytes_from_utf8(pTHX_ U8* s, STRLEN *len, bool *is_utf8) { - U8 *send; U8 *d; U8 *start = s; + U8 *send; I32 count = 0; if (!*is_utf8) @@ -679,28 +679,30 @@ Perl_bytes_from_utf8(pTHX_ U8* s, STRLEN *len, bool *is_utf8) /* ensure valid UTF8 and chars < 256 before converting string */ for (send = s + *len; s < send;) { U8 c = *s++; - if (!UTF8_IS_ASCII(c)) { - if (UTF8_IS_CONTINUATION(c) || s >= send || - !UTF8_IS_CONTINUATION(*s) || UTF8_IS_DOWNGRADEABLE_START(c)) + if (!UTF8_IS_ASCII(c)) { + if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send && + (c = *s++) && UTF8_IS_CONTINUATION(c)) + count++; + else return start; - s++, count++; - } + } } *is_utf8 = 0; +#ifndef EBCDIC + /* Can use as-is if no high chars */ if (!count) return start; +#endif Newz(801, d, (*len) - count + 1, U8); s = start; start = d; while (s < send) { U8 c = *s++; - - if (UTF8_IS_ASCII(c)) - *d++ = c; - else - *d++ = UTF8_ACCUMULATE(c, *s++); + if (!UTF8_IS_ASCII(c)) + c = UTF8_ACCUMULATE(c, *s++); + *d++ = ASCII_TO_NATIVE(c); } *d = '\0'; *len = d - start; @@ -729,11 +731,10 @@ Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len) dst = d; while (s < send) { - if (UTF8_IS_ASCII(*s)) - *d++ = *s++; + UV uv = NATIVE_TO_ASCII(*s++); + if (UTF8_IS_ASCII(uv)) + *d++ = uv; else { - UV uv = *s++; - *d++ = UTF8_EIGHT_BIT_HI(uv); *d++ = UTF8_EIGHT_BIT_LO(uv); } diff --git a/utf8.h b/utf8.h index 160e5d2..5c920c9 100644 --- a/utf8.h +++ b/utf8.h @@ -88,7 +88,7 @@ END_EXTERN_C #define UTF8_IS_START(c) (((U8)c) >= 0xc0 && (((U8)c) <= 0xfd)) #define UTF8_IS_CONTINUATION(c) (((U8)c) >= 0x80 && (((U8)c) <= 0xbf)) #define UTF8_IS_CONTINUED(c) (((U8)c) & 0x80) -#define UTF8_IS_DOWNGRADEABLE_START(c) (((U8)c & 0xfc) != 0xc0) +#define UTF8_IS_DOWNGRADEABLE_START(c) (((U8)c & 0xfc) == 0xc0) #define UTF8_CONTINUATION_MASK ((U8)0x3f) #define UTF8_ACCUMULATION_SHIFT 6 @@ -139,14 +139,18 @@ END_EXTERN_C /* EBCDIC-happy ways of converting native code to UTF8 */ #ifdef EBCDIC -#define NATIVE_TO_ASCII(ch) PL_e2a[(ch)] -#define ASCII_TO_NATIVE(ch) PL_a2e[(ch)] -#define UNI_TO_NATIVE(ch) (((ch) > 0x100) ? (ch) : (UV) PL_a2e[(ch)]) -#define NATIVE_TO_UNI(ch) (((ch) > 0x100) ? (ch) : (UV) PL_e2a[(ch)]) +#define NATIVE_TO_ASCII(ch) PL_e2a[(ch)&255] +#define ASCII_TO_NATIVE(ch) PL_a2e[(ch)&255] +#define NATIVE_TO_UNI(ch) (((ch) > 255) ? (ch) : (UV) PL_e2a[(ch)]) +#define UNI_TO_NATIVE(ch) (((ch) > 255) ? (ch) : (UV) PL_a2e[(ch)]) +#define NATIVE_TO_NEED(enc,ch) ((enc) ? NATIVE_TO_ASCII(ch) : (ch)) +#define ASCII_TO_NEED(enc,ch) ((enc) ? (ch) : ASCII_TO_NATIVE(ch)) #else -#define NATIVE_TO_ASCII(ch) (ch) -#define ASCII_TO_NATIVE(ch) (ch) -#define UNI_TO_NATIVE(ch) (ch) -#define NATIVE_TO_UNI(ch) (ch) +#define NATIVE_TO_ASCII(ch) (ch) +#define ASCII_TO_NATIVE(ch) (ch) +#define UNI_TO_NATIVE(ch) (ch) +#define NATIVE_TO_UNI(ch) (ch) +#define NATIVE_TO_NEED(enc,ch) (ch) +#define ASCII_TO_NEED(enc,ch) (ch) #endif