#ifdef DOINIT
#ifdef EBCDIC
#if '^' == 106 /* if defined(_OSD_POSIX) POSIX-BC */
-EXT unsigned char PL_e2a[] = { /* ASCII (ISO8859-1) to EBCDIC (POSIX-BC) */
+EXT unsigned char PL_a2e[] = { /* ASCII (ISO8859-1) to EBCDIC (POSIX-BC) */
0, 1, 2, 3, 55, 45, 46, 47,
22, 5, 21, 11, 12, 13, 14, 15,
16, 17, 18, 19, 60, 61, 50, 38,
140, 73, 205, 206, 203, 207, 204, 225,
112, 192, 222, 219, 220, 141, 142, 223
};
-EXT unsigned char PL_a2e[] = { /* EBCDIC (POSIX-BC) to ASCII (ISO8859-1) */
+EXT unsigned char PL_e2a[] = { /* EBCDIC (POSIX-BC) to ASCII (ISO8859-1) */
0, 1, 2, 3, 156, 9, 134, 127,
151, 141, 142, 11, 12, 13, 14, 15,
16, 17, 18, 19, 157, 10, 8, 135,
};
#endif /* POSIX-BC */
#if '^' == 176 /* if defined(??) (OS/400?) 037 */
-EXT unsigned char PL_e2a[] = { /* ASCII (ISO8859-1) to EBCDIC (IBM-037) */
+EXT unsigned char PL_a2e[] = { /* ASCII (ISO8859-1) to EBCDIC (IBM-037) */
0, 1, 2, 3, 55, 45, 46, 47,
22, 5, 37, 11, 12, 13, 14, 15,
16, 17, 18, 19, 60, 61, 50, 38,
140, 73, 205, 206, 203, 207, 204, 225,
112, 221, 222, 219, 220, 141, 142, 223
};
-EXT unsigned char PL_a2e[] = { /* EBCDIC (IBM-037) to ASCII (ISO8859-1) */
+EXT unsigned char PL_e2a[] = { /* EBCDIC (IBM-037) to ASCII (ISO8859-1) */
0, 1, 2, 3, 156, 9, 134, 127,
151, 141, 142, 11, 12, 13, 14, 15,
16, 17, 18, 19, 157, 133, 8, 135,
};
#endif /* 037 */
#if '^' == 95 /* if defined(__MVS__) || defined(??) (VM/ESA?) 1047 */
-EXT unsigned char PL_e2a[] = { /* ASCII (ISO8859-1) to EBCDIC (IBM-1047) */
+EXT unsigned char PL_a2e[] = { /* ASCII (ISO8859-1) to EBCDIC (IBM-1047) */
0, 1, 2, 3, 55, 45, 46, 47,
22, 5, 21, 11, 12, 13, 14, 15,
16, 17, 18, 19, 60, 61, 50, 38,
140, 73, 205, 206, 203, 207, 204, 225,
112, 221, 222, 219, 220, 141, 142, 223
};
-EXT unsigned char PL_a2e[] = { /* EBCDIC (IBM-1047) to ASCII (ISO8859-1) */
+EXT unsigned char PL_e2a[] = { /* EBCDIC (IBM-1047) to ASCII (ISO8859-1) */
0, 1, 2, 3, 156, 9, 134, 127,
151, 141, 142, 11, 12, 13, 14, 15,
16, 17, 18, 19, 157, 10, 8, 135,
STRLEN
Perl_sv_utf8_upgrade(pTHX_ register SV *sv)
{
- char *s, *t, *e;
+ U8 *s, *t, *e;
int hibit = 0;
if (!sv)
if (SvUTF8(sv))
return SvCUR(sv);
+ if (SvREADONLY(sv) && SvFAKE(sv)) {
+ sv_force_normal(sv);
+ }
+
/* This function could be much more efficient if we had a FLAG in SVs
* to signal if there are any hibit chars in the PV.
* Given that there isn't make loop fast as possible
*/
- s = SvPVX(sv);
- e = SvEND(sv);
+ s = (U8 *) SvPVX(sv);
+ e = (U8 *) SvEND(sv);
t = s;
while (t < e) {
- if ((hibit = UTF8_IS_CONTINUED(*t++)))
+ if ((hibit = UTF8_IS_CONTINUED(NATIVE_TO_ASCII(*t++))))
break;
}
-
if (hibit) {
STRLEN len;
- if (SvREADONLY(sv) && SvFAKE(sv)) {
- sv_force_normal(sv);
- s = SvPVX(sv);
- }
len = SvCUR(sv) + 1; /* Plus the \0 */
SvPVX(sv) = (char*)bytes_to_utf8((U8*)s, &len);
SvCUR(sv) = len - 1;
Safefree(s); /* No longer using what was there before. */
SvLEN(sv) = len; /* No longer know the real size. */
}
+#ifdef EBCDIC
+ else {
+ for (t = s; t < e; t++)
+ *t = NATIVE_TO_ASCII(*t);
+ }
+#endif
/* Mark as UTF-8 even if no hibit - saves scanning loop */
SvUTF8_on(sv);
return SvCUR(sv);
char *pv2;
STRLEN cur2;
I32 eq = 0;
- bool pv1tmp = FALSE;
- bool pv2tmp = FALSE;
+ char *tpv = Nullch;
if (!sv1) {
pv1 = "";
/* do not utf8ize the comparands as a side-effect */
if (cur1 && cur2 && SvUTF8(sv1) != SvUTF8(sv2) && !IN_BYTE) {
bool is_utf8 = TRUE;
-
+ /* UTF-8ness differs */
if (PL_hints & HINT_UTF8_DISTINCT)
return FALSE;
if (SvUTF8(sv1)) {
+ /* sv1 is the UTF-8 one , If is equal it must be downgrade-able */
char *pv = (char*)bytes_from_utf8((U8*)pv1, &cur1, &is_utf8);
-
- if ((pv1tmp = (pv != pv1)))
- pv1 = pv;
+ if (pv != pv1)
+ pv1 = tpv = pv;
}
else {
+ /* sv2 is the UTF-8 one , If is equal it must be downgrade-able */
char *pv = (char *)bytes_from_utf8((U8*)pv2, &cur2, &is_utf8);
-
- if ((pv2tmp = (pv != pv2)))
- pv2 = pv;
+ if (pv != pv2)
+ pv2 = tpv = pv;
+ }
+ if (is_utf8) {
+ /* Downgrade not possible - cannot be eq */
+ return FALSE;
}
}
if (cur1 == cur2)
eq = memEQ(pv1, pv2, cur1);
- if (pv1tmp)
- Safefree(pv1);
- if (pv2tmp)
- Safefree(pv2);
+ if (tpv != Nullch)
+ Safefree(tpv);
return eq;
}
if (isLOWER(min)) {
for (i = min; i <= max; i++)
if (isLOWER(i))
- *d++ = i;
+ *d++ = NATIVE_TO_NEED(has_utf8,i);
} else {
for (i = min; i <= max; i++)
if (isUPPER(i))
- *d++ = i;
+ *d++ = NATIVE_TO_NEED(has_utf8,i);
}
}
else
else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
if (s[2] == '#') {
while (s < send && *s != ')')
- *d++ = *s++;
+ *d++ = NATIVE_TO_NEED(has_utf8,*s++);
}
else if (s[2] == '{' /* This should match regcomp.c */
|| ((s[2] == 'p' || s[2] == '?') && s[3] == '{'))
yyerror("Sequence (?{...}) not terminated or not {}-balanced");
}
while (s < regparse)
- *d++ = *s++;
+ *d++ = NATIVE_TO_NEED(has_utf8,*s++);
}
}
else if (*s == '#' && PL_lex_inpat &&
((PMOP*)PL_lex_inpat)->op_pmflags & PMf_EXTENDED) {
while (s+1 < send && *s != '\n')
- *d++ = *s++;
+ *d++ = NATIVE_TO_NEED(has_utf8,*s++);
}
/* check for embedded arrays
/* some backslashes we leave behind */
if (*leaveit && *s && strchr(leaveit, *s)) {
- *d++ = '\\';
- *d++ = *s++;
+ *d++ = NATIVE_TO_NEED(has_utf8,'\\');
+ *d++ = NATIVE_TO_NEED(has_utf8,*s++);
continue;
}
NUM_ESCAPE_INSERT:
/* Insert oct or hex escaped character.
* There will always enough room in sv since such
- * escapes will be longer than any UT-F8 sequence
+ * escapes will be longer than any UTF-8 sequence
* they can end up as. */
/* We need to map to chars to ASCII before doing the tests
to cover EBCDIC
*/
- if (NATIVE_TO_ASCII(uv) > 127) {
+ if (NATIVE_TO_UNI(uv) > 127) {
if (!has_utf8 && uv > 255) {
/* Might need to recode whatever we have
* accumulated so far if it contains any
*/
int hicount = 0;
char *c;
-
for (c = SvPVX(sv); c < d; c++) {
- if (UTF8_IS_CONTINUED(NATIVE_TO_ASCII(*c)))
+ if (UTF8_IS_CONTINUED(NATIVE_TO_ASCII(*c))) {
hicount++;
+ }
}
- if (hicount) {
- char *old_pvx = SvPVX(sv);
- char *src, *dst;
-
- d = SvGROW(sv,
- SvLEN(sv) + hicount + 1) +
- (d - old_pvx);
-
- src = d - 1;
- d += hicount;
- dst = d - 1;
-
- while (src < dst) {
+ if (hicount || NATIVE_TO_ASCII('A') != 'A') {
+ STRLEN offset = d - SvPVX(sv);
+ U8 *src, *dst;
+ d = SvGROW(sv, SvLEN(sv) + hicount + 1) + offset;
+ src = (U8 *)d - 1;
+ dst = src+hicount;
+ d += hicount;
+ while (src >= (U8 *)SvPVX(sv)) {
U8 ch = NATIVE_TO_ASCII(*src);
if (UTF8_IS_CONTINUED(ch)) {
- *dst-- = UTF8_EIGHT_BIT_LO(ch);
- *dst-- = UTF8_EIGHT_BIT_HI(ch);
+ *dst-- = UTF8_EIGHT_BIT_LO(ch);
+ *dst-- = UTF8_EIGHT_BIT_HI(ch);
}
else {
*dst-- = ch;
}
}
else {
- *d++ = (char)uv;
+ *d++ = NATIVE_TO_NEED(has_utf8,uv);
}
continue;
if (isLOWER(c))
c = toUPPER(c);
#endif
- *d++ = toCTRL(c);
+ *d++ = NATIVE_TO_NEED(has_utf8,toCTRL(c));
}
continue;
/* printf-style backslashes, formfeeds, newlines, etc */
case 'b':
- *d++ = '\b';
+ *d++ = NATIVE_TO_NEED(has_utf8,'\b');
break;
case 'n':
- *d++ = '\n';
+ *d++ = NATIVE_TO_NEED(has_utf8,'\n');
break;
case 'r':
- *d++ = '\r';
+ *d++ = NATIVE_TO_NEED(has_utf8,'\r');
break;
case 'f':
- *d++ = '\f';
+ *d++ = NATIVE_TO_NEED(has_utf8,'\f');
break;
case 't':
- *d++ = '\t';
+ *d++ = NATIVE_TO_NEED(has_utf8,'\t');
break;
case 'e':
- *d++ = ASCII_TO_NATIVE('\033');
+ *d++ = ASCII_TO_NEED(has_utf8,'\033');
break;
case 'a':
- *d++ = ASCII_TO_NATIVE('\007');
+ *d++ = ASCII_TO_NEED(has_utf8,'\007');
break;
} /* end switch */
} /* end if (backslash) */
default_action:
+#ifndef EBCDIC
+ /* The 'has_utf8' here is very dubious */
if (UTF8_IS_CONTINUED(NATIVE_TO_ASCII(*s)) && (this_utf8 || has_utf8)) {
STRLEN len = (STRLEN) -1;
UV uv;
}
continue;
}
-
- *d++ = *s++;
+#endif
+ *d++ = NATIVE_TO_NEED(has_utf8,*s++);
} /* while loop to process each character */
/* terminate the string and set up the sv */
U8 *
Perl_bytes_from_utf8(pTHX_ U8* s, STRLEN *len, bool *is_utf8)
{
- U8 *send;
U8 *d;
U8 *start = s;
+ U8 *send;
I32 count = 0;
if (!*is_utf8)
/* ensure valid UTF8 and chars < 256 before converting string */
for (send = s + *len; s < send;) {
U8 c = *s++;
- if (!UTF8_IS_ASCII(c)) {
- if (UTF8_IS_CONTINUATION(c) || s >= send ||
- !UTF8_IS_CONTINUATION(*s) || UTF8_IS_DOWNGRADEABLE_START(c))
+ if (!UTF8_IS_ASCII(c)) {
+ if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
+ (c = *s++) && UTF8_IS_CONTINUATION(c))
+ count++;
+ else
return start;
- s++, count++;
- }
+ }
}
*is_utf8 = 0;
+#ifndef EBCDIC
+ /* Can use as-is if no high chars */
if (!count)
return start;
+#endif
Newz(801, d, (*len) - count + 1, U8);
s = start; start = d;
while (s < send) {
U8 c = *s++;
-
- if (UTF8_IS_ASCII(c))
- *d++ = c;
- else
- *d++ = UTF8_ACCUMULATE(c, *s++);
+ if (!UTF8_IS_ASCII(c))
+ c = UTF8_ACCUMULATE(c, *s++);
+ *d++ = ASCII_TO_NATIVE(c);
}
*d = '\0';
*len = d - start;
dst = d;
while (s < send) {
- if (UTF8_IS_ASCII(*s))
- *d++ = *s++;
+ UV uv = NATIVE_TO_ASCII(*s++);
+ if (UTF8_IS_ASCII(uv))
+ *d++ = uv;
else {
- UV uv = *s++;
-
*d++ = UTF8_EIGHT_BIT_HI(uv);
*d++ = UTF8_EIGHT_BIT_LO(uv);
}
#define UTF8_IS_START(c) (((U8)c) >= 0xc0 && (((U8)c) <= 0xfd))
#define UTF8_IS_CONTINUATION(c) (((U8)c) >= 0x80 && (((U8)c) <= 0xbf))
#define UTF8_IS_CONTINUED(c) (((U8)c) & 0x80)
-#define UTF8_IS_DOWNGRADEABLE_START(c) (((U8)c & 0xfc) != 0xc0)
+#define UTF8_IS_DOWNGRADEABLE_START(c) (((U8)c & 0xfc) == 0xc0)
#define UTF8_CONTINUATION_MASK ((U8)0x3f)
#define UTF8_ACCUMULATION_SHIFT 6
/* EBCDIC-happy ways of converting native code to UTF8 */
#ifdef EBCDIC
-#define NATIVE_TO_ASCII(ch) PL_e2a[(ch)]
-#define ASCII_TO_NATIVE(ch) PL_a2e[(ch)]
-#define UNI_TO_NATIVE(ch) (((ch) > 0x100) ? (ch) : (UV) PL_a2e[(ch)])
-#define NATIVE_TO_UNI(ch) (((ch) > 0x100) ? (ch) : (UV) PL_e2a[(ch)])
+#define NATIVE_TO_ASCII(ch) PL_e2a[(ch)&255]
+#define ASCII_TO_NATIVE(ch) PL_a2e[(ch)&255]
+#define NATIVE_TO_UNI(ch) (((ch) > 255) ? (ch) : (UV) PL_e2a[(ch)])
+#define UNI_TO_NATIVE(ch) (((ch) > 255) ? (ch) : (UV) PL_a2e[(ch)])
+#define NATIVE_TO_NEED(enc,ch) ((enc) ? NATIVE_TO_ASCII(ch) : (ch))
+#define ASCII_TO_NEED(enc,ch) ((enc) ? (ch) : ASCII_TO_NATIVE(ch))
#else
-#define NATIVE_TO_ASCII(ch) (ch)
-#define ASCII_TO_NATIVE(ch) (ch)
-#define UNI_TO_NATIVE(ch) (ch)
-#define NATIVE_TO_UNI(ch) (ch)
+#define NATIVE_TO_ASCII(ch) (ch)
+#define ASCII_TO_NATIVE(ch) (ch)
+#define UNI_TO_NATIVE(ch) (ch)
+#define NATIVE_TO_UNI(ch) (ch)
+#define NATIVE_TO_NEED(enc,ch) (ch)
+#define ASCII_TO_NEED(enc,ch) (ch)
#endif