PerlIO fixups for Win32:

[p5sagit/p5-mst-13.2.git] / toke.c
diff --git a/toke.c b/toke.c

index 0bc4a53..00fe0b5 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -36,8 +36,12 @@ static I32 utf16rev_textfilter(pTHXo_ int idx, SV *sv, int maxlen);
 #define XFAKEBRACK 128
 #define XENUMMASK 127
 
-/*#define UTF (SvUTF8(PL_linestr) && !(PL_hints & HINT_BYTE))*/
-#define UTF (PL_hints & HINT_UTF8)
+#ifdef EBCDIC
+/* For now 'use utf8' does not affect tokenizer on EBCDIC */
+#define UTF (PL_linestr && DO_UTF8(PL_linestr))
+#else
+#define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || (PL_hints & HINT_UTF8))
+#endif
 
 /* In variables name $^X, these are the legal values for X.
  * 1999-02-27 mjd-perl-patch@plover.com */
@@ -917,8 +921,11 @@ S_tokeq(pTHX_ SV *sv)
     if (s == send)
        goto finish;
     d = s;
-    if ( PL_hints & HINT_NEW_STRING )
+    if ( PL_hints & HINT_NEW_STRING ) {
        pv = sv_2mortal(newSVpvn(SvPVX(pv), len));
+       if (SvUTF8(sv))
+           SvUTF8_on(pv);
+    }
     while (s < send) {
        if (*s == '\\') {
            if (s + 1 < send && (s[1] == '\\'))
@@ -1215,22 +1222,22 @@ S_scan_const(pTHX_ char *start)
     register char *d = SvPVX(sv);              /* destination for copies */
     bool dorange = FALSE;                      /* are we in a translit range? */
     bool didrange = FALSE;                     /* did we just finish a range? */
-    bool has_utf8 = (PL_linestr && SvUTF8(PL_linestr));
-                                               /* the constant is UTF8 */
+    I32  has_utf8 = FALSE;                     /* Output constant is UTF8 */
+    I32  this_utf8 = UTF;                      /* The source string is assumed to be UTF8 */
     UV uv;
 
-    I32 utf = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op)
-       ? (PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF))
-       : UTF;
-    I32 this_utf8 = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op)
-       ? (PL_sublex_info.sub_op->op_private & (PL_lex_repl ?
-                                               OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF))
-       : UTF;
     const char *leaveit =      /* set of acceptably-backslashed characters */
        PL_lex_inpat
            ? "\\.^$@AGZdDwWsSbBpPXC+*?|()-nrtfeaxcz0123456789[{]} \t\n\r\f\v#"
            : "";
 
+    if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
+       /* If we are doing a trans and we know we want UTF8 set expectation */
+       has_utf8   = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
+       this_utf8  = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
+    }
+
+
     while (s < send || dorange) {
         /* get transliterations out of the way (they're most literal) */
        if (PL_lex_inwhat == OP_TRANS) {
@@ -1240,17 +1247,18 @@ S_scan_const(pTHX_ char *start)
                I32 min;                        /* first character in range */
                I32 max;                        /* last character in range */
 
-               if (utf) {
+               if (has_utf8) {
                    char *c = (char*)utf8_hop((U8*)d, -1);
                    char *e = d++;
                    while (e-- > c)
                        *(e + 1) = *e;
-                   *c = (char)0xff;
+                   *c = UTF_TO_NATIVE(0xff);
                    /* mark the range as done, and continue */
                    dorange = FALSE;
                    didrange = TRUE;
                    continue;
                }
+
                i = d - SvPVX(sv);              /* remember current offset */
                SvGROW(sv, SvLEN(sv) + 256);    /* never more than 256 chars in a range */
                d = SvPVX(sv) + i;              /* refresh d after realloc */
@@ -1271,11 +1279,11 @@ S_scan_const(pTHX_ char *start)
                    if (isLOWER(min)) {
                        for (i = min; i <= max; i++)
                            if (isLOWER(i))
-                               *d++ = i;
+                               *d++ = NATIVE_TO_NEED(has_utf8,i);
                    } else {
                        for (i = min; i <= max; i++)
                            if (isUPPER(i))
-                               *d++ = i;
+                               *d++ = NATIVE_TO_NEED(has_utf8,i);
                    }
                }
                else
@@ -1294,8 +1302,8 @@ S_scan_const(pTHX_ char *start)
                if (didrange) {
                    Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
                }
-               if (utf) {
-                   *d++ = (char)0xff;  /* use illegal utf8 byte--see pmtrans */
+               if (has_utf8) {
+                   *d++ = UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */
                    s++;
                    continue;
                }
@@ -1314,7 +1322,7 @@ S_scan_const(pTHX_ char *start)
        else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
            if (s[2] == '#') {
                while (s < send && *s != ')')
-                   *d++ = *s++;
+                   *d++ = NATIVE_TO_NEED(has_utf8,*s++);
            }
            else if (s[2] == '{' /* This should match regcomp.c */
                     || ((s[2] == 'p' || s[2] == '?') && s[3] == '{'))
@@ -1337,7 +1345,7 @@ S_scan_const(pTHX_ char *start)
                    yyerror("Sequence (?{...}) not terminated or not {}-balanced");
                }
                while (s < regparse)
-                   *d++ = *s++;
+                   *d++ = NATIVE_TO_NEED(has_utf8,*s++);
            }
        }
 
@@ -1345,7 +1353,7 @@ S_scan_const(pTHX_ char *start)
        else if (*s == '#' && PL_lex_inpat &&
          ((PMOP*)PL_lex_inpat)->op_pmflags & PMf_EXTENDED) {
            while (s+1 < send && *s != '\n')
-               *d++ = *s++;
+               *d++ = NATIVE_TO_NEED(has_utf8,*s++);
        }
 
        /* check for embedded arrays
@@ -1365,14 +1373,16 @@ S_scan_const(pTHX_ char *start)
                break;          /* in regexp, $ might be tail anchor */
        }
 
+       /* End of else if chain - OP_TRANS rejoin rest */
+
        /* backslashes */
        if (*s == '\\' && s+1 < send) {
            s++;
 
            /* some backslashes we leave behind */
            if (*leaveit && *s && strchr(leaveit, *s)) {
-               *d++ = '\\';
-               *d++ = *s++;
+               *d++ = NATIVE_TO_NEED(has_utf8,'\\');
+               *d++ = NATIVE_TO_NEED(has_utf8,*s++);
                continue;
            }
 
@@ -1448,13 +1458,13 @@ S_scan_const(pTHX_ char *start)
              NUM_ESCAPE_INSERT:
                /* Insert oct or hex escaped character.
                 * There will always enough room in sv since such
-                * escapes will be longer than any UT-F8 sequence
+                * escapes will be longer than any UTF-8 sequence
                 * they can end up as. */
                
                /* We need to map to chars to ASCII before doing the tests
                   to cover EBCDIC
                */
-               if (NATIVE_TO_ASCII(uv) > 127) {
+               if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(uv))) {
                    if (!has_utf8 && uv > 255) {
                        /* Might need to recode whatever we have
                         * accumulated so far if it contains any
@@ -1464,32 +1474,27 @@ S_scan_const(pTHX_ char *start)
                         *  this rescan? --jhi)
                         */
                        int hicount = 0;
-                       char *c;
-
-                       for (c = SvPVX(sv); c < d; c++) {
-                           if (UTF8_IS_CONTINUED(NATIVE_TO_ASCII(*c)))
+                       U8 *c;
+                       for (c = (U8 *) SvPVX(sv); c < (U8 *)d; c++) {
+                           if (!NATIVE_IS_INVARIANT(*c)) {
                                hicount++;
+                           }
                        }
                        if (hicount) {
-                           char *old_pvx = SvPVX(sv);
-                           char *src, *dst;
-                       
-                           d = SvGROW(sv,
-                                      SvLEN(sv) + hicount + 1) +
-                                        (d - old_pvx);
-
-                           src = d - 1;
-                           d += hicount;
-                           dst = d - 1;
-
-                           while (src < dst) {
-                               U8 ch = NATIVE_TO_ASCII(*src);
-                               if (UTF8_IS_CONTINUED(ch)) {
-                                   *dst-- = UTF8_EIGHT_BIT_LO(ch);
-                                   *dst-- = UTF8_EIGHT_BIT_HI(ch);
+                           STRLEN offset = d - SvPVX(sv);
+                           U8 *src, *dst;
+                           d = SvGROW(sv, SvLEN(sv) + hicount + 1) + offset;
+                           src = (U8 *)d - 1;
+                           dst = src+hicount;
+                           d  += hicount;
+                           while (src >= (U8 *)SvPVX(sv)) {
+                               if (!NATIVE_IS_INVARIANT(*src)) {
+                                   U8 ch = NATIVE_TO_ASCII(*src);
+                                   *dst-- = UTF8_EIGHT_BIT_LO(ch);
+                                   *dst-- = UTF8_EIGHT_BIT_HI(ch);
                                }
                                else {
-                                   *dst-- = ch;
+                                   *dst-- = *src;
                                }
                                src--;
                            }
@@ -1504,7 +1509,6 @@ S_scan_const(pTHX_ char *start)
                            PL_sublex_info.sub_op->op_private |=
                                (PL_lex_repl ? OPpTRANS_FROM_UTF
                                             : OPpTRANS_TO_UTF);
-                           utf = TRUE;
                        }
                     }
                    else {
@@ -1512,7 +1516,7 @@ S_scan_const(pTHX_ char *start)
                    }
                }
                else {
-                   *d++ = (char)uv;
+                   *d++ = (char) uv;
                }
                continue;
 
@@ -1572,31 +1576,31 @@ S_scan_const(pTHX_ char *start)
                    if (isLOWER(c))
                        c = toUPPER(c);
 #endif
-                   *d++ = toCTRL(c);
+                   *d++ = NATIVE_TO_NEED(has_utf8,toCTRL(c));
                }
                continue;
 
            /* printf-style backslashes, formfeeds, newlines, etc */
            case 'b':
-               *d++ = '\b';
+               *d++ = NATIVE_TO_NEED(has_utf8,'\b');
                break;
            case 'n':
-               *d++ = '\n';
+               *d++ = NATIVE_TO_NEED(has_utf8,'\n');
                break;
            case 'r':
-               *d++ = '\r';
+               *d++ = NATIVE_TO_NEED(has_utf8,'\r');
                break;
            case 'f':
-               *d++ = '\f';
+               *d++ = NATIVE_TO_NEED(has_utf8,'\f');
                break;
            case 't':
-               *d++ = '\t';
+               *d++ = NATIVE_TO_NEED(has_utf8,'\t');
                break;
            case 'e':
-               *d++ = ASCII_TO_NATIVE('\033');
+               *d++ = ASCII_TO_NEED(has_utf8,'\033');
                break;
            case 'a':
-               *d++ = ASCII_TO_NATIVE('\007');
+               *d++ = ASCII_TO_NEED(has_utf8,'\007');
                break;
            } /* end switch */
 
@@ -1605,41 +1609,40 @@ S_scan_const(pTHX_ char *start)
        } /* end if (backslash) */
 
     default_action:
-       if (UTF8_IS_CONTINUED(NATIVE_TO_ASCII(*s)) && (this_utf8 || has_utf8)) {
-           STRLEN len = (STRLEN) -1;
-           UV uv;
-           if (this_utf8) {
-               uv = utf8n_to_uvchr((U8*)s, send - s, &len, 0);
-           }
-           if (len == (STRLEN)-1) {
-               /* Illegal UTF8 (a high-bit byte), make it valid. */
-               char *old_pvx = SvPVX(sv);
-               /* need space for one extra char (NOTE: SvCUR() not set here) */
-               d = SvGROW(sv, SvLEN(sv) + 1) + (d - old_pvx);
-               d = (char*)uvchr_to_utf8((U8*)d, (U8)*s++);
-           }
-           else {
-               while (len--)
-                   *d++ = *s++;
-           }
-           has_utf8 = TRUE;
-          if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
-              PL_sublex_info.sub_op->op_private |=
-                  (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
-              utf = TRUE;
-          }
-           continue;
-       }
-
-       *d++ = *s++;
+       /* If we started with encoded form, or already know we want it
+          and then encode the next character */
+       if ((has_utf8 || this_utf8) && !NATIVE_IS_INVARIANT((U8)(*s))) {
+           STRLEN len  = 1;
+           UV uv       = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
+           STRLEN need = UNISKIP(NATIVE_TO_UNI(uv));
+           s += len;
+           if (need > len) {
+               /* encoded value larger than old, need extra space (NOTE: SvCUR() not set here) */
+               STRLEN off = d - SvPVX(sv);
+               d = SvGROW(sv, SvLEN(sv) + (need-len)) + off;
+           }
+           d = (char*)uvchr_to_utf8((U8*)d, uv);
+           has_utf8 = TRUE;
+       }
+       else {
+           *d++ = NATIVE_TO_NEED(has_utf8,*s++);
+       }
     } /* while loop to process each character */
 
     /* terminate the string and set up the sv */
     *d = '\0';
     SvCUR_set(sv, d - SvPVX(sv));
+    if (SvCUR(sv) >= SvLEN(sv))
+      Perl_croak(aTHX_ "panic:constant overflowed allocated space");
+
     SvPOK_on(sv);
-    if (has_utf8)
+    if (has_utf8) {
        SvUTF8_on(sv);
+       if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
+               PL_sublex_info.sub_op->op_private |=
+                   (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
+       }
+    }
 
     /* shrink the sv if we allocated more than we used */
     if (SvCUR(sv) + 5 < SvLEN(sv)) {
@@ -3726,7 +3729,7 @@ Perl_yylex(pTHX)
            missingterm((char*)0);
        yylval.ival = OP_CONST;
        for (d = SvPV(PL_lex_stuff, len); len; len--, d++) {
-           if (*d == '$' || *d == '@' || *d == '\\' || UTF8_IS_CONTINUED(*d)) {
+           if (*d == '$' || *d == '@' || *d == '\\' || !UTF8_IS_INVARIANT((U8)*d)) {
                yylval.ival = OP_STRINGIFY;
                break;
            }
@@ -6698,7 +6701,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims)
 
     /* after skipping whitespace, the next character is the terminator */
     term = *s;
-    if (UTF8_IS_CONTINUED(term) && UTF)
+    if (!UTF8_IS_INVARIANT((U8)term) && UTF)
        has_utf8 = TRUE;
 
     /* mark where we are */
@@ -6745,7 +6748,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims)
                   have found the terminator */
                else if (*s == term)
                    break;
-               else if (!has_utf8 && UTF8_IS_CONTINUED(*s) && UTF)
+               else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF)
                    has_utf8 = TRUE;
                *to = *s;
            }
@@ -6774,7 +6777,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims)
                    break;
                else if (*s == PL_multi_open)
                    brackets++;
-               else if (!has_utf8 && UTF8_IS_CONTINUED(*s) && UTF)
+               else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF)
                    has_utf8 = TRUE;
                *to = *s;
            }
@@ -7232,7 +7235,7 @@ vstring:
            while (isDIGIT(*pos) || *pos == '_')
                pos++;
            if (!isALPHA(*pos)) {
-               UV rev, revmax = 0;
+               UV rev;
                U8 tmpbuf[UTF8_MAXLEN+1];
                U8 *tmpend;
                s++;                            /* get past 'v' */
@@ -7262,9 +7265,9 @@ vstring:
                    }
                    /* Append native character for the rev point */
                    tmpend = uvchr_to_utf8(tmpbuf, rev);
-                   if (rev > revmax)
-                       revmax = rev;
                    sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf);
+                   if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(rev)))
+                       SvUTF8_on(sv);
                    if (*pos == '.' && isDIGIT(pos[1]))
                        s = ++pos;
                    else {
@@ -7274,14 +7277,8 @@ vstring:
                    while (isDIGIT(*pos) || *pos == '_')
                        pos++;
                }
-
                SvPOK_on(sv);
                SvREADONLY_on(sv);
-               /* if (revmax > 127) { */
-                   SvUTF8_on(sv); /*
-                   if (revmax < 256)
-                     sv_utf8_downgrade(sv, TRUE);
-               } */
            }
        }
        break;