A bug introduced in #8217 (the undefined variable in the
[p5sagit/p5-mst-13.2.git] / toke.c
diff --git a/toke.c b/toke.c
index 8b5f7f4..398253c 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -1,6 +1,6 @@
 /*    toke.c
  *
- *    Copyright (c) 1991-2000, Larry Wall
+ *    Copyright (c) 1991-2001, Larry Wall
  *
  *    You may distribute under the terms of either the GNU General Public
  *    License or the Artistic License, as specified in the README file.
@@ -79,7 +79,7 @@ static I32 utf16rev_textfilter(pTHXo_ int idx, SV *sv, int maxlen);
 #  endif
 YYSTYPE* yylval_pointer[YYMAXLEVEL];
 int* yychar_pointer[YYMAXLEVEL];
-int yyactlevel = 0;
+int yyactlevel = -1;
 #  undef yylval
 #  undef yychar
 #  define yylval (*yylval_pointer[yyactlevel])
@@ -1045,8 +1045,11 @@ STATIC I32
 S_sublex_done(pTHX)
 {
     if (!PL_lex_starts++) {
+       SV *sv = newSVpvn("",0);
+       if (SvUTF8(PL_linestr))
+           SvUTF8_on(sv);
        PL_expect = XOPERATOR;
-       yylval.opval = (OP*)newSVOP(OP_CONST, 0, newSVpvn("",0));
+       yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
        return THING;
     }
 
@@ -1173,7 +1176,8 @@ S_scan_const(pTHX_ char *start)
     register char *d = SvPVX(sv);              /* destination for copies */
     bool dorange = FALSE;                      /* are we in a translit range? */
     bool didrange = FALSE;                     /* did we just finish a range? */
-    bool has_utf8 = FALSE;                     /* embedded \x{} */
+    bool has_utf8 = (PL_linestr && SvUTF8(PL_linestr));
+                                               /* the constant is UTF8 */
     UV uv;
 
     I32 utf = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op)
@@ -1311,33 +1315,8 @@ S_scan_const(pTHX_ char *start)
                break;          /* in regexp, $ might be tail anchor */
        }
 
-       /* (now in tr/// code again) */
-
-       if (*s & 0x80 && (this_utf8 || has_utf8)) {
-           STRLEN len = (STRLEN) -1;
-           UV uv;
-           if (this_utf8) {
-               uv = utf8_to_uv((U8*)s, send - s, &len, UTF8_CHECK_ONLY);
-           }
-           if (len == (STRLEN)-1) {
-               /* Illegal UTF8 (a high-bit byte), make it valid. */
-               char *old_pvx = SvPVX(sv);
-               /* need space for one extra char (NOTE: SvCUR() not set here) */
-               d = SvGROW(sv, SvLEN(sv) + 1) + (d - old_pvx);
-               d = (char*)uv_to_utf8((U8*)d, (U8)*s++);
-           }
-           else {
-               while (len--)
-                   *d++ = *s++;
-           }
-           has_utf8 = TRUE;
-           continue;
-       }
-
        /* backslashes */
        if (*s == '\\' && s+1 < send) {
-           bool to_be_utf8 = FALSE;
-
            s++;
 
            /* some backslashes we leave behind */
@@ -1406,7 +1385,6 @@ S_scan_const(pTHX_ char *start)
                    else {
                        STRLEN len = 1;         /* allow underscores */
                        uv = (UV)scan_hex(s + 1, e - s - 1, &len);
-                       to_be_utf8 = TRUE;
                    }
                    s = e + 1;
                }
@@ -1420,34 +1398,47 @@ S_scan_const(pTHX_ char *start)
 
              NUM_ESCAPE_INSERT:
                /* Insert oct or hex escaped character.
-                * There will always enough room in sv since such escapes will
-                * be longer than any utf8 sequence they can end up as
-                */
+                * There will always enough room in sv since such
+                * escapes will be longer than any UT-F8 sequence
+                * they can end up as. */
+
+               /* This spot is wrong for EBCDIC.  Characters like
+                * the lowercase letters and digits are >127 in EBCDIC,
+                * so here they would need to be mapped to the Unicode
+                * repertoire.   --jhi */
+               
                if (uv > 127) {
-                   if (!has_utf8 && (to_be_utf8 || uv > 255)) {
-                       /* might need to recode whatever we have accumulated so far
-                        * if it contains any hibit chars
+                   if (!has_utf8 && uv > 255) {
+                       /* Might need to recode whatever we have
+                        * accumulated so far if it contains any
+                        * hibit chars.
+                        *
+                        * (Can't we keep track of that and avoid
+                        *  this rescan? --jhi)
                         */
                        int hicount = 0;
                        char *c;
+
                        for (c = SvPVX(sv); c < d; c++) {
-                           if (*c & 0x80)
+                           if (UTF8_IS_CONTINUED(*c))
                                hicount++;
                        }
                        if (hicount) {
                            char *old_pvx = SvPVX(sv);
                            char *src, *dst;
-                           d = SvGROW(sv, SvCUR(sv) + hicount + 1) + (d - old_pvx);
+                         
+                           d = SvGROW(sv,
+                                      SvCUR(sv) + hicount + 1) +
+                                        (d - old_pvx);
 
                            src = d - 1;
                            d += hicount;
                            dst = d - 1;
 
                            while (src < dst) {
-                               if (*src & 0x80) {
-                                   dst--;
-                                   uv_to_utf8((U8*)dst, (U8)*src--);
-                                   dst--;
+                               if (UTF8_IS_CONTINUED(*src)) {
+                                   *dst-- = UTF8_EIGHT_BIT_LO(*src);
+                                   *dst-- = UTF8_EIGHT_BIT_HI(*src--);
                                }
                                else {
                                    *dst-- = *src--;
@@ -1456,7 +1447,7 @@ S_scan_const(pTHX_ char *start)
                         }
                     }
 
-                    if (to_be_utf8 || uv > 255) {
+                    if (has_utf8 || uv > 255) {
                        d = (char*)uv_to_utf8((U8*)d, uv);
                        has_utf8 = TRUE;
                     }
@@ -1568,6 +1559,29 @@ S_scan_const(pTHX_ char *start)
            continue;
        } /* end if (backslash) */
 
+       /* (now in tr/// code again) */
+
+       if (UTF8_IS_CONTINUED(*s) && (this_utf8 || has_utf8)) {
+           STRLEN len = (STRLEN) -1;
+           UV uv;
+           if (this_utf8) {
+               uv = utf8_to_uv((U8*)s, send - s, &len, 0);
+           }
+           if (len == (STRLEN)-1) {
+               /* Illegal UTF8 (a high-bit byte), make it valid. */
+               char *old_pvx = SvPVX(sv);
+               /* need space for one extra char (NOTE: SvCUR() not set here) */
+               d = SvGROW(sv, SvLEN(sv) + 1) + (d - old_pvx);
+               d = (char*)uv_to_utf8((U8*)d, (U8)*s++);
+           }
+           else {
+               while (len--)
+                   *d++ = *s++;
+           }
+           has_utf8 = TRUE;
+           continue;
+       }
+
        *d++ = *s++;
     } /* while loop to process each character */
 
@@ -2068,15 +2082,16 @@ Perl_yylex_r(pTHX_ YYSTYPE *lvalp, int *lcharp)
 {
     int r;
 
+    yyactlevel++;
     yylval_pointer[yyactlevel] = lvalp;
     yychar_pointer[yyactlevel] = lcharp;
-    yyactlevel++;
     if (yyactlevel >= YYMAXLEVEL)
        Perl_croak(aTHX_ "panic: YYMAXLEVEL");
 
     r = Perl_yylex(aTHX);
 
-    yyactlevel--;
+    if (yyactlevel > 0)
+       yyactlevel--;
 
     return r;
 }
@@ -2085,13 +2100,8 @@ Perl_yylex_r(pTHX_ YYSTYPE *lvalp, int *lcharp)
 #ifdef __SC__
 #pragma segment Perl_yylex
 #endif
-
 int
-#ifdef USE_PURE_BISON
-Perl_yylex(pTHX_ YYSTYPE *lvalp, int *lcharp)
-#else
 Perl_yylex(pTHX)
-#endif
 {
     register char *s;
     register char *d;
@@ -2099,6 +2109,7 @@ Perl_yylex(pTHX)
     STRLEN len;
     GV *gv = Nullgv;
     GV **gvp = 0;
+    bool bof = FALSE;
 
     /* check if there's an identifier for us to look at */
     if (PL_pending_ident) {
@@ -2519,7 +2530,7 @@ Perl_yylex(pTHX)
            goto retry;
        }
        do {
-           bool bof = PL_rsfp ? TRUE : FALSE;
+           bof = PL_rsfp ? TRUE : FALSE;
            if (bof) {
 #ifdef PERLIO_IS_STDIO
 #  ifdef __GNU_LIBRARY__
@@ -2850,7 +2861,7 @@ Perl_yylex(pTHX)
            if (ftst) {
                PL_last_lop_op = ftst;
                DEBUG_T( { PerlIO_printf(Perl_debug_log,
-                        "### Saw file test %c\n", ftst);
+                        "### Saw file test %c\n", (int)ftst);
                } )
                FTST(ftst);
            }
@@ -2858,7 +2869,8 @@ Perl_yylex(pTHX)
                /* Assume it was a minus followed by a one-letter named
                 * subroutine call (or a -bareword), then. */
                DEBUG_T( { PerlIO_printf(Perl_debug_log,
-                        "### %c looked like a file test but was not\n", ftst);
+                       "### %c looked like a file test but was not\n",
+                       (int)ftst);
                } )
                s -= 2;
            }
@@ -3013,9 +3025,21 @@ Perl_yylex(pTHX)
                    PL_lex_stuff = Nullsv;
                }
                else {
-                   attrs = append_elem(OP_LIST, attrs,
-                                       newSVOP(OP_CONST, 0,
-                                               newSVpvn(s, len)));
+                   if (!PL_in_my && len == 6 && strnEQ(s, "lvalue", len))
+                       CvLVALUE_on(PL_compcv);
+                   else if (!PL_in_my && len == 6 && strnEQ(s, "locked", len))
+                       CvLOCKED_on(PL_compcv);
+                   else if (!PL_in_my && len == 6 && strnEQ(s, "method", len))
+                       CvMETHOD_on(PL_compcv);
+                   /* After we've set the flags, it could be argued that
+                      we don't need to do the attributes.pm-based setting
+                      process, and shouldn't bother appending recognized
+                      flags. To experiment with that, uncomment the
+                      following "else": */
+                   /* else */
+                       attrs = append_elem(OP_LIST, attrs,
+                                           newSVOP(OP_CONST, 0,
+                                                   newSVpvn(s, len)));
                }
                s = skipspace(d);
                if (*s == ':' && s[1] != ':')
@@ -3652,7 +3676,7 @@ Perl_yylex(pTHX)
            missingterm((char*)0);
        yylval.ival = OP_CONST;
        for (d = SvPV(PL_lex_stuff, len); len; len--, d++) {
-           if (*d == '$' || *d == '@' || *d == '\\' || *d & 0x80) {
+           if (*d == '$' || *d == '@' || *d == '\\' || UTF8_IS_CONTINUED(*d)) {
                yylval.ival = OP_STRINGIFY;
                break;
            }
@@ -4687,7 +4711,10 @@ Perl_yylex(pTHX)
            TOKEN('(');
 
        case KEY_qq:
+       case KEY_qu:
            s = scan_str(s,FALSE,FALSE);
+           if (tmp == KEY_qu && is_utf8_string((U8*)s, SvCUR(PL_lex_stuff)))
+               SvUTF8_on(PL_lex_stuff);
            if (!s)
                missingterm((char*)0);
            yylval.ival = OP_STRINGIFY;
@@ -5524,6 +5551,7 @@ Perl_keyword(pTHX_ register char *d, I32 len)
            if (strEQ(d,"q"))                   return KEY_q;
            if (strEQ(d,"qr"))                  return KEY_qr;
            if (strEQ(d,"qq"))                  return KEY_qq;
+           if (strEQ(d,"qu"))                  return KEY_qu;
            if (strEQ(d,"qw"))                  return KEY_qw;
            if (strEQ(d,"qx"))                  return KEY_qx;
        }
@@ -5923,9 +5951,9 @@ S_scan_word(pTHX_ register char *s, char *dest, STRLEN destlen, int allow_packag
            *d++ = *s++;
            *d++ = *s++;
        }
-       else if (UTF && *(U8*)s >= 0xc0 && isALNUM_utf8((U8*)s)) {
+       else if (UTF && UTF8_IS_START(*s) && isALNUM_utf8((U8*)s)) {
            char *t = s + UTF8SKIP(s);
-           while (*t & 0x80 && is_utf8_mark((U8*)t))
+           while (UTF8_IS_CONTINUED(*t) && is_utf8_mark((U8*)t))
                t += UTF8SKIP(t);
            if (d + (t - s) > e)
                Perl_croak(aTHX_ ident_too_long);
@@ -5975,9 +6003,9 @@ S_scan_ident(pTHX_ register char *s, register char *send, char *dest, STRLEN des
                *d++ = *s++;
                *d++ = *s++;
            }
-           else if (UTF && *(U8*)s >= 0xc0 && isALNUM_utf8((U8*)s)) {
+           else if (UTF && UTF8_IS_START(*s) && isALNUM_utf8((U8*)s)) {
                char *t = s + UTF8SKIP(s);
-               while (*t & 0x80 && is_utf8_mark((U8*)t))
+               while (UTF8_IS_CONTINUED(*t) && is_utf8_mark((U8*)t))
                    t += UTF8SKIP(t);
                if (d + (t - s) > e)
                    Perl_croak(aTHX_ ident_too_long);
@@ -6030,7 +6058,7 @@ S_scan_ident(pTHX_ register char *s, register char *send, char *dest, STRLEN des
                e = s;
                while ((e < send && isALNUM_lazy_if(e,UTF)) || *e == ':') {
                    e += UTF8SKIP(e);
-                   while (e < send && *e & 0x80 && is_utf8_mark((U8*)e))
+                   while (e < send && UTF8_IS_CONTINUED(*e) && is_utf8_mark((U8*)e))
                        e += UTF8SKIP(e);
                }
                Copy(s, d, e - s, char);
@@ -6647,7 +6675,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims)
 
     /* after skipping whitespace, the next character is the terminator */
     term = *s;
-    if ((term & 0x80) && UTF)
+    if (UTF8_IS_CONTINUED(term) && UTF)
        has_utf8 = TRUE;
 
     /* mark where we are */
@@ -6694,7 +6722,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims)
                   have found the terminator */
                else if (*s == term)
                    break;
-               else if (!has_utf8 && (*s & 0x80) && UTF)
+               else if (!has_utf8 && UTF8_IS_CONTINUED(*s) && UTF)
                    has_utf8 = TRUE;
                *to = *s;
            }
@@ -6723,7 +6751,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims)
                    break;
                else if (*s == PL_multi_open)
                    brackets++;
-               else if (!has_utf8 && (*s & 0x80) && UTF)
+               else if (!has_utf8 && UTF8_IS_CONTINUED(*s) && UTF)
                    has_utf8 = TRUE;
                *to = *s;
            }
@@ -7180,10 +7208,9 @@ vstring:
            while (isDIGIT(*pos) || *pos == '_')
                pos++;
            if (!isALPHA(*pos)) {
-               UV rev;
+               UV rev, revmax = 0;
                U8 tmpbuf[UTF8_MAXLEN+1];
                U8 *tmpend;
-               bool utf8 = FALSE;
                s++;                            /* get past 'v' */
 
                sv = NEWSV(92,5);
@@ -7210,7 +7237,8 @@ vstring:
                        }
                    }
                    tmpend = uv_to_utf8(tmpbuf, rev);
-                   utf8 = utf8 || rev > 127;
+                   if (rev > revmax)
+                       revmax = rev;
                    sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf);
                    if (*pos == '.' && isDIGIT(pos[1]))
                        s = ++pos;
@@ -7224,9 +7252,9 @@ vstring:
 
                SvPOK_on(sv);
                SvREADONLY_on(sv);
-               if (utf8) {
+               if (revmax > 127) {
                    SvUTF8_on(sv);
-                   if (!UTF||IN_BYTE)
+                   if (revmax < 256)
                      sv_utf8_downgrade(sv, TRUE);
                }
            }