X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=toke.c;h=424249fded01a2952a0d0b3406e29cc6683ed241;hb=9d50d3997b5daf2ecaed7d0cafc86ab4121491bc;hp=0d4fc1d018888008b6ac24cbc30ef95f02672e80;hpb=9041c2e396c8c7de7680a2007dc341a9f65be0d0;p=p5sagit%2Fp5-mst-13.2.git diff --git a/toke.c b/toke.c index 0d4fc1d..424249f 100644 --- a/toke.c +++ b/toke.c @@ -36,8 +36,12 @@ static I32 utf16rev_textfilter(pTHXo_ int idx, SV *sv, int maxlen); #define XFAKEBRACK 128 #define XENUMMASK 127 -/*#define UTF (SvUTF8(PL_linestr) && !(PL_hints & HINT_BYTE))*/ -#define UTF (PL_hints & HINT_UTF8) +#ifdef EBCDIC +/* For now 'use utf8' does not affect tokenizer on EBCDIC */ +#define UTF (PL_linestr && DO_UTF8(PL_linestr)) +#else +#define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || (PL_hints & HINT_UTF8)) +#endif /* In variables name $^X, these are the legal values for X. * 1999-02-27 mjd-perl-patch@plover.com */ @@ -177,13 +181,15 @@ int yyactlevel = -1; /* grandfather return to old style */ #define OLDLOP(f) return(yylval.ival=f,PL_expect = XTERM,PL_bufptr = s,(int)LSTOP) +#ifdef DEBUGGING + STATIC void S_tokereport(pTHX_ char *thing, char* s, I32 rv) { - SV *report; DEBUG_T({ - report = newSVpv(thing, 0); - Perl_sv_catpvf(aTHX_ report, ":line %i:%i:", CopLINE(PL_curcop), rv); + SV* report = newSVpv(thing, 0); + Perl_sv_catpvf(aTHX_ report, ":line %d:%"IVdf":", CopLINE(PL_curcop), + (IV)rv); if (s - PL_bufptr > 0) sv_catpvn(report, PL_bufptr, s - PL_bufptr); @@ -192,9 +198,11 @@ S_tokereport(pTHX_ char *thing, char* s, I32 rv) sv_catpv(report, PL_tokenbuf); } PerlIO_printf(Perl_debug_log, "### %s\n", SvPV_nolen(report)); - }) + }); } +#endif + /* * S_ao * @@ -533,7 +541,7 @@ S_skipspace(pTHX_ register char *s) for (;;) { STRLEN prevlen; SSize_t oldprevlen, oldoldprevlen; - SSize_t oldloplen, oldunilen; + SSize_t oldloplen = 0, oldunilen = 0; while (s < PL_bufend && isSPACE(*s)) { if (*s++ == '\n' && PL_in_eval && !PL_rsfp) incline(s); @@ -917,8 +925,11 @@ S_tokeq(pTHX_ SV *sv) if (s == send) goto finish; d = s; - if ( PL_hints & HINT_NEW_STRING ) + if ( PL_hints & HINT_NEW_STRING ) { pv = sv_2mortal(newSVpvn(SvPVX(pv), len)); + if (SvUTF8(sv)) + SvUTF8_on(pv); + } while (s < send) { if (*s == '\\') { if (s + 1 < send && (s[1] == '\\')) @@ -1035,6 +1046,7 @@ S_sublex_push(pTHX) SAVEI32(PL_lex_inwhat); SAVECOPLINE(PL_curcop); SAVEPPTR(PL_bufptr); + SAVEPPTR(PL_bufend); SAVEPPTR(PL_oldbufptr); SAVEPPTR(PL_oldoldbufptr); SAVEPPTR(PL_last_lop); @@ -1215,22 +1227,22 @@ S_scan_const(pTHX_ char *start) register char *d = SvPVX(sv); /* destination for copies */ bool dorange = FALSE; /* are we in a translit range? */ bool didrange = FALSE; /* did we just finish a range? */ - bool has_utf8 = (PL_linestr && SvUTF8(PL_linestr)); - /* the constant is UTF8 */ + I32 has_utf8 = FALSE; /* Output constant is UTF8 */ + I32 this_utf8 = UTF; /* The source string is assumed to be UTF8 */ UV uv; - I32 utf = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) - ? (PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF)) - : UTF; - I32 this_utf8 = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) - ? (PL_sublex_info.sub_op->op_private & (PL_lex_repl ? - OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF)) - : UTF; const char *leaveit = /* set of acceptably-backslashed characters */ PL_lex_inpat ? "\\.^$@AGZdDwWsSbBpPXC+*?|()-nrtfeaxcz0123456789[{]} \t\n\r\f\v#" : ""; + if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) { + /* If we are doing a trans and we know we want UTF8 set expectation */ + has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF); + this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF); + } + + while (s < send || dorange) { /* get transliterations out of the way (they're most literal) */ if (PL_lex_inwhat == OP_TRANS) { @@ -1240,17 +1252,18 @@ S_scan_const(pTHX_ char *start) I32 min; /* first character in range */ I32 max; /* last character in range */ - if (utf) { + if (has_utf8) { char *c = (char*)utf8_hop((U8*)d, -1); char *e = d++; while (e-- > c) *(e + 1) = *e; - *c = (char)0xff; + *c = UTF_TO_NATIVE(0xff); /* mark the range as done, and continue */ dorange = FALSE; didrange = TRUE; continue; } + i = d - SvPVX(sv); /* remember current offset */ SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */ d = SvPVX(sv) + i; /* refresh d after realloc */ @@ -1265,17 +1278,17 @@ S_scan_const(pTHX_ char *start) (char)min, (char)max); } -#ifndef ASCIIish +#ifdef EBCDIC if ((isLOWER(min) && isLOWER(max)) || (isUPPER(min) && isUPPER(max))) { if (isLOWER(min)) { for (i = min; i <= max; i++) if (isLOWER(i)) - *d++ = i; + *d++ = NATIVE_TO_NEED(has_utf8,i); } else { for (i = min; i <= max; i++) if (isUPPER(i)) - *d++ = i; + *d++ = NATIVE_TO_NEED(has_utf8,i); } } else @@ -1294,8 +1307,8 @@ S_scan_const(pTHX_ char *start) if (didrange) { Perl_croak(aTHX_ "Ambiguous range in transliteration operator"); } - if (utf) { - *d++ = (char)0xff; /* use illegal utf8 byte--see pmtrans */ + if (has_utf8) { + *d++ = UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */ s++; continue; } @@ -1314,7 +1327,7 @@ S_scan_const(pTHX_ char *start) else if (*s == '(' && PL_lex_inpat && s[1] == '?') { if (s[2] == '#') { while (s < send && *s != ')') - *d++ = *s++; + *d++ = NATIVE_TO_NEED(has_utf8,*s++); } else if (s[2] == '{' /* This should match regcomp.c */ || ((s[2] == 'p' || s[2] == '?') && s[3] == '{')) @@ -1337,7 +1350,7 @@ S_scan_const(pTHX_ char *start) yyerror("Sequence (?{...}) not terminated or not {}-balanced"); } while (s < regparse) - *d++ = *s++; + *d++ = NATIVE_TO_NEED(has_utf8,*s++); } } @@ -1345,7 +1358,7 @@ S_scan_const(pTHX_ char *start) else if (*s == '#' && PL_lex_inpat && ((PMOP*)PL_lex_inpat)->op_pmflags & PMf_EXTENDED) { while (s+1 < send && *s != '\n') - *d++ = *s++; + *d++ = NATIVE_TO_NEED(has_utf8,*s++); } /* check for embedded arrays @@ -1365,14 +1378,16 @@ S_scan_const(pTHX_ char *start) break; /* in regexp, $ might be tail anchor */ } + /* End of else if chain - OP_TRANS rejoin rest */ + /* backslashes */ if (*s == '\\' && s+1 < send) { s++; /* some backslashes we leave behind */ if (*leaveit && *s && strchr(leaveit, *s)) { - *d++ = '\\'; - *d++ = *s++; + *d++ = NATIVE_TO_NEED(has_utf8,'\\'); + *d++ = NATIVE_TO_NEED(has_utf8,*s++); continue; } @@ -1427,14 +1442,14 @@ S_scan_const(pTHX_ char *start) ++s; if (*s == '{') { char* e = strchr(s, '}'); + STRLEN len = 1; /* allow underscores */ + if (!e) { yyerror("Missing right brace on \\x{}"); - e = s; - } - else { - STRLEN len = 1; /* allow underscores */ - uv = (UV)scan_hex(s + 1, e - s - 1, &len); + ++s; + continue; } + uv = (UV)scan_hex(s + 1, e - s - 1, &len); s = e + 1; } else { @@ -1448,15 +1463,13 @@ S_scan_const(pTHX_ char *start) NUM_ESCAPE_INSERT: /* Insert oct or hex escaped character. * There will always enough room in sv since such - * escapes will be longer than any UT-F8 sequence + * escapes will be longer than any UTF-8 sequence * they can end up as. */ - - /* This spot is wrong for EBCDIC. Characters like - * the lowercase letters and digits are >127 in EBCDIC, - * so here they would need to be mapped to the Unicode - * repertoire. --jhi */ - if (uv > 127) { + /* We need to map to chars to ASCII before doing the tests + to cover EBCDIC + */ + if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(uv))) { if (!has_utf8 && uv > 255) { /* Might need to recode whatever we have * accumulated so far if it contains any @@ -1465,33 +1478,30 @@ S_scan_const(pTHX_ char *start) * (Can't we keep track of that and avoid * this rescan? --jhi) */ - int hicount = 0; - char *c; - - for (c = SvPVX(sv); c < d; c++) { - if (UTF8_IS_CONTINUED(*c)) + int hicount = 0; + U8 *c; + for (c = (U8 *) SvPVX(sv); c < (U8 *)d; c++) { + if (!NATIVE_IS_INVARIANT(*c)) { hicount++; + } } if (hicount) { - char *old_pvx = SvPVX(sv); - char *src, *dst; - - d = SvGROW(sv, - SvLEN(sv) + hicount + 1) + - (d - old_pvx); - - src = d - 1; - d += hicount; - dst = d - 1; - - while (src < dst) { - if (UTF8_IS_CONTINUED(*src)) { - *dst-- = UTF8_EIGHT_BIT_LO(*src); - *dst-- = UTF8_EIGHT_BIT_HI(*src--); + STRLEN offset = d - SvPVX(sv); + U8 *src, *dst; + d = SvGROW(sv, SvLEN(sv) + hicount + 1) + offset; + src = (U8 *)d - 1; + dst = src+hicount; + d += hicount; + while (src >= (U8 *)SvPVX(sv)) { + if (!NATIVE_IS_INVARIANT(*src)) { + U8 ch = NATIVE_TO_ASCII(*src); + *dst-- = UTF8_EIGHT_BIT_LO(ch); + *dst-- = UTF8_EIGHT_BIT_HI(ch); } else { - *dst-- = *src--; + *dst-- = *src; } + src--; } } } @@ -1504,7 +1514,6 @@ S_scan_const(pTHX_ char *start) PL_sublex_info.sub_op->op_private |= (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF); - utf = TRUE; } } else { @@ -1512,7 +1521,7 @@ S_scan_const(pTHX_ char *start) } } else { - *d++ = (char)uv; + *d++ = (char) uv; } continue; @@ -1566,51 +1575,38 @@ S_scan_const(pTHX_ char *start) /* \c is a control character */ case 'c': s++; -#ifdef EBCDIC - *d = *s++; - if (isLOWER(*d)) - *d = toUPPER(*d); - *d = toCTRL(*d); - d++; -#else { U8 c = *s++; - *d++ = toCTRL(c); - } +#ifdef EBCDIC + if (isLOWER(c)) + c = toUPPER(c); #endif + *d++ = NATIVE_TO_NEED(has_utf8,toCTRL(c)); + } continue; /* printf-style backslashes, formfeeds, newlines, etc */ case 'b': - *d++ = '\b'; + *d++ = NATIVE_TO_NEED(has_utf8,'\b'); break; case 'n': - *d++ = '\n'; + *d++ = NATIVE_TO_NEED(has_utf8,'\n'); break; case 'r': - *d++ = '\r'; + *d++ = NATIVE_TO_NEED(has_utf8,'\r'); break; case 'f': - *d++ = '\f'; + *d++ = NATIVE_TO_NEED(has_utf8,'\f'); break; case 't': - *d++ = '\t'; + *d++ = NATIVE_TO_NEED(has_utf8,'\t'); break; -#ifdef EBCDIC - case 'e': - *d++ = '\047'; /* CP 1047 */ - break; - case 'a': - *d++ = '\057'; /* CP 1047 */ - break; -#else case 'e': - *d++ = '\033'; + *d++ = ASCII_TO_NEED(has_utf8,'\033'); break; case 'a': - *d++ = '\007'; + *d++ = ASCII_TO_NEED(has_utf8,'\007'); break; -#endif } /* end switch */ s++; @@ -1618,41 +1614,40 @@ S_scan_const(pTHX_ char *start) } /* end if (backslash) */ default_action: - if (UTF8_IS_CONTINUED(*s) && (this_utf8 || has_utf8)) { - STRLEN len = (STRLEN) -1; - UV uv; - if (this_utf8) { - uv = utf8n_to_uvchr((U8*)s, send - s, &len, 0); - } - if (len == (STRLEN)-1) { - /* Illegal UTF8 (a high-bit byte), make it valid. */ - char *old_pvx = SvPVX(sv); - /* need space for one extra char (NOTE: SvCUR() not set here) */ - d = SvGROW(sv, SvLEN(sv) + 1) + (d - old_pvx); - d = (char*)uvchr_to_utf8((U8*)d, (U8)*s++); - } - else { - while (len--) - *d++ = *s++; - } - has_utf8 = TRUE; - if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) { - PL_sublex_info.sub_op->op_private |= - (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF); - utf = TRUE; - } - continue; - } - - *d++ = *s++; + /* If we started with encoded form, or already know we want it + and then encode the next character */ + if ((has_utf8 || this_utf8) && !NATIVE_IS_INVARIANT((U8)(*s))) { + STRLEN len = 1; + UV uv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s); + STRLEN need = UNISKIP(NATIVE_TO_UNI(uv)); + s += len; + if (need > len) { + /* encoded value larger than old, need extra space (NOTE: SvCUR() not set here) */ + STRLEN off = d - SvPVX(sv); + d = SvGROW(sv, SvLEN(sv) + (need-len)) + off; + } + d = (char*)uvchr_to_utf8((U8*)d, uv); + has_utf8 = TRUE; + } + else { + *d++ = NATIVE_TO_NEED(has_utf8,*s++); + } } /* while loop to process each character */ /* terminate the string and set up the sv */ *d = '\0'; SvCUR_set(sv, d - SvPVX(sv)); + if (SvCUR(sv) >= SvLEN(sv)) + Perl_croak(aTHX_ "panic: constant overflowed allocated space"); + SvPOK_on(sv); - if (has_utf8) + if (has_utf8) { SvUTF8_on(sv); + if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) { + PL_sublex_info.sub_op->op_private |= + (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF); + } + } /* shrink the sv if we allocated more than we used */ if (SvCUR(sv) + 5 < SvLEN(sv)) { @@ -2177,7 +2172,7 @@ Perl_yylex(pTHX) PL_pending_ident = 0; DEBUG_T({ PerlIO_printf(Perl_debug_log, - "### Tokener saw identifier '%s'\n", PL_tokenbuf); }) + "### Tokener saw identifier '%s'\n", PL_tokenbuf); }); /* if we're in a my(), we can't allow dynamics here. $foo'bar has already been turned into $foo::bar, so @@ -2318,7 +2313,7 @@ Perl_yylex(pTHX) } DEBUG_T({ PerlIO_printf(Perl_debug_log, "### Next token after '%s' was known, type %"IVdf"\n", PL_bufptr, - (IV)PL_nexttype[PL_nexttoke]); }) + (IV)PL_nexttype[PL_nexttoke]); }); return(PL_nexttype[PL_nexttoke]); @@ -2352,7 +2347,7 @@ Perl_yylex(pTHX) } else { DEBUG_T({ PerlIO_printf(Perl_debug_log, - "### Saw case modifier at '%s'\n", PL_bufptr); }) + "### Saw case modifier at '%s'\n", PL_bufptr); }); s = PL_bufptr + 1; if (strnEQ(s, "L\\u", 3) || strnEQ(s, "U\\l", 3)) tmp = *s, *s = s[2], s[2] = tmp; /* misordered... */ @@ -2404,7 +2399,7 @@ Perl_yylex(pTHX) if (PL_bufptr == PL_bufend) return sublex_done(); DEBUG_T({ PerlIO_printf(Perl_debug_log, - "### Interpolated variable at '%s'\n", PL_bufptr); }) + "### Interpolated variable at '%s'\n", PL_bufptr); }); PL_expect = XTERM; PL_lex_dojoin = (*PL_bufptr == '@'); PL_lex_state = LEX_INTERPNORMAL; @@ -2504,7 +2499,7 @@ Perl_yylex(pTHX) DEBUG_T( { PerlIO_printf(Perl_debug_log, "### Tokener expecting %s at %s\n", exp_name[PL_expect], s); - } ) + } ); retry: switch (*s) { @@ -2523,7 +2518,7 @@ Perl_yylex(pTHX) yyerror("Missing right curly or square bracket"); DEBUG_T( { PerlIO_printf(Perl_debug_log, "### Tokener got EOF\n"); - } ) + } ); TOKEN(0); } if (s++ < PL_bufend) @@ -2856,6 +2851,8 @@ Perl_yylex(pTHX) s++; if (s < d) s++; + else if (s > d) /* Found by Ilya: feed random input to Perl. */ + Perl_croak(aTHX_ "panic: input overflow"); incline(s); if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) { PL_bufptr = s; @@ -2883,7 +2880,7 @@ Perl_yylex(pTHX) s = force_word(PL_bufptr,WORD,FALSE,FALSE,FALSE); DEBUG_T( { PerlIO_printf(Perl_debug_log, "### Saw unary minus before =>, forcing word '%s'\n", s); - } ) + } ); OPERATOR('-'); /* unary minus */ } PL_last_uni = PL_oldbufptr; @@ -2928,7 +2925,7 @@ Perl_yylex(pTHX) PL_last_lop_op = ftst; DEBUG_T( { PerlIO_printf(Perl_debug_log, "### Saw file test %c\n", (int)ftst); - } ) + } ); FTST(ftst); } else { @@ -2937,7 +2934,7 @@ Perl_yylex(pTHX) DEBUG_T( { PerlIO_printf(Perl_debug_log, "### %c looked like a file test but was not\n", (int)ftst); - } ) + } ); s -= 2; } } @@ -3234,8 +3231,16 @@ Perl_yylex(pTHX) else PL_lex_brackstack[PL_lex_brackets++] = XOPERATOR; s = skipspace(s); - if (*s == '}') + if (*s == '}') { + if (PL_expect == XREF && PL_lex_state == LEX_INTERPNORMAL) { + PL_expect = XTERM; + /* This hack is to get the ${} in the message. */ + PL_bufptr = s+1; + yyerror("syntax error"); + break; + } OPERATOR(HASHBRACK); + } /* This hack serves to disambiguate a pair of curlies * as being a block or an anon hash. Normally, expectation * determines that, but in cases where we're not in a @@ -3697,7 +3702,7 @@ Perl_yylex(pTHX) s = scan_num(s, &yylval); DEBUG_T( { PerlIO_printf(Perl_debug_log, "### Saw number in '%s'\n", s); - } ) + } ); if (PL_expect == XOPERATOR) no_op("Number",s); TERM(THING); @@ -3706,7 +3711,7 @@ Perl_yylex(pTHX) s = scan_str(s,FALSE,FALSE); DEBUG_T( { PerlIO_printf(Perl_debug_log, "### Saw string before '%s'\n", s); - } ) + } ); if (PL_expect == XOPERATOR) { if (PL_lex_formbrack && PL_lex_brackets == PL_lex_formbrack) { PL_expect = XTERM; @@ -3725,7 +3730,7 @@ Perl_yylex(pTHX) s = scan_str(s,FALSE,FALSE); DEBUG_T( { PerlIO_printf(Perl_debug_log, "### Saw string before '%s'\n", s); - } ) + } ); if (PL_expect == XOPERATOR) { if (PL_lex_formbrack && PL_lex_brackets == PL_lex_formbrack) { PL_expect = XTERM; @@ -3739,7 +3744,7 @@ Perl_yylex(pTHX) missingterm((char*)0); yylval.ival = OP_CONST; for (d = SvPV(PL_lex_stuff, len); len; len--, d++) { - if (*d == '$' || *d == '@' || *d == '\\' || UTF8_IS_CONTINUED(*d)) { + if (*d == '$' || *d == '@' || *d == '\\' || !UTF8_IS_INVARIANT((U8)*d)) { yylval.ival = OP_STRINGIFY; break; } @@ -3750,7 +3755,7 @@ Perl_yylex(pTHX) s = scan_str(s,FALSE,FALSE); DEBUG_T( { PerlIO_printf(Perl_debug_log, "### Saw backtick string before '%s'\n", s); - } ) + } ); if (PL_expect == XOPERATOR) no_op("Backticks",s); if (!s) @@ -3780,7 +3785,7 @@ Perl_yylex(pTHX) TERM(THING); } /* avoid v123abc() or $h{v1}, allow C */ - else if (!isALPHA(*start) && (PL_expect == XTERM || PL_expect == XREF)) { + else if (!isALPHA(*start) && (PL_expect == XTERM || PL_expect == XREF || PL_expect == XSTATE)) { char c = *start; GV *gv; *start = '\0'; @@ -3866,7 +3871,7 @@ Perl_yylex(pTHX) CLINE; yylval.opval = (OP*)newSVOP(OP_CONST, 0, newSVpv(PL_tokenbuf,0)); yylval.opval->op_private = OPpCONST_BARE; - if (UTF && !IN_BYTE && is_utf8_string((U8*)PL_tokenbuf, len)) + if (UTF && !IN_BYTES && is_utf8_string((U8*)PL_tokenbuf, len)) SvUTF8_on(((SVOP*)yylval.opval)->op_sv); TERM(WORD); } @@ -4027,7 +4032,7 @@ Perl_yylex(pTHX) if (*s == '=' && s[1] == '>') { CLINE; sv_setpv(((SVOP*)yylval.opval)->op_sv, PL_tokenbuf); - if (UTF && !IN_BYTE && is_utf8_string((U8*)PL_tokenbuf, len)) + if (UTF && !IN_BYTES && is_utf8_string((U8*)PL_tokenbuf, len)) SvUTF8_on(((SVOP*)yylval.opval)->op_sv); TERM(WORD); } @@ -4191,13 +4196,19 @@ Perl_yylex(pTHX) (void)PerlIO_seek(PL_rsfp, 0L, 0); } if (PerlLIO_setmode(PerlIO_fileno(PL_rsfp), O_TEXT) != -1) { +#ifdef PERLIO_IS_STDIO /* really? */ +# if defined(__BORLANDC__) + /* XXX see note in do_binmode() */ + ((FILE*)PL_rsfp)->flags &= ~_F_BIN; +# endif +#endif if (loc > 0) PerlIO_seek(PL_rsfp, loc, 0); } } #endif #ifdef PERLIO_LAYERS - if (UTF && !IN_BYTE) + if (UTF && !IN_BYTES) PerlIO_apply_layers(aTHX_ PL_rsfp, NULL, ":utf8"); #endif PL_rsfp = Nullfp; @@ -4986,7 +4997,7 @@ Perl_yylex(pTHX) really_sub: { char tmpbuf[sizeof PL_tokenbuf]; - SSize_t tboffset; + SSize_t tboffset = 0; expectation attrful; bool have_name, have_proto; int key = tmp; @@ -5630,7 +5641,7 @@ Perl_keyword(pTHX_ register char *d, I32 len) if (strEQ(d,"rindex")) return -KEY_rindex; break; case 7: - if (strEQ(d,"require")) return -KEY_require; + if (strEQ(d,"require")) return KEY_require; if (strEQ(d,"reverse")) return -KEY_reverse; if (strEQ(d,"readdir")) return -KEY_readdir; break; @@ -6525,7 +6536,7 @@ retval: Renew(SvPVX(tmpstr), SvLEN(tmpstr), char); } SvREFCNT_dec(herewas); - if (UTF && !IN_BYTE && is_utf8_string((U8*)SvPVX(tmpstr), SvCUR(tmpstr))) + if (UTF && !IN_BYTES && is_utf8_string((U8*)SvPVX(tmpstr), SvCUR(tmpstr))) SvUTF8_on(tmpstr); PL_lex_stuff = tmpstr; yylval.ival = op_type; @@ -6711,7 +6722,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) /* after skipping whitespace, the next character is the terminator */ term = *s; - if (UTF8_IS_CONTINUED(term) && UTF) + if (!UTF8_IS_INVARIANT((U8)term) && UTF) has_utf8 = TRUE; /* mark where we are */ @@ -6758,7 +6769,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) have found the terminator */ else if (*s == term) break; - else if (!has_utf8 && UTF8_IS_CONTINUED(*s) && UTF) + else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF) has_utf8 = TRUE; *to = *s; } @@ -6787,7 +6798,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) break; else if (*s == PL_multi_open) brackets++; - else if (!has_utf8 && UTF8_IS_CONTINUED(*s) && UTF) + else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF) has_utf8 = TRUE; *to = *s; } @@ -6879,11 +6890,11 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) Read a number in any of the formats that Perl accepts: - 0(x[0-7A-F]+)|([0-7]+)|(b[01]) - [\d_]+(\.[\d_]*)?[Ee](\d+) - - Underbars (_) are allowed in decimal numbers. If -w is on, - underbars before a decimal point must be at three digit intervals. + \d(_?\d)*(\.(\d(_?\d)*)?)?[Ee][\+\-]?(\d(_?\d)*) 12 12.34 12. + \.\d(_?\d)*[Ee][\+\-]?(\d(_?\d)*) .34 + 0b[01](_?[01])* + 0[0-7](_?[0-7])* + 0x[0-9A-Fa-f](_?[0-9A-Fa-f])* Like most scan_ routines, it uses the PL_tokenbuf buffer to hold the thing it reads. @@ -6953,8 +6964,17 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp) else if (s[1] == '.' || s[1] == 'e' || s[1] == 'E') goto decimal; /* so it must be octal */ - else + else { shift = 3; + s++; + } + + if (*s == '_') { + if (ckWARN(WARN_SYNTAX)) + Perl_warner(aTHX_ WARN_SYNTAX, + "Misplaced _ in number"); + lastub = s++; + } base = bases[shift]; Base = Bases[shift]; @@ -6972,9 +6992,12 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp) default: goto out; - /* _ are ignored */ + /* _ are ignored -- but warned about if consecutive */ case '_': - s++; + if (ckWARN(WARN_SYNTAX) && lastub && s == lastub + 1) + Perl_warner(aTHX_ WARN_SYNTAX, + "Misplaced _ in number"); + lastub = s++; break; /* 8 and 9 are not octal */ @@ -7041,6 +7064,13 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp) the number. */ out: + + /* final misplaced underbar check */ + if (s[-1] == '_') { + if (ckWARN(WARN_SYNTAX)) + Perl_warner(aTHX_ WARN_SYNTAX, "Misplaced _ in number"); + } + sv = NEWSV(92,0); if (overflowed) { if (ckWARN(WARN_PORTABLE) && n > 4294967295.0) @@ -7080,9 +7110,10 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp) if -w is on */ if (*s == '_') { - if (ckWARN(WARN_SYNTAX) && lastub && s - lastub != 3) - Perl_warner(aTHX_ WARN_SYNTAX, "Misplaced _ in number"); - lastub = ++s; + if (ckWARN(WARN_SYNTAX) && lastub && s == lastub + 1) + Perl_warner(aTHX_ WARN_SYNTAX, + "Misplaced _ in number"); + lastub = s++; } else { /* check for end of fixed-length buffer */ @@ -7094,7 +7125,7 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp) } /* final misplaced underbar check */ - if (lastub && s - lastub != 3) { + if (lastub && s == lastub + 1) { if (ckWARN(WARN_SYNTAX)) Perl_warner(aTHX_ WARN_SYNTAX, "Misplaced _ in number"); } @@ -7107,16 +7138,34 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp) floatit = TRUE; *d++ = *s++; - /* copy, ignoring underbars, until we run out of - digits. Note: no misplaced underbar checks! + if (*s == '_') { + if (ckWARN(WARN_SYNTAX)) + Perl_warner(aTHX_ WARN_SYNTAX, + "Misplaced _ in number"); + lastub = s; + } + + /* copy, ignoring underbars, until we run out of digits. */ for (; isDIGIT(*s) || *s == '_'; s++) { /* fixed length buffer check */ if (d >= e) Perl_croak(aTHX_ number_too_long); - if (*s != '_') + if (*s == '_') { + if (ckWARN(WARN_SYNTAX) && lastub && s == lastub + 1) + Perl_warner(aTHX_ WARN_SYNTAX, + "Misplaced _ in number"); + lastub = s; + } + else *d++ = *s; } + /* fractional part ending in underbar? */ + if (s[-1] == '_') { + if (ckWARN(WARN_SYNTAX)) + Perl_warner(aTHX_ WARN_SYNTAX, + "Misplaced _ in number"); + } if (*s == '.' && isDIGIT(s[1])) { /* oops, it's really a v-string, but without the "v" */ s = start - 1; @@ -7125,22 +7174,48 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp) } /* read exponent part, if present */ - if (*s && strchr("eE",*s) && strchr("+-0123456789",s[1])) { + if (*s && strchr("eE",*s) && strchr("+-0123456789_", s[1])) { floatit = TRUE; s++; /* regardless of whether user said 3E5 or 3e5, use lower 'e' */ *d++ = 'e'; /* At least some Mach atof()s don't grok 'E' */ + /* stray preinitial _ */ + if (*s == '_') { + if (ckWARN(WARN_SYNTAX)) + Perl_warner(aTHX_ WARN_SYNTAX, + "Misplaced _ in number"); + lastub = s++; + } + /* allow positive or negative exponent */ if (*s == '+' || *s == '-') *d++ = *s++; - /* read digits of exponent (no underbars :-) */ - while (isDIGIT(*s)) { - if (d >= e) - Perl_croak(aTHX_ number_too_long); - *d++ = *s++; + /* stray initial _ */ + if (*s == '_') { + if (ckWARN(WARN_SYNTAX)) + Perl_warner(aTHX_ WARN_SYNTAX, + "Misplaced _ in number"); + lastub = s++; + } + + /* read digits of exponent */ + while (isDIGIT(*s) || *s == '_') { + if (isDIGIT(*s)) { + if (d >= e) + Perl_croak(aTHX_ number_too_long); + *d++ = *s++; + } + else { + if (ckWARN(WARN_SYNTAX) && + ((lastub && s == lastub + 1) || + (!isDIGIT(s[1]) && s[1] != '_'))) + Perl_warner(aTHX_ WARN_SYNTAX, + "Misplaced _ in number"); + lastub = s++; + } } } @@ -7159,8 +7234,8 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp) */ if (!floatit) { - IV iv; - UV uv; + IV iv = 0; + UV uv = 0; errno = 0; if (*PL_tokenbuf == '-') iv = Strtol(PL_tokenbuf, (char**)NULL, 10); @@ -7245,7 +7320,7 @@ vstring: while (isDIGIT(*pos) || *pos == '_') pos++; if (!isALPHA(*pos)) { - UV rev, revmax = 0; + UV rev; U8 tmpbuf[UTF8_MAXLEN+1]; U8 *tmpend; s++; /* get past 'v' */ @@ -7275,9 +7350,9 @@ vstring: } /* Append native character for the rev point */ tmpend = uvchr_to_utf8(tmpbuf, rev); - if (rev > revmax) - revmax = rev; sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf); + if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(rev))) + SvUTF8_on(sv); if (*pos == '.' && isDIGIT(pos[1])) s = ++pos; else { @@ -7287,14 +7362,8 @@ vstring: while (isDIGIT(*pos) || *pos == '_') pos++; } - SvPOK_on(sv); SvREADONLY_on(sv); - /* if (revmax > 127) { */ - SvUTF8_on(sv); /* - if (revmax < 256) - sv_utf8_downgrade(sv, TRUE); - } */ } } break;