X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=toke.c;h=fca0f73bb8e5a2417f23407a42f490eb0fd2eb38;hb=4a280ebedb23042ec7ac637d8bfb46817322de6c;hp=3652c11809c8c4e377497bbe0070351d74a2c5b4;hpb=be4731d2ab91c4f6213bf88a0084f6128a0db383;p=p5sagit%2Fp5-mst-13.2.git diff --git a/toke.c b/toke.c index 3652c11..fca0f73 100644 --- a/toke.c +++ b/toke.c @@ -36,8 +36,12 @@ static I32 utf16rev_textfilter(pTHXo_ int idx, SV *sv, int maxlen); #define XFAKEBRACK 128 #define XENUMMASK 127 -/*#define UTF (SvUTF8(PL_linestr) && !(PL_hints & HINT_BYTE))*/ -#define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || PL_hints & HINT_UTF8) +#ifdef EBCDIC +/* For now 'use utf8' does not affect tokenizer on EBCDIC */ +#define UTF (PL_linestr && DO_UTF8(PL_linestr)) +#else +#define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || (PL_hints & HINT_UTF8)) +#endif /* In variables name $^X, these are the legal values for X. * 1999-02-27 mjd-perl-patch@plover.com */ @@ -177,13 +181,15 @@ int yyactlevel = -1; /* grandfather return to old style */ #define OLDLOP(f) return(yylval.ival=f,PL_expect = XTERM,PL_bufptr = s,(int)LSTOP) +#ifdef DEBUGGING + STATIC void S_tokereport(pTHX_ char *thing, char* s, I32 rv) { - SV *report; DEBUG_T({ - report = newSVpv(thing, 0); - Perl_sv_catpvf(aTHX_ report, ":line %i:%i:", CopLINE(PL_curcop), rv); + SV* report = newSVpv(thing, 0); + Perl_sv_catpvf(aTHX_ report, ":line %d:%"IVdf":", CopLINE(PL_curcop), + (IV)rv); if (s - PL_bufptr > 0) sv_catpvn(report, PL_bufptr, s - PL_bufptr); @@ -192,9 +198,11 @@ S_tokereport(pTHX_ char *thing, char* s, I32 rv) sv_catpv(report, PL_tokenbuf); } PerlIO_printf(Perl_debug_log, "### %s\n", SvPV_nolen(report)); - }) + }); } +#endif + /* * S_ao * @@ -533,7 +541,7 @@ S_skipspace(pTHX_ register char *s) for (;;) { STRLEN prevlen; SSize_t oldprevlen, oldoldprevlen; - SSize_t oldloplen, oldunilen; + SSize_t oldloplen = 0, oldunilen = 0; while (s < PL_bufend && isSPACE(*s)) { if (*s++ == '\n' && PL_in_eval && !PL_rsfp) incline(s); @@ -1038,6 +1046,7 @@ S_sublex_push(pTHX) SAVEI32(PL_lex_inwhat); SAVECOPLINE(PL_curcop); SAVEPPTR(PL_bufptr); + SAVEPPTR(PL_bufend); SAVEPPTR(PL_oldbufptr); SAVEPPTR(PL_oldoldbufptr); SAVEPPTR(PL_last_lop); @@ -1218,22 +1227,22 @@ S_scan_const(pTHX_ char *start) register char *d = SvPVX(sv); /* destination for copies */ bool dorange = FALSE; /* are we in a translit range? */ bool didrange = FALSE; /* did we just finish a range? */ - bool has_utf8 = (PL_linestr && SvUTF8(PL_linestr)); - /* the constant is UTF8 */ + I32 has_utf8 = FALSE; /* Output constant is UTF8 */ + I32 this_utf8 = UTF; /* The source string is assumed to be UTF8 */ UV uv; - I32 utf = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) - ? (PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF)) - : UTF; - I32 this_utf8 = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) - ? (PL_sublex_info.sub_op->op_private & (PL_lex_repl ? - OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF)) - : UTF; const char *leaveit = /* set of acceptably-backslashed characters */ PL_lex_inpat ? "\\.^$@AGZdDwWsSbBpPXC+*?|()-nrtfeaxcz0123456789[{]} \t\n\r\f\v#" : ""; + if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) { + /* If we are doing a trans and we know we want UTF8 set expectation */ + has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF); + this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF); + } + + while (s < send || dorange) { /* get transliterations out of the way (they're most literal) */ if (PL_lex_inwhat == OP_TRANS) { @@ -1243,17 +1252,18 @@ S_scan_const(pTHX_ char *start) I32 min; /* first character in range */ I32 max; /* last character in range */ - if (utf) { + if (has_utf8) { char *c = (char*)utf8_hop((U8*)d, -1); char *e = d++; while (e-- > c) *(e + 1) = *e; - *c = (char)0xff; + *c = UTF_TO_NATIVE(0xff); /* mark the range as done, and continue */ dorange = FALSE; didrange = TRUE; continue; } + i = d - SvPVX(sv); /* remember current offset */ SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */ d = SvPVX(sv) + i; /* refresh d after realloc */ @@ -1297,8 +1307,8 @@ S_scan_const(pTHX_ char *start) if (didrange) { Perl_croak(aTHX_ "Ambiguous range in transliteration operator"); } - if (utf) { - *d++ = (char)0xff; /* use illegal utf8 byte--see pmtrans */ + if (has_utf8) { + *d++ = UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */ s++; continue; } @@ -1368,6 +1378,8 @@ S_scan_const(pTHX_ char *start) break; /* in regexp, $ might be tail anchor */ } + /* End of else if chain - OP_TRANS rejoin rest */ + /* backslashes */ if (*s == '\\' && s+1 < send) { s++; @@ -1430,14 +1442,14 @@ S_scan_const(pTHX_ char *start) ++s; if (*s == '{') { char* e = strchr(s, '}'); + STRLEN len = 1; /* allow underscores */ + if (!e) { yyerror("Missing right brace on \\x{}"); - e = s; - } - else { - STRLEN len = 1; /* allow underscores */ - uv = (UV)scan_hex(s + 1, e - s - 1, &len); + ++s; + continue; } + uv = (UV)scan_hex(s + 1, e - s - 1, &len); s = e + 1; } else { @@ -1457,7 +1469,7 @@ S_scan_const(pTHX_ char *start) /* We need to map to chars to ASCII before doing the tests to cover EBCDIC */ - if (NATIVE_TO_UNI(uv) > 127) { + if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(uv))) { if (!has_utf8 && uv > 255) { /* Might need to recode whatever we have * accumulated so far if it contains any @@ -1467,13 +1479,13 @@ S_scan_const(pTHX_ char *start) * this rescan? --jhi) */ int hicount = 0; - char *c; - for (c = SvPVX(sv); c < d; c++) { - if (UTF8_IS_CONTINUED(NATIVE_TO_ASCII(*c))) { + U8 *c; + for (c = (U8 *) SvPVX(sv); c < (U8 *)d; c++) { + if (!NATIVE_IS_INVARIANT(*c)) { hicount++; } } - if (hicount || NATIVE_TO_ASCII('A') != 'A') { + if (hicount) { STRLEN offset = d - SvPVX(sv); U8 *src, *dst; d = SvGROW(sv, SvLEN(sv) + hicount + 1) + offset; @@ -1481,13 +1493,13 @@ S_scan_const(pTHX_ char *start) dst = src+hicount; d += hicount; while (src >= (U8 *)SvPVX(sv)) { - U8 ch = NATIVE_TO_ASCII(*src); - if (UTF8_IS_CONTINUED(ch)) { + if (!NATIVE_IS_INVARIANT(*src)) { + U8 ch = NATIVE_TO_ASCII(*src); *dst-- = UTF8_EIGHT_BIT_LO(ch); *dst-- = UTF8_EIGHT_BIT_HI(ch); } else { - *dst-- = ch; + *dst-- = *src; } src--; } @@ -1502,7 +1514,6 @@ S_scan_const(pTHX_ char *start) PL_sublex_info.sub_op->op_private |= (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF); - utf = TRUE; } } else { @@ -1510,7 +1521,7 @@ S_scan_const(pTHX_ char *start) } } else { - *d++ = NATIVE_TO_NEED(has_utf8,uv); + *d++ = (char) uv; } continue; @@ -1603,43 +1614,40 @@ S_scan_const(pTHX_ char *start) } /* end if (backslash) */ default_action: -#ifndef EBCDIC - /* The 'has_utf8' here is very dubious */ - if (UTF8_IS_CONTINUED(NATIVE_TO_ASCII(*s)) && (this_utf8 || has_utf8)) { - STRLEN len = (STRLEN) -1; - UV uv; - if (this_utf8) { - uv = utf8n_to_uvchr((U8*)s, send - s, &len, 0); - } - if (len == (STRLEN)-1) { - /* Illegal UTF8 (a high-bit byte), make it valid. */ - char *old_pvx = SvPVX(sv); - /* need space for one extra char (NOTE: SvCUR() not set here) */ - d = SvGROW(sv, SvLEN(sv) + 1) + (d - old_pvx); - d = (char*)uvchr_to_utf8((U8*)d, (U8)*s++); - } - else { - while (len--) - *d++ = *s++; - } - has_utf8 = TRUE; - if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) { - PL_sublex_info.sub_op->op_private |= - (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF); - utf = TRUE; - } - continue; - } -#endif - *d++ = NATIVE_TO_NEED(has_utf8,*s++); + /* If we started with encoded form, or already know we want it + and then encode the next character */ + if ((has_utf8 || this_utf8) && !NATIVE_IS_INVARIANT((U8)(*s))) { + STRLEN len = 1; + UV uv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s); + STRLEN need = UNISKIP(NATIVE_TO_UNI(uv)); + s += len; + if (need > len) { + /* encoded value larger than old, need extra space (NOTE: SvCUR() not set here) */ + STRLEN off = d - SvPVX(sv); + d = SvGROW(sv, SvLEN(sv) + (need-len)) + off; + } + d = (char*)uvchr_to_utf8((U8*)d, uv); + has_utf8 = TRUE; + } + else { + *d++ = NATIVE_TO_NEED(has_utf8,*s++); + } } /* while loop to process each character */ /* terminate the string and set up the sv */ *d = '\0'; SvCUR_set(sv, d - SvPVX(sv)); + if (SvCUR(sv) >= SvLEN(sv)) + Perl_croak(aTHX_ "panic: constant overflowed allocated space"); + SvPOK_on(sv); - if (has_utf8) + if (has_utf8) { SvUTF8_on(sv); + if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) { + PL_sublex_info.sub_op->op_private |= + (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF); + } + } /* shrink the sv if we allocated more than we used */ if (SvCUR(sv) + 5 < SvLEN(sv)) { @@ -2164,7 +2172,7 @@ Perl_yylex(pTHX) PL_pending_ident = 0; DEBUG_T({ PerlIO_printf(Perl_debug_log, - "### Tokener saw identifier '%s'\n", PL_tokenbuf); }) + "### Tokener saw identifier '%s'\n", PL_tokenbuf); }); /* if we're in a my(), we can't allow dynamics here. $foo'bar has already been turned into $foo::bar, so @@ -2305,7 +2313,7 @@ Perl_yylex(pTHX) } DEBUG_T({ PerlIO_printf(Perl_debug_log, "### Next token after '%s' was known, type %"IVdf"\n", PL_bufptr, - (IV)PL_nexttype[PL_nexttoke]); }) + (IV)PL_nexttype[PL_nexttoke]); }); return(PL_nexttype[PL_nexttoke]); @@ -2339,7 +2347,7 @@ Perl_yylex(pTHX) } else { DEBUG_T({ PerlIO_printf(Perl_debug_log, - "### Saw case modifier at '%s'\n", PL_bufptr); }) + "### Saw case modifier at '%s'\n", PL_bufptr); }); s = PL_bufptr + 1; if (strnEQ(s, "L\\u", 3) || strnEQ(s, "U\\l", 3)) tmp = *s, *s = s[2], s[2] = tmp; /* misordered... */ @@ -2391,7 +2399,7 @@ Perl_yylex(pTHX) if (PL_bufptr == PL_bufend) return sublex_done(); DEBUG_T({ PerlIO_printf(Perl_debug_log, - "### Interpolated variable at '%s'\n", PL_bufptr); }) + "### Interpolated variable at '%s'\n", PL_bufptr); }); PL_expect = XTERM; PL_lex_dojoin = (*PL_bufptr == '@'); PL_lex_state = LEX_INTERPNORMAL; @@ -2491,7 +2499,7 @@ Perl_yylex(pTHX) DEBUG_T( { PerlIO_printf(Perl_debug_log, "### Tokener expecting %s at %s\n", exp_name[PL_expect], s); - } ) + } ); retry: switch (*s) { @@ -2510,7 +2518,7 @@ Perl_yylex(pTHX) yyerror("Missing right curly or square bracket"); DEBUG_T( { PerlIO_printf(Perl_debug_log, "### Tokener got EOF\n"); - } ) + } ); TOKEN(0); } if (s++ < PL_bufend) @@ -2843,6 +2851,8 @@ Perl_yylex(pTHX) s++; if (s < d) s++; + else if (s > d) /* Found by Ilya: feed random input to Perl. */ + Perl_croak(aTHX_ "panic: input overflow"); incline(s); if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) { PL_bufptr = s; @@ -2870,7 +2880,7 @@ Perl_yylex(pTHX) s = force_word(PL_bufptr,WORD,FALSE,FALSE,FALSE); DEBUG_T( { PerlIO_printf(Perl_debug_log, "### Saw unary minus before =>, forcing word '%s'\n", s); - } ) + } ); OPERATOR('-'); /* unary minus */ } PL_last_uni = PL_oldbufptr; @@ -2915,7 +2925,7 @@ Perl_yylex(pTHX) PL_last_lop_op = ftst; DEBUG_T( { PerlIO_printf(Perl_debug_log, "### Saw file test %c\n", (int)ftst); - } ) + } ); FTST(ftst); } else { @@ -2924,7 +2934,7 @@ Perl_yylex(pTHX) DEBUG_T( { PerlIO_printf(Perl_debug_log, "### %c looked like a file test but was not\n", (int)ftst); - } ) + } ); s -= 2; } } @@ -3221,8 +3231,16 @@ Perl_yylex(pTHX) else PL_lex_brackstack[PL_lex_brackets++] = XOPERATOR; s = skipspace(s); - if (*s == '}') + if (*s == '}') { + if (PL_expect == XREF && PL_lex_state == LEX_INTERPNORMAL) { + PL_expect = XTERM; + /* This hack is to get the ${} in the message. */ + PL_bufptr = s+1; + yyerror("syntax error"); + break; + } OPERATOR(HASHBRACK); + } /* This hack serves to disambiguate a pair of curlies * as being a block or an anon hash. Normally, expectation * determines that, but in cases where we're not in a @@ -3684,7 +3702,7 @@ Perl_yylex(pTHX) s = scan_num(s, &yylval); DEBUG_T( { PerlIO_printf(Perl_debug_log, "### Saw number in '%s'\n", s); - } ) + } ); if (PL_expect == XOPERATOR) no_op("Number",s); TERM(THING); @@ -3693,7 +3711,7 @@ Perl_yylex(pTHX) s = scan_str(s,FALSE,FALSE); DEBUG_T( { PerlIO_printf(Perl_debug_log, "### Saw string before '%s'\n", s); - } ) + } ); if (PL_expect == XOPERATOR) { if (PL_lex_formbrack && PL_lex_brackets == PL_lex_formbrack) { PL_expect = XTERM; @@ -3712,7 +3730,7 @@ Perl_yylex(pTHX) s = scan_str(s,FALSE,FALSE); DEBUG_T( { PerlIO_printf(Perl_debug_log, "### Saw string before '%s'\n", s); - } ) + } ); if (PL_expect == XOPERATOR) { if (PL_lex_formbrack && PL_lex_brackets == PL_lex_formbrack) { PL_expect = XTERM; @@ -3726,7 +3744,7 @@ Perl_yylex(pTHX) missingterm((char*)0); yylval.ival = OP_CONST; for (d = SvPV(PL_lex_stuff, len); len; len--, d++) { - if (*d == '$' || *d == '@' || *d == '\\' || UTF8_IS_CONTINUED(*d)) { + if (*d == '$' || *d == '@' || *d == '\\' || !UTF8_IS_INVARIANT((U8)*d)) { yylval.ival = OP_STRINGIFY; break; } @@ -3737,7 +3755,7 @@ Perl_yylex(pTHX) s = scan_str(s,FALSE,FALSE); DEBUG_T( { PerlIO_printf(Perl_debug_log, "### Saw backtick string before '%s'\n", s); - } ) + } ); if (PL_expect == XOPERATOR) no_op("Backticks",s); if (!s) @@ -3767,7 +3785,7 @@ Perl_yylex(pTHX) TERM(THING); } /* avoid v123abc() or $h{v1}, allow C */ - else if (!isALPHA(*start) && (PL_expect == XTERM || PL_expect == XREF)) { + else if (!isALPHA(*start) && (PL_expect == XTERM || PL_expect == XREF || PL_expect == XSTATE)) { char c = *start; GV *gv; *start = '\0'; @@ -3853,7 +3871,7 @@ Perl_yylex(pTHX) CLINE; yylval.opval = (OP*)newSVOP(OP_CONST, 0, newSVpv(PL_tokenbuf,0)); yylval.opval->op_private = OPpCONST_BARE; - if (UTF && !IN_BYTE && is_utf8_string((U8*)PL_tokenbuf, len)) + if (UTF && !IN_BYTES && is_utf8_string((U8*)PL_tokenbuf, len)) SvUTF8_on(((SVOP*)yylval.opval)->op_sv); TERM(WORD); } @@ -4014,7 +4032,7 @@ Perl_yylex(pTHX) if (*s == '=' && s[1] == '>') { CLINE; sv_setpv(((SVOP*)yylval.opval)->op_sv, PL_tokenbuf); - if (UTF && !IN_BYTE && is_utf8_string((U8*)PL_tokenbuf, len)) + if (UTF && !IN_BYTES && is_utf8_string((U8*)PL_tokenbuf, len)) SvUTF8_on(((SVOP*)yylval.opval)->op_sv); TERM(WORD); } @@ -4177,14 +4195,24 @@ Perl_yylex(pTHX) loc = PerlIO_tell(PL_rsfp); (void)PerlIO_seek(PL_rsfp, 0L, 0); } +#ifdef NETWARE + if (PerlLIO_setmode(PL_rsfp, O_TEXT) != -1) { +#else if (PerlLIO_setmode(PerlIO_fileno(PL_rsfp), O_TEXT) != -1) { +#endif /* NETWARE */ +#ifdef PERLIO_IS_STDIO /* really? */ +# if defined(__BORLANDC__) + /* XXX see note in do_binmode() */ + ((FILE*)PL_rsfp)->flags &= ~_F_BIN; +# endif +#endif if (loc > 0) PerlIO_seek(PL_rsfp, loc, 0); } } #endif #ifdef PERLIO_LAYERS - if (UTF && !IN_BYTE) + if (UTF && !IN_BYTES) PerlIO_apply_layers(aTHX_ PL_rsfp, NULL, ":utf8"); #endif PL_rsfp = Nullfp; @@ -4973,7 +5001,7 @@ Perl_yylex(pTHX) really_sub: { char tmpbuf[sizeof PL_tokenbuf]; - SSize_t tboffset; + SSize_t tboffset = 0; expectation attrful; bool have_name, have_proto; int key = tmp; @@ -5617,7 +5645,7 @@ Perl_keyword(pTHX_ register char *d, I32 len) if (strEQ(d,"rindex")) return -KEY_rindex; break; case 7: - if (strEQ(d,"require")) return -KEY_require; + if (strEQ(d,"require")) return KEY_require; if (strEQ(d,"reverse")) return -KEY_reverse; if (strEQ(d,"readdir")) return -KEY_readdir; break; @@ -6512,7 +6540,7 @@ retval: Renew(SvPVX(tmpstr), SvLEN(tmpstr), char); } SvREFCNT_dec(herewas); - if (UTF && !IN_BYTE && is_utf8_string((U8*)SvPVX(tmpstr), SvCUR(tmpstr))) + if (UTF && !IN_BYTES && is_utf8_string((U8*)SvPVX(tmpstr), SvCUR(tmpstr))) SvUTF8_on(tmpstr); PL_lex_stuff = tmpstr; yylval.ival = op_type; @@ -6698,7 +6726,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) /* after skipping whitespace, the next character is the terminator */ term = *s; - if (UTF8_IS_CONTINUED(term) && UTF) + if (!UTF8_IS_INVARIANT((U8)term) && UTF) has_utf8 = TRUE; /* mark where we are */ @@ -6745,7 +6773,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) have found the terminator */ else if (*s == term) break; - else if (!has_utf8 && UTF8_IS_CONTINUED(*s) && UTF) + else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF) has_utf8 = TRUE; *to = *s; } @@ -6774,7 +6802,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) break; else if (*s == PL_multi_open) brackets++; - else if (!has_utf8 && UTF8_IS_CONTINUED(*s) && UTF) + else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF) has_utf8 = TRUE; *to = *s; } @@ -6866,11 +6894,11 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) Read a number in any of the formats that Perl accepts: - 0(x[0-7A-F]+)|([0-7]+)|(b[01]) - [\d_]+(\.[\d_]*)?[Ee](\d+) - - Underbars (_) are allowed in decimal numbers. If -w is on, - underbars before a decimal point must be at three digit intervals. + \d(_?\d)*(\.(\d(_?\d)*)?)?[Ee][\+\-]?(\d(_?\d)*) 12 12.34 12. + \.\d(_?\d)*[Ee][\+\-]?(\d(_?\d)*) .34 + 0b[01](_?[01])* + 0[0-7](_?[0-7])* + 0x[0-9A-Fa-f](_?[0-9A-Fa-f])* Like most scan_ routines, it uses the PL_tokenbuf buffer to hold the thing it reads. @@ -6940,8 +6968,17 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp) else if (s[1] == '.' || s[1] == 'e' || s[1] == 'E') goto decimal; /* so it must be octal */ - else + else { shift = 3; + s++; + } + + if (*s == '_') { + if (ckWARN(WARN_SYNTAX)) + Perl_warner(aTHX_ WARN_SYNTAX, + "Misplaced _ in number"); + lastub = s++; + } base = bases[shift]; Base = Bases[shift]; @@ -6959,9 +6996,12 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp) default: goto out; - /* _ are ignored */ + /* _ are ignored -- but warned about if consecutive */ case '_': - s++; + if (ckWARN(WARN_SYNTAX) && lastub && s == lastub + 1) + Perl_warner(aTHX_ WARN_SYNTAX, + "Misplaced _ in number"); + lastub = s++; break; /* 8 and 9 are not octal */ @@ -7028,6 +7068,13 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp) the number. */ out: + + /* final misplaced underbar check */ + if (s[-1] == '_') { + if (ckWARN(WARN_SYNTAX)) + Perl_warner(aTHX_ WARN_SYNTAX, "Misplaced _ in number"); + } + sv = NEWSV(92,0); if (overflowed) { if (ckWARN(WARN_PORTABLE) && n > 4294967295.0) @@ -7067,9 +7114,10 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp) if -w is on */ if (*s == '_') { - if (ckWARN(WARN_SYNTAX) && lastub && s - lastub != 3) - Perl_warner(aTHX_ WARN_SYNTAX, "Misplaced _ in number"); - lastub = ++s; + if (ckWARN(WARN_SYNTAX) && lastub && s == lastub + 1) + Perl_warner(aTHX_ WARN_SYNTAX, + "Misplaced _ in number"); + lastub = s++; } else { /* check for end of fixed-length buffer */ @@ -7081,7 +7129,7 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp) } /* final misplaced underbar check */ - if (lastub && s - lastub != 3) { + if (lastub && s == lastub + 1) { if (ckWARN(WARN_SYNTAX)) Perl_warner(aTHX_ WARN_SYNTAX, "Misplaced _ in number"); } @@ -7094,16 +7142,34 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp) floatit = TRUE; *d++ = *s++; - /* copy, ignoring underbars, until we run out of - digits. Note: no misplaced underbar checks! + if (*s == '_') { + if (ckWARN(WARN_SYNTAX)) + Perl_warner(aTHX_ WARN_SYNTAX, + "Misplaced _ in number"); + lastub = s; + } + + /* copy, ignoring underbars, until we run out of digits. */ for (; isDIGIT(*s) || *s == '_'; s++) { /* fixed length buffer check */ if (d >= e) Perl_croak(aTHX_ number_too_long); - if (*s != '_') + if (*s == '_') { + if (ckWARN(WARN_SYNTAX) && lastub && s == lastub + 1) + Perl_warner(aTHX_ WARN_SYNTAX, + "Misplaced _ in number"); + lastub = s; + } + else *d++ = *s; } + /* fractional part ending in underbar? */ + if (s[-1] == '_') { + if (ckWARN(WARN_SYNTAX)) + Perl_warner(aTHX_ WARN_SYNTAX, + "Misplaced _ in number"); + } if (*s == '.' && isDIGIT(s[1])) { /* oops, it's really a v-string, but without the "v" */ s = start - 1; @@ -7112,22 +7178,48 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp) } /* read exponent part, if present */ - if (*s && strchr("eE",*s) && strchr("+-0123456789",s[1])) { + if (*s && strchr("eE",*s) && strchr("+-0123456789_", s[1])) { floatit = TRUE; s++; /* regardless of whether user said 3E5 or 3e5, use lower 'e' */ *d++ = 'e'; /* At least some Mach atof()s don't grok 'E' */ + /* stray preinitial _ */ + if (*s == '_') { + if (ckWARN(WARN_SYNTAX)) + Perl_warner(aTHX_ WARN_SYNTAX, + "Misplaced _ in number"); + lastub = s++; + } + /* allow positive or negative exponent */ if (*s == '+' || *s == '-') *d++ = *s++; - /* read digits of exponent (no underbars :-) */ - while (isDIGIT(*s)) { - if (d >= e) - Perl_croak(aTHX_ number_too_long); - *d++ = *s++; + /* stray initial _ */ + if (*s == '_') { + if (ckWARN(WARN_SYNTAX)) + Perl_warner(aTHX_ WARN_SYNTAX, + "Misplaced _ in number"); + lastub = s++; + } + + /* read digits of exponent */ + while (isDIGIT(*s) || *s == '_') { + if (isDIGIT(*s)) { + if (d >= e) + Perl_croak(aTHX_ number_too_long); + *d++ = *s++; + } + else { + if (ckWARN(WARN_SYNTAX) && + ((lastub && s == lastub + 1) || + (!isDIGIT(s[1]) && s[1] != '_'))) + Perl_warner(aTHX_ WARN_SYNTAX, + "Misplaced _ in number"); + lastub = s++; + } } } @@ -7146,8 +7238,8 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp) */ if (!floatit) { - IV iv; - UV uv; + IV iv = 0; + UV uv = 0; errno = 0; if (*PL_tokenbuf == '-') iv = Strtol(PL_tokenbuf, (char**)NULL, 10); @@ -7232,7 +7324,7 @@ vstring: while (isDIGIT(*pos) || *pos == '_') pos++; if (!isALPHA(*pos)) { - UV rev, revmax = 0; + UV rev; U8 tmpbuf[UTF8_MAXLEN+1]; U8 *tmpend; s++; /* get past 'v' */ @@ -7262,9 +7354,9 @@ vstring: } /* Append native character for the rev point */ tmpend = uvchr_to_utf8(tmpbuf, rev); - if (rev > revmax) - revmax = rev; sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf); + if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(rev))) + SvUTF8_on(sv); if (*pos == '.' && isDIGIT(pos[1])) s = ++pos; else { @@ -7274,14 +7366,8 @@ vstring: while (isDIGIT(*pos) || *pos == '_') pos++; } - SvPOK_on(sv); SvREADONLY_on(sv); - /* if (revmax > 127) { */ - SvUTF8_on(sv); /* - if (revmax < 256) - sv_utf8_downgrade(sv, TRUE); - } */ } } break;