X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=toke.c;h=347e86fb20368d203d1be7b64ccb5409302ae15a;hb=96e821d57e0ce06bdcf42060deb9cea0b0589546;hp=0d4fc1d018888008b6ac24cbc30ef95f02672e80;hpb=9041c2e396c8c7de7680a2007dc341a9f65be0d0;p=p5sagit%2Fp5-mst-13.2.git diff --git a/toke.c b/toke.c index 0d4fc1d..347e86f 100644 --- a/toke.c +++ b/toke.c @@ -36,8 +36,12 @@ static I32 utf16rev_textfilter(pTHXo_ int idx, SV *sv, int maxlen); #define XFAKEBRACK 128 #define XENUMMASK 127 -/*#define UTF (SvUTF8(PL_linestr) && !(PL_hints & HINT_BYTE))*/ -#define UTF (PL_hints & HINT_UTF8) +#ifdef EBCDIC +/* For now 'use utf8' does not affect tokenizer on EBCDIC */ +#define UTF (PL_linestr && DO_UTF8(PL_linestr)) +#else +#define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || (PL_hints & HINT_UTF8)) +#endif /* In variables name $^X, these are the legal values for X. * 1999-02-27 mjd-perl-patch@plover.com */ @@ -183,7 +187,8 @@ S_tokereport(pTHX_ char *thing, char* s, I32 rv) SV *report; DEBUG_T({ report = newSVpv(thing, 0); - Perl_sv_catpvf(aTHX_ report, ":line %i:%i:", CopLINE(PL_curcop), rv); + Perl_sv_catpvf(aTHX_ report, ":line %d:%"IVdf":", CopLINE(PL_curcop), + (IV)rv); if (s - PL_bufptr > 0) sv_catpvn(report, PL_bufptr, s - PL_bufptr); @@ -917,8 +922,11 @@ S_tokeq(pTHX_ SV *sv) if (s == send) goto finish; d = s; - if ( PL_hints & HINT_NEW_STRING ) + if ( PL_hints & HINT_NEW_STRING ) { pv = sv_2mortal(newSVpvn(SvPVX(pv), len)); + if (SvUTF8(sv)) + SvUTF8_on(pv); + } while (s < send) { if (*s == '\\') { if (s + 1 < send && (s[1] == '\\')) @@ -1215,22 +1223,22 @@ S_scan_const(pTHX_ char *start) register char *d = SvPVX(sv); /* destination for copies */ bool dorange = FALSE; /* are we in a translit range? */ bool didrange = FALSE; /* did we just finish a range? */ - bool has_utf8 = (PL_linestr && SvUTF8(PL_linestr)); - /* the constant is UTF8 */ + I32 has_utf8 = FALSE; /* Output constant is UTF8 */ + I32 this_utf8 = UTF; /* The source string is assumed to be UTF8 */ UV uv; - I32 utf = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) - ? (PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF)) - : UTF; - I32 this_utf8 = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) - ? (PL_sublex_info.sub_op->op_private & (PL_lex_repl ? - OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF)) - : UTF; const char *leaveit = /* set of acceptably-backslashed characters */ PL_lex_inpat ? "\\.^$@AGZdDwWsSbBpPXC+*?|()-nrtfeaxcz0123456789[{]} \t\n\r\f\v#" : ""; + if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) { + /* If we are doing a trans and we know we want UTF8 set expectation */ + has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF); + this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF); + } + + while (s < send || dorange) { /* get transliterations out of the way (they're most literal) */ if (PL_lex_inwhat == OP_TRANS) { @@ -1240,17 +1248,18 @@ S_scan_const(pTHX_ char *start) I32 min; /* first character in range */ I32 max; /* last character in range */ - if (utf) { + if (has_utf8) { char *c = (char*)utf8_hop((U8*)d, -1); char *e = d++; while (e-- > c) *(e + 1) = *e; - *c = (char)0xff; + *c = UTF_TO_NATIVE(0xff); /* mark the range as done, and continue */ dorange = FALSE; didrange = TRUE; continue; } + i = d - SvPVX(sv); /* remember current offset */ SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */ d = SvPVX(sv) + i; /* refresh d after realloc */ @@ -1265,17 +1274,17 @@ S_scan_const(pTHX_ char *start) (char)min, (char)max); } -#ifndef ASCIIish +#ifdef EBCDIC if ((isLOWER(min) && isLOWER(max)) || (isUPPER(min) && isUPPER(max))) { if (isLOWER(min)) { for (i = min; i <= max; i++) if (isLOWER(i)) - *d++ = i; + *d++ = NATIVE_TO_NEED(has_utf8,i); } else { for (i = min; i <= max; i++) if (isUPPER(i)) - *d++ = i; + *d++ = NATIVE_TO_NEED(has_utf8,i); } } else @@ -1294,8 +1303,8 @@ S_scan_const(pTHX_ char *start) if (didrange) { Perl_croak(aTHX_ "Ambiguous range in transliteration operator"); } - if (utf) { - *d++ = (char)0xff; /* use illegal utf8 byte--see pmtrans */ + if (has_utf8) { + *d++ = UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */ s++; continue; } @@ -1314,7 +1323,7 @@ S_scan_const(pTHX_ char *start) else if (*s == '(' && PL_lex_inpat && s[1] == '?') { if (s[2] == '#') { while (s < send && *s != ')') - *d++ = *s++; + *d++ = NATIVE_TO_NEED(has_utf8,*s++); } else if (s[2] == '{' /* This should match regcomp.c */ || ((s[2] == 'p' || s[2] == '?') && s[3] == '{')) @@ -1337,7 +1346,7 @@ S_scan_const(pTHX_ char *start) yyerror("Sequence (?{...}) not terminated or not {}-balanced"); } while (s < regparse) - *d++ = *s++; + *d++ = NATIVE_TO_NEED(has_utf8,*s++); } } @@ -1345,7 +1354,7 @@ S_scan_const(pTHX_ char *start) else if (*s == '#' && PL_lex_inpat && ((PMOP*)PL_lex_inpat)->op_pmflags & PMf_EXTENDED) { while (s+1 < send && *s != '\n') - *d++ = *s++; + *d++ = NATIVE_TO_NEED(has_utf8,*s++); } /* check for embedded arrays @@ -1365,14 +1374,16 @@ S_scan_const(pTHX_ char *start) break; /* in regexp, $ might be tail anchor */ } + /* End of else if chain - OP_TRANS rejoin rest */ + /* backslashes */ if (*s == '\\' && s+1 < send) { s++; /* some backslashes we leave behind */ if (*leaveit && *s && strchr(leaveit, *s)) { - *d++ = '\\'; - *d++ = *s++; + *d++ = NATIVE_TO_NEED(has_utf8,'\\'); + *d++ = NATIVE_TO_NEED(has_utf8,*s++); continue; } @@ -1448,15 +1459,13 @@ S_scan_const(pTHX_ char *start) NUM_ESCAPE_INSERT: /* Insert oct or hex escaped character. * There will always enough room in sv since such - * escapes will be longer than any UT-F8 sequence + * escapes will be longer than any UTF-8 sequence * they can end up as. */ - - /* This spot is wrong for EBCDIC. Characters like - * the lowercase letters and digits are >127 in EBCDIC, - * so here they would need to be mapped to the Unicode - * repertoire. --jhi */ - if (uv > 127) { + /* We need to map to chars to ASCII before doing the tests + to cover EBCDIC + */ + if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(uv))) { if (!has_utf8 && uv > 255) { /* Might need to recode whatever we have * accumulated so far if it contains any @@ -1465,33 +1474,30 @@ S_scan_const(pTHX_ char *start) * (Can't we keep track of that and avoid * this rescan? --jhi) */ - int hicount = 0; - char *c; - - for (c = SvPVX(sv); c < d; c++) { - if (UTF8_IS_CONTINUED(*c)) + int hicount = 0; + U8 *c; + for (c = (U8 *) SvPVX(sv); c < (U8 *)d; c++) { + if (!NATIVE_IS_INVARIANT(*c)) { hicount++; + } } if (hicount) { - char *old_pvx = SvPVX(sv); - char *src, *dst; - - d = SvGROW(sv, - SvLEN(sv) + hicount + 1) + - (d - old_pvx); - - src = d - 1; - d += hicount; - dst = d - 1; - - while (src < dst) { - if (UTF8_IS_CONTINUED(*src)) { - *dst-- = UTF8_EIGHT_BIT_LO(*src); - *dst-- = UTF8_EIGHT_BIT_HI(*src--); + STRLEN offset = d - SvPVX(sv); + U8 *src, *dst; + d = SvGROW(sv, SvLEN(sv) + hicount + 1) + offset; + src = (U8 *)d - 1; + dst = src+hicount; + d += hicount; + while (src >= (U8 *)SvPVX(sv)) { + if (!NATIVE_IS_INVARIANT(*src)) { + U8 ch = NATIVE_TO_ASCII(*src); + *dst-- = UTF8_EIGHT_BIT_LO(ch); + *dst-- = UTF8_EIGHT_BIT_HI(ch); } else { - *dst-- = *src--; + *dst-- = *src; } + src--; } } } @@ -1504,7 +1510,6 @@ S_scan_const(pTHX_ char *start) PL_sublex_info.sub_op->op_private |= (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF); - utf = TRUE; } } else { @@ -1512,7 +1517,7 @@ S_scan_const(pTHX_ char *start) } } else { - *d++ = (char)uv; + *d++ = (char) uv; } continue; @@ -1566,51 +1571,38 @@ S_scan_const(pTHX_ char *start) /* \c is a control character */ case 'c': s++; -#ifdef EBCDIC - *d = *s++; - if (isLOWER(*d)) - *d = toUPPER(*d); - *d = toCTRL(*d); - d++; -#else { U8 c = *s++; - *d++ = toCTRL(c); - } +#ifdef EBCDIC + if (isLOWER(c)) + c = toUPPER(c); #endif + *d++ = NATIVE_TO_NEED(has_utf8,toCTRL(c)); + } continue; /* printf-style backslashes, formfeeds, newlines, etc */ case 'b': - *d++ = '\b'; + *d++ = NATIVE_TO_NEED(has_utf8,'\b'); break; case 'n': - *d++ = '\n'; + *d++ = NATIVE_TO_NEED(has_utf8,'\n'); break; case 'r': - *d++ = '\r'; + *d++ = NATIVE_TO_NEED(has_utf8,'\r'); break; case 'f': - *d++ = '\f'; + *d++ = NATIVE_TO_NEED(has_utf8,'\f'); break; case 't': - *d++ = '\t'; - break; -#ifdef EBCDIC - case 'e': - *d++ = '\047'; /* CP 1047 */ + *d++ = NATIVE_TO_NEED(has_utf8,'\t'); break; - case 'a': - *d++ = '\057'; /* CP 1047 */ - break; -#else case 'e': - *d++ = '\033'; + *d++ = ASCII_TO_NEED(has_utf8,'\033'); break; case 'a': - *d++ = '\007'; + *d++ = ASCII_TO_NEED(has_utf8,'\007'); break; -#endif } /* end switch */ s++; @@ -1618,41 +1610,40 @@ S_scan_const(pTHX_ char *start) } /* end if (backslash) */ default_action: - if (UTF8_IS_CONTINUED(*s) && (this_utf8 || has_utf8)) { - STRLEN len = (STRLEN) -1; - UV uv; - if (this_utf8) { - uv = utf8n_to_uvchr((U8*)s, send - s, &len, 0); - } - if (len == (STRLEN)-1) { - /* Illegal UTF8 (a high-bit byte), make it valid. */ - char *old_pvx = SvPVX(sv); - /* need space for one extra char (NOTE: SvCUR() not set here) */ - d = SvGROW(sv, SvLEN(sv) + 1) + (d - old_pvx); - d = (char*)uvchr_to_utf8((U8*)d, (U8)*s++); - } - else { - while (len--) - *d++ = *s++; - } - has_utf8 = TRUE; - if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) { - PL_sublex_info.sub_op->op_private |= - (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF); - utf = TRUE; - } - continue; - } - - *d++ = *s++; + /* If we started with encoded form, or already know we want it + and then encode the next character */ + if ((has_utf8 || this_utf8) && !NATIVE_IS_INVARIANT((U8)(*s))) { + STRLEN len = 1; + UV uv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s); + STRLEN need = UNISKIP(NATIVE_TO_UNI(uv)); + s += len; + if (need > len) { + /* encoded value larger than old, need extra space (NOTE: SvCUR() not set here) */ + STRLEN off = d - SvPVX(sv); + d = SvGROW(sv, SvLEN(sv) + (need-len)) + off; + } + d = (char*)uvchr_to_utf8((U8*)d, uv); + has_utf8 = TRUE; + } + else { + *d++ = NATIVE_TO_NEED(has_utf8,*s++); + } } /* while loop to process each character */ /* terminate the string and set up the sv */ *d = '\0'; SvCUR_set(sv, d - SvPVX(sv)); + if (SvCUR(sv) >= SvLEN(sv)) + Perl_croak(aTHX_ "panic:constant overflowed allocated space"); + SvPOK_on(sv); - if (has_utf8) + if (has_utf8) { SvUTF8_on(sv); + if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) { + PL_sublex_info.sub_op->op_private |= + (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF); + } + } /* shrink the sv if we allocated more than we used */ if (SvCUR(sv) + 5 < SvLEN(sv)) { @@ -2856,6 +2847,8 @@ Perl_yylex(pTHX) s++; if (s < d) s++; + else if (s > d) /* Found by Ilya: feed random input to Perl. */ + Perl_croak(aTHX_ "panic: input overflow"); incline(s); if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) { PL_bufptr = s; @@ -3739,7 +3732,7 @@ Perl_yylex(pTHX) missingterm((char*)0); yylval.ival = OP_CONST; for (d = SvPV(PL_lex_stuff, len); len; len--, d++) { - if (*d == '$' || *d == '@' || *d == '\\' || UTF8_IS_CONTINUED(*d)) { + if (*d == '$' || *d == '@' || *d == '\\' || !UTF8_IS_INVARIANT((U8)*d)) { yylval.ival = OP_STRINGIFY; break; } @@ -4191,6 +4184,12 @@ Perl_yylex(pTHX) (void)PerlIO_seek(PL_rsfp, 0L, 0); } if (PerlLIO_setmode(PerlIO_fileno(PL_rsfp), O_TEXT) != -1) { +#ifdef PERLIO_IS_STDIO /* really? */ +# if defined(__BORLANDC__) + /* XXX see note in do_binmode() */ + ((FILE*)PL_rsfp)->flags &= ~_F_BIN; +# endif +#endif if (loc > 0) PerlIO_seek(PL_rsfp, loc, 0); } @@ -6711,7 +6710,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) /* after skipping whitespace, the next character is the terminator */ term = *s; - if (UTF8_IS_CONTINUED(term) && UTF) + if (!UTF8_IS_INVARIANT((U8)term) && UTF) has_utf8 = TRUE; /* mark where we are */ @@ -6758,7 +6757,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) have found the terminator */ else if (*s == term) break; - else if (!has_utf8 && UTF8_IS_CONTINUED(*s) && UTF) + else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF) has_utf8 = TRUE; *to = *s; } @@ -6787,7 +6786,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) break; else if (*s == PL_multi_open) brackets++; - else if (!has_utf8 && UTF8_IS_CONTINUED(*s) && UTF) + else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF) has_utf8 = TRUE; *to = *s; } @@ -6879,11 +6878,11 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) Read a number in any of the formats that Perl accepts: - 0(x[0-7A-F]+)|([0-7]+)|(b[01]) - [\d_]+(\.[\d_]*)?[Ee](\d+) - - Underbars (_) are allowed in decimal numbers. If -w is on, - underbars before a decimal point must be at three digit intervals. + \d(_?\d)*(\.(\d(_?\d)*)?)?[Ee](\d+) 12 12.34 12. + \.\d(_?\d)*[Ee](\d+) .34 + 0b[01](_?[01])* + 0[0-7](_?[0-7])* + 0x[0-9A-Fa-f](_?[0-9A-Fa-f])* Like most scan_ routines, it uses the PL_tokenbuf buffer to hold the thing it reads. @@ -6953,8 +6952,17 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp) else if (s[1] == '.' || s[1] == 'e' || s[1] == 'E') goto decimal; /* so it must be octal */ - else + else { shift = 3; + s++; + } + + if (*s == '_') { + if (ckWARN(WARN_SYNTAX)) + Perl_warner(aTHX_ WARN_SYNTAX, + "Misplaced _ in number"); + lastub = s++; + } base = bases[shift]; Base = Bases[shift]; @@ -6972,9 +6980,12 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp) default: goto out; - /* _ are ignored */ + /* _ are ignored -- but warned about if consecutive */ case '_': - s++; + if (ckWARN(WARN_SYNTAX) && lastub && s == lastub + 1) + Perl_warner(aTHX_ WARN_SYNTAX, + "Misplaced _ in number"); + lastub = s++; break; /* 8 and 9 are not octal */ @@ -7041,6 +7052,13 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp) the number. */ out: + + /* final misplaced underbar check */ + if (s[-1] == '_') { + if (ckWARN(WARN_SYNTAX)) + Perl_warner(aTHX_ WARN_SYNTAX, "Misplaced _ in number"); + } + sv = NEWSV(92,0); if (overflowed) { if (ckWARN(WARN_PORTABLE) && n > 4294967295.0) @@ -7080,9 +7098,10 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp) if -w is on */ if (*s == '_') { - if (ckWARN(WARN_SYNTAX) && lastub && s - lastub != 3) - Perl_warner(aTHX_ WARN_SYNTAX, "Misplaced _ in number"); - lastub = ++s; + if (ckWARN(WARN_SYNTAX) && lastub && s == lastub + 1) + Perl_warner(aTHX_ WARN_SYNTAX, + "Misplaced _ in number"); + lastub = s++; } else { /* check for end of fixed-length buffer */ @@ -7094,7 +7113,7 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp) } /* final misplaced underbar check */ - if (lastub && s - lastub != 3) { + if (lastub && s == lastub + 1) { if (ckWARN(WARN_SYNTAX)) Perl_warner(aTHX_ WARN_SYNTAX, "Misplaced _ in number"); } @@ -7107,16 +7126,34 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp) floatit = TRUE; *d++ = *s++; - /* copy, ignoring underbars, until we run out of - digits. Note: no misplaced underbar checks! + if (*s == '_') { + if (ckWARN(WARN_SYNTAX)) + Perl_warner(aTHX_ WARN_SYNTAX, + "Misplaced _ in number"); + lastub = s; + } + + /* copy, ignoring underbars, until we run out of digits. */ for (; isDIGIT(*s) || *s == '_'; s++) { /* fixed length buffer check */ if (d >= e) Perl_croak(aTHX_ number_too_long); - if (*s != '_') + if (*s == '_') { + if (ckWARN(WARN_SYNTAX) && lastub && s == lastub + 1) + Perl_warner(aTHX_ WARN_SYNTAX, + "Misplaced _ in number"); + lastub = s; + } + else *d++ = *s; } + /* fractional part ending in underbar? */ + if (s[-1] == '_') { + if (ckWARN(WARN_SYNTAX)) + Perl_warner(aTHX_ WARN_SYNTAX, + "Misplaced _ in number"); + } if (*s == '.' && isDIGIT(s[1])) { /* oops, it's really a v-string, but without the "v" */ s = start - 1; @@ -7245,7 +7282,7 @@ vstring: while (isDIGIT(*pos) || *pos == '_') pos++; if (!isALPHA(*pos)) { - UV rev, revmax = 0; + UV rev; U8 tmpbuf[UTF8_MAXLEN+1]; U8 *tmpend; s++; /* get past 'v' */ @@ -7275,9 +7312,9 @@ vstring: } /* Append native character for the rev point */ tmpend = uvchr_to_utf8(tmpbuf, rev); - if (rev > revmax) - revmax = rev; sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf); + if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(rev))) + SvUTF8_on(sv); if (*pos == '.' && isDIGIT(pos[1])) s = ++pos; else { @@ -7287,14 +7324,8 @@ vstring: while (isDIGIT(*pos) || *pos == '_') pos++; } - SvPOK_on(sv); SvREADONLY_on(sv); - /* if (revmax > 127) { */ - SvUTF8_on(sv); /* - if (revmax < 256) - sv_utf8_downgrade(sv, TRUE); - } */ } } break;