X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=toke.c;h=544a42950b0584fb43cf398ecf92a01793d1ee29;hb=43d2322dfe15a2c5bb24cfb950929e9c6bfb5cdd;hp=c60a929161cc14f70b7d64c9b6b1b6c0a8f214ae;hpb=93189314521460c01625b05f7cfa81ac855affa9;p=p5sagit%2Fp5-mst-13.2.git diff --git a/toke.c b/toke.c index c60a929..544a429 100644 --- a/toke.c +++ b/toke.c @@ -1,6 +1,7 @@ /* toke.c * - * Copyright (c) 1991-2002, Larry Wall + * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, + * 2000, 2001, 2002, 2003, 2004, by Larry Wall and others * * You may distribute under the terms of either the GNU General Public * License or the Artistic License, as specified in the README file. @@ -22,8 +23,8 @@ #define PERL_IN_TOKE_C #include "perl.h" -#define yychar PL_yychar -#define yylval PL_yylval +#define yychar (*PL_yycharp) +#define yylval (*PL_yylvalp) static char ident_too_long[] = "Identifier too long"; static char c_without_g[] = "Use of /c modifier is meaningless without /g"; @@ -78,22 +79,6 @@ static I32 utf16rev_textfilter(pTHX_ int idx, SV *sv, int maxlen); #undef ff_next #endif -#ifdef USE_PURE_BISON -# ifndef YYMAXLEVEL -# define YYMAXLEVEL 100 -# endif -YYSTYPE* yylval_pointer[YYMAXLEVEL]; -int* yychar_pointer[YYMAXLEVEL]; -int yyactlevel = -1; -# undef yylval -# undef yychar -# define yylval (*yylval_pointer[yyactlevel]) -# define yychar (*yychar_pointer[yyactlevel]) -# define PERL_YYLEX_PARAM yylval_pointer[yyactlevel],yychar_pointer[yyactlevel] -# undef yylex -# define yylex() Perl_yylex_r(aTHX_ yylval_pointer[yyactlevel],yychar_pointer[yyactlevel]) -#endif - #include "keywords.h" /* CLINE is a macro that ensures PL_copline has a sane value */ @@ -255,18 +240,23 @@ S_no_op(pTHX_ char *what, char *s) else PL_bufptr = s; yywarn(Perl_form(aTHX_ "%s found where operator expected", what)); - if (is_first) - Perl_warn(aTHX_ "\t(Missing semicolon on previous line?)\n"); - else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) { - char *t; - for (t = PL_oldoldbufptr; *t && (isALNUM_lazy_if(t,UTF) || *t == ':'); t++) ; - if (t < PL_bufptr && isSPACE(*t)) - Perl_warn(aTHX_ "\t(Do you need to predeclare %.*s?)\n", - t - PL_oldoldbufptr, PL_oldoldbufptr); - } - else { - assert(s >= oldbp); - Perl_warn(aTHX_ "\t(Missing operator before %.*s?)\n", s - oldbp, oldbp); + if (ckWARN_d(WARN_SYNTAX)) { + if (is_first) + Perl_warner(aTHX_ packWARN(WARN_SYNTAX), + "\t(Missing semicolon on previous line?)\n"); + else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) { + char *t; + for (t = PL_oldoldbufptr; *t && (isALNUM_lazy_if(t,UTF) || *t == ':'); t++) ; + if (t < PL_bufptr && isSPACE(*t)) + Perl_warner(aTHX_ packWARN(WARN_SYNTAX), + "\t(Do you need to predeclare %.*s?)\n", + t - PL_oldoldbufptr, PL_oldoldbufptr); + } + else { + assert(s >= oldbp); + Perl_warner(aTHX_ packWARN(WARN_SYNTAX), + "\t(Missing operator before %.*s?)\n", s - oldbp, oldbp); + } } PL_bufptr = oldbp; } @@ -424,8 +414,8 @@ Perl_lex_start(pTHX_ SV *line) SAVEPPTR(PL_last_uni); SAVEPPTR(PL_linestart); SAVESPTR(PL_linestr); - SAVEPPTR(PL_lex_brackstack); - SAVEPPTR(PL_lex_casestack); + SAVEGENERICPV(PL_lex_brackstack); + SAVEGENERICPV(PL_lex_casestack); SAVEDESTRUCTOR_X(restore_rsfp, PL_rsfp); SAVESPTR(PL_lex_stuff); SAVEI32(PL_lex_defer); @@ -440,8 +430,6 @@ Perl_lex_start(pTHX_ SV *line) PL_lex_brackets = 0; New(899, PL_lex_brackstack, 120, char); New(899, PL_lex_casestack, 12, char); - SAVEFREEPV(PL_lex_brackstack); - SAVEFREEPV(PL_lex_casestack); PL_lex_casemods = 0; *PL_lex_casestack = '\0'; PL_lex_dojoin = 0; @@ -683,7 +671,7 @@ S_check_uni(pTHX) char ch = *s; *s = '\0'; Perl_warner(aTHX_ packWARN(WARN_AMBIGUOUS), - "Warning: Use of \"%s\" without parens is ambiguous", + "Warning: Use of \"%s\" without parentheses is ambiguous", PL_last_uni); *s = ch; } @@ -787,6 +775,8 @@ S_force_word(pTHX_ register char *start, int token, int check_keyword, int allow } PL_nextval[PL_nexttoke].opval = (OP*)newSVOP(OP_CONST,0, newSVpv(PL_tokenbuf,0)); PL_nextval[PL_nexttoke].opval->op_private |= OPpCONST_BARE; + if (UTF && !IN_BYTES && is_utf8_string((U8*)PL_tokenbuf, len)) + SvUTF8_on(((SVOP*)PL_nextval[PL_nexttoke].opval)->op_sv); force_next(token); } return s; @@ -1052,8 +1042,8 @@ S_sublex_push(pTHX) SAVEPPTR(PL_last_uni); SAVEPPTR(PL_linestart); SAVESPTR(PL_linestr); - SAVEPPTR(PL_lex_brackstack); - SAVEPPTR(PL_lex_casestack); + SAVEGENERICPV(PL_lex_brackstack); + SAVEGENERICPV(PL_lex_casestack); PL_linestr = PL_lex_stuff; PL_lex_stuff = Nullsv; @@ -1068,8 +1058,6 @@ S_sublex_push(pTHX) PL_lex_brackets = 0; New(899, PL_lex_brackstack, 120, char); New(899, PL_lex_casestack, 12, char); - SAVEFREEPV(PL_lex_brackstack); - SAVEFREEPV(PL_lex_casestack); PL_lex_casemods = 0; *PL_lex_casestack = '\0'; PL_lex_starts = 0; @@ -1232,7 +1220,7 @@ S_scan_const(pTHX_ char *start) const char *leaveit = /* set of acceptably-backslashed characters */ PL_lex_inpat - ? "\\.^$@AGZdDwWsSbBpPXC+*?|()-nrtfeaxcz0123456789[{]} \t\n\r\f\v#" + ? "\\.^$@AGZdDwWsSbBpPXC+*?|()-nrtfeaxz0123456789[{]} \t\n\r\f\v#" : ""; if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) { @@ -1273,7 +1261,7 @@ S_scan_const(pTHX_ char *start) if (min > max) { Perl_croak(aTHX_ - "Invalid [] range \"%c-%c\" in transliteration operator", + "Invalid range \"%c-%c\" in transliteration operator", (char)min, (char)max); } @@ -1325,7 +1313,7 @@ S_scan_const(pTHX_ char *start) except for the last char, which will be done separately. */ else if (*s == '(' && PL_lex_inpat && s[1] == '?') { if (s[2] == '#') { - while (s < send && *s != ')') + while (s+1 < send && *s != ')') *d++ = NATIVE_TO_NEED(has_utf8,*s++); } else if (s[2] == '{' /* This should match regcomp.c */ @@ -1344,10 +1332,8 @@ S_scan_const(pTHX_ char *start) count--; regparse++; } - if (*regparse != ')') { + if (*regparse != ')') regparse--; /* Leave one char for continuation. */ - yyerror("Sequence (?{...}) not terminated or not {}-balanced"); - } while (s < regparse) *d++ = NATIVE_TO_NEED(has_utf8,*s++); } @@ -1611,7 +1597,7 @@ S_scan_const(pTHX_ char *start) /* \c is a control character */ case 'c': s++; - { + if (s < send) { U8 c = *s++; #ifdef EBCDIC if (isLOWER(c)) @@ -1619,6 +1605,9 @@ S_scan_const(pTHX_ char *start) #endif *d++ = NATIVE_TO_NEED(has_utf8,toCTRL(c)); } + else { + yyerror("Missing control char name in \\c"); + } continue; /* printf-style backslashes, formfeeds, newlines, etc */ @@ -1674,17 +1663,18 @@ S_scan_const(pTHX_ char *start) *d = '\0'; SvCUR_set(sv, d - SvPVX(sv)); if (SvCUR(sv) >= SvLEN(sv)) - Perl_croak(aTHX_ "panic: constant overflowed allocated space"); + Perl_croak(aTHX_ "panic: constant overflowed allocated space"); SvPOK_on(sv); if (PL_encoding && !has_utf8) { - sv_recode_to_utf8(sv, PL_encoding); - has_utf8 = TRUE; + sv_recode_to_utf8(sv, PL_encoding); + if (SvUTF8(sv)) + has_utf8 = TRUE; } if (has_utf8) { SvUTF8_on(sv); if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) { - PL_sublex_info.sub_op->op_private |= + PL_sublex_info.sub_op->op_private |= (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF); } } @@ -2141,7 +2131,7 @@ S_find_in_my_stash(pTHX_ char *pkgname, I32 len) #ifdef DEBUGGING static char* exp_name[] = { "OPERATOR", "TERM", "REF", "STATE", "BLOCK", "ATTRBLOCK", - "ATTRTERM", "TERMBLOCK" + "ATTRTERM", "TERMBLOCK", "TERMORDORDOR" }; #endif @@ -2170,26 +2160,6 @@ S_find_in_my_stash(pTHX_ char *pkgname, I32 len) if we already built the token before, use it. */ -#ifdef USE_PURE_BISON -int -Perl_yylex_r(pTHX_ YYSTYPE *lvalp, int *lcharp) -{ - int r; - - yyactlevel++; - yylval_pointer[yyactlevel] = lvalp; - yychar_pointer[yyactlevel] = lcharp; - if (yyactlevel >= YYMAXLEVEL) - Perl_croak(aTHX_ "panic: YYMAXLEVEL"); - - r = Perl_yylex(aTHX); - - if (yyactlevel > 0) - yyactlevel--; - - return r; -} -#endif #ifdef __SC__ #pragma segment Perl_yylex @@ -2204,6 +2174,7 @@ Perl_yylex(pTHX) GV *gv = Nullgv; GV **gvp = 0; bool bof = FALSE; + I32 orig_keyword = 0; /* check if there's an identifier for us to look at */ if (PL_pending_ident) @@ -2265,39 +2236,40 @@ Perl_yylex(pTHX) DEBUG_T({ PerlIO_printf(Perl_debug_log, "### Saw case modifier at '%s'\n", PL_bufptr); }); s = PL_bufptr + 1; - if (strnEQ(s, "L\\u", 3) || strnEQ(s, "U\\l", 3)) - tmp = *s, *s = s[2], s[2] = (char)tmp; /* misordered... */ - if (strchr("LU", *s) && - (strchr(PL_lex_casestack, 'L') || strchr(PL_lex_casestack, 'U'))) - { - PL_lex_casestack[--PL_lex_casemods] = '\0'; - return ')'; + if (s[1] == '\\' && s[2] == 'E') { + PL_bufptr = s + 3; + PL_lex_state = LEX_INTERPCONCAT; + return yylex(); } - if (PL_lex_casemods > 10) { - char* newlb = Renew(PL_lex_casestack, PL_lex_casemods + 2, char); - if (newlb != PL_lex_casestack) { - SAVEFREEPV(newlb); - PL_lex_casestack = newlb; + else { + if (strnEQ(s, "L\\u", 3) || strnEQ(s, "U\\l", 3)) + tmp = *s, *s = s[2], s[2] = (char)tmp; /* misordered... */ + if (strchr("LU", *s) && + (strchr(PL_lex_casestack, 'L') || strchr(PL_lex_casestack, 'U'))) { + PL_lex_casestack[--PL_lex_casemods] = '\0'; + return ')'; } + if (PL_lex_casemods > 10) + Renew(PL_lex_casestack, PL_lex_casemods + 2, char); + PL_lex_casestack[PL_lex_casemods++] = *s; + PL_lex_casestack[PL_lex_casemods] = '\0'; + PL_lex_state = LEX_INTERPCONCAT; + PL_nextval[PL_nexttoke].ival = 0; + force_next('('); + if (*s == 'l') + PL_nextval[PL_nexttoke].ival = OP_LCFIRST; + else if (*s == 'u') + PL_nextval[PL_nexttoke].ival = OP_UCFIRST; + else if (*s == 'L') + PL_nextval[PL_nexttoke].ival = OP_LC; + else if (*s == 'U') + PL_nextval[PL_nexttoke].ival = OP_UC; + else if (*s == 'Q') + PL_nextval[PL_nexttoke].ival = OP_QUOTEMETA; + else + Perl_croak(aTHX_ "panic: yylex"); + PL_bufptr = s + 1; } - PL_lex_casestack[PL_lex_casemods++] = *s; - PL_lex_casestack[PL_lex_casemods] = '\0'; - PL_lex_state = LEX_INTERPCONCAT; - PL_nextval[PL_nexttoke].ival = 0; - force_next('('); - if (*s == 'l') - PL_nextval[PL_nexttoke].ival = OP_LCFIRST; - else if (*s == 'u') - PL_nextval[PL_nexttoke].ival = OP_UCFIRST; - else if (*s == 'L') - PL_nextval[PL_nexttoke].ival = OP_LC; - else if (*s == 'U') - PL_nextval[PL_nexttoke].ival = OP_UC; - else if (*s == 'Q') - PL_nextval[PL_nexttoke].ival = OP_QUOTEMETA; - else - Perl_croak(aTHX_ "panic: yylex"); - PL_bufptr = s + 1; force_next(FUNC); if (PL_lex_starts) { s = PL_bufptr; @@ -2322,13 +2294,7 @@ Perl_yylex(pTHX) if (PL_lex_dojoin) { PL_nextval[PL_nexttoke].ival = 0; force_next(','); -#ifdef USE_5005THREADS - PL_nextval[PL_nexttoke].opval = newOP(OP_THREADSV, 0); - PL_nextval[PL_nexttoke].opval->op_targ = find_threadsv("\""); - force_next(PRIVATEREF); -#else force_ident("\"", '$'); -#endif /* USE_5005THREADS */ PL_nextval[PL_nexttoke].ival = 0; force_next('$'); PL_nextval[PL_nexttoke].ival = 0; @@ -2430,8 +2396,12 @@ Perl_yylex(pTHX) if (!PL_rsfp) { PL_last_uni = 0; PL_last_lop = 0; - if (PL_lex_brackets) - yyerror("Missing right curly or square bracket"); + if (PL_lex_brackets) { + if (PL_lex_formbrack) + yyerror("Format not terminated"); + else + yyerror("Missing right curly or square bracket"); + } DEBUG_T( { PerlIO_printf(Perl_debug_log, "### Tokener got EOF\n"); } ); @@ -2527,8 +2497,13 @@ Perl_yylex(pTHX) sv_setpv(PL_linestr,""); TOKEN(';'); /* not infinite loop because rsfp is NULL now */ } - /* if it looks like the start of a BOM, check if it in fact is */ - else if (bof && (!*s || *(U8*)s == 0xEF || *(U8*)s >= 0xFE)) { + /* If it looks like the start of a BOM or raw UTF-16, + * check if it in fact is. */ + else if (bof && + (*s == 0 || + *(U8*)s == 0xEF || + *(U8*)s >= 0xFE || + s[1] == 0)) { #ifdef PERLIO_IS_STDIO # ifdef __GNU_LIBRARY__ # if __GNU_LIBRARY__ == 1 /* Linux glibc5 */ @@ -2707,7 +2682,9 @@ Perl_yylex(pTHX) else newargv = PL_origargv; newargv[0] = ipath; + PERL_FPU_PRE_EXEC PerlProc_execv(ipath, EXEC_ARGV_CAST(newargv)); + PERL_FPU_POST_EXEC Perl_croak(aTHX_ "Can't exec %s", ipath); } #endif @@ -2730,6 +2707,14 @@ Perl_yylex(pTHX) } d = moreswitches(d); } while (d); + if (PL_doswitches && !switches_done) { + int argc = PL_origargc; + char **argv = PL_origargv; + do { + argc--,argv++; + } while (argc && argv[0][0] == '-' && argv[0][1]); + init_argv_symbols(argc,argv); + } if ((PERLDB_LINE && !oldpdb) || ((PL_minus_n || PL_minus_p) && !(oldn || oldp))) /* if we have already added "LINE: while (<>) {", @@ -2868,10 +2853,10 @@ Perl_yylex(pTHX) /* Assume it was a minus followed by a one-letter named * subroutine call (or a -bareword), then. */ DEBUG_T( { PerlIO_printf(Perl_debug_log, - "### %c looked like a file test but was not\n", - (int)ftst); + "### '-%c' looked like a file test but was not\n", + tmp); } ); - s -= 2; + s = --PL_bufptr; } } tmp = *s++; @@ -2943,8 +2928,6 @@ Perl_yylex(pTHX) PL_tokenbuf[0] = '%'; s = scan_ident(s, PL_bufend, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1, TRUE); if (!PL_tokenbuf[1]) { - if (s == PL_bufend) - yyerror("Final % should be \\% or %name"); PREREF('%'); } PL_pending_ident = '%'; @@ -3021,19 +3004,27 @@ Perl_yylex(pTHX) PL_lex_stuff = Nullsv; } else { + if (len == 6 && strnEQ(s, "unique", len)) { + if (PL_in_my == KEY_our) +#ifdef USE_ITHREADS + GvUNIQUE_on(cGVOPx_gv(yylval.opval)); +#else + ; /* skip to avoid loading attributes.pm */ +#endif + else + Perl_croak(aTHX_ "The 'unique' attribute may only be applied to 'our' variables"); + } + /* NOTE: any CV attrs applied here need to be part of the CVf_BUILTIN_ATTRS define in cv.h! */ - if (!PL_in_my && len == 6 && strnEQ(s, "lvalue", len)) + else if (!PL_in_my && len == 6 && strnEQ(s, "lvalue", len)) CvLVALUE_on(PL_compcv); else if (!PL_in_my && len == 6 && strnEQ(s, "locked", len)) CvLOCKED_on(PL_compcv); else if (!PL_in_my && len == 6 && strnEQ(s, "method", len)) CvMETHOD_on(PL_compcv); -#ifdef USE_ITHREADS - else if (PL_in_my == KEY_our && len == 6 && - strnEQ(s, "unique", len)) - GvUNIQUE_on(cGVOPx_gv(yylval.opval)); -#endif + else if (!PL_in_my && len == 9 && strnEQ(s, "assertion", len)) + CvASSERTION_on(PL_compcv); /* After we've set the flags, it could be argued that we don't need to do the attributes.pm-based setting process, and shouldn't bother appending recognized @@ -3090,6 +3081,7 @@ Perl_yylex(pTHX) PL_oldbufptr = PL_oldoldbufptr; /* allow print(STDOUT 123) */ else PL_expect = XTERM; + s = skipspace(s); TOKEN('('); case ';': CLINE; @@ -3118,11 +3110,7 @@ Perl_yylex(pTHX) leftbracket: s++; if (PL_lex_brackets > 100) { - char* newlb = Renew(PL_lex_brackstack, PL_lex_brackets + 1, char); - if (newlb != PL_lex_brackstack) { - SAVEFREEPV(newlb); - PL_lex_brackstack = newlb; - } + Renew(PL_lex_brackstack, PL_lex_brackets + 10, char); } switch (PL_expect) { case XTERM: @@ -3215,12 +3203,17 @@ Perl_yylex(pTHX) || ((*t == 'q' || *t == 'x') && ++t < PL_bufend && !isALNUM(*t)))) { + /* skip q//-like construct */ char *tmps; char open, close, term; I32 brackets = 1; while (t < PL_bufend && isSPACE(*t)) t++; + /* check for q => */ + if (t+1 < PL_bufend && t[0] == '=' && t[1] == '>') { + OPERATOR(HASHBRACK); + } term = *t; open = term; if (term && (tmps = strchr("([{< )]}> )]}>",term))) @@ -3233,7 +3226,7 @@ Perl_yylex(pTHX) else if (*t == open) break; } - else + else { for (t++; t < PL_bufend; t++) { if (*t == '\\' && t+1 < PL_bufend) t++; @@ -3242,8 +3235,13 @@ Perl_yylex(pTHX) else if (*t == open) brackets++; } + } + t++; } - t++; + else + /* skip plain q word */ + while (t < PL_bufend && isALNUM_lazy_if(t,UTF)) + t += UTF8SKIP(t); } else if (isALNUM_lazy_if(t,UTF)) { t += UTF8SKIP(t); @@ -3389,8 +3387,24 @@ Perl_yylex(pTHX) case '!': s++; tmp = *s++; - if (tmp == '=') + if (tmp == '=') { + /* was this !=~ where !~ was meant? + * warn on m:!=~\s+([/?]|[msy]\W|tr\W): */ + + if (*s == '~' && ckWARN(WARN_SYNTAX)) { + char *t = s+1; + + while (t < PL_bufend && isSPACE(*t)) + ++t; + + if (*t == '/' || *t == '?' || + ((*t == 'm' || *t == 's' || *t == 'y') && !isALNUM(t[1])) || + (*t == 't' && t[1] == 'r' && !isALNUM(t[2]))) + Perl_warner(aTHX_ packWARN(WARN_SYNTAX), + "!=~ should be !~"); + } Eop(OP_NE); + } if (tmp == '~') PMop(OP_NOT); s--; @@ -3543,9 +3557,7 @@ Perl_yylex(pTHX) } } else { - GV *gv = gv_fetchpv(tmpbuf, FALSE, SVt_PVCV); - if (gv && GvCVu(gv)) - PL_expect = XTERM; /* e.g. print $fh subr() */ + PL_expect = XTERM; /* e.g. print $fh subr() */ } } else if (isDIGIT(*s)) @@ -3569,8 +3581,6 @@ Perl_yylex(pTHX) PL_tokenbuf[0] = '@'; s = scan_ident(s, PL_bufend, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1, FALSE); if (!PL_tokenbuf[1]) { - if (s == PL_bufend) - yyerror("Final @ should be \\@ or @name"); PREREF('@'); } if (PL_lex_state == LEX_NORMAL) @@ -3802,6 +3812,7 @@ Perl_yylex(pTHX) case 'z': case 'Z': keylookup: { + orig_keyword = 0; gv = Nullgv; gvp = 0; @@ -3866,6 +3877,7 @@ Perl_yylex(pTHX) } } if (ogv) { + orig_keyword = tmp; tmp = 0; /* overridden by import or by GLOBAL */ } else if (gv && !gvp @@ -4041,7 +4053,9 @@ Perl_yylex(pTHX) /* If followed by a bareword, see if it looks like indir obj. */ - if ((isIDFIRST_lazy_if(s,UTF) || *s == '$') && (tmp = intuit_method(s,gv))) + if (!orig_keyword + && (isIDFIRST_lazy_if(s,UTF) || *s == '$') + && (tmp = intuit_method(s,gv))) return tmp; /* Not a method, so call it a subroutine (if defined) */ @@ -4076,6 +4090,8 @@ Perl_yylex(pTHX) TERM(FUNC0SUB); if (strEQ(proto, "$")) OPERATOR(UNIOPSUB); + while (*proto == ';') + proto++; if (*proto == '&' && *s == '{') { sv_setpv(PL_subname, PL_curstash ? "__ANON__" : "__ANON__::__ANON__"); @@ -4190,8 +4206,29 @@ Perl_yylex(pTHX) } #endif #ifdef PERLIO_LAYERS - if (UTF && !IN_BYTES) - PerlIO_apply_layers(aTHX_ PL_rsfp, NULL, ":utf8"); + if (!IN_BYTES) { + if (UTF) + PerlIO_apply_layers(aTHX_ PL_rsfp, NULL, ":utf8"); + else if (PL_encoding) { + SV *name; + dSP; + ENTER; + SAVETMPS; + PUSHMARK(sp); + EXTEND(SP, 1); + XPUSHs(PL_encoding); + PUTBACK; + call_method("name", G_SCALAR); + SPAGAIN; + name = POPs; + PUTBACK; + PerlIO_apply_layers(aTHX_ PL_rsfp, NULL, + Perl_form(aTHX_ ":encoding(%"SVf")", + name)); + FREETMPS; + LEAVE; + } + } #endif PL_rsfp = Nullfp; } @@ -4649,11 +4686,15 @@ Perl_yylex(pTHX) if (isIDFIRST_lazy_if(s,UTF)) { char *t; for (d = s; isALNUM_lazy_if(d,UTF); d++) ; - t = skipspace(d); - if (strchr("|&*+-=!?:.", *t) && ckWARN_d(WARN_PRECEDENCE)) + for (t=d; *t && isSPACE(*t); t++) ; + if ( *t && strchr("|&*+-=!?:.", *t) && ckWARN_d(WARN_PRECEDENCE) + /* [perl #16184] */ + && !(t[0] == '=' && t[1] == '>') + ) { Perl_warner(aTHX_ packWARN(WARN_PRECEDENCE), "Precedence problem: open %.*s should be open(%.*s)", - d-s,s, d-s,s); + d - s, s, d - s, s); + } } LOP(OP_OPEN,XTERM); @@ -5042,8 +5083,8 @@ Perl_yylex(pTHX) d[tmp] = '\0'; if (bad_proto && ckWARN(WARN_SYNTAX)) Perl_warner(aTHX_ packWARN(WARN_SYNTAX), - "Illegal character in prototype for %s : %s", - SvPVX(PL_subname), d); + "Illegal character in prototype for %"SVf" : %s", + PL_subname, d); SvCUR(PL_lex_stuff) = tmp; have_proto = TRUE; @@ -5054,6 +5095,12 @@ Perl_yylex(pTHX) if (*s == ':' && s[1] != ':') PL_expect = attrful; + else if (*s != '{' && key == KEY_sub) { + if (!have_name) + Perl_croak(aTHX_ "Illegal declaration of anonymous subroutine"); + else if (*s != ';') + Perl_croak(aTHX_ "Illegal declaration of subroutine %"SVf, PL_subname); + } if (have_proto) { PL_nextval[PL_nexttoke].opval = @@ -5237,7 +5284,7 @@ static int S_pending_ident(pTHX) { register char *d; - register I32 tmp; + register I32 tmp = 0; /* pit holds the identifier we read and pending_ident is reset */ char pit = PL_pending_ident; PL_pending_ident = 0; @@ -5257,14 +5304,14 @@ S_pending_ident(pTHX) yyerror(Perl_form(aTHX_ "No package name allowed for " "variable %s in \"our\"", PL_tokenbuf)); - tmp = pad_allocmy(PL_tokenbuf); + tmp = allocmy(PL_tokenbuf); } else { if (strchr(PL_tokenbuf,':')) yyerror(Perl_form(aTHX_ PL_no_myglob,PL_tokenbuf)); yylval.opval = newOP(OP_PADANY, 0); - yylval.opval->op_targ = pad_allocmy(PL_tokenbuf); + yylval.opval->op_targ = allocmy(PL_tokenbuf); return PRIVATEREF; } } @@ -5282,23 +5329,13 @@ S_pending_ident(pTHX) */ if (!strchr(PL_tokenbuf,':')) { -#ifdef USE_5005THREADS - /* Check for single character per-thread SVs */ - if (PL_tokenbuf[0] == '$' && PL_tokenbuf[2] == '\0' - && !isALPHA(PL_tokenbuf[1]) /* Rule out obvious non-threadsvs */ - && (tmp = find_threadsv(&PL_tokenbuf[1])) != NOT_IN_PAD) - { - yylval.opval = newOP(OP_THREADSV, 0); - yylval.opval->op_targ = tmp; - return PRIVATEREF; - } -#endif /* USE_5005THREADS */ - if ((tmp = pad_findmy(PL_tokenbuf)) != NOT_IN_PAD) { - SV *namesv = AvARRAY(PL_comppad_name)[tmp]; + if (!PL_in_my) + tmp = pad_findmy(PL_tokenbuf); + if (tmp != NOT_IN_PAD) { /* might be an "our" variable" */ - if (SvFLAGS(namesv) & SVpad_OUR) { + if (PAD_COMPNAME_FLAGS(tmp) & SVpad_OUR) { /* build ops for a bareword */ - SV *sym = newSVpv(HvNAME(GvSTASH(namesv)),0); + SV *sym = newSVpv(HvNAME(PAD_COMPNAME_OURSTASH(tmp)), 0); sv_catpvn(sym, "::", 2); sv_catpv(sym, PL_tokenbuf+1); yylval.opval = (OP*)newSVOP(OP_CONST, 0, sym); @@ -5489,7 +5526,9 @@ Perl_keyword(pTHX_ register char *d, I32 len) break; case 6: if (strEQ(d,"exists")) return KEY_exists; - if (strEQ(d,"elseif")) Perl_warn(aTHX_ "elseif should be elsif"); + if (strEQ(d,"elseif") && ckWARN_d(WARN_SYNTAX)) + Perl_warner(aTHX_ packWARN(WARN_SYNTAX), + "elseif should be elsif"); break; case 8: if (strEQ(d,"endgrent")) return -KEY_endgrent; @@ -6282,8 +6321,10 @@ S_scan_ident(pTHX_ register char *s, register char *send, char *dest, STRLEN des } if (*s == '}') { s++; - if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) + if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) { PL_lex_state = LEX_INTERPEND; + PL_expect = XREF; + } if (funny == '#') funny = '@'; if (PL_lex_state == LEX_NORMAL) { @@ -6470,7 +6511,8 @@ S_scan_trans(pTHX_ char *start) New(803, tbl, complement&&!del?258:256, short); o = newPVOP(OP_TRANS, 0, (char*)tbl); - o->op_private = del|squash|complement| + o->op_private &= ~OPpTRANS_ALL; + o->op_private |= del|squash|complement| (DO_UTF8(PL_lex_stuff)? OPpTRANS_FROM_UTF : 0)| (DO_UTF8(PL_lex_repl) ? OPpTRANS_TO_UTF : 0); @@ -6590,7 +6632,7 @@ S_scan_heredoc(pTHX_ register char *s) sv_setpvn(tmpstr,d+1,s-d); s += len - 1; sv_catpvn(herewas,s,bufend-s); - (void)strcpy(bufptr,SvPVX(herewas)); + Copy(SvPVX(herewas),bufptr,SvCUR(herewas) + 1,char); s = olds; goto retval; @@ -6670,8 +6712,12 @@ retval: Renew(SvPVX(tmpstr), SvLEN(tmpstr), char); } SvREFCNT_dec(herewas); - if (UTF && !IN_BYTES && is_utf8_string((U8*)SvPVX(tmpstr), SvCUR(tmpstr))) - SvUTF8_on(tmpstr); + if (!IN_BYTES) { + if (UTF && is_utf8_string((U8*)SvPVX(tmpstr), SvCUR(tmpstr))) + SvUTF8_on(tmpstr); + else if (PL_encoding) + sv_recode_to_utf8(tmpstr, PL_encoding); + } PL_lex_stuff = tmpstr; yylval.ival = op_type; return s; @@ -6756,7 +6802,7 @@ S_scan_inputsymbol(pTHX_ char *start) /* turn <> into */ if (!len) - (void)strcpy(d,"ARGV"); + Copy("ARGV",d,5,char); /* Check whether readline() is overriden */ if (((gv_readline = gv_fetchpv("readline", FALSE, SVt_PVCV)) @@ -6777,9 +6823,9 @@ S_scan_inputsymbol(pTHX_ char *start) add symbol table ops */ if ((tmp = pad_findmy(d)) != NOT_IN_PAD) { - SV *namesv = AvARRAY(PL_comppad_name)[tmp]; - if (SvFLAGS(namesv) & SVpad_OUR) { - SV *sym = sv_2mortal(newSVpv(HvNAME(GvSTASH(namesv)),0)); + if (PAD_COMPNAME_FLAGS(tmp) & SVpad_OUR) { + SV *sym = sv_2mortal( + newSVpv(HvNAME(PAD_COMPNAME_OURSTASH(tmp)),0)); sv_catpvn(sym, "::", 2); sv_catpv(sym, d+1); d = SvPVX(sym); @@ -6890,6 +6936,10 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) register char *to; /* current position in the sv's data */ I32 brackets = 1; /* bracket nesting level */ bool has_utf8 = FALSE; /* is there any utf8 content? */ + I32 termcode; /* terminating char. code */ + U8 termstr[UTF8_MAXLEN]; /* terminating string */ + STRLEN termlen; /* length of terminating string */ + char *last = NULL; /* last position for nesting bracket */ /* skip space before the delimiter */ if (isSPACE(*s)) @@ -6900,8 +6950,16 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) /* after skipping whitespace, the next character is the terminator */ term = *s; - if (!UTF8_IS_INVARIANT((U8)term) && UTF) - has_utf8 = TRUE; + if (!UTF) { + termcode = termstr[0] = term; + termlen = 1; + } + else { + termcode = utf8_to_uvchr((U8*)s, &termlen); + Copy(s, termstr, termlen, U8); + if (!UTF8_IS_INVARIANT(term)) + has_utf8 = TRUE; + } /* mark where we are */ PL_multi_start = CopLINE(PL_curcop); @@ -6909,21 +6967,92 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) /* find corresponding closing delimiter */ if (term && (tmps = strchr("([{< )]}> )]}>",term))) - term = tmps[5]; + termcode = termstr[0] = term = tmps[5]; + PL_multi_close = term; /* create a new SV to hold the contents. 87 is leak category, I'm assuming. 79 is the SV's initial length. What a random number. */ sv = NEWSV(87,79); sv_upgrade(sv, SVt_PVIV); - SvIVX(sv) = term; + SvIVX(sv) = termcode; (void)SvPOK_only(sv); /* validate pointer */ /* move past delimiter and try to read a complete string */ if (keep_delims) - sv_catpvn(sv, s, 1); - s++; + sv_catpvn(sv, s, termlen); + s += termlen; for (;;) { + if (PL_encoding && !UTF) { + bool cont = TRUE; + + while (cont) { + int offset = s - SvPVX(PL_linestr); + bool found = sv_cat_decode(sv, PL_encoding, PL_linestr, + &offset, (char*)termstr, termlen); + char *ns = SvPVX(PL_linestr) + offset; + char *svlast = SvEND(sv) - 1; + + for (; s < ns; s++) { + if (*s == '\n' && !PL_rsfp) + CopLINE_inc(PL_curcop); + } + if (!found) + goto read_more_line; + else { + /* handle quoted delimiters */ + if (SvCUR(sv) > 1 && *(svlast-1) == '\\') { + char *t; + for (t = svlast-2; t >= SvPVX(sv) && *t == '\\';) + t--; + if ((svlast-1 - t) % 2) { + if (!keep_quoted) { + *(svlast-1) = term; + *svlast = '\0'; + SvCUR_set(sv, SvCUR(sv) - 1); + } + continue; + } + } + if (PL_multi_open == PL_multi_close) { + cont = FALSE; + } + else { + char *t, *w; + if (!last) + last = SvPVX(sv); + for (w = t = last; t < svlast; w++, t++) { + /* At here, all closes are "was quoted" one, + so we don't check PL_multi_close. */ + if (*t == '\\') { + if (!keep_quoted && *(t+1) == PL_multi_open) + t++; + else + *w++ = *t++; + } + else if (*t == PL_multi_open) + brackets++; + + *w = *t; + } + if (w < t) { + *w++ = term; + *w = '\0'; + SvCUR_set(sv, w - SvPVX(sv)); + } + last = w; + if (--brackets <= 0) + cont = FALSE; + } + } + } + if (!keep_delims) { + SvCUR_set(sv, SvCUR(sv) - 1); + *SvEND(sv) = '\0'; + } + break; + } + /* extend sv if need be */ SvGROW(sv, SvCUR(sv) + (PL_bufend - s) + 1); /* set 'to' to the next character in the sv's string */ @@ -6945,8 +7074,12 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) } /* terminate when run out of buffer (the for() condition), or have found the terminator */ - else if (*s == term) - break; + else if (*s == term) { + if (termlen == 1) + break; + if (s+termlen <= PL_bufend && memEQ(s, (char*)termstr, termlen)) + break; + } else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF) has_utf8 = TRUE; *to = *s; @@ -7008,6 +7141,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) to[-1] = '\n'; #endif + read_more_line: /* if we're out of file, or a read fails, bail and reset the current line marker so we can report where the unterminated string began */ @@ -7038,12 +7172,15 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) /* at this point, we have successfully read the delimited string */ - if (keep_delims) - sv_catpvn(sv, s, 1); - if (has_utf8) + if (!PL_encoding || UTF) { + if (keep_delims) + sv_catpvn(sv, s, termlen); + s += termlen; + } + if (has_utf8 || PL_encoding) SvUTF8_on(sv); + PL_multi_end = CopLINE(PL_curcop); - s++; /* if we allocated too much space, give some back */ if (SvCUR(sv) + 5 < SvLEN(sv)) { @@ -7120,6 +7257,7 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp) UV u = 0; I32 shift; bool overflowed = FALSE; + bool just_zero = TRUE; /* just plain 0 or binary number? */ static NV nvshift[5] = { 1.0, 2.0, 4.0, 8.0, 16.0 }; static char* bases[5] = { "", "binary", "", "octal", "hexadecimal" }; @@ -7136,9 +7274,11 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp) if (s[1] == 'x') { shift = 4; s += 2; + just_zero = FALSE; } else if (s[1] == 'b') { shift = 1; s += 2; + just_zero = FALSE; } /* check for a decimal in disguise */ else if (s[1] == '.' || s[1] == 'e' || s[1] == 'E') @@ -7210,6 +7350,7 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp) */ digit: + just_zero = FALSE; if (!overflowed) { x = u << shift; /* make room for the digit */ @@ -7268,7 +7409,10 @@ Perl_scan_num(pTHX_ char *start, YYSTYPE* lvalp) #endif sv_setuv(sv, u); } - if (PL_hints & HINT_NEW_BINARY) + if (just_zero && (PL_hints & HINT_NEW_INTEGER)) + sv = new_constant(start, s - start, "integer", + sv, Nullsv, NULL); + else if (PL_hints & HINT_NEW_BINARY) sv = new_constant(start, s - start, "binary", sv, Nullsv, NULL); } break; @@ -7464,20 +7608,23 @@ S_scan_formline(pTHX_ register char *s) register char *t; SV *stuff = newSVpvn("",0); bool needargs = FALSE; + bool eofmt = FALSE; while (!needargs) { - if (*s == '.' || *s == /*{*/'}') { + if (*s == '.') { /*SUPPRESS 530*/ #ifdef PERL_STRICT_CR for (t = s+1;SPACE_OR_TAB(*t); t++) ; #else for (t = s+1;SPACE_OR_TAB(*t) || *t == '\r'; t++) ; #endif - if (*t == '\n' || t == PL_bufend) + if (*t == '\n' || t == PL_bufend) { + eofmt = TRUE; break; + } } if (PL_in_eval && !PL_rsfp) { - eol = strchr(s,'\n'); + eol = memchr(s,'\n',PL_bufend-s); if (!eol++) eol = PL_bufend; } @@ -7514,7 +7661,6 @@ S_scan_formline(pTHX_ register char *s) PL_last_lop = PL_last_uni = Nullch; if (!s) { s = PL_bufptr; - yyerror("Format not terminated"); break; } } @@ -7530,6 +7676,12 @@ S_scan_formline(pTHX_ register char *s) } else PL_lex_state = LEX_FORMLINE; + if (!IN_BYTES) { + if (UTF && is_utf8_string((U8*)SvPVX(stuff), SvCUR(stuff))) + SvUTF8_on(stuff); + else if (PL_encoding) + sv_recode_to_utf8(stuff, PL_encoding); + } PL_nextval[PL_nexttoke].opval = (OP*)newSVOP(OP_CONST, 0, stuff); force_next(THING); PL_nextval[PL_nexttoke].ival = OP_FORMLINE; @@ -7537,7 +7689,8 @@ S_scan_formline(pTHX_ register char *s) } else { SvREFCNT_dec(stuff); - PL_lex_formbrack = 0; + if (eofmt) + PL_lex_formbrack = 0; PL_bufptr = s; } return s; @@ -7557,52 +7710,22 @@ Perl_start_subparse(pTHX_ I32 is_format, U32 flags) { I32 oldsavestack_ix = PL_savestack_ix; CV* outsidecv = PL_compcv; - AV* comppadlist; if (PL_compcv) { assert(SvTYPE(PL_compcv) == SVt_PVCV); } SAVEI32(PL_subline); save_item(PL_subname); - SAVEI32(PL_padix); - SAVECOMPPAD(); - SAVESPTR(PL_comppad_name); SAVESPTR(PL_compcv); - SAVEI32(PL_comppad_name_fill); - SAVEI32(PL_min_intro_pending); - SAVEI32(PL_max_intro_pending); - SAVEI32(PL_pad_reset_pending); PL_compcv = (CV*)NEWSV(1104,0); sv_upgrade((SV *)PL_compcv, is_format ? SVt_PVFM : SVt_PVCV); CvFLAGS(PL_compcv) |= flags; - PL_comppad = newAV(); - av_push(PL_comppad, Nullsv); - PL_curpad = AvARRAY(PL_comppad); - PL_comppad_name = newAV(); - PL_comppad_name_fill = 0; - PL_min_intro_pending = 0; - PL_padix = 0; PL_subline = CopLINE(PL_curcop); -#ifdef USE_5005THREADS - av_store(PL_comppad_name, 0, newSVpvn("@_", 2)); - PL_curpad[0] = (SV*)newAV(); - SvPADMY_on(PL_curpad[0]); /* XXX Needed? */ -#endif /* USE_5005THREADS */ - - comppadlist = newAV(); - AvREAL_off(comppadlist); - av_store(comppadlist, 0, (SV*)PL_comppad_name); - av_store(comppadlist, 1, (SV*)PL_comppad); - - CvPADLIST(PL_compcv) = comppadlist; + CvPADLIST(PL_compcv) = pad_new(padnew_SAVE|padnew_SAVESUB); CvOUTSIDE(PL_compcv) = (CV*)SvREFCNT_inc(outsidecv); -#ifdef USE_5005THREADS - CvOWNER(PL_compcv) = 0; - New(666, CvMUTEXP(PL_compcv), 1, perl_mutex); - MUTEX_INIT(CvMUTEXP(PL_compcv)); -#endif /* USE_5005THREADS */ + CvOUTSIDE_SEQ(PL_compcv) = PL_cop_seqmax; return oldsavestack_ix; } @@ -7663,12 +7786,7 @@ Perl_yyerror(pTHX_ char *s) } else if (yychar > 255) where = "next token ???"; -#ifdef USE_PURE_BISON -/* GNU Bison sets the value -2 */ - else if (yychar == -2) { -#else - else if ((yychar & 127) == 127) { -#endif + else if (yychar == -2) { /* YYEMPTY */ if (PL_lex_state == LEX_NORMAL || (PL_lex_state == LEX_KNOWNEXT && PL_lex_defer == LEX_NORMAL)) where = "at end of line"; @@ -7700,8 +7818,8 @@ Perl_yyerror(pTHX_ char *s) (int)PL_multi_open,(int)PL_multi_close,(IV)PL_multi_start); PL_multi_end = 0; } - if (PL_in_eval & EVAL_WARNONLY) - Perl_warn(aTHX_ "%"SVf, msg); + if (PL_in_eval & EVAL_WARNONLY && ckWARN_d(WARN_SYNTAX)) + Perl_warner(aTHX_ packWARN(WARN_SYNTAX), "%"SVf, msg); else qerror(msg); if (PL_error_count >= 10) { @@ -7725,72 +7843,94 @@ S_swallow_bom(pTHX_ U8 *s) { STRLEN slen; slen = SvCUR(PL_linestr); - switch (*s) { + switch (s[0]) { case 0xFF: if (s[1] == 0xFE) { - /* UTF-16 little-endian */ + /* UTF-16 little-endian? (or UTF32-LE?) */ if (s[2] == 0 && s[3] == 0) /* UTF-32 little-endian */ - Perl_croak(aTHX_ "Unsupported script encoding"); + Perl_croak(aTHX_ "Unsupported script encoding UTF32-LE"); #ifndef PERL_NO_UTF16_FILTER - DEBUG_p(PerlIO_printf(Perl_debug_log, "UTF-LE script encoding\n")); + if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF16-LE script encoding (BOM)\n"); s += 2; + utf16le: if (PL_bufend > (char*)s) { U8 *news; I32 newlen; filter_add(utf16rev_textfilter, NULL); New(898, news, (PL_bufend - (char*)s) * 3 / 2 + 1, U8); - PL_bufend = (char*)utf16_to_utf8_reversed(s, news, - PL_bufend - (char*)s - 1, - &newlen); - Copy(news, s, newlen, U8); - SvCUR_set(PL_linestr, newlen); - PL_bufend = SvPVX(PL_linestr) + newlen; - news[newlen++] = '\0'; + PL_bufend = + (char*)utf16_to_utf8_reversed(s, news, + PL_bufend - (char*)s - 1, + &newlen); + sv_setpvn(PL_linestr, (const char*)news, newlen); Safefree(news); + SvUTF8_on(PL_linestr); + s = (U8*)SvPVX(PL_linestr); + PL_bufend = SvPVX(PL_linestr) + newlen; } #else - Perl_croak(aTHX_ "Unsupported script encoding"); + Perl_croak(aTHX_ "Unsupported script encoding UTF16-LE"); #endif } break; case 0xFE: - if (s[1] == 0xFF) { /* UTF-16 big-endian */ + if (s[1] == 0xFF) { /* UTF-16 big-endian? */ #ifndef PERL_NO_UTF16_FILTER - DEBUG_p(PerlIO_printf(Perl_debug_log, "UTF-16BE script encoding\n")); + if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16BE script encoding (BOM)\n"); s += 2; + utf16be: if (PL_bufend > (char *)s) { U8 *news; I32 newlen; filter_add(utf16_textfilter, NULL); New(898, news, (PL_bufend - (char*)s) * 3 / 2 + 1, U8); - PL_bufend = (char*)utf16_to_utf8(s, news, - PL_bufend - (char*)s, - &newlen); - Copy(news, s, newlen, U8); - SvCUR_set(PL_linestr, newlen); - PL_bufend = SvPVX(PL_linestr) + newlen; - news[newlen++] = '\0'; + PL_bufend = + (char*)utf16_to_utf8(s, news, + PL_bufend - (char*)s, + &newlen); + sv_setpvn(PL_linestr, (const char*)news, newlen); Safefree(news); + SvUTF8_on(PL_linestr); + s = (U8*)SvPVX(PL_linestr); + PL_bufend = SvPVX(PL_linestr) + newlen; } #else - Perl_croak(aTHX_ "Unsupported script encoding"); + Perl_croak(aTHX_ "Unsupported script encoding UTF16-BE"); #endif } break; case 0xEF: if (slen > 2 && s[1] == 0xBB && s[2] == 0xBF) { - DEBUG_p(PerlIO_printf(Perl_debug_log, "UTF-8 script encoding\n")); + if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-8 script encoding (BOM)\n"); s += 3; /* UTF-8 */ } break; case 0: - if (slen > 3 && s[1] == 0 && /* UTF-32 big-endian */ - s[2] == 0xFE && s[3] == 0xFF) - { - Perl_croak(aTHX_ "Unsupported script encoding"); + if (slen > 3) { + if (s[1] == 0) { + if (s[2] == 0xFE && s[3] == 0xFF) { + /* UTF-32 big-endian */ + Perl_croak(aTHX_ "Unsupported script encoding UTF32-BE"); + } + } + else if (s[2] == 0 && s[3] != 0) { + /* Leading bytes + * 00 xx 00 xx + * are a good indicator of UTF-16BE. */ + if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16BE script encoding (no BOM)\n"); + goto utf16be; + } } + default: + if (slen > 3 && s[1] == 0 && s[2] != 0 && s[3] == 0) { + /* Leading bytes + * xx 00 xx 00 + * are a good indicator of UTF-16LE. */ + if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16LE script encoding (no BOM)\n"); + goto utf16le; + } } return (char*)s; } @@ -7852,3 +7992,89 @@ utf16rev_textfilter(pTHX_ int idx, SV *sv, int maxlen) } #endif +/* +Returns a pointer to the next character after the parsed +vstring, as well as updating the passed in sv. + +Function must be called like + + sv = NEWSV(92,5); + s = scan_vstring(s,sv); + +The sv should already be large enough to store the vstring +passed in, for performance reasons. + +*/ + +char * +Perl_scan_vstring(pTHX_ char *s, SV *sv) +{ + char *pos = s; + char *start = s; + if (*pos == 'v') pos++; /* get past 'v' */ + while (pos < PL_bufend && (isDIGIT(*pos) || *pos == '_')) + pos++; + if ( *pos != '.') { + /* this may not be a v-string if followed by => */ + char *next = pos; + while (next < PL_bufend && isSPACE(*next)) + ++next; + if ((PL_bufend - next) >= 2 && *next == '=' && next[1] == '>' ) { + /* return string not v-string */ + sv_setpvn(sv,(char *)s,pos-s); + return pos; + } + } + + if (!isALPHA(*pos)) { + UV rev; + U8 tmpbuf[UTF8_MAXLEN+1]; + U8 *tmpend; + + if (*s == 'v') s++; /* get past 'v' */ + + sv_setpvn(sv, "", 0); + + for (;;) { + rev = 0; + { + /* this is atoi() that tolerates underscores */ + char *end = pos; + UV mult = 1; + while (--end >= s) { + UV orev; + if (*end == '_') + continue; + orev = rev; + rev += (*end - '0') * mult; + mult *= 10; + if (orev > rev && ckWARN_d(WARN_OVERFLOW)) + Perl_warner(aTHX_ packWARN(WARN_OVERFLOW), + "Integer overflow in decimal number"); + } + } +#ifdef EBCDIC + if (rev > 0x7FFFFFFF) + Perl_croak(aTHX_ "In EBCDIC the v-string components cannot exceed 2147483647"); +#endif + /* Append native character for the rev point */ + tmpend = uvchr_to_utf8(tmpbuf, rev); + sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf); + if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(rev))) + SvUTF8_on(sv); + if (pos + 1 < PL_bufend && *pos == '.' && isDIGIT(pos[1])) + s = ++pos; + else { + s = pos; + break; + } + while (pos < PL_bufend && (isDIGIT(*pos) || *pos == '_')) + pos++; + } + SvPOK_on(sv); + sv_magic(sv,NULL,PERL_MAGIC_vstring,(const char*)start, pos-start); + SvRMAGICAL_on(sv); + } + return s; +} +