# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || (PL_hints & HINT_UTF8))
#endif
+/* The maximum number of characters preceding the unrecognized one to display */
+#define UNRECOGNIZED_PRECEDE_COUNT 10
+
/* In variables named $^X, these are the legal values for X.
* 1999-02-27 mjd-perl-patch@plover.com */
#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
-/* On MacOS, respect nonbreaking spaces */
-#ifdef MACOS_TRADITIONAL
-#define SPACE_OR_TAB(c) ((c)==' '||(c)=='\312'||(c)=='\t')
-#else
#define SPACE_OR_TAB(c) ((c)==' '||(c)=='\t')
-#endif
/* LEX_* are values for PL_lex_state, the state of the lexer.
* They are arranged oddly so that the guard on the switch statement
if (nl)
*nl = '\0';
}
- else if (
-#ifdef EBCDIC
- iscntrl(PL_multi_close)
-#else
- PL_multi_close < 32 || PL_multi_close == 127
-#endif
- ) {
+ else if (isCNTRL(PL_multi_close)) {
*tmpbuf = '^';
tmpbuf[1] = (char)toCTRL(PL_multi_close);
tmpbuf[2] = '\0';
n = s;
while (isDIGIT(*s))
s++;
+ if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
+ return;
while (SPACE_OR_TAB(*s))
s++;
if (*s == '"' && (t = strchr(s+1, '"'))) {
handle \cV (control characters)
handle printf-style backslashes (\f, \r, \n, etc)
} (end switch)
+ continue
} (end if backslash)
+ handle regular character
} (end while character to read)
*/
{
dVAR;
register char *send = PL_bufend; /* end of the constant */
- SV *sv = newSV(send - start); /* sv for the constant */
+ SV *sv = newSV(send - start); /* sv for the constant. See
+ note below on sizing. */
register char *s = start; /* start of the constant */
register char *d = SvPVX(sv); /* destination for copies */
bool dorange = FALSE; /* are we in a translit range? */
bool didrange = FALSE; /* did we just finish a range? */
I32 has_utf8 = FALSE; /* Output constant is UTF8 */
- I32 this_utf8 = UTF; /* The source string is assumed to be UTF8 */
+ I32 this_utf8 = UTF; /* Is the source string assumed
+ to be UTF8? But, this can
+ show as true when the source
+ isn't utf8, as for example
+ when it is entirely composed
+ of hex constants */
+
+ /* Note on sizing: The scanned constant is placed into sv, which is
+ * initialized by newSV() assuming one byte of output for every byte of
+ * input. This routine expects newSV() to allocate an extra byte for a
+ * trailing NUL, which this routine will append if it gets to the end of
+ * the input. There may be more bytes of input than output (eg., \N{LATIN
+ * CAPITAL LETTER A}), or more output than input if the constant ends up
+ * recoded to utf8, but each time a construct is found that might increase
+ * the needed size, SvGROW() is called. Its size parameter each time is
+ * based on the best guess estimate at the time, namely the length used so
+ * far, plus the length the current construct will occupy, plus room for
+ * the trailing NUL, plus one byte for every input byte still unscanned */
+
UV uv;
#ifdef EBCDIC
UV literal_endpoint = 0;
goto default_action;
}
- /* \132 indicates an octal constant */
+ /* eg. \132 indicates the octal constant 0x132 */
case '0': case '1': case '2': case '3':
case '4': case '5': case '6': case '7':
{
I32 flags = 0;
STRLEN len = 3;
- uv = grok_oct(s, &len, &flags, NULL);
+ uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
s += len;
}
goto NUM_ESCAPE_INSERT;
- /* \x24 indicates a hex constant */
+ /* eg. \x24 indicates the hex constant 0x24 */
case 'x':
++s;
if (*s == '{') {
continue;
}
len = e - s;
- uv = grok_hex(s, &len, &flags, NULL);
+ uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
s = e + 1;
}
else {
{
STRLEN len = 2;
I32 flags = PERL_SCAN_DISALLOW_PREFIX;
- uv = grok_hex(s, &len, &flags, NULL);
+ uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
s += len;
}
}
NUM_ESCAPE_INSERT:
- /* Insert oct or hex escaped character.
- * There will always enough room in sv since such
- * escapes will be longer than any UTF-8 sequence
- * they can end up as. */
+ /* Insert oct, hex, or \N{U+...} escaped character. There will
+ * always be enough room in sv since such escapes will be
+ * longer than any UTF-8 sequence they can end up as, except if
+ * they force us to recode the rest of the string into utf8 */
- /* We need to map to chars to ASCII before doing the tests
- to cover EBCDIC
- */
- if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(uv))) {
+ /* Here uv is the ordinal of the next character being added in
+ * unicode (converted from native). (It has to be done before
+ * here because \N is interpreted as unicode, and oct and hex
+ * as native.) */
+ if (!UNI_IS_INVARIANT(uv)) {
if (!has_utf8 && uv > 255) {
- /* Might need to recode whatever we have
- * accumulated so far if it contains any
- * hibit chars.
- *
- * (Can't we keep track of that and avoid
- * this rescan? --jhi)
- */
- int hicount = 0;
- U8 *c;
- for (c = (U8 *) SvPVX(sv); c < (U8 *)d; c++) {
- if (!NATIVE_IS_INVARIANT(*c)) {
- hicount++;
- }
- }
- if (hicount) {
- const STRLEN offset = d - SvPVX_const(sv);
- U8 *src, *dst;
- d = SvGROW(sv, SvLEN(sv) + hicount + 1) + offset;
- src = (U8 *)d - 1;
- dst = src+hicount;
- d += hicount;
- while (src >= (const U8 *)SvPVX_const(sv)) {
- if (!NATIVE_IS_INVARIANT(*src)) {
- const U8 ch = NATIVE_TO_ASCII(*src);
- *dst-- = (U8)UTF8_EIGHT_BIT_LO(ch);
- *dst-- = (U8)UTF8_EIGHT_BIT_HI(ch);
- }
- else {
- *dst-- = *src;
- }
- src--;
- }
- }
+ /* Might need to recode whatever we have accumulated so
+ * far if it contains any chars variant in utf8 or
+ * utf-ebcdic. */
+
+ SvCUR_set(sv, d - SvPVX_const(sv));
+ SvPOK_on(sv);
+ *d = '\0';
+ /* See Note on sizing above. */
+ sv_utf8_upgrade_flags_grow(sv,
+ SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+ UNISKIP(uv) + (STRLEN)(send - s) + 1);
+ d = SvPVX(sv) + SvCUR(sv);
+ has_utf8 = TRUE;
}
- if (has_utf8 || uv > 255) {
- d = (char*)uvchr_to_utf8((U8*)d, uv);
- has_utf8 = TRUE;
+ if (has_utf8) {
+ d = (char*)uvuni_to_utf8((U8*)d, uv);
if (PL_lex_inwhat == OP_TRANS &&
PL_sublex_info.sub_op) {
PL_sublex_info.sub_op->op_private |=
}
continue;
- /* \N{LATIN SMALL LETTER A} is a named character */
+ /* \N{LATIN SMALL LETTER A} is a named character, and so is
+ * \N{U+0041} */
case 'N':
++s;
if (*s == '{') {
goto cont_scan;
}
if (e > s + 2 && s[1] == 'U' && s[2] == '+') {
- /* \N{U+...} */
+ /* \N{U+...} The ... is a unicode value even on EBCDIC
+ * machines */
I32 flags = PERL_SCAN_ALLOW_UNDERSCORES |
PERL_SCAN_DISALLOW_PREFIX;
s += 3;
}
}
#endif
+ /* If destination is not in utf8 but this new character is,
+ * recode the dest to utf8 */
if (!has_utf8 && SvUTF8(res)) {
- const char * const ostart = SvPVX_const(sv);
- SvCUR_set(sv, d - ostart);
+ SvCUR_set(sv, d - SvPVX_const(sv));
SvPOK_on(sv);
*d = '\0';
- sv_utf8_upgrade(sv);
- /* this just broke our allocation above... */
- SvGROW(sv, (STRLEN)(send - start));
+ /* See Note on sizing above. */
+ sv_utf8_upgrade_flags_grow(sv,
+ SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+ len + (STRLEN)(send - s) + 1);
d = SvPVX(sv) + SvCUR(sv);
has_utf8 = TRUE;
- }
- if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
- const char * const odest = SvPVX_const(sv);
+ } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
- SvGROW(sv, (SvLEN(sv) + len - (e - s + 4)));
- d = SvPVX(sv) + (d - odest);
+ /* See Note on sizing above. (NOTE: SvCUR() is not set
+ * correctly here). */
+ const STRLEN off = d - SvPVX_const(sv);
+ d = SvGROW(sv, off + len + (STRLEN)(send - s) + 1) + off;
}
#ifdef EBCDIC
if (!dorange)
#endif
default_action:
- /* If we started with encoded form, or already know we want it
- and then encode the next character */
- if ((has_utf8 || this_utf8) && !NATIVE_IS_INVARIANT((U8)(*s))) {
+ /* If we started with encoded form, or already know we want it,
+ then encode the next character */
+ if (! NATIVE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
STRLEN len = 1;
+
+
+ /* One might think that it is wasted effort in the case of the
+ * source being utf8 (this_utf8 == TRUE) to take the next character
+ * in the source, convert it to an unsigned value, and then convert
+ * it back again. But the source has not been validated here. The
+ * routine that does the conversion checks for errors like
+ * malformed utf8 */
+
const UV nextuv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
const STRLEN need = UNISKIP(NATIVE_TO_UNI(nextuv));
- s += len;
- if (need > len) {
- /* encoded value larger than old, need extra space (NOTE: SvCUR() not set here) */
+ if (!has_utf8) {
+ SvCUR_set(sv, d - SvPVX_const(sv));
+ SvPOK_on(sv);
+ *d = '\0';
+ /* See Note on sizing above. */
+ sv_utf8_upgrade_flags_grow(sv,
+ SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+ need + (STRLEN)(send - s) + 1);
+ d = SvPVX(sv) + SvCUR(sv);
+ has_utf8 = TRUE;
+ } else if (need > len) {
+ /* encoded value larger than old, may need extra space (NOTE:
+ * SvCUR() is not set correctly here). See Note on sizing
+ * above. */
const STRLEN off = d - SvPVX_const(sv);
- d = SvGROW(sv, SvLEN(sv) + (need-len)) + off;
+ d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
}
+ s += len;
+
d = (char*)uvchr_to_utf8((U8*)d, nextuv);
- has_utf8 = TRUE;
#ifdef EBCDIC
if (uv > 255 && !dorange)
native_range = FALSE;
default:
if (isIDFIRST_lazy_if(s,UTF))
goto keylookup;
- len = UTF ? Perl_utf8_length(aTHX_ (U8 *) PL_linestart, (U8 *) s) : (STRLEN) (s - PL_linestart);
- Perl_croak(aTHX_ "Unrecognized character \\x%02X in column %d", *s & 255, (int) len + 1);
+ {
+ unsigned char c = *s;
+ len = UTF ? Perl_utf8_length(aTHX_ (U8 *) PL_linestart, (U8 *) s) : (STRLEN) (s - PL_linestart);
+ if (len > UNRECOGNIZED_PRECEDE_COUNT) {
+ d = UTF ? (char *) Perl_utf8_hop(aTHX_ (U8 *) s, -UNRECOGNIZED_PRECEDE_COUNT) : s - UNRECOGNIZED_PRECEDE_COUNT;
+ } else {
+ d = PL_linestart;
+ }
+ *s = '\0';
+ Perl_croak(aTHX_ "Unrecognized character \\x%02X; marked by <-- HERE after %s<-- HERE near column %d", c, d, (int) len + 1);
+ }
case 4:
case 26:
goto fake_eof; /* emulate EOF on ^D or ^Z */
*s = '#'; /* Don't try to parse shebang line */
}
#endif /* ALTERNATE_SHEBANG */
-#ifndef MACOS_TRADITIONAL
if (!d &&
*s == '#' &&
ipathend > ipath &&
while (s < PL_bufend && isSPACE(*s))
s++;
if (s < PL_bufend) {
- Newxz(newargv,PL_origargc+3,char*);
+ Newx(newargv,PL_origargc+3,char*);
newargv[1] = s;
while (s < PL_bufend && !isSPACE(*s))
s++;
PERL_FPU_POST_EXEC
Perl_croak(aTHX_ "Can't exec %s", ipath);
}
-#endif
if (d) {
while (*d && !isSPACE(*d))
d++;
const char *d1 = d;
do {
- if (*d1 == 'M' || *d1 == 'm' || *d1 == 'C') {
+ bool baduni = FALSE;
+ if (*d1 == 'C') {
+ const char *d2 = d1 + 1;
+ if (parse_unicode_opts((const char **)&d2)
+ != PL_unicode)
+ baduni = TRUE;
+ }
+ if (baduni || *d1 == 'M' || *d1 == 'm') {
const char * const m = d1;
while (*d1 && !isSPACE(*d1))
d1++;
"\t(Maybe you didn't strip carriage returns after a network transfer?)\n");
#endif
case ' ': case '\t': case '\f': case 013:
-#ifdef MACOS_TRADITIONAL
- case '\312':
-#endif
#ifdef PERL_MAD
PL_realtokenstart = -1;
if (!PL_thiswhite)
BOop(OP_BIT_XOR);
case '[':
PL_lex_brackets++;
- /* FALL THROUGH */
+ {
+ const char tmp = *s++;
+ OPERATOR(tmp);
+ }
case '~':
if (s[1] == '~'
&& (PL_expect == XOPERATOR || PL_expect == XTERMORDORDOR))
case KEY_or:
case KEY_and:
case KEY_for:
+ case KEY_foreach:
case KEY_unless:
case KEY_if:
case KEY_while:
if (len == 6 && strnEQ(SvPVX(sv), "unique", len)) {
sv_free(sv);
if (PL_in_my == KEY_our) {
-#ifdef USE_ITHREADS
- GvUNIQUE_on(cGVOPx_gv(pl_yylval.opval));
-#else
- /* skip to avoid loading attributes.pm */
-#endif
deprecate(":unique");
}
else
}
else if (!PL_in_my && len == 6 && strnEQ(SvPVX(sv), "locked", len)) {
sv_free(sv);
- CvLOCKED_on(PL_compcv);
+ deprecate(":locked");
}
else if (!PL_in_my && len == 6 && strnEQ(SvPVX(sv), "method", len)) {
sv_free(sv);
pl_yylval.ival = 0;
OPERATOR(ASSIGNOP);
case '!':
- if (PL_expect == XSTATE && s[1] == '!' && s[2] == '!') {
- s += 3;
- LOP(OP_DIE,XTERM);
- }
s++;
{
const char tmp = *s++;
AOPERATOR(DORDOR);
}
case '?': /* may either be conditional or pattern */
- if (PL_expect == XSTATE && s[1] == '?' && s[2] == '?') {
- s += 3;
- LOP(OP_WARN,XTERM);
- }
if (PL_expect == XOPERATOR) {
char tmp = *s++;
if(tmp == '?') {
/* Is this a label? */
if (!tmp && PL_expect == XSTATE
&& d < PL_bufend && *d == ':' && *(d + 1) != ':') {
+ tmp = keyword(PL_tokenbuf, len, 0);
+ if (tmp)
+ Perl_croak(aTHX_ "Can't use keyword '%s' as a label", PL_tokenbuf);
s = d + 1;
pl_yylval.pval = CopLABEL_alloc(PL_tokenbuf);
CLINE;
TOKEN(LABEL);
}
-
- /* Check for keywords */
- tmp = keyword(PL_tokenbuf, len, 0);
+ else
+ /* Check for keywords */
+ tmp = keyword(PL_tokenbuf, len, 0);
/* Is this a word before a => operator? */
if (*d == '=' && d[1] == '>') {
/* Call it a bare word */
- bareword:
if (PL_hints & HINT_STRICT_SUBS)
pl_yylval.opval->op_private |= OPpCONST_STRICT;
else {
+ bareword:
+ /* after "print" and similar functions (corresponding to
+ * "F? L" in opcode.pl), whatever wasn't already parsed as
+ * a filehandle should be subject to "strict subs".
+ * Likewise for the optional indirect-object argument to system
+ * or exec, which can't be a bareword */
+ if ((PL_last_lop_op == OP_PRINT
+ || PL_last_lop_op == OP_PRTF
+ || PL_last_lop_op == OP_SAY
+ || PL_last_lop_op == OP_SYSTEM
+ || PL_last_lop_op == OP_EXEC)
+ && (PL_hints & HINT_STRICT_SUBS))
+ pl_yylval.opval->op_private |= OPpCONST_STRICT;
if (lastchar != '-') {
if (ckWARN(WARN_RESERVED)) {
d = PL_tokenbuf;
#ifdef __SC__
#pragma segment Perl_yylex
#endif
-int
-Perl_yywarn(pTHX_ const char *const s)
+static int
+S_yywarn(pTHX_ const char *const s)
{
dVAR;