#define FAIL(msg) \
STMT_START { \
char *ellipses = ""; \
- unsigned len = strlen(RExC_precomp); \
+ IV len = RExC_end - RExC_precomp; \
\
if (!SIZE_ONLY) \
SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
#define FAIL2(pat,msg) \
STMT_START { \
char *ellipses = ""; \
- unsigned len = strlen(RExC_precomp); \
+ IV len = RExC_end - RExC_precomp; \
\
if (!SIZE_ONLY) \
SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
*/
#define Simple_vFAIL(m) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
+ IV offset = RExC_parse - RExC_precomp; \
\
Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
m, (int)offset, RExC_precomp, RExC_precomp + offset); \
*/
#define Simple_vFAIL2(m,a1) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
+ IV offset = RExC_parse - RExC_precomp; \
\
S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
(int)offset, RExC_precomp, RExC_precomp + offset); \
*/
#define Simple_vFAIL3(m, a1, a2) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
+ IV offset = RExC_parse - RExC_precomp; \
\
S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
(int)offset, RExC_precomp, RExC_precomp + offset); \
*/
#define Simple_vFAIL4(m, a1, a2, a3) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
+ IV offset = RExC_parse - RExC_precomp; \
\
S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
(int)offset, RExC_precomp, RExC_precomp + offset); \
*/
#define Simple_vFAIL5(m, a1, a2, a3, a4) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
+ IV offset = RExC_parse - RExC_precomp; \
S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
(int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
#define vWARN(loc,m) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
+ IV offset = loc - RExC_precomp; \
Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
m, (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END \
#define vWARNdep(loc,m) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
+ IV offset = loc - RExC_precomp; \
int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED; \
Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\
m, (int)offset, RExC_precomp, RExC_precomp + offset); \
#define vWARN2(loc, m, a1) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
+ IV offset = loc - RExC_precomp; \
Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
a1, \
(int)offset, RExC_precomp, RExC_precomp + offset); \
#define vWARN3(loc, m, a1, a2) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp) - (RExC_end - (loc)); \
+ IV offset = loc - RExC_precomp; \
Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
a1, a2, \
(int)offset, RExC_precomp, RExC_precomp + offset); \
#define vWARN4(loc, m, a1, a2, a3) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
+ IV offset = loc - RExC_precomp; \
Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
a1, a2, a3, \
(int)offset, RExC_precomp, RExC_precomp + offset); \
/* used for the parse_flags section for (?c) -- japhy */
#define vWARN5(loc, m, a1, a2, a3, a4) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
+ IV offset = loc - RExC_precomp; \
Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
a1, a2, a3, a4, \
(int)offset, RExC_precomp, RExC_precomp + offset); \
regnode *oscan = scan;
struct regnode_charclass_class this_class;
struct regnode_charclass_class *oclass = NULL;
+ I32 next_is_eval = 0;
switch (PL_regkind[(U8)OP(scan)]) {
case WHILEM: /* End of (?:...)* . */
scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
}
scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
+ next_is_eval = (OP(scan) == EVAL);
do_curly:
if (flags & SCF_DO_SUBSTR) {
if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
if (!scan) /* It was not CURLYX, but CURLY. */
scan = next;
if (ckWARN(WARN_REGEXP)
+ /* ? quantifier ok, except for (?{ ... }) */
+ && (next_is_eval || !(mincount == 0 && maxcount == 1))
&& (minnext == 0) && (deltanext == 0)
&& data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
&& maxcount <= REG_INFTY/3) /* Complement check for big count */
/* a lovely hack--pretend we saw [\pX] instead */
RExC_end = strchr(RExC_parse, '}');
if (!RExC_end) {
+ U8 c = (U8)*RExC_parse;
RExC_parse += 2;
RExC_end = oldregxend;
- vFAIL("Missing right brace on \\p{}");
+ vFAIL2("Missing right brace on \\%c{}", c);
}
RExC_end++;
}
register char *p;
char *oldp, *s;
STRLEN numlen;
+ STRLEN ulen;
+ U8 tmpbuf[UTF8_MAXLEN*2+1];
parse_start = RExC_parse - 1;
vFAIL("Missing right brace on \\x{}");
}
else {
- numlen = 1; /* allow underscores */
- ender = (UV)scan_hex(p + 1, e - p - 1, &numlen);
+ I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
+ | PERL_SCAN_DISALLOW_PREFIX;
+ numlen = e - p - 1;
+ ender = grok_hex(p + 1, &numlen, &flags, NULL);
if (ender > 0xff)
RExC_utf8 = 1;
/* numlen is generous */
}
}
else {
- numlen = 0; /* disallow underscores */
- ender = (UV)scan_hex(p, 2, &numlen);
+ I32 flags = PERL_SCAN_DISALLOW_PREFIX;
+ numlen = 2;
+ ender = grok_hex(p, &numlen, &flags, NULL);
p += numlen;
}
break;
case '5': case '6': case '7': case '8':case '9':
if (*p == '0' ||
(isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
- numlen = 0; /* disallow underscores */
- ender = (UV)scan_oct(p, 3, &numlen);
+ I32 flags = 0;
+ numlen = 3;
+ ender = grok_oct(p, &numlen, &flags, NULL);
p += numlen;
}
else {
/* FALL THROUGH */
default:
if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
- vWARN2(p +1, "Unrecognized escape \\%c passed through", *p);
+ vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
goto normal_default;
}
break;
if (RExC_flags16 & PMf_EXTENDED)
p = regwhite(p, RExC_end);
if (UTF && FOLD) {
- if (LOC)
- ender = toLOWER_LC_uvchr(ender);
- else
- ender = toLOWER_uni(ender);
+ toLOWER_uni(ender, tmpbuf, &ulen);
+ ender = utf8_to_uvchr(tmpbuf, 0);
}
if (ISMULT2(p)) { /* Back off on ?+*. */
if (len)
break;
}
+ if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT && !RExC_utf8) {
+ STRLEN oldlen = STR_LEN(ret);
+ SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
+ char *s = Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding);
+ STRLEN newlen = SvCUR(sv);
+ if (!SIZE_ONLY) {
+ DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
+ oldlen, STRING(ret), newlen, s));
+ Copy(s, STRING(ret), newlen, char);
+ STR_LEN(ret) += newlen - oldlen;
+ RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
+ } else
+ RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
+ }
+
return(ret);
}
Character classes ([:foo:]) can also be negated ([:^foo:]).
Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
- but trigger warnings because they are currently unimplemented. */
+ but trigger failures because they are currently unimplemented. */
+
+#define POSIXCC_DONE(c) ((c) == ':')
+#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
+#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
+
STATIC I32
S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
{
if (value == '[' && RExC_parse + 1 < RExC_end &&
/* I smell either [: or [= or [. -- POSIX has been here, right? */
- (*RExC_parse == ':' ||
- *RExC_parse == '=' ||
- *RExC_parse == '.')) {
- char c = *RExC_parse;
+ POSIXCC(UCHARAT(RExC_parse))) {
+ char c = UCHARAT(RExC_parse);
char* s = RExC_parse++;
- while (RExC_parse < RExC_end && *RExC_parse != c)
+ while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
RExC_parse++;
if (RExC_parse == RExC_end)
/* Grandfather lone [:, [=, [. */
else {
char* t = RExC_parse++; /* skip over the c */
- if (*RExC_parse == ']') {
+ if (UCHARAT(RExC_parse) == ']') {
RExC_parse++; /* skip over the ending ] */
posixcc = s + 1;
if (*s == ':') {
/* adjust RExC_parse so the warning shows after
the class closes */
- while (*RExC_parse && *RExC_parse != ']')
+ while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
RExC_parse++;
Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
}
S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
{
if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
- (*RExC_parse == ':' ||
- *RExC_parse == '=' ||
- *RExC_parse == '.')) {
+ POSIXCC(UCHARAT(RExC_parse))) {
char *s = RExC_parse;
char c = *s++;
vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
/* [[=foo=]] and [[.foo.]] are still future. */
- if (c == '=' || c == '.')
- {
+ if (POSIXCC_NOTYET(c)) {
/* adjust RExC_parse so the error shows after
the class closes */
- while (*RExC_parse && *RExC_parse++ != ']')
+ while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
;
Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
}
S_regclass(pTHX_ RExC_state_t *pRExC_state)
{
register UV value;
+ register UV nextvalue;
register IV prevvalue = OOB_UNICODE;
register IV range = 0;
register regnode *ret;
if (!SIZE_ONLY)
ANYOF_FLAGS(ret) = 0;
- if (*RExC_parse == '^') { /* Complement of range. */
+ if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
RExC_naughty++;
RExC_parse++;
if (!SIZE_ONLY)
listsv = newSVpvn("# comment\n", 10);
}
- if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
+ nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
+
+ if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && POSIXCC(nextvalue))
checkposixcc(pRExC_state);
- if (*RExC_parse == ']' || *RExC_parse == '-')
+ if (UCHARAT(RExC_parse) == ']' || UCHARAT(RExC_parse) == '-')
goto charclassloop; /* allow 1st char to be ] or - */
- while (RExC_parse < RExC_end && *RExC_parse != ']') {
+ while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
charclassloop:
}
else
value = UCHARAT(RExC_parse++);
- if (value == '[')
+ nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
+ if (value == '[' && POSIXCC(nextvalue))
namedclass = regpposixcc(pRExC_state, value);
else if (value == '\\') {
if (UTF) {
case 'p':
case 'P':
if (*RExC_parse == '{') {
+ U8 c = (U8)value;
e = strchr(RExC_parse++, '}');
if (!e)
- vFAIL("Missing right brace on \\p{}");
+ vFAIL2("Missing right brace on \\%c{}", c);
+ while (isSPACE(UCHARAT(RExC_parse)))
+ RExC_parse++;
+ if (e == RExC_parse)
+ vFAIL2("Empty \\%c{}", c);
n = e - RExC_parse;
+ while (isSPACE(UCHARAT(RExC_parse + n - 1)))
+ n--;
}
else {
e = RExC_parse;
n = 1;
}
if (!SIZE_ONLY) {
+ if (UCHARAT(RExC_parse) == '^') {
+ RExC_parse++;
+ n--;
+ value = value == 'p' ? 'P' : 'p'; /* toggle */
+ while (isSPACE(UCHARAT(RExC_parse))) {
+ RExC_parse++;
+ n--;
+ }
+ }
if (value == 'p')
- Perl_sv_catpvf(aTHX_ listsv,
- "+utf8::%.*s\n", (int)n, RExC_parse);
+ Perl_sv_catpvf(aTHX_ listsv,
+ "+utf8::%.*s\n", (int)n, RExC_parse);
else
- Perl_sv_catpvf(aTHX_ listsv,
- "!utf8::%.*s\n", (int)n, RExC_parse);
+ Perl_sv_catpvf(aTHX_ listsv,
+ "!utf8::%.*s\n", (int)n, RExC_parse);
}
RExC_parse = e + 1;
ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
case 'a': value = ASCII_TO_NATIVE('\007');break;
case 'x':
if (*RExC_parse == '{') {
+ I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
+ | PERL_SCAN_DISALLOW_PREFIX;
e = strchr(RExC_parse++, '}');
if (!e)
vFAIL("Missing right brace on \\x{}");
- numlen = 1; /* allow underscores */
- value = (UV)scan_hex(RExC_parse,
- e - RExC_parse,
- &numlen);
+
+ numlen = e - RExC_parse;
+ value = grok_hex(RExC_parse, &numlen, &flags, NULL);
RExC_parse = e + 1;
}
else {
- numlen = 0; /* disallow underscores */
- value = (UV)scan_hex(RExC_parse, 2, &numlen);
+ I32 flags = PERL_SCAN_DISALLOW_PREFIX;
+ numlen = 2;
+ value = grok_hex(RExC_parse, &numlen, &flags, NULL);
RExC_parse += numlen;
}
break;
break;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
- numlen = 0; /* disallow underscores */
- value = (UV)scan_oct(--RExC_parse, 3, &numlen);
+ {
+ I32 flags = 0;
+ numlen = 3;
+ value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
RExC_parse += numlen;
break;
+ }
default:
if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
vWARN2(RExC_parse,
k = PL_regkind[(U8)OP(o)];
- if (k == EXACT)
- Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>", PL_colors[0],
- STR_LEN(o), STRING(o), PL_colors[1]);
+ if (k == EXACT) {
+ SV *dsv = sv_2mortal(newSVpvn("", 0));
+ bool do_utf8 = DO_UTF8(sv);
+ char *s = do_utf8 ?
+ pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60, 0) :
+ STRING(o);
+ int len = do_utf8 ?
+ strlen(s) :
+ STR_LEN(o);
+ Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
+ PL_colors[0],
+ len, s,
+ PL_colors[1]);
+ }
else if (k == CURLY) {
if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
if (flags & ANYOF_UNICODE)
sv_catpv(sv, "{unicode}");
else if (flags & ANYOF_UNICODE_ALL)
- sv_catpv(sv, "{all-unicode}");
+ sv_catpv(sv, "{unicode_all}");
{
SV *lv;
SAVEVPTR(PL_reg_re); /* from regexec.c */
SAVEPPTR(PL_reg_ganch); /* from regexec.c */
SAVESPTR(PL_reg_sv); /* from regexec.c */
- SAVEI8(PL_reg_sv_utf8); /* from regexec.c */
+ SAVEI8(PL_reg_match_utf8); /* from regexec.c */
SAVEVPTR(PL_reg_magic); /* from regexec.c */
SAVEI32(PL_reg_oldpos); /* from regexec.c */
SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */