data->whilem_c = data_fake.whilem_c;
}
if (f & SCF_DO_STCLASS_AND) {
- const int was = (data->start_class->flags & ANYOF_EOS);
-
- cl_and(data->start_class, &intrnl);
- if (was)
- data->start_class->flags |= ANYOF_EOS;
+ if (flags & SCF_DO_STCLASS_OR) {
+ /* OR before, AND after: ideally we would recurse with
+ * data_fake to get the AND applied by study of the
+ * remainder of the pattern, and then derecurse;
+ * *** HACK *** for now just treat as "no information".
+ * See [perl #56690].
+ */
+ cl_init(pRExC_state, data->start_class);
+ } else {
+ /* AND before and after: combine and continue */
+ const int was = (data->start_class->flags & ANYOF_EOS);
+
+ cl_and(data->start_class, &intrnl);
+ if (was)
+ data->start_class->flags |= ANYOF_EOS;
+ }
}
}
#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
/* Pick up the branches, linking them together. */
parse_start = RExC_parse; /* MJD */
br = regbranch(pRExC_state, &flags, 1,depth+1);
+
+ if (freeze_paren) {
+ if (RExC_npar > after_freeze)
+ after_freeze = RExC_npar;
+ RExC_npar = freeze_paren;
+ }
+
/* branch_len = (paren != 0); */
if (br == NULL)
/* reg_namedseq(pRExC_state,UVp)
This is expected to be called by a parser routine that has
- recognized'\N' and needs to handle the rest. RExC_parse is
+ recognized '\N' and needs to handle the rest. RExC_parse is
expected to point at the first char following the N at the time
of the call.
be returned to indicate failure. (This will NOT be a valid pointer
to a regnode.)
- If value is null then it is assumed that we are parsing normal text
+ If valuep is null then it is assumed that we are parsing normal text
and inserts a new EXACT node into the program containing the resolved
string and returns a pointer to the new node. If the string is
zerolength a NOTHING node is emitted.
-
+
On success RExC_parse is set to the char following the endbrace.
Parsing failures will generate a fatal errorvia vFAIL(...)
*/
STATIC regnode *
-S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
+S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
{
char * name; /* start of the content of the name */
char * endbrace; /* endbrace following the name */
PERL_ARGS_ASSERT_REG_NAMEDSEQ;
- if (*RExC_parse != '{') {
- vFAIL("Missing braces on \\N{}");
+ if (*RExC_parse != '{' ||
+ (*RExC_parse == '{' && RExC_parse[1]
+ && strchr("0123456789", RExC_parse[1])))
+ {
+ GET_RE_DEBUG_FLAGS_DECL;
+ if (valuep)
+ /* no bare \N in a charclass */
+ vFAIL("Missing braces on \\N{}");
+ GET_RE_DEBUG_FLAGS;
+ nextchar(pRExC_state);
+ ret = reg_node(pRExC_state, REG_ANY);
+ *flagp |= HASWIDTH|SIMPLE;
+ RExC_naughty++;
+ RExC_parse--;
+ Set_Node_Length(ret, 1); /* MJD */
+ return ret;
}
name = RExC_parse+1;
endbrace = strchr(RExC_parse, '}');
| PERL_SCAN_DISALLOW_PREFIX
| (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
UV cp;
- char string;
len = (STRLEN)(endbrace - name - 2);
cp = grok_hex(name + 2, &len, &fl, NULL);
if ( len != (STRLEN)(endbrace - name - 2) ) {
cp = 0xFFFD;
}
- if (cp > 0xff)
- RExC_utf8 = 1;
if ( valuep ) {
+ if (cp > 0xff) RExC_utf8 = 1;
*valuep = cp;
return NULL;
}
- string = (char)cp;
- sv_str= newSVpvn(&string, 1);
+
+ /* Need to convert to utf8 if either: won't fit into a byte, or the re
+ * is going to be in utf8 and the representation changes under utf8. */
+ if (cp > 0xff || (RExC_utf8 && ! UNI_IS_INVARIANT(cp))) {
+ U8 string[UTF8_MAXBYTES+1];
+ U8 *tmps;
+ RExC_utf8 = 1;
+ tmps = uvuni_to_utf8(string, cp);
+ sv_str = newSVpvn_utf8((char*)string, tmps - string, TRUE);
+ } else { /* Otherwise, no need for utf8, can skip that step */
+ char string;
+ string = (char)cp;
+ sv_str= newSVpvn(&string, 1);
+ }
} else {
/* fetch the charnames handler for this scope */
HV * const table = GvHV(PL_hintgv);
Set_Node_Cur_Length(ret); /* MJD */
RExC_parse--;
nextchar(pRExC_state);
- } else {
+ } else { /* zero length */
ret = reg_node(pRExC_state,NOTHING);
}
if (!cached) {
}
break;
case 'N':
- /* Handle \N{NAME} here and not below because it can be
+ /* Handle \N and \N{NAME} here and not below because it can be
multicharacter. join_exact() will join them up later on.
Also this makes sure that things like /\N{BLAH}+/ and
\N{BLAH} being multi char Just Happen. dmq*/
++RExC_parse;
- ret= reg_namedseq(pRExC_state, NULL);
+ ret= reg_namedseq(pRExC_state, NULL, flagp);
break;
case 'k': /* Handle \k<NAME> and \k'NAME' */
parse_named_seq:
I32 flags = 0;
STRLEN numlen = 3;
ender = grok_oct(p, &numlen, &flags, NULL);
+
+ /* An octal above 0xff is interpreted differently
+ * depending on if the re is in utf8 or not. If it
+ * is in utf8, the value will be itself, otherwise
+ * it is interpreted as modulo 0x100. It has been
+ * decided to discourage the use of octal above the
+ * single-byte range. For now, warn only when
+ * it ends up modulo */
+ if (SIZE_ONLY && ender >= 0x100
+ && ! UTF && ! PL_encoding
+ && ckWARN2(WARN_DEPRECATED, WARN_REGEXP)) {
+ vWARNdep(p, "Use of octal value above 377 is deprecated");
+ }
p += numlen;
}
else {
what = WORD; \
break
+/*
+ We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
+ so that it is possible to override the option here without having to
+ rebuild the entire core. as we are required to do if we change regcomp.h
+ which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
+*/
+#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
+#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
+#endif
+
+#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
+#define POSIX_CC_UNI_NAME(CCNAME) CCNAME
+#else
+#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
+#endif
+
/*
parse a class specification and produce either an ANYOF node that
matches the pattern or if the pattern matches a single char only and
from earlier versions, OTOH that behaviour was broken
as well. */
UV v; /* value is register so we cant & it /grrr */
- if (reg_namedseq(pRExC_state, &v)) {
+ if (reg_namedseq(pRExC_state, &v, NULL)) {
goto parseit;
}
value= v;
* A similar issue a little earlier when switching on value.
* --jhi */
switch ((I32)namedclass) {
+
+ case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
+ case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
+ case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
+ case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
+ case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
+ case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
+ case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
+ case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
+ case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
+ case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
+#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
case _C_C_T_(ALNUM, isALNUM(value), "Word");
- case _C_C_T_(ALNUMC, isALNUMC(value), "Alnum");
- case _C_C_T_(ALPHA, isALPHA(value), "Alpha");
- case _C_C_T_(BLANK, isBLANK(value), "Blank");
- case _C_C_T_(CNTRL, isCNTRL(value), "Cntrl");
- case _C_C_T_(GRAPH, isGRAPH(value), "Graph");
- case _C_C_T_(LOWER, isLOWER(value), "Lower");
- case _C_C_T_(PRINT, isPRINT(value), "Print");
- case _C_C_T_(PSXSPC, isPSXSPC(value), "Space");
- case _C_C_T_(PUNCT, isPUNCT(value), "Punct");
case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
- case _C_C_T_(UPPER, isUPPER(value), "Upper");
+#else
+ case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
+ case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
+#endif
case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
ANYOF_BITMAP_SET(ret, value);
}
yesno = '+';
- what = "Digit";
+ what = POSIX_CC_UNI_NAME("Digit");
break;
case ANYOF_NDIGIT:
if (LOC)
ANYOF_BITMAP_SET(ret, value);
}
yesno = '!';
- what = "Digit";
+ what = POSIX_CC_UNI_NAME("Digit");
break;
case ANYOF_MAX:
/* this is to handle \p and \P */
/*
- regprop - printable representation of opcode
*/
+#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
+STMT_START { \
+ if (do_sep) { \
+ Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
+ if (flags & ANYOF_INVERT) \
+ /*make sure the invert info is in each */ \
+ sv_catpvs(sv, "^"); \
+ do_sep = 0; \
+ } \
+} STMT_END
+
void
Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
{
else if (k == ANYOF) {
int i, rangestart = -1;
const U8 flags = ANYOF_FLAGS(o);
+ int do_sep = 0;
/* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
static const char * const anyofs[] = {
"[:^alpha:]",
"[:ascii:]",
"[:^ascii:]",
- "[:ctrl:]",
- "[:^ctrl:]",
+ "[:cntrl:]",
+ "[:^cntrl:]",
"[:graph:]",
"[:^graph:]",
"[:lower:]",
Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
if (flags & ANYOF_INVERT)
sv_catpvs(sv, "^");
+
+ /* output what the standard cp 0-255 bitmap matches */
for (i = 0; i <= 256; i++) {
if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
if (rangestart == -1)
sv_catpvs(sv, "-");
put_byte(sv, i - 1);
}
+ do_sep = 1;
rangestart = -1;
}
}
-
+
+ EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
+ /* output any special charclass tests (used mostly under use locale) */
if (o->flags & ANYOF_CLASS)
for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
- if (ANYOF_CLASS_TEST(o,i))
+ if (ANYOF_CLASS_TEST(o,i)) {
sv_catpv(sv, anyofs[i]);
-
+ do_sep = 1;
+ }
+
+ EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
+
+ /* output information about the unicode matching */
if (flags & ANYOF_UNICODE)
sv_catpvs(sv, "{unicode}");
else if (flags & ANYOF_UNICODE_ALL)
if (lv) {
if (sw) {
U8 s[UTF8_MAXBYTES_CASE+1];
-
+
for (i = 0; i <= 256; i++) { /* just the first 256 */
uvchr_to_utf8(s, i);
if (r->saved_copy)
SvREFCNT_dec(r->saved_copy);
#endif
- Safefree(r->swap);
Safefree(r->offs);
}
a case of zero-ing that, rather than copying the current length. */
SvPV_set(ret_x, RX_WRAPPED(rx));
SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
- StructCopy(&(r->xpv_cur), &(ret->xpv_cur), struct regexp_allocated);
+ memcpy(&(ret->xpv_cur), &(r->xpv_cur),
+ sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
SvLEN_set(ret_x, 0);
Newx(ret->offs, npar, regexp_paren_pair);
Copy(r->offs, ret->offs, npar, regexp_paren_pair);
ret->saved_copy = NULL;
#endif
ret->mother_re = rx;
- ret->swap = NULL;
return ret_x;
}