/* *These* symbols are masked to allow static link. */
# define Perl_regnext my_regnext
# define Perl_save_re_context my_save_re_context
-# define Perl_reginitcolors my_reginitcolors
+# define Perl_reginitcolors my_reginitcolors
# define PERL_NO_GET_CONTEXT
-#endif
+#endif
/*SUPPRESS 112*/
/*
*
**** Alterations to Henry's code are...
****
- **** Copyright (c) 1991-2000, Larry Wall
+ **** Copyright (c) 1991-2001, Larry Wall
****
**** You may distribute under the terms of either the GNU General Public
**** License or the Artistic License, as specified in the README file.
char *end; /* End of input for compile */
char *parse; /* Input-scan pointer. */
I32 whilem_seen; /* number of WHILEM in this expr */
- regnode *emit; /* Code-emit pointer; ®dummy = don't */
+ regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
I32 naughty; /* How bad is this pattern? */
I32 sawback; /* Did we see \1, ...? */
U32 seen;
I32 extralen;
I32 seen_zerolen;
I32 seen_evals;
+ I32 utf8;
#if ADD_TO_REGEXEC
char *starttry; /* -Dr: where regtry was called. */
#define RExC_starttry (pRExC_state->starttry)
#define RExC_extralen (pRExC_state->extralen)
#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
#define RExC_seen_evals (pRExC_state->seen_evals)
+#define RExC_utf8 (pRExC_state->utf8)
#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
((*s) == '{' && regcurly(s)))
-#ifdef atarist
-#define PERL_META "^$.[()|?+*\\"
-#else
-#define META "^$.[()|?+*\\"
-#endif
#ifdef SPSTART
#undef SPSTART /* dratted cpp namespace... */
* Forward declarations for pregcomp()'s friends.
*/
-static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0};
#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
#define SCF_WHILEM_VISITED_POS 0x2000
-#define RF_utf8 8
-#define UTF (PL_reg_flags & RF_utf8)
+#define UTF RExC_utf8
#define LOC (RExC_flags16 & PMf_LOCALE)
#define FOLD (RExC_flags16 & PMf_FOLD)
-#define OOB_CHAR8 1234
-#define OOB_UTF8 123456
+#define OOB_UNICODE 12345678
#define OOB_NAMEDCLASS -1
#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
*/
#define MARKER1 "HERE" /* marker as it appears in the description */
#define MARKER2 " << HERE " /* marker as it appears within the regex */
-
+
#define REPORT_LOCATION " before " MARKER1 " mark in regex m/%.*s" MARKER2 "%s/"
/*
STATIC void
S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
{
- dTHR;
STRLEN l = CHR_SVLEN(data->last_found);
STRLEN old_l = CHR_SVLEN(*data->longest);
-
+
if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
sv_setsv(*data->longest, data->last_found);
if (*data->longest == data->longest_fixed) {
data->offset_fixed = l ? data->last_start_min : data->pos_min;
if (data->flags & SF_BEFORE_EOL)
- data->flags
+ data->flags
|= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
else
data->flags &= ~SF_FIX_BEFORE_EOL;
}
else {
data->offset_float_min = l ? data->last_start_min : data->pos_min;
- data->offset_float_max = (l
- ? data->last_start_max
+ data->offset_float_max = (l
+ ? data->last_start_max
: data->pos_min + data->pos_delta);
if (data->flags & SF_BEFORE_EOL)
- data->flags
+ data->flags
|= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
else
data->flags &= ~SF_FL_BEFORE_EOL;
ANYOF_CLASS_ZERO(cl);
for (value = 0; value < 256; ++value)
ANYOF_BITMAP_SET(cl, value);
- cl->flags = ANYOF_EOS;
+ cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
if (LOC)
cl->flags |= ANYOF_LOCALE;
}
for (value = 0; value <= ANYOF_MAX; value += 2)
if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
return 1;
+ if (!(cl->flags & ANYOF_UNICODE_ALL))
+ return 0;
for (value = 0; value < 256; ++value)
if (!ANYOF_BITMAP_TEST(cl, value))
return 0;
} /* XXXX: logic is complicated otherwise, leave it along for a moment. */
if (!(and_with->flags & ANYOF_EOS))
cl->flags &= ~ANYOF_EOS;
+
+ if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
+ cl->flags &= ~ANYOF_UNICODE_ALL;
+ cl->flags |= ANYOF_UNICODE;
+ ARG_SET(cl, ARG(and_with));
+ }
+ if (!(and_with->flags & ANYOF_UNICODE_ALL))
+ cl->flags &= ~ANYOF_UNICODE_ALL;
+ if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
+ cl->flags &= ~ANYOF_UNICODE;
}
/* 'OR' a given class with another one. Can create false positives */
} else {
/* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
- && (!(or_with->flags & ANYOF_FOLD)
+ && (!(or_with->flags & ANYOF_FOLD)
|| (cl->flags & ANYOF_FOLD)) ) {
int i;
}
if (or_with->flags & ANYOF_EOS)
cl->flags |= ANYOF_EOS;
+
+ if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
+ ARG(cl) != ARG(or_with)) {
+ cl->flags |= ANYOF_UNICODE_ALL;
+ cl->flags &= ~ANYOF_UNICODE;
+ }
+ if (or_with->flags & ANYOF_UNICODE_ALL) {
+ cl->flags |= ANYOF_UNICODE_ALL;
+ cl->flags &= ~ANYOF_UNICODE;
+ }
}
/* REx optimizer. Converts nodes into quickier variants "in place".
/* deltap: Write maxlen-minlen here. */
/* last: Stop before this one. */
{
- dTHR;
I32 min = 0, pars = 0, code;
regnode *scan = *scanp, *next;
I32 delta = 0;
I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
scan_data_t data_fake;
struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
-
+
while (scan && OP(scan) != END && scan < last) {
/* Peephole optimizer: */
U32 stringok = 1;
#ifdef DEBUGGING
regnode *stop = scan;
-#endif
+#endif
next = scan + NODE_SZ_STR(scan);
/* Skip NOTHING, merge EXACT*. */
while (n &&
- ( PL_regkind[(U8)OP(n)] == NOTHING ||
+ ( PL_regkind[(U8)OP(n)] == NOTHING ||
(stringok && (OP(n) == OP(scan))))
&& NEXT_OFF(n)
&& NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
#ifdef DEBUGGING
if (stringok)
stop = n;
-#endif
+#endif
n = regnext(n);
}
- else {
+ else if (stringok) {
int oldl = STR_LEN(scan);
regnode *nnext = regnext(n);
-
- if (oldl + STR_LEN(n) > U8_MAX)
+
+ if (oldl + STR_LEN(n) > U8_MAX)
break;
NEXT_OFF(scan) += NEXT_OFF(n);
STR_LEN(scan) += STR_LEN(n);
next = n + NODE_SZ_STR(n);
/* Now we can overwrite *n : */
- Move(STRING(n), STRING(scan) + oldl,
- STR_LEN(n), char);
+ Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
#ifdef DEBUGGING
- if (stringok)
- stop = next - 1;
-#endif
+ stop = next - 1;
+#endif
n = nnext;
}
}
int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
int noff;
regnode *n = scan;
-
+
/* Skip NOTHING and LONGJMP. */
while ((n = regnext(n))
&& ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
off += noff;
if (reg_off_by_arg[OP(scan)])
ARG(scan) = off;
- else
+ else
NEXT_OFF(scan) = off;
}
/* The principal pseudo-switch. Cannot be a switch, since we
look into several different things. */
- if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
+ if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
|| OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
next = regnext(scan);
code = OP(scan);
-
- if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
+
+ if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
I32 max1 = 0, min1 = I32_MAX, num = 0;
struct regnode_charclass_class accum;
num++;
data_fake.flags = 0;
- if (data) {
+ if (data) {
data_fake.whilem_c = data->whilem_c;
data_fake.last_closep = data->last_closep;
}
cl_init(pRExC_state, &this_class);
data_fake.start_class = &this_class;
f = SCF_DO_STCLASS_AND;
- }
+ }
if (flags & SCF_WHILEM_VISITED_POS)
f |= SCF_WHILEM_VISITED_POS;
/* we suppose the run is continuous, last=next...*/
minnext = study_chunk(pRExC_state, &scan, &deltanext,
next, &data_fake, f);
- if (min1 > minnext)
+ if (min1 > minnext)
min1 = minnext;
if (max1 < minnext + deltanext)
max1 = minnext + deltanext;
data->whilem_c = data_fake.whilem_c;
if (flags & SCF_DO_STCLASS)
cl_or(pRExC_state, &accum, &this_class);
- if (code == SUSPEND)
+ if (code == SUSPEND)
break;
}
if (code == IFTHEN && num < 2) /* Empty ELSE branch */
flags &= ~SCF_DO_STCLASS;
}
else {
- /* Switch to OR mode: cache the old value of
+ /* Switch to OR mode: cache the old value of
* data->start_class */
StructCopy(data->start_class, &and_with,
struct regnode_charclass_class);
}
else if (OP(scan) == EXACT) {
I32 l = STR_LEN(scan);
+ UV uc = *((U8*)STRING(scan));
if (UTF) {
- unsigned char *s = (unsigned char *)STRING(scan);
- unsigned char *e = s + l;
- I32 newl = 0;
- while (s < e) {
- newl++;
- s += UTF8SKIP(s);
- }
- l = newl;
+ U8 *s = (U8*)STRING(scan);
+ l = utf8_length(s, s + l);
+ uc = utf8_to_uvchr(s, NULL);
}
min += l;
if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
if (data->last_end == -1) { /* Update the start info. */
data->last_start_min = data->pos_min;
data->last_start_max = is_inf
- ? I32_MAX : data->pos_min + data->pos_delta;
+ ? I32_MAX : data->pos_min + data->pos_delta;
}
sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
data->last_end = data->pos_min + l;
/* Check whether it is compatible with what we know already! */
int compat = 1;
- if (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
- && !ANYOF_BITMAP_TEST(data->start_class, *STRING(scan))
+ if (uc >= 0x100 ||
+ !(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
+ && !ANYOF_BITMAP_TEST(data->start_class, uc)
&& (!(data->start_class->flags & ANYOF_FOLD)
- || !ANYOF_BITMAP_TEST(data->start_class,
- PL_fold[*(U8*)STRING(scan)])))
+ || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
compat = 0;
ANYOF_CLASS_ZERO(data->start_class);
ANYOF_BITMAP_ZERO(data->start_class);
if (compat)
- ANYOF_BITMAP_SET(data->start_class, *STRING(scan));
+ ANYOF_BITMAP_SET(data->start_class, uc);
data->start_class->flags &= ~ANYOF_EOS;
+ if (uc < 0x100)
+ data->start_class->flags &= ~ANYOF_UNICODE_ALL;
}
else if (flags & SCF_DO_STCLASS_OR) {
/* false positive possible if the class is case-folded */
- ANYOF_BITMAP_SET(data->start_class, *STRING(scan));
+ if (uc < 0x100)
+ ANYOF_BITMAP_SET(data->start_class, uc);
+ else
+ data->start_class->flags |= ANYOF_UNICODE_ALL;
data->start_class->flags &= ~ANYOF_EOS;
cl_and(data->start_class, &and_with);
}
}
else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
I32 l = STR_LEN(scan);
+ UV uc = *((U8*)STRING(scan));
/* Search for fixed substrings supports EXACT only. */
- if (flags & SCF_DO_SUBSTR)
+ if (flags & SCF_DO_SUBSTR)
scan_commit(pRExC_state, data);
if (UTF) {
- unsigned char *s = (unsigned char *)STRING(scan);
- unsigned char *e = s + l;
- I32 newl = 0;
- while (s < e) {
- newl++;
- s += UTF8SKIP(s);
- }
- l = newl;
+ U8 *s = (U8 *)STRING(scan);
+ l = utf8_length(s, s + l);
+ uc = utf8_to_uvchr(s, NULL);
}
min += l;
if (data && (flags & SCF_DO_SUBSTR))
/* Check whether it is compatible with what we know already! */
int compat = 1;
- if (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
- && !ANYOF_BITMAP_TEST(data->start_class, *STRING(scan))
- && !ANYOF_BITMAP_TEST(data->start_class,
- PL_fold[*(U8*)STRING(scan)]))
+ if (uc >= 0x100 ||
+ !(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
+ && !ANYOF_BITMAP_TEST(data->start_class, uc)
+ && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc]))
compat = 0;
ANYOF_CLASS_ZERO(data->start_class);
ANYOF_BITMAP_ZERO(data->start_class);
if (compat) {
- ANYOF_BITMAP_SET(data->start_class, *STRING(scan));
+ ANYOF_BITMAP_SET(data->start_class, uc);
data->start_class->flags &= ~ANYOF_EOS;
data->start_class->flags |= ANYOF_FOLD;
if (OP(scan) == EXACTFL)
if (data->start_class->flags & ANYOF_FOLD) {
/* false positive possible if the class is case-folded.
Assume that the locale settings are the same... */
- ANYOF_BITMAP_SET(data->start_class, *STRING(scan));
+ if (uc < 0x100)
+ ANYOF_BITMAP_SET(data->start_class, uc);
data->start_class->flags &= ~ANYOF_EOS;
}
cl_and(data->start_class, &and_with);
if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
next = NEXTOPER(scan);
if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
- mincount = 1;
- maxcount = REG_INFTY;
+ mincount = 1;
+ maxcount = REG_INFTY;
next = regnext(scan);
scan = NEXTOPER(scan);
goto do_curly;
case STAR:
if (flags & SCF_DO_STCLASS) {
mincount = 0;
- maxcount = REG_INFTY;
+ maxcount = REG_INFTY;
next = regnext(scan);
scan = NEXTOPER(scan);
goto do_curly;
}
- is_inf = is_inf_internal = 1;
+ is_inf = is_inf_internal = 1;
scan = regnext(scan);
if (flags & SCF_DO_SUBSTR) {
scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
}
goto optimize_curly_tail;
case CURLY:
- mincount = ARG1(scan);
+ mincount = ARG1(scan);
maxcount = ARG2(scan);
next = regnext(scan);
if (OP(scan) == CURLYX) {
/* These are the cases when once a subexpression
fails at a particular position, it cannot succeed
even after backtracking at the enclosing scope.
-
+
XXXX what if minimal match and we are at the
initial run of {n,m}? */
if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
f &= ~SCF_WHILEM_VISITED_POS;
/* This will finish on WHILEM, setting scan, or on NULL: */
- minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
- mincount == 0
+ minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
+ mincount == 0
? (f & ~SCF_DO_SUBSTR) : f);
if (flags & SCF_DO_STCLASS)
cl_or(pRExC_state, data->start_class, &this_class);
}
else if (flags & SCF_DO_STCLASS_AND) {
- /* Switch to OR mode: cache the old value of
+ /* Switch to OR mode: cache the old value of
* data->start_class */
StructCopy(data->start_class, &and_with,
struct regnode_charclass_class);
}
if (!scan) /* It was not CURLYX, but CURLY. */
scan = next;
- if (ckWARN(WARN_REGEXP) && (minnext + deltanext == 0)
+ if (ckWARN(WARN_REGEXP) && (minnext + deltanext == 0)
&& !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
&& maxcount <= REG_INFTY/3) /* Complement check for big count */
{
}
min += minnext * mincount;
- is_inf_internal |= ((maxcount == REG_INFTY
+ is_inf_internal |= ((maxcount == REG_INFTY
&& (minnext + deltanext) > 0)
|| deltanext == I32_MAX);
is_inf |= is_inf_internal;
delta += (minnext + deltanext) * maxcount - minnext * mincount;
/* Try powerful optimization CURLYX => CURLYN. */
- if ( OP(oscan) == CURLYX && data
+ if ( OP(oscan) == CURLYX && data
&& data->flags & SF_IN_PAR
&& !(data->flags & SF_HAS_EVAL)
&& !deltanext && minnext == 1 ) {
nxt = regnext(nxt);
if (!strchr((char*)PL_simple,OP(nxt))
&& !(PL_regkind[(U8)OP(nxt)] == EXACT
- && STR_LEN(nxt) == 1))
+ && STR_LEN(nxt) == 1))
goto nogo;
nxt2 = nxt;
nxt = regnext(nxt);
- if (OP(nxt) != CLOSE)
+ if (OP(nxt) != CLOSE)
goto nogo;
/* Now we know that nxt2 is the only contents: */
oscan->flags = ARG(nxt);
OP(nxt) = OPTIMIZED; /* was CLOSE. */
OP(nxt + 1) = OPTIMIZED; /* was count. */
NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
-#endif
+#endif
}
nogo:
/* Try optimization CURLYX => CURLYM. */
- if ( OP(oscan) == CURLYX && data
+ if ( OP(oscan) == CURLYX && data
&& !(data->flags & SF_HAS_PAR)
&& !(data->flags & SF_HAS_EVAL)
&& !deltanext ) {
OP(oscan) = CURLYM;
while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
- && (OP(nxt2) != WHILEM))
+ && (OP(nxt2) != WHILEM))
nxt = nxt2;
OP(nxt2) = SUCCEED; /* Whas WHILEM */
/* Need to optimize away parenths. */
/* Set the parenth number. */
regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
- if (OP(nxt) != CLOSE)
+ if (OP(nxt) != CLOSE)
FAIL("Panic opt close");
oscan->flags = ARG(nxt);
OP(nxt1) = OPTIMIZED; /* was OPEN. */
OP(nxt + 1) = OPTIMIZED; /* was count. */
NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
-#endif
+#endif
#if 0
while ( nxt1 && (OP(nxt1) != WHILEM)) {
regnode *nnxt = regnext(nxt1);
-
+
if (nnxt == nxt) {
if (reg_off_by_arg[OP(nxt1)])
ARG_SET(nxt1, nxt2 - nxt1);
}
#endif
/* Optimize again: */
- study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
+ study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
NULL, 0);
}
else
PREVOPER(nxt)->flags = data->whilem_c
| (RExC_whilem_seen << 4); /* On WHILEM */
}
- if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
+ if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
pars++;
if (flags & SCF_DO_SUBSTR) {
SV *last_str = Nullsv;
int counted = mincount != 0;
if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
- I32 b = pos_before >= data->last_start_min
+ I32 b = pos_before >= data->last_start_min
? pos_before : data->last_start_min;
STRLEN l;
char *s = SvPV(data->last_found, l);
/* What was added is a constant string */
if (mincount > 1) {
SvGROW(last_str, (mincount * l) + 1);
- repeatcpy(SvPVX(last_str) + l,
+ repeatcpy(SvPVX(last_str) + l,
SvPVX(last_str), l, mincount - 1);
SvCUR(last_str) *= mincount;
/* Add additional parts. */
- SvCUR_set(data->last_found,
+ SvCUR_set(data->last_found,
SvCUR(data->last_found) - l);
sv_catsv(data->last_found, last_str);
data->last_end += l * (mincount - 1);
if (mincount && last_str) {
sv_setsv(data->last_found, last_str);
data->last_end = data->pos_min;
- data->last_start_min =
+ data->last_start_min =
data->pos_min - CHR_SVLEN(last_str);
- data->last_start_max = is_inf
- ? I32_MAX
+ data->last_start_max = is_inf
+ ? I32_MAX
: data->pos_min + data->pos_delta
- CHR_SVLEN(last_str);
}
break;
}
}
- else if (strchr((char*)PL_simple,OP(scan)) || PL_regkind[(U8)OP(scan)] == ANYUTF8) {
+ else if (strchr((char*)PL_simple,OP(scan))) {
int value;
if (flags & SCF_DO_SUBSTR) {
/* Some of the logic below assumes that switching
locale on will only add false positives. */
switch (PL_regkind[(U8)OP(scan)]) {
- case ANYUTF8:
case SANY:
- case SANYUTF8:
- case ALNUMUTF8:
- case ANYOFUTF8:
- case ALNUMLUTF8:
- case NALNUMUTF8:
- case NALNUMLUTF8:
- case SPACEUTF8:
- case NSPACEUTF8:
- case SPACELUTF8:
- case NSPACELUTF8:
- case DIGITUTF8:
- case NDIGITUTF8:
default:
do_default:
/* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
else {
for (value = 0; value < 256; value++)
if (isALNUM(value))
- ANYOF_BITMAP_SET(data->start_class, value);
+ ANYOF_BITMAP_SET(data->start_class, value);
}
}
break;
else {
for (value = 0; value < 256; value++)
if (!isALNUM(value))
- ANYOF_BITMAP_SET(data->start_class, value);
+ ANYOF_BITMAP_SET(data->start_class, value);
}
}
break;
else {
for (value = 0; value < 256; value++)
if (isSPACE(value))
- ANYOF_BITMAP_SET(data->start_class, value);
+ ANYOF_BITMAP_SET(data->start_class, value);
}
}
break;
else {
for (value = 0; value < 256; value++)
if (!isSPACE(value))
- ANYOF_BITMAP_SET(data->start_class, value);
+ ANYOF_BITMAP_SET(data->start_class, value);
}
}
break;
else {
for (value = 0; value < 256; value++)
if (isDIGIT(value))
- ANYOF_BITMAP_SET(data->start_class, value);
+ ANYOF_BITMAP_SET(data->start_class, value);
}
}
break;
else {
for (value = 0; value < 256; value++)
if (!isDIGIT(value))
- ANYOF_BITMAP_SET(data->start_class, value);
+ ANYOF_BITMAP_SET(data->start_class, value);
}
}
break;
int f = 0;
data_fake.flags = 0;
- if (data) {
+ if (data) {
data_fake.whilem_c = data->whilem_c;
data_fake.last_closep = data->last_closep;
}
finish:
*scanp = scan;
*deltap = is_inf_internal ? I32_MAX : delta;
- if (flags & SCF_DO_SUBSTR && is_inf)
+ if (flags & SCF_DO_SUBSTR && is_inf)
data->pos_delta = I32_MAX - data->pos_min;
if (is_par > U8_MAX)
is_par = 0;
STATIC I32
S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
{
- dTHR;
if (RExC_rx->data) {
- Renewc(RExC_rx->data,
- sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
+ Renewc(RExC_rx->data,
+ sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
char, struct reg_data);
Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
RExC_rx->data->count += n;
void
Perl_reginitcolors(pTHX)
{
- dTHR;
int i = 0;
char *s = PerlEnv_getenv("PERL_RE_COLORS");
-
+
if (s) {
PL_colors[0] = s = savepv(s);
while (++i < 6) {
PL_colors[i] = s = "";
}
} else {
- while (i < 6)
+ while (i < 6)
PL_colors[i++] = "";
}
PL_colorset = 1;
regexp *
Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
{
- dTHR;
register regexp *r;
regnode *scan;
regnode *first;
FAIL("NULL regexp argument");
/* XXXX This looks very suspicious... */
- if (pm->op_pmdynflags & PMdf_UTF8) {
- PL_reg_flags |= RF_utf8;
- }
+ if (pm->op_pmdynflags & PMdf_CMP_UTF8)
+ RExC_utf8 = 1;
else
- PL_reg_flags = 0;
+ RExC_utf8 = 0;
RExC_precomp = savepvn(exp, xend - exp);
DEBUG_r(if (!PL_colorset) reginitcolors());
/* Starting-point info. */
again:
if (PL_regkind[(U8)OP(first)] == EXACT) {
- if (OP(first) == EXACT); /* Empty, get anchored substr later. */
- else if ((OP(first) == EXACTF || OP(first) == EXACTFL)
- && !UTF)
+ if (OP(first) == EXACT)
+ ; /* Empty, get anchored substr later. */
+ else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
r->regstclass = first;
}
else if (strchr((char*)PL_simple,OP(first)))
/* turn .* into ^.* with an implied $*=1 */
int type = OP(NEXTOPER(first));
- if (type == REG_ANY || type == ANYUTF8)
+ if (type == REG_ANY)
type = ROPT_ANCH_MBOL;
else
type = ROPT_ANCH_SBOL;
first = NEXTOPER(first);
goto again;
}
- if (sawplus && (!sawopen || !RExC_sawback)
+ if (sawplus && (!sawopen || !RExC_sawback)
&& !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
/* x+ must match at the 1st pos of run of x's */
r->reganch |= ROPT_SKIP;
/* Scan is after the zeroth branch, first is atomic matcher. */
- DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
+ DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
(IV)(first - scan + 1)));
/*
* If there's something expensive in the r.e., find the
minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
&data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
- && data.last_start_min == 0 && data.last_end > 0
+ && data.last_start_min == 0 && data.last_end > 0
&& !RExC_seen_zerolen
&& (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
r->reganch |= ROPT_CHECK_ALL;
SvREFCNT_dec(data.longest_fixed);
longest_fixed_length = 0;
}
- if (r->regstclass
- && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == ANYUTF8
- || OP(r->regstclass) == SANYUTF8 || OP(r->regstclass) == SANY))
+ if (r->regstclass
+ && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
r->regstclass = NULL;
if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
&& !(data.start_class->flags & ANYOF_EOS)
SV *sv;
I32 n = add_data(pRExC_state, 1, "f");
- New(1006, RExC_rx->data->data[n], 1,
+ New(1006, RExC_rx->data->data[n], 1,
struct regnode_charclass_class);
StructCopy(data.start_class,
(struct regnode_charclass_class*)RExC_rx->data->data[n],
struct regnode_charclass_class);
r->regstclass = (regnode*)RExC_rx->data->data[n];
r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
+ PL_regdata = r->data; /* for regprop() */
DEBUG_r((sv = sv_newmortal(),
regprop(sv, (regnode*)data.start_class),
PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n",
SV *sv;
I32 n = add_data(pRExC_state, 1, "f");
- New(1006, RExC_rx->data->data[n], 1,
+ New(1006, RExC_rx->data->data[n], 1,
struct regnode_charclass_class);
StructCopy(data.start_class,
(struct regnode_charclass_class*)RExC_rx->data->data[n],
}
r->minlen = minlen;
- if (RExC_seen & REG_SEEN_GPOS)
+ if (RExC_seen & REG_SEEN_GPOS)
r->reganch |= ROPT_GPOS_SEEN;
if (RExC_seen & REG_SEEN_LOOKBEHIND)
r->reganch |= ROPT_LOOKBEHIND_SEEN;
if (RExC_seen & REG_SEEN_EVAL)
r->reganch |= ROPT_EVAL_SEEN;
+ if (RExC_seen & REG_SEEN_SANY)
+ r->reganch |= ROPT_SANY_SEEN;
Newz(1002, r->startp, RExC_npar, I32);
Newz(1002, r->endp, RExC_npar, I32);
+ PL_regdata = r->data; /* for regprop() */
DEBUG_r(regdump(r));
return(r);
}
S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
/* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
{
- dTHR;
register regnode *ret; /* Will be the head of the group. */
register regnode *br;
register regnode *lastbr;
switch (paren) {
case '<':
RExC_seen |= REG_SEEN_LOOKBEHIND;
- if (*RExC_parse == '!')
+ if (*RExC_parse == '!')
paren = ',';
- if (*RExC_parse != '=' && *RExC_parse != '!')
+ if (*RExC_parse != '=' && *RExC_parse != '!')
goto unknown;
RExC_parse++;
case '=':
/* FALL THROUGH */
case '{':
{
- dTHR;
I32 count = 1, n = 0;
char c;
char *s = RExC_parse;
while (count && (c = *RExC_parse)) {
if (c == '\\' && RExC_parse[1])
RExC_parse++;
- else if (c == '{')
+ else if (c == '{')
count++;
- else if (c == '}')
+ else if (c == '}')
count--;
RExC_parse++;
}
if (*RExC_parse != ')')
{
- RExC_parse = s;
+ RExC_parse = s;
vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
}
if (!SIZE_ONLY) {
AV *av;
-
- if (RExC_parse - 1 - s)
+
+ if (RExC_parse - 1 - s)
sv = newSVpvn(s, RExC_parse - 1 - s);
else
sv = newSVpvn("", 0);
case '(':
{
if (RExC_parse[0] == '?') {
- if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
- || RExC_parse[1] == '<'
+ if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
+ || RExC_parse[1] == '<'
|| RExC_parse[1] == '{') { /* Lookahead or eval. */
I32 flag;
ret->flags = 1;
regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
goto insert_if;
- }
+ }
}
else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
parno = atoi(RExC_parse++);
STATIC regnode *
S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
{
- dTHR;
register regnode *ret;
register regnode *chain = NULL;
register regnode *latest;
I32 flags = 0, c = 0;
- if (first)
+ if (first)
ret = NULL;
else {
- if (!SIZE_ONLY && RExC_extralen)
+ if (!SIZE_ONLY && RExC_extralen)
ret = reganode(pRExC_state, BRANCHJ,0);
else
ret = reg_node(pRExC_state, BRANCH);
}
- if (!first && SIZE_ONLY)
+ if (!first && SIZE_ONLY)
RExC_extralen += 1; /* BRANCHJ */
-
+
*flagp = WORST; /* Tentatively. */
RExC_parse--;
STATIC regnode *
S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
{
- dTHR;
register regnode *ret;
register char op;
register char *next;
if (!(flags&HASWIDTH) && op != '?')
vFAIL("Regexp *+ operand could be empty");
-#endif
+#endif
nextchar(pRExC_state);
STATIC regnode *
S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
{
- dTHR;
register regnode *ret = 0;
I32 flags;
break;
case '$':
nextchar(pRExC_state);
- if (*RExC_parse)
+ if (*RExC_parse)
RExC_seen_zerolen++;
if (RExC_flags16 & PMf_MULTILINE)
ret = reg_node(pRExC_state, MEOL);
break;
case '.':
nextchar(pRExC_state);
- if (UTF) {
- if (RExC_flags16 & PMf_SINGLELINE)
- ret = reg_node(pRExC_state, SANYUTF8);
- else
- ret = reg_node(pRExC_state, ANYUTF8);
- *flagp |= HASWIDTH;
- }
- else {
- if (RExC_flags16 & PMf_SINGLELINE)
- ret = reg_node(pRExC_state, SANY);
- else
- ret = reg_node(pRExC_state, REG_ANY);
- *flagp |= HASWIDTH|SIMPLE;
- }
+ if (RExC_flags16 & PMf_SINGLELINE)
+ ret = reg_node(pRExC_state, SANY);
+ else
+ ret = reg_node(pRExC_state, REG_ANY);
+ *flagp |= HASWIDTH|SIMPLE;
RExC_naughty++;
break;
case '[':
{
char *oregcomp_parse = ++RExC_parse;
- ret = (UTF ? regclassutf8(pRExC_state) : regclass(pRExC_state));
+ ret = regclass(pRExC_state);
if (*RExC_parse != ']') {
RExC_parse = oregcomp_parse;
vFAIL("Unmatched [");
break;
case 'C':
ret = reg_node(pRExC_state, SANY);
+ RExC_seen |= REG_SEEN_SANY;
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
break;
ret = reg_node(pRExC_state, CLUMP);
*flagp |= HASWIDTH;
nextchar(pRExC_state);
- if (UTF && !PL_utf8_mark)
- is_utf8_mark((U8*)"~"); /* preload table */
break;
case 'w':
- ret = reg_node(pRExC_state,
- UTF
- ? (LOC ? ALNUMLUTF8 : ALNUMUTF8)
- : (LOC ? ALNUML : ALNUM));
+ ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
- if (UTF && !PL_utf8_alnum)
- is_utf8_alnum((U8*)"a"); /* preload table */
break;
case 'W':
- ret = reg_node(pRExC_state,
- UTF
- ? (LOC ? NALNUMLUTF8 : NALNUMUTF8)
- : (LOC ? NALNUML : NALNUM));
+ ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
- if (UTF && !PL_utf8_alnum)
- is_utf8_alnum((U8*)"a"); /* preload table */
break;
case 'b':
RExC_seen_zerolen++;
RExC_seen |= REG_SEEN_LOOKBEHIND;
- ret = reg_node(pRExC_state,
- UTF
- ? (LOC ? BOUNDLUTF8 : BOUNDUTF8)
- : (LOC ? BOUNDL : BOUND));
+ ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
*flagp |= SIMPLE;
nextchar(pRExC_state);
- if (UTF && !PL_utf8_alnum)
- is_utf8_alnum((U8*)"a"); /* preload table */
break;
case 'B':
RExC_seen_zerolen++;
RExC_seen |= REG_SEEN_LOOKBEHIND;
- ret = reg_node(pRExC_state,
- UTF
- ? (LOC ? NBOUNDLUTF8 : NBOUNDUTF8)
- : (LOC ? NBOUNDL : NBOUND));
+ ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
*flagp |= SIMPLE;
nextchar(pRExC_state);
- if (UTF && !PL_utf8_alnum)
- is_utf8_alnum((U8*)"a"); /* preload table */
break;
case 's':
- ret = reg_node(pRExC_state,
- UTF
- ? (LOC ? SPACELUTF8 : SPACEUTF8)
- : (LOC ? SPACEL : SPACE));
+ ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
- if (UTF && !PL_utf8_space)
- is_utf8_space((U8*)" "); /* preload table */
break;
case 'S':
- ret = reg_node(pRExC_state,
- UTF
- ? (LOC ? NSPACELUTF8 : NSPACEUTF8)
- : (LOC ? NSPACEL : NSPACE));
+ ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
- if (UTF && !PL_utf8_space)
- is_utf8_space((U8*)" "); /* preload table */
break;
case 'd':
- ret = reg_node(pRExC_state, UTF ? DIGITUTF8 : DIGIT);
+ ret = reg_node(pRExC_state, DIGIT);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
- if (UTF && !PL_utf8_digit)
- is_utf8_digit((U8*)"1"); /* preload table */
break;
case 'D':
- ret = reg_node(pRExC_state, UTF ? NDIGITUTF8 : NDIGIT);
+ ret = reg_node(pRExC_state, NDIGIT);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
- if (UTF && !PL_utf8_digit)
- is_utf8_digit((U8*)"1"); /* preload table */
break;
case 'p':
case 'P':
RExC_end = RExC_parse + 2;
RExC_parse--;
- ret = regclassutf8(pRExC_state);
+ ret = regclass(pRExC_state);
RExC_end = oldregxend;
RExC_parse--;
case 'x':
if (*++p == '{') {
char* e = strchr(p, '}');
-
+
if (!e) {
RExC_parse = p + 1;
vFAIL("Missing right brace on \\x{}");
}
- else if (UTF) {
+ else {
numlen = 1; /* allow underscores */
ender = (UV)scan_hex(p + 1, e - p - 1, &numlen);
+ if (ender > 0xff)
+ RExC_utf8 = 1;
/* numlen is generous */
if (numlen + len >= 127) {
p--;
}
p = e + 1;
}
- else
- {
- RExC_parse = e + 1;
- vFAIL("Can't use \\x{} without 'use utf8' declaration");
- }
-
}
else {
numlen = 0; /* disallow underscores */
break;
default:
normal_default:
- if ((*p & 0xc0) == 0xc0 && UTF) {
- ender = utf8_to_uv((U8*)p, RExC_end - p,
+ if (UTF8_IS_START(*p) && UTF) {
+ ender = utf8n_to_uvuni((U8*)p, RExC_end - p,
&numlen, 0);
p += numlen;
}
p = regwhite(p, RExC_end);
if (UTF && FOLD) {
if (LOC)
- ender = toLOWER_LC_uni(ender);
+ ender = toLOWER_LC_uvchr(UNI_TO_NATIVE(ender));
else
ender = toLOWER_uni(ender);
}
if (ISMULT2(p)) { /* Back off on ?+*. */
if (len)
p = oldp;
+ /* ender is a Unicode value so it can be > 0xff --
+ * in other words, do not use UTF8_IS_CONTINUED(). */
else if (ender >= 0x80 && UTF) {
reguni(pRExC_state, ender, s, &numlen);
s += numlen;
}
break;
}
+ /* ender is a Unicode value so it can be > 0xff --
+ * in other words, do not use UTF8_IS_CONTINUED(). */
if (ender >= 0x80 && UTF) {
reguni(pRExC_state, ender, s, &numlen);
s += numlen;
STATIC I32
S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
{
- dTHR;
char *posixcc = 0;
I32 namedclass = OOB_NAMEDCLASS;
*RExC_parse == '.')) {
char c = *RExC_parse;
char* s = RExC_parse++;
-
+
while (RExC_parse < RExC_end && *RExC_parse != c)
RExC_parse++;
if (RExC_parse == RExC_end)
STATIC regnode *
S_regclass(pTHX_ RExC_state_t *pRExC_state)
{
- dTHR;
- register U32 value;
- register I32 lastvalue = OOB_CHAR8;
- register I32 range = 0;
+ register UV value;
+ register IV lastvalue = OOB_UNICODE;
+ register IV range = 0;
register regnode *ret;
STRLEN numlen;
- I32 namedclass;
+ IV namedclass;
char *rangebegin;
bool need_class = 0;
+ SV *listsv;
+ register char *e;
+ UV n;
+ bool dont_optimize_invert = FALSE;
+
+ ret = reganode(pRExC_state, ANYOF, 0);
+
+ if (!SIZE_ONLY)
+ ANYOF_FLAGS(ret) = 0;
+
+ if (*RExC_parse == '^') { /* Complement of range. */
+ RExC_naughty++;
+ RExC_parse++;
+ if (!SIZE_ONLY)
+ ANYOF_FLAGS(ret) |= ANYOF_INVERT;
+ }
- ret = reg_node(pRExC_state, ANYOF);
if (SIZE_ONLY)
RExC_size += ANYOF_SKIP;
else {
- ret->flags = 0;
- ANYOF_BITMAP_ZERO(ret);
RExC_emit += ANYOF_SKIP;
if (FOLD)
ANYOF_FLAGS(ret) |= ANYOF_FOLD;
if (LOC)
ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
- }
- if (*RExC_parse == '^') { /* Complement of range. */
- RExC_naughty++;
- RExC_parse++;
- if (!SIZE_ONLY)
- ANYOF_FLAGS(ret) |= ANYOF_INVERT;
+ ANYOF_BITMAP_ZERO(ret);
+ listsv = newSVpvn("# comment\n", 10);
}
if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
checkposixcc(pRExC_state);
if (*RExC_parse == ']' || *RExC_parse == '-')
- goto skipcond; /* allow 1st char to be ] or - */
+ goto charclassloop; /* allow 1st char to be ] or - */
+
while (RExC_parse < RExC_end && *RExC_parse != ']') {
- skipcond:
- namedclass = OOB_NAMEDCLASS;
+
+ charclassloop:
+
+ namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
+
if (!range)
rangebegin = RExC_parse;
- value = UCHARAT(RExC_parse++);
+ if (UTF) {
+ value = utf8n_to_uvuni((U8*)RExC_parse,
+ RExC_end - RExC_parse,
+ &numlen, 0);
+ RExC_parse += numlen;
+ }
+ else
+ value = UCHARAT(RExC_parse++);
if (value == '[')
namedclass = regpposixcc(pRExC_state, value);
else if (value == '\\') {
- value = UCHARAT(RExC_parse++);
+ if (UTF) {
+ value = utf8n_to_uvuni((U8*)RExC_parse,
+ RExC_end - RExC_parse,
+ &numlen, 0);
+ RExC_parse += numlen;
+ }
+ else
+ value = UCHARAT(RExC_parse++);
/* Some compilers cannot handle switching on 64-bit integer
- * values, therefore the 'value' cannot be an UV. --jhi */
- switch (value) {
+ * values, therefore value cannot be an UV. Yes, this will
+ * be a problem later if we want switch on Unicode.
+ * A similar issue a little bit later when switching on
+ * namedclass. --jhi */
+ switch ((I32)value) {
case 'w': namedclass = ANYOF_ALNUM; break;
case 'W': namedclass = ANYOF_NALNUM; break;
case 's': namedclass = ANYOF_SPACE; break;
case 'S': namedclass = ANYOF_NSPACE; break;
case 'd': namedclass = ANYOF_DIGIT; break;
case 'D': namedclass = ANYOF_NDIGIT; break;
+ case 'p':
+ case 'P':
+ if (*RExC_parse == '{') {
+ e = strchr(RExC_parse++, '}');
+ if (!e)
+ vFAIL("Missing right brace on \\p{}");
+ n = e - RExC_parse;
+ }
+ else {
+ e = RExC_parse;
+ n = 1;
+ }
+ if (!SIZE_ONLY) {
+ if (value == 'p')
+ Perl_sv_catpvf(aTHX_ listsv,
+ "+utf8::%.*s\n", (int)n, RExC_parse);
+ else
+ Perl_sv_catpvf(aTHX_ listsv,
+ "!utf8::%.*s\n", (int)n, RExC_parse);
+ }
+ RExC_parse = e + 1;
+ ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
+ continue;
case 'n': value = '\n'; break;
case 'r': value = '\r'; break;
case 't': value = '\t'; break;
case 'a': value = '\057'; break;
#endif
case 'x':
- numlen = 0; /* disallow underscores */
- value = (UV)scan_hex(RExC_parse, 2, &numlen);
- RExC_parse += numlen;
+ if (*RExC_parse == '{') {
+ e = strchr(RExC_parse++, '}');
+ if (!e)
+ vFAIL("Missing right brace on \\x{}");
+ numlen = 1; /* allow underscores */
+ value = (UV)scan_hex(RExC_parse,
+ e - RExC_parse,
+ &numlen);
+ RExC_parse = e + 1;
+ }
+ else {
+ numlen = 0; /* disallow underscores */
+ value = (UV)scan_hex(RExC_parse, 2, &numlen);
+ RExC_parse += numlen;
+ }
break;
case 'c':
value = UCHARAT(RExC_parse++);
break;
default:
if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
-
- vWARN2(RExC_parse, "Unrecognized escape \\%c in character class passed through", (int)value);
+ vWARN2(RExC_parse,
+ "Unrecognized escape \\%c in character class passed through",
+ (int)value);
break;
}
- }
- if (namedclass > OOB_NAMEDCLASS) {
- if (!need_class && !SIZE_ONLY)
+ } /* end of \blah */
+
+ if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
+
+ if (!SIZE_ONLY && !need_class)
ANYOF_CLASS_ZERO(ret);
+
need_class = 1;
- if (range) { /* a-\d, a-[:digit:] */
+
+ /* a bad range like a-\d, a-[:digit:] ? */
+ if (range) {
if (!SIZE_ONLY) {
if (ckWARN(WARN_REGEXP))
vWARN4(RExC_parse,
RExC_parse - rangebegin,
RExC_parse - rangebegin,
rangebegin);
- ANYOF_BITMAP_SET(ret, lastvalue);
- ANYOF_BITMAP_SET(ret, '-');
+ if (lastvalue < 256) {
+ ANYOF_BITMAP_SET(ret, lastvalue);
+ ANYOF_BITMAP_SET(ret, '-');
+ }
+ else {
+ ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
+ Perl_sv_catpvf(aTHX_ listsv,
+ /* 0x002D is Unicode for '-' */
+ "%04"UVxf"\n002D\n", (UV)lastvalue);
+ }
}
- range = 0; /* this is not a true range */
+
+ range = 0; /* this was not a true range */
}
+
if (!SIZE_ONLY) {
- switch (namedclass) {
+ /* Possible truncation here but in some 64-bit environments
+ * the compiler gets heartburn about switch on 64-bit values.
+ * A similar issue a little earlier when switching on value.
+ * --jhi */
+ switch ((I32)namedclass) {
case ANYOF_ALNUM:
if (LOC)
ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
if (isALNUM(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
break;
case ANYOF_NALNUM:
if (LOC)
if (!isALNUM(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
break;
- case ANYOF_SPACE:
- if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_SPACE);
- else {
- for (value = 0; value < 256; value++)
- if (isSPACE(value))
- ANYOF_BITMAP_SET(ret, value);
- }
- break;
- case ANYOF_NSPACE:
+ case ANYOF_ALNUMC:
if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
+ ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
else {
for (value = 0; value < 256; value++)
- if (!isSPACE(value))
+ if (isALNUMC(value))
ANYOF_BITMAP_SET(ret, value);
}
- break;
- case ANYOF_DIGIT:
- if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
- else {
- for (value = '0'; value <= '9'; value++)
- ANYOF_BITMAP_SET(ret, value);
- }
- break;
- case ANYOF_NDIGIT:
- if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
- else {
- for (value = 0; value < '0'; value++)
- ANYOF_BITMAP_SET(ret, value);
- for (value = '9' + 1; value < 256; value++)
- ANYOF_BITMAP_SET(ret, value);
- }
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
break;
case ANYOF_NALNUMC:
if (LOC)
if (!isALNUMC(value))
ANYOF_BITMAP_SET(ret, value);
}
- break;
- case ANYOF_ALNUMC:
- if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
- else {
- for (value = 0; value < 256; value++)
- if (isALNUMC(value))
- ANYOF_BITMAP_SET(ret, value);
- }
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
break;
case ANYOF_ALPHA:
if (LOC)
if (isALPHA(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
break;
case ANYOF_NALPHA:
if (LOC)
if (!isALPHA(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
break;
case ANYOF_ASCII:
if (LOC)
ANYOF_BITMAP_SET(ret, value);
#endif /* EBCDIC */
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
break;
case ANYOF_NASCII:
if (LOC)
ANYOF_BITMAP_SET(ret, value);
#endif /* EBCDIC */
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
break;
case ANYOF_BLANK:
if (LOC)
if (isBLANK(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
break;
case ANYOF_NBLANK:
if (LOC)
if (!isBLANK(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
break;
case ANYOF_CNTRL:
if (LOC)
if (isCNTRL(value))
ANYOF_BITMAP_SET(ret, value);
}
- lastvalue = OOB_CHAR8;
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
break;
case ANYOF_NCNTRL:
if (LOC)
if (!isCNTRL(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
+ break;
+ case ANYOF_DIGIT:
+ if (LOC)
+ ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
+ else {
+ /* consecutive digits assumed */
+ for (value = '0'; value <= '9'; value++)
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
+ break;
+ case ANYOF_NDIGIT:
+ if (LOC)
+ ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
+ else {
+ /* consecutive digits assumed */
+ for (value = 0; value < '0'; value++)
+ ANYOF_BITMAP_SET(ret, value);
+ for (value = '9' + 1; value < 256; value++)
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
break;
case ANYOF_GRAPH:
if (LOC)
if (isGRAPH(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
break;
case ANYOF_NGRAPH:
if (LOC)
if (!isGRAPH(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
break;
case ANYOF_LOWER:
if (LOC)
if (isLOWER(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
break;
case ANYOF_NLOWER:
if (LOC)
if (!isLOWER(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
break;
case ANYOF_PRINT:
if (LOC)
if (isPRINT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
break;
case ANYOF_NPRINT:
if (LOC)
if (!isPRINT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
break;
case ANYOF_PSXSPC:
if (LOC)
if (isPSXSPC(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
break;
case ANYOF_NPSXSPC:
if (LOC)
if (!isPSXSPC(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
break;
case ANYOF_PUNCT:
if (LOC)
if (isPUNCT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
break;
case ANYOF_NPUNCT:
if (LOC)
if (!isPUNCT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
+ break;
+ case ANYOF_SPACE:
+ if (LOC)
+ ANYOF_CLASS_SET(ret, ANYOF_SPACE);
+ else {
+ for (value = 0; value < 256; value++)
+ if (isSPACE(value))
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
+ break;
+ case ANYOF_NSPACE:
+ if (LOC)
+ ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
+ else {
+ for (value = 0; value < 256; value++)
+ if (!isSPACE(value))
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
break;
case ANYOF_UPPER:
if (LOC)
if (isUPPER(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
break;
case ANYOF_NUPPER:
if (LOC)
if (!isUPPER(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
break;
case ANYOF_XDIGIT:
if (LOC)
if (isXDIGIT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
break;
case ANYOF_NXDIGIT:
if (LOC)
if (!isXDIGIT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
break;
default:
vFAIL("Invalid [::] class");
ANYOF_FLAGS(ret) |= ANYOF_CLASS;
continue;
}
- }
+ } /* end of namedclass \blah */
+
if (range) {
if (lastvalue > value) /* b-a */ {
Simple_vFAIL4("Invalid [] range \"%*.*s\"",
RExC_parse - rangebegin,
rangebegin);
}
- range = 0;
+ range = 0; /* not a true range */
}
else {
- lastvalue = value;
+ lastvalue = value; /* save the beginning of the range */
if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
RExC_parse[1] != ']') {
RExC_parse++;
- if (namedclass > OOB_NAMEDCLASS) { /* \w-, [:word:]- */
+
+ /* a bad range like \w-, [:word:]- ? */
+ if (namedclass > OOB_NAMEDCLASS) {
if (ckWARN(WARN_REGEXP))
vWARN4(RExC_parse,
"False [] range \"%*.*s\"",
if (!SIZE_ONLY)
ANYOF_BITMAP_SET(ret, '-');
} else
- range = 1;
- continue; /* do it next time */
+ range = 1; /* yeah, it's a range! */
+ continue; /* but do it the next time */
}
}
+
/* now is the next time */
if (!SIZE_ONLY) {
+ if (lastvalue < 256 && value < 256) {
#ifndef ASCIIish /* EBCDIC, for example. */
- if ((isLOWER(lastvalue) && isLOWER(value)) ||
- (isUPPER(lastvalue) && isUPPER(value)))
- {
- I32 i;
- if (isLOWER(lastvalue)) {
- for (i = lastvalue; i <= value; i++)
- if (isLOWER(i))
- ANYOF_BITMAP_SET(ret, i);
- } else {
- for (i = lastvalue; i <= value; i++)
- if (isUPPER(i))
- ANYOF_BITMAP_SET(ret, i);
+ if ((isLOWER(lastvalue) && isLOWER(value)) ||
+ (isUPPER(lastvalue) && isUPPER(value)))
+ {
+ IV i;
+ if (isLOWER(lastvalue)) {
+ for (i = lastvalue; i <= value; i++)
+ if (isLOWER(i))
+ ANYOF_BITMAP_SET(ret, i);
+ } else {
+ for (i = lastvalue; i <= value; i++)
+ if (isUPPER(i))
+ ANYOF_BITMAP_SET(ret, i);
+ }
}
- }
- else
+ else
#endif
- for ( ; lastvalue <= value; lastvalue++)
- ANYOF_BITMAP_SET(ret, lastvalue);
+ for ( ; lastvalue <= value; lastvalue++)
+ ANYOF_BITMAP_SET(ret, lastvalue);
+ } else {
+ ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
+ if (lastvalue < value)
+ Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
+ (UV)lastvalue, (UV)value);
+ else
+ Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
+ (UV)value);
+ }
}
- range = 0;
+
+ range = 0; /* this range (if it was one) is done now */
}
+
if (need_class) {
if (SIZE_ONLY)
RExC_size += ANYOF_CLASS_ADD_SKIP;
else
RExC_emit += ANYOF_CLASS_ADD_SKIP;
}
+
/* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
if (!SIZE_ONLY &&
- (ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD) {
+ (ANYOF_FLAGS(ret) &
+ /* If the only flag is folding (plus possibly inversion). */
+ (ANYOF_FLAGS_ALL ^ ANYOF_INVERT) == ANYOF_FOLD)) {
for (value = 0; value < 256; ++value) {
if (ANYOF_BITMAP_TEST(ret, value)) {
- I32 cf = PL_fold[value];
- ANYOF_BITMAP_SET(ret, cf);
+ IV fold = PL_fold[value];
+
+ if (fold != value)
+ ANYOF_BITMAP_SET(ret, fold);
}
}
ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
}
+
/* optimize inverted simple patterns (e.g. [^a-z]) */
- if (!SIZE_ONLY && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
+ if (!SIZE_ONLY && !dont_optimize_invert &&
+ /* If the only flag is inversion. */
+ (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
- ANYOF_FLAGS(ret) = 0;
+ ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
}
- return ret;
-}
-STATIC regnode *
-S_regclassutf8(pTHX_ RExC_state_t *pRExC_state)
-{
- dTHR;
- register char *e;
- register U32 value;
- register U32 lastvalue = OOB_UTF8;
- register I32 range = 0;
- register regnode *ret;
- STRLEN numlen;
- I32 n;
- SV *listsv;
- U8 flags = 0;
- I32 namedclass;
- char *rangebegin;
-
- if (*RExC_parse == '^') { /* Complement of range. */
- RExC_naughty++;
- RExC_parse++;
- if (!SIZE_ONLY)
- flags |= ANYOF_INVERT;
- }
if (!SIZE_ONLY) {
- if (FOLD)
- flags |= ANYOF_FOLD;
- if (LOC)
- flags |= ANYOF_LOCALE;
- listsv = newSVpvn("# comment\n",10);
- }
+ AV *av = newAV();
+ SV *rv;
- if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
- checkposixcc(pRExC_state);
-
- if (*RExC_parse == ']' || *RExC_parse == '-')
- goto skipcond; /* allow 1st char to be ] or - */
-
- while (RExC_parse < RExC_end && *RExC_parse != ']') {
- skipcond:
- namedclass = OOB_NAMEDCLASS;
- if (!range)
- rangebegin = RExC_parse;
- value = utf8_to_uv((U8*)RExC_parse,
- RExC_end - RExC_parse,
- &numlen, 0);
- RExC_parse += numlen;
- if (value == '[')
- namedclass = regpposixcc(pRExC_state, value);
- else if (value == '\\') {
- value = (U32)utf8_to_uv((U8*)RExC_parse,
- RExC_end - RExC_parse,
- &numlen, 0);
- RExC_parse += numlen;
- /* Some compilers cannot handle switching on 64-bit integer
- * values, therefore value cannot be an UV. Yes, this will
- * be a problem later if we want switch on Unicode. --jhi */
- switch (value) {
- case 'w': namedclass = ANYOF_ALNUM; break;
- case 'W': namedclass = ANYOF_NALNUM; break;
- case 's': namedclass = ANYOF_SPACE; break;
- case 'S': namedclass = ANYOF_NSPACE; break;
- case 'd': namedclass = ANYOF_DIGIT; break;
- case 'D': namedclass = ANYOF_NDIGIT; break;
- case 'p':
- case 'P':
- if (*RExC_parse == '{') {
- e = strchr(RExC_parse++, '}');
- if (!e)
- vFAIL("Missing right brace on \\p{}");
- n = e - RExC_parse;
- }
- else {
- e = RExC_parse;
- n = 1;
- }
- if (!SIZE_ONLY) {
- if (value == 'p')
- Perl_sv_catpvf(aTHX_ listsv,
- "+utf8::%.*s\n", (int)n, RExC_parse);
- else
- Perl_sv_catpvf(aTHX_ listsv,
- "!utf8::%.*s\n", (int)n, RExC_parse);
- }
- RExC_parse = e + 1;
- lastvalue = OOB_UTF8;
- continue;
- case 'n': value = '\n'; break;
- case 'r': value = '\r'; break;
- case 't': value = '\t'; break;
- case 'f': value = '\f'; break;
- case 'b': value = '\b'; break;
-#ifdef ASCIIish
- case 'e': value = '\033'; break;
- case 'a': value = '\007'; break;
-#else
- case 'e': value = '\047'; break;
- case 'a': value = '\057'; break;
-#endif
- case 'x':
- if (*RExC_parse == '{') {
- e = strchr(RExC_parse++, '}');
- if (!e)
- vFAIL("Missing right brace on \\x{}");
- numlen = 1; /* allow underscores */
- value = (UV)scan_hex(RExC_parse,
- e - RExC_parse,
- &numlen);
- RExC_parse = e + 1;
- }
- else {
- numlen = 0; /* disallow underscores */
- value = (UV)scan_hex(RExC_parse, 2, &numlen);
- RExC_parse += numlen;
- }
- break;
- case 'c':
- value = UCHARAT(RExC_parse++);
- value = toCTRL(value);
- break;
- case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9':
- numlen = 0; /* disallow underscores */
- value = (UV)scan_oct(--RExC_parse, 3, &numlen);
- RExC_parse += numlen;
- break;
- default:
- if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
- vWARN2(RExC_parse,
- "Unrecognized escape \\%c in character class passed through",
- (int)value);
- break;
- }
- }
- if (namedclass > OOB_NAMEDCLASS) {
- if (range) { /* a-\d, a-[:digit:] */
- if (!SIZE_ONLY) {
- if (ckWARN(WARN_REGEXP))
- vWARN4(RExC_parse,
- "False [] range \"%*.*s\"",
- RExC_parse - rangebegin,
- RExC_parse - rangebegin,
- rangebegin);
- Perl_sv_catpvf(aTHX_ listsv,
- /* 0x002D is Unicode for '-' */
- "%04"UVxf"\n002D\n", (UV)lastvalue);
- }
- range = 0;
- }
- if (!SIZE_ONLY) {
- switch (namedclass) {
- case ANYOF_ALNUM:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n"); break;
- case ANYOF_NALNUM:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n"); break;
- case ANYOF_ALNUMC:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n"); break;
- case ANYOF_NALNUMC:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n"); break;
- case ANYOF_ALPHA:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n"); break;
- case ANYOF_NALPHA:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n"); break;
- case ANYOF_ASCII:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n"); break;
- case ANYOF_NASCII:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n"); break;
- case ANYOF_CNTRL:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n"); break;
- case ANYOF_NCNTRL:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n"); break;
- case ANYOF_GRAPH:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n"); break;
- case ANYOF_NGRAPH:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n"); break;
- case ANYOF_DIGIT:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n"); break;
- case ANYOF_NDIGIT:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n"); break;
- case ANYOF_LOWER:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n"); break;
- case ANYOF_NLOWER:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n"); break;
- case ANYOF_PRINT:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n"); break;
- case ANYOF_NPRINT:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n"); break;
- case ANYOF_PUNCT:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n"); break;
- case ANYOF_NPUNCT:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n"); break;
- case ANYOF_SPACE:
- case ANYOF_PSXSPC:
- case ANYOF_BLANK:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n"); break;
- case ANYOF_NSPACE:
- case ANYOF_NPSXSPC:
- case ANYOF_NBLANK:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n"); break;
- case ANYOF_UPPER:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n"); break;
- case ANYOF_NUPPER:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n"); break;
- case ANYOF_XDIGIT:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n"); break;
- case ANYOF_NXDIGIT:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n"); break;
- }
- continue;
- }
- }
- if (range) {
- if (lastvalue > value) { /* b-a */
- Simple_vFAIL4("invalid [] range \"%*.*s\"",
- RExC_parse - rangebegin,
- RExC_parse - rangebegin,
- rangebegin);
- }
- range = 0;
- }
- else {
- lastvalue = value;
- if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
- RExC_parse[1] != ']') {
- RExC_parse++;
- if (namedclass > OOB_NAMEDCLASS) { /* \w-, [:word:]- */
- if (ckWARN(WARN_REGEXP))
- vWARN4(RExC_parse,
- "False [] range \"%*.*s\"",
- RExC_parse - rangebegin,
- RExC_parse - rangebegin,
- rangebegin);
- if (!SIZE_ONLY)
- Perl_sv_catpvf(aTHX_ listsv,
- /* 0x002D is Unicode for '-' */
- "002D\n");
- } else
- range = 1;
- continue; /* do it next time */
- }
- }
- /* now is the next time */
- if (!SIZE_ONLY)
- Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
- (UV)lastvalue, (UV)value);
- range = 0;
- }
-
- ret = reganode(pRExC_state, ANYOFUTF8, 0);
-
- if (!SIZE_ONLY) {
- SV *rv = swash_init("utf8", "", listsv, 1, 0);
- SvREFCNT_dec(listsv);
- n = add_data(pRExC_state, 1,"s");
+ av_store(av, 0, listsv);
+ av_store(av, 1, NULL);
+ rv = newRV_noinc((SV*)av);
+ n = add_data(pRExC_state, 1, "s");
RExC_rx->data->data[n] = (void*)rv;
- ARG1_SET(ret, flags);
- ARG2_SET(ret, n);
+ ARG_SET(ret, n);
}
return ret;
STATIC char*
S_nextchar(pTHX_ RExC_state_t *pRExC_state)
{
- dTHR;
char* retval = RExC_parse++;
for (;;) {
STATIC regnode * /* Location. */
S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
{
- dTHR;
register regnode *ret;
register regnode *ptr;
STATIC regnode * /* Location. */
S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
{
- dTHR;
register regnode *ret;
register regnode *ptr;
STATIC void
S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
{
- dTHR;
- if (SIZE_ONLY) {
- U8 tmpbuf[UTF8_MAXLEN];
- *lenp = uv_to_utf8(tmpbuf, uv) - tmpbuf;
- }
- else
- *lenp = uv_to_utf8((U8*)s, uv) - (U8*)s;
-
+ *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvuni_to_utf8((U8*)s, uv) - (U8*)s);
}
/*
STATIC void
S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
{
- dTHR;
register regnode *src;
register regnode *dst;
register regnode *place;
register int offset = regarglen[(U8)op];
-
+
/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
if (SIZE_ONLY) {
STATIC void
S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
{
- dTHR;
register regnode *scan;
register regnode *temp;
STATIC void
S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
{
- dTHR;
/* "Operandless" and "op != BRANCH" are synonymous in practice. */
if (p == NULL || SIZE_ONLY)
return;
(int)(2*l + 1), "", SvPVX(sv));
if (next == NULL) /* Next ptr. */
PerlIO_printf(Perl_debug_log, "(0)");
- else
+ else
PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
(void)PerlIO_putc(Perl_debug_log, '\n');
after_print:
if (PL_regkind[(U8)op] == BRANCHJ) {
- register regnode *nnode = (OP(next) == LONGJMP
- ? regnext(next)
+ register regnode *nnode = (OP(next) == LONGJMP
+ ? regnext(next)
: next);
if (last && nnode > last)
nnode = last;
Perl_regdump(pTHX_ regexp *r)
{
#ifdef DEBUGGING
- dTHR;
SV *sv = sv_newmortal();
(void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
/* Header fields of interest. */
if (r->anchored_substr)
PerlIO_printf(Perl_debug_log,
- "anchored `%s%.*s%s'%s at %"IVdf" ",
+ "anchored `%s%.*s%s'%s at %"IVdf" ",
PL_colors[0],
(int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
- SvPVX(r->anchored_substr),
+ SvPVX(r->anchored_substr),
PL_colors[1],
SvTAIL(r->anchored_substr) ? "$" : "",
(IV)r->anchored_offset);
if (r->float_substr)
PerlIO_printf(Perl_debug_log,
- "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
+ "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
PL_colors[0],
- (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
+ (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
SvPVX(r->float_substr),
PL_colors[1],
SvTAIL(r->float_substr) ? "$" : "",
(IV)r->float_min_offset, (UV)r->float_max_offset);
if (r->check_substr)
- PerlIO_printf(Perl_debug_log,
- r->check_substr == r->float_substr
+ PerlIO_printf(Perl_debug_log,
+ r->check_substr == r->float_substr
? "(checking floating" : "(checking anchored");
if (r->reganch & ROPT_NOSCAN)
PerlIO_printf(Perl_debug_log, " noscan");
STATIC void
S_put_byte(pTHX_ SV *sv, int c)
{
- if (c <= ' ' || c == 127 || c == 255)
+ if (isCNTRL(c) || c == 127 || c == 255 || !isPRINT(c))
Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
else if (c == '-' || c == ']' || c == '\\' || c == '^')
Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
Perl_regprop(pTHX_ SV *sv, regnode *o)
{
#ifdef DEBUGGING
- dTHR;
register int k;
sv_setpvn(sv, "", 0);
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
else if (k == ANYOF) {
int i, rangestart = -1;
- const char * const out[] = { /* Should be syncronized with
- ANYOF_ #xdefines in regcomp.h */
+ U8 flags = ANYOF_FLAGS(o);
+ const char * const anyofs[] = { /* Should be syncronized with
+ * ANYOF_ #xdefines in regcomp.h */
"\\w",
"\\W",
"\\s",
"[:^blank:]"
};
- if (o->flags & ANYOF_LOCALE)
+ if (flags & ANYOF_LOCALE)
sv_catpv(sv, "{loc}");
- if (o->flags & ANYOF_FOLD)
+ if (flags & ANYOF_FOLD)
sv_catpv(sv, "{i}");
Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
- if (o->flags & ANYOF_INVERT)
+ if (flags & ANYOF_INVERT)
sv_catpv(sv, "^");
for (i = 0; i <= 256; i++) {
if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
rangestart = -1;
}
}
+
if (o->flags & ANYOF_CLASS)
- for (i = 0; i < sizeof(out)/sizeof(char*); i++)
+ for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
if (ANYOF_CLASS_TEST(o,i))
- sv_catpv(sv, out[i]);
+ sv_catpv(sv, anyofs[i]);
+
+ if (flags & ANYOF_UNICODE)
+ sv_catpv(sv, "{unicode}");
+ else if (flags & ANYOF_UNICODE_ALL)
+ sv_catpv(sv, "{all-unicode}");
+
+ {
+ SV *lv;
+ SV *sw = regclass_swash(o, FALSE, &lv);
+
+ if (lv) {
+ if (sw) {
+ UV i;
+ U8 s[UTF8_MAXLEN+1];
+
+ for (i = 0; i <= 256; i++) { /* just the first 256 */
+ U8 *e = uvuni_to_utf8(s, i);
+
+ if (i < 256 && swash_fetch(sw, s)) {
+ if (rangestart == -1)
+ rangestart = i;
+ } else if (rangestart != -1) {
+ U8 *p;
+
+ if (i <= rangestart + 3)
+ for (; rangestart < i; rangestart++) {
+ for(e = uvuni_to_utf8(s, rangestart), p = s; p < e; p++)
+ put_byte(sv, *p);
+ }
+ else {
+ for (e = uvuni_to_utf8(s, rangestart), p = s; p < e; p++)
+ put_byte(sv, *p);
+ sv_catpv(sv, "-");
+ for (e = uvuni_to_utf8(s, i - 1), p = s; p < e; p++)
+ put_byte(sv, *p);
+ }
+ rangestart = -1;
+ }
+ }
+
+ sv_catpv(sv, "..."); /* et cetera */
+ }
+
+ {
+ char *s = savepv(SvPVX(lv));
+ char *origs = s;
+
+ while(*s && *s != '\n') s++;
+
+ if (*s == '\n') {
+ char *t = ++s;
+
+ while (*s) {
+ if (*s == '\n')
+ *s = ' ';
+ s++;
+ }
+ if (s[-1] == ' ')
+ s[-1] = 0;
+
+ sv_catpv(sv, t);
+ }
+
+ Safefree(origs);
+ }
+ }
+ }
+
Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
}
else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
void
Perl_pregfree(pTHX_ struct regexp *r)
{
- dTHR;
DEBUG_r(if (!PL_colorset) reginitcolors());
if (!r || (--r->refcnt > 0))
regnode *
Perl_regnext(pTHX_ register regnode *p)
{
- dTHR;
register I32 offset;
if (p == &PL_regdummy)
void
Perl_save_re_context(pTHX)
-{
- dTHR;
-
+{
#if 0
SAVEPPTR(RExC_precomp); /* uncompiled string. */
SAVEI32(RExC_npar); /* () count. */
SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
SAVEPPTR(PL_regtill); /* How far we are required to go. */
SAVEI8(PL_regprev); /* char before regbol, \n if none */
- SAVEVPTR(PL_reg_start_tmp); /* from regexec.c */
+ SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
PL_reg_start_tmp = 0;
- SAVEFREEPV(PL_reg_start_tmp);
SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
PL_reg_start_tmpl = 0;
SAVEVPTR(PL_regdata);
SAVEI32(PL_reg_oldpos); /* from regexec.c */
SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
SAVEVPTR(PL_reg_curpm); /* from regexec.c */
+ SAVEI32(PL_regnpar); /* () count. */
#ifdef DEBUGGING
- SAVEPPTR(PL_reg_starttry); /* from regexec.c */
+ SAVEPPTR(PL_reg_starttry); /* from regexec.c */
#endif
}
{
ReREFCNT_dec((regexp *)r);
}
-