*
**** Alterations to Henry's code are...
****
- **** Copyright (c) 1991-2001, Larry Wall
+ **** Copyright (c) 1991-2002, Larry Wall
****
**** You may distribute under the terms of either the GNU General Public
**** License or the Artistic License, as specified in the README file.
#define PERL_IN_REGCOMP_C
#include "perl.h"
-#ifdef PERL_IN_XSUB_RE
-# if defined(PERL_CAPI) || defined(PERL_OBJECT)
-# include "XSUB.h"
-# endif
-#else
+#ifndef PERL_IN_XSUB_RE
# include "INTERN.h"
#endif
#endif
typedef struct RExC_state_t {
- U16 flags16; /* are we folding, multilining? */
+ U32 flags; /* are we folding, multilining? */
char *precomp; /* uncompiled string. */
regexp *rx;
char *start; /* Start of input for compile */
#endif
} RExC_state_t;
-#define RExC_flags16 (pRExC_state->flags16)
+#define RExC_flags (pRExC_state->flags)
#define RExC_precomp (pRExC_state->precomp)
#define RExC_rx (pRExC_state->rx)
#define RExC_start (pRExC_state->start)
#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
#define SCF_WHILEM_VISITED_POS 0x2000
-#define UTF RExC_utf8
-#define LOC (RExC_flags16 & PMf_LOCALE)
-#define FOLD (RExC_flags16 & PMf_FOLD)
+#define UTF (RExC_utf8 != 0)
+#define LOC ((RExC_flags & PMf_LOCALE) != 0)
+#define FOLD ((RExC_flags & PMf_FOLD) != 0)
#define OOB_UNICODE 12345678
#define OOB_NAMEDCLASS -1
#define FAIL(msg) \
STMT_START { \
char *ellipses = ""; \
- unsigned len = strlen(RExC_precomp); \
+ IV len = RExC_end - RExC_precomp; \
\
if (!SIZE_ONLY) \
SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
#define FAIL2(pat,msg) \
STMT_START { \
char *ellipses = ""; \
- unsigned len = strlen(RExC_precomp); \
+ IV len = RExC_end - RExC_precomp; \
\
if (!SIZE_ONLY) \
SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
*/
#define Simple_vFAIL(m) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
+ IV offset = RExC_parse - RExC_precomp; \
\
Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
m, (int)offset, RExC_precomp, RExC_precomp + offset); \
*/
#define Simple_vFAIL2(m,a1) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
+ IV offset = RExC_parse - RExC_precomp; \
\
S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
(int)offset, RExC_precomp, RExC_precomp + offset); \
*/
#define Simple_vFAIL3(m, a1, a2) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
+ IV offset = RExC_parse - RExC_precomp; \
\
S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
(int)offset, RExC_precomp, RExC_precomp + offset); \
*/
#define Simple_vFAIL4(m, a1, a2, a3) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
+ IV offset = RExC_parse - RExC_precomp; \
\
S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
(int)offset, RExC_precomp, RExC_precomp + offset); \
*/
#define Simple_vFAIL5(m, a1, a2, a3, a4) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
+ IV offset = RExC_parse - RExC_precomp; \
S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
(int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
#define vWARN(loc,m) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
- Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
+ IV offset = loc - RExC_precomp; \
+ Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,\
m, (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END \
#define vWARNdep(loc,m) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
- int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED; \
- Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\
+ IV offset = loc - RExC_precomp; \
+ Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), "%s" REPORT_LOCATION,\
m, (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END \
#define vWARN2(loc, m, a1) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
- Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
+ IV offset = loc - RExC_precomp; \
+ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
a1, \
(int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
#define vWARN3(loc, m, a1, a2) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp) - (RExC_end - (loc)); \
- Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
+ IV offset = loc - RExC_precomp; \
+ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
a1, a2, \
(int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
#define vWARN4(loc, m, a1, a2, a3) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
- Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
+ IV offset = loc - RExC_precomp; \
+ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
a1, a2, a3, \
(int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
/* used for the parse_flags section for (?c) -- japhy */
#define vWARN5(loc, m, a1, a2, a3, a4) \
STMT_START { \
- unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \
- Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
+ IV offset = loc - RExC_precomp; \
+ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
a1, a2, a3, a4, \
(int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
-static void clear_re(pTHXo_ void *r);
+static void clear_re(pTHX_ void *r);
/* Mark that we cannot extend a found fixed substring at this point.
Updata the longest found anchored substring and the longest found
data->offset_float_max = (l
? data->last_start_max
: data->pos_min + data->pos_delta);
+ if ((U32)data->offset_float_max > (U32)I32_MAX)
+ data->offset_float_max = I32_MAX;
if (data->flags & SF_BEFORE_EOL)
data->flags
|= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
STATIC void
S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
{
- int value;
-
ANYOF_CLASS_ZERO(cl);
- for (value = 0; value < 256; ++value)
- ANYOF_BITMAP_SET(cl, value);
+ ANYOF_BITMAP_SETALL(cl);
cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
if (LOC)
cl->flags |= ANYOF_LOCALE;
return 1;
if (!(cl->flags & ANYOF_UNICODE_ALL))
return 0;
- for (value = 0; value < 256; ++value)
- if (!ANYOF_BITMAP_TEST(cl, value))
- return 0;
+ if (!ANYOF_BITMAP_TESTALLSET(cl))
+ return 0;
return 1;
}
}
}
+/*
+ * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
+ * These need to be revisited when a newer toolchain becomes available.
+ */
+#if defined(__sparc64__) && defined(__GNUC__)
+# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
+# undef SPARC64_GCC_WORKAROUND
+# define SPARC64_GCC_WORKAROUND 1
+# endif
+#endif
+
/* REx optimizer. Converts nodes into quickier variants "in place".
Finds fixed substrings. */
n = nnext;
}
}
+
+ if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
+/*
+ Two problematic code points in Unicode casefolding of EXACT nodes:
+
+ U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
+ U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+
+ which casefold to
+
+ Unicode UTF-8
+
+ U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
+ U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
+
+ This means that in case-insensitive matching (or "loose matching",
+ as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
+ length of the above casefolded versions) can match a target string
+ of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
+ This would rather mess up the minimum length computation.
+
+ What we'll do is to look for the tail four bytes, and then peek
+ at the preceding two bytes to see whether we need to decrease
+ the minimum length by four (six minus two).
+
+ Thanks to the design of UTF-8, there cannot be false matches:
+ A sequence of valid UTF-8 bytes cannot be a subsequence of
+ another valid sequence of UTF-8 bytes.
+
+*/
+ char *s0 = STRING(scan), *s, *t;
+ char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
+ char *t0 = "\xcc\x88\xcc\x81";
+ char *t1 = t0 + 3;
+
+ for (s = s0 + 2;
+ s < s2 && (t = ninstr(s, s1, t0, t1));
+ s = t + 4) {
+ if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
+ ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
+ min -= 4;
+ }
+ }
+
#ifdef DEBUGGING
/* Allow dumping */
n = scan + NODE_SZ_STR(scan);
? I32_MAX : data->pos_min + data->pos_delta;
}
sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
+ if (UTF)
+ SvUTF8_on(data->last_found);
data->last_end = data->pos_min + l;
data->pos_min += l; /* As in the first entry. */
data->flags &= ~SF_BEFORE_EOL;
regnode *oscan = scan;
struct regnode_charclass_class this_class;
struct regnode_charclass_class *oclass = NULL;
+ I32 next_is_eval = 0;
switch (PL_regkind[(U8)OP(scan)]) {
case WHILEM: /* End of (?:...)* . */
scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
}
scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
+ next_is_eval = (OP(scan) == EVAL);
do_curly:
if (flags & SCF_DO_SUBSTR) {
if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
if (!scan) /* It was not CURLYX, but CURLY. */
scan = next;
if (ckWARN(WARN_REGEXP)
+ /* ? quantifier ok, except for (?{ ... }) */
+ && (next_is_eval || !(mincount == 0 && maxcount == 1))
&& (minnext == 0) && (deltanext == 0)
&& data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
&& maxcount <= REG_INFTY/3) /* Complement check for big count */
if (OP(nxt) != CLOSE)
goto nogo;
/* Now we know that nxt2 is the only contents: */
- oscan->flags = ARG(nxt);
+ oscan->flags = (U8)ARG(nxt);
OP(oscan) = CURLYN;
OP(nxt1) = NOTHING; /* was OPEN. */
#ifdef DEBUGGING
if (OP(nxt) != CLOSE)
FAIL("Panic opt close");
- oscan->flags = ARG(nxt);
+ oscan->flags = (U8)ARG(nxt);
OP(nxt1) = OPTIMIZED; /* was OPEN. */
OP(nxt) = OPTIMIZED; /* was CLOSE. */
#ifdef DEBUGGING
if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
nxt += ARG(nxt);
- PREVOPER(nxt)->flags = data->whilem_c
- | (RExC_whilem_seen << 4); /* On WHILEM */
+ PREVOPER(nxt)->flags = (U8)(data->whilem_c
+ | (RExC_whilem_seen << 4)); /* On WHILEM */
}
if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
pars++;
int counted = mincount != 0;
if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
+#if defined(SPARC64_GCC_WORKAROUND)
+ I32 b = 0;
+ STRLEN l = 0;
+ char *s = NULL;
+ I32 old = 0;
+
+ if (pos_before >= data->last_start_min)
+ b = pos_before;
+ else
+ b = data->last_start_min;
+
+ l = 0;
+ s = SvPV(data->last_found, l);
+ old = b - data->last_start_min;
+
+#else
I32 b = pos_before >= data->last_start_min
? pos_before : data->last_start_min;
STRLEN l;
char *s = SvPV(data->last_found, l);
I32 old = b - data->last_start_min;
+#endif
if (UTF)
old = utf8_hop((U8*)s, old) - (U8*)s;
else if (minnext > U8_MAX) {
vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
}
- scan->flags = minnext;
+ scan->flags = (U8)minnext;
}
if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
pars++;
pars++;
}
else if (OP(scan) == CLOSE) {
- if (ARG(scan) == is_par) {
+ if ((I32)ARG(scan) == is_par) {
next = regnext(scan);
if ( next && (OP(next) != WHILEM) && next < last)
if (exp == NULL)
FAIL("NULL regexp argument");
- /* XXXX This looks very suspicious... */
- if (pm->op_pmdynflags & PMdf_CMP_UTF8)
- RExC_utf8 = 1;
- else
- RExC_utf8 = 0;
+ RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
RExC_precomp = exp;
- DEBUG_r(if (!PL_colorset) reginitcolors());
- DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
- PL_colors[4],PL_colors[5],PL_colors[0],
- (int)(xend - exp), RExC_precomp, PL_colors[1]));
- RExC_flags16 = pm->op_pmflags;
+ DEBUG_r({
+ if (!PL_colorset) reginitcolors();
+ PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
+ PL_colors[4],PL_colors[5],PL_colors[0],
+ (int)(xend - exp), RExC_precomp, PL_colors[1]);
+ });
+ RExC_flags = pm->op_pmflags;
RExC_sawback = 0;
RExC_seen = 0;
RExC_rx = r;
/* Second pass: emit code. */
- RExC_flags16 = pm->op_pmflags; /* don't let top level (?i) bleed */
+ RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
RExC_parse = exp;
RExC_end = xend;
RExC_naughty = 0;
RExC_emit_start = r->program;
RExC_emit = r->program;
/* Store the count of eval-groups for security checks: */
- RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
+ RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
REGC((U8)REG_MAGIC, (char*) RExC_emit++);
r->data = 0;
if (reg(pRExC_state, 0, &flags) == NULL)
/* Dig out information for optimizations. */
r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
- pm->op_pmflags = RExC_flags16;
+ pm->op_pmflags = RExC_flags;
if (UTF)
- r->reganch |= ROPT_UTF8;
+ r->reganch |= ROPT_UTF8; /* Unicode in it? */
r->regstclass = NULL;
if (RExC_naughty >= 10) /* Probably an expensive pattern. */
r->reganch |= ROPT_NAUGHTY;
first = NEXTOPER(first);
goto again;
}
- else if ((OP(first) == STAR &&
+ else if (!sawopen && (OP(first) == STAR &&
PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
!(r->reganch & ROPT_ANCH) )
{
if (longest_float_length
|| (data.flags & SF_FL_BEFORE_EOL
&& (!(data.flags & SF_FL_BEFORE_MEOL)
- || (RExC_flags16 & PMf_MULTILINE)))) {
+ || (RExC_flags & PMf_MULTILINE)))) {
int t;
if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
&& SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
goto remove_float; /* As in (a)+. */
- r->float_substr = data.longest_float;
+ if (SvUTF8(data.longest_float)) {
+ r->float_utf8 = data.longest_float;
+ r->float_substr = Nullsv;
+ } else {
+ r->float_substr = data.longest_float;
+ r->float_utf8 = Nullsv;
+ }
r->float_min_offset = data.offset_float_min;
r->float_max_offset = data.offset_float_max;
t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
&& (!(data.flags & SF_FL_BEFORE_MEOL)
- || (RExC_flags16 & PMf_MULTILINE)));
- fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
+ || (RExC_flags & PMf_MULTILINE)));
+ fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
}
else {
remove_float:
- r->float_substr = Nullsv;
+ r->float_substr = r->float_utf8 = Nullsv;
SvREFCNT_dec(data.longest_float);
longest_float_length = 0;
}
if (longest_fixed_length
|| (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
&& (!(data.flags & SF_FIX_BEFORE_MEOL)
- || (RExC_flags16 & PMf_MULTILINE)))) {
+ || (RExC_flags & PMf_MULTILINE)))) {
int t;
- r->anchored_substr = data.longest_fixed;
+ if (SvUTF8(data.longest_fixed)) {
+ r->anchored_utf8 = data.longest_fixed;
+ r->anchored_substr = Nullsv;
+ } else {
+ r->anchored_substr = data.longest_fixed;
+ r->anchored_utf8 = Nullsv;
+ }
r->anchored_offset = data.offset_fixed;
t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
&& (!(data.flags & SF_FIX_BEFORE_MEOL)
- || (RExC_flags16 & PMf_MULTILINE)));
- fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
+ || (RExC_flags & PMf_MULTILINE)));
+ fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
}
else {
- r->anchored_substr = Nullsv;
+ r->anchored_substr = r->anchored_utf8 = Nullsv;
SvREFCNT_dec(data.longest_fixed);
longest_fixed_length = 0;
}
if (r->regstclass
&& (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
r->regstclass = NULL;
- if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
+ if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
+ && stclass_flag
&& !(data.start_class->flags & ANYOF_EOS)
- && !cl_is_anything(data.start_class)) {
+ && !cl_is_anything(data.start_class))
+ {
I32 n = add_data(pRExC_state, 1, "f");
New(1006, RExC_rx->data->data[n], 1,
/* A temporary algorithm prefers floated substr to fixed one to dig more info. */
if (longest_fixed_length > longest_float_length) {
r->check_substr = r->anchored_substr;
+ r->check_utf8 = r->anchored_utf8;
r->check_offset_min = r->check_offset_max = r->anchored_offset;
if (r->reganch & ROPT_ANCH_SINGLE)
r->reganch |= ROPT_NOSCAN;
}
else {
r->check_substr = r->float_substr;
+ r->check_utf8 = r->float_utf8;
r->check_offset_min = data.offset_float_min;
r->check_offset_max = data.offset_float_max;
}
/* XXXX Currently intuiting is not compatible with ANCH_GPOS.
This should be changed ASAP! */
- if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
+ if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
r->reganch |= RE_USE_INTUIT;
- if (SvTAIL(r->check_substr))
+ if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
r->reganch |= RE_INTUIT_TAIL;
}
}
data.start_class = &ch_class;
data.last_closep = &last_close;
minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
- r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
+ r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
+ = r->float_substr = r->float_utf8 = Nullsv;
if (!(data.start_class->flags & ANYOF_EOS)
- && !cl_is_anything(data.start_class)) {
+ && !cl_is_anything(data.start_class))
+ {
I32 n = add_data(pRExC_state, 1, "f");
New(1006, RExC_rx->data->data[n], 1,
register regnode *lastbr;
register regnode *ender = 0;
register I32 parno = 0;
- I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
+ I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
/* for (?g), (?gc), and (?o) warnings; warning
about (?c) will warn about (?g) -- japhy */
/* Make an OPEN node, if parenthesized. */
if (paren) {
if (*RExC_parse == '?') { /* (?...) */
- U16 posflags = 0, negflags = 0;
- U16 *flagsp = &posflags;
+ U32 posflags = 0, negflags = 0;
+ U32 *flagsp = &posflags;
int logical = 0;
char *seqstart = RExC_parse;
/* FALL THROUGH*/
case '?': /* (??...) */
logical = 1;
+ if (*RExC_parse != '{')
+ goto unknown;
paren = *RExC_parse++;
/* FALL THROUGH */
case '{': /* (?{...}) */
++RExC_parse;
goto parse_flags;
}
- RExC_flags16 |= posflags;
- RExC_flags16 &= ~negflags;
+ RExC_flags |= posflags;
+ RExC_flags &= ~negflags;
if (*RExC_parse == ':') {
RExC_parse++;
paren = ':';
}
else if (paren != '?') /* Not Conditional */
ret = br;
- if (flags&HASWIDTH)
- *flagp |= HASWIDTH;
- *flagp |= flags&SPSTART;
+ *flagp |= flags & (SPSTART | HASWIDTH);
lastbr = br;
while (*RExC_parse == '|') {
if (!SIZE_ONLY && RExC_extralen) {
static char parens[] = "=!<,>";
if (paren && (p = strchr(parens, paren))) {
- int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
+ U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
int flag = (p - parens) > 1;
if (paren == '>')
/* Check for proper termination. */
if (paren) {
- RExC_flags16 = oregflags;
+ RExC_flags = oregflags;
if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
RExC_parse = oregcomp_parse;
vFAIL("Unmatched (");
if (max && max < min)
vFAIL("Can't do {n,m} with n > m");
if (!SIZE_ONLY) {
- ARG1_SET(ret, min);
- ARG2_SET(ret, max);
+ ARG1_SET(ret, (U16)min);
+ ARG2_SET(ret, (U16)max);
}
goto nest_check;
case '^':
RExC_seen_zerolen++;
nextchar(pRExC_state);
- if (RExC_flags16 & PMf_MULTILINE)
+ if (RExC_flags & PMf_MULTILINE)
ret = reg_node(pRExC_state, MBOL);
- else if (RExC_flags16 & PMf_SINGLELINE)
+ else if (RExC_flags & PMf_SINGLELINE)
ret = reg_node(pRExC_state, SBOL);
else
ret = reg_node(pRExC_state, BOL);
nextchar(pRExC_state);
if (*RExC_parse)
RExC_seen_zerolen++;
- if (RExC_flags16 & PMf_MULTILINE)
+ if (RExC_flags & PMf_MULTILINE)
ret = reg_node(pRExC_state, MEOL);
- else if (RExC_flags16 & PMf_SINGLELINE)
+ else if (RExC_flags & PMf_SINGLELINE)
ret = reg_node(pRExC_state, SEOL);
else
ret = reg_node(pRExC_state, EOL);
break;
case '.':
nextchar(pRExC_state);
- if (RExC_flags16 & PMf_SINGLELINE)
+ if (RExC_flags & PMf_SINGLELINE)
ret = reg_node(pRExC_state, SANY);
else
ret = reg_node(pRExC_state, REG_ANY);
case 'Z':
ret = reg_node(pRExC_state, SEOL);
*flagp |= SIMPLE;
+ RExC_seen_zerolen++; /* Do not optimize RE away */
nextchar(pRExC_state);
break;
case 'z':
Set_Node_Length(ret, 2); /* MJD */
break;
case 'w':
- ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
+ ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
Set_Node_Length(ret, 2); /* MJD */
break;
case 'W':
- ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
+ ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
Set_Node_Length(ret, 2); /* MJD */
case 'b':
RExC_seen_zerolen++;
RExC_seen |= REG_SEEN_LOOKBEHIND;
- ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
+ ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
*flagp |= SIMPLE;
nextchar(pRExC_state);
Set_Node_Length(ret, 2); /* MJD */
case 'B':
RExC_seen_zerolen++;
RExC_seen |= REG_SEEN_LOOKBEHIND;
- ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
+ ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
*flagp |= SIMPLE;
nextchar(pRExC_state);
Set_Node_Length(ret, 2); /* MJD */
break;
case 's':
- ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
+ ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
Set_Node_Length(ret, 2); /* MJD */
break;
case 'S':
- ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
+ ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
Set_Node_Length(ret, 2); /* MJD */
/* a lovely hack--pretend we saw [\pX] instead */
RExC_end = strchr(RExC_parse, '}');
if (!RExC_end) {
+ U8 c = (U8)*RExC_parse;
RExC_parse += 2;
RExC_end = oldregxend;
- vFAIL("Missing right brace on \\p{}");
+ vFAIL2("Missing right brace on \\%c{}", c);
}
RExC_end++;
}
- else
+ else {
RExC_end = RExC_parse + 2;
+ if (RExC_end > oldregxend)
+ RExC_end = oldregxend;
+ }
RExC_parse--;
ret = regclass(pRExC_state);
while (isDIGIT(*RExC_parse))
RExC_parse++;
- if (!SIZE_ONLY && num > RExC_rx->nparens)
+ if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
vFAIL("Reference to nonexistent group");
RExC_sawback = 1;
- ret = reganode(pRExC_state, FOLD
- ? (LOC ? REFFL : REFF)
- : REF, num);
+ ret = reganode(pRExC_state,
+ (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
+ num);
*flagp |= HASWIDTH;
/* override incorrect value set in reganode MJD */
break;
case '#':
- if (RExC_flags16 & PMf_EXTENDED) {
+ if (RExC_flags & PMf_EXTENDED) {
while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
if (RExC_parse < RExC_end)
goto tryagain;
register char *p;
char *oldp, *s;
STRLEN numlen;
+ STRLEN foldlen;
+ U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
parse_start = RExC_parse - 1;
RExC_parse++;
defchar:
- ret = reg_node(pRExC_state, FOLD
- ? (LOC ? EXACTFL : EXACTF)
- : EXACT);
+ ender = 0;
+ ret = reg_node(pRExC_state,
+ (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
s = STRING(ret);
for (len = 0, p = RExC_parse - 1;
len < 127 && p < RExC_end;
{
oldp = p;
- if (RExC_flags16 & PMf_EXTENDED)
+ if (RExC_flags & PMf_EXTENDED)
p = regwhite(p, RExC_end);
switch (*p) {
case '^':
case '\\':
switch (*++p) {
case 'A':
+ case 'C':
+ case 'X':
case 'G':
case 'Z':
case 'z':
vFAIL("Missing right brace on \\x{}");
}
else {
- numlen = 1; /* allow underscores */
- ender = (UV)scan_hex(p + 1, e - p - 1, &numlen);
+ I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
+ | PERL_SCAN_DISALLOW_PREFIX;
+ numlen = e - p - 1;
+ ender = grok_hex(p + 1, &numlen, &flags, NULL);
if (ender > 0xff)
RExC_utf8 = 1;
/* numlen is generous */
}
}
else {
- numlen = 0; /* disallow underscores */
- ender = (UV)scan_hex(p, 2, &numlen);
+ I32 flags = PERL_SCAN_DISALLOW_PREFIX;
+ numlen = 2;
+ ender = grok_hex(p, &numlen, &flags, NULL);
p += numlen;
}
break;
case '5': case '6': case '7': case '8':case '9':
if (*p == '0' ||
(isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
- numlen = 0; /* disallow underscores */
- ender = (UV)scan_oct(p, 3, &numlen);
+ I32 flags = 0;
+ numlen = 3;
+ ender = grok_oct(p, &numlen, &flags, NULL);
p += numlen;
}
else {
/* FALL THROUGH */
default:
if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
- vWARN2(p +1, "Unrecognized escape \\%c passed through", *p);
+ vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
goto normal_default;
}
break;
ender = *p++;
break;
}
- if (RExC_flags16 & PMf_EXTENDED)
+ if (RExC_flags & PMf_EXTENDED)
p = regwhite(p, RExC_end);
if (UTF && FOLD) {
- if (LOC)
- ender = toLOWER_LC_uvchr(ender);
- else
- ender = toLOWER_uni(ender);
+ /* Prime the casefolded buffer. */
+ ender = toFOLD_uni(ender, tmpbuf, &foldlen);
}
if (ISMULT2(p)) { /* Back off on ?+*. */
if (len)
p = oldp;
- else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
- reguni(pRExC_state, ender, s, &numlen);
- s += numlen;
- len += numlen;
+ else if (UTF) {
+ STRLEN unilen;
+
+ if (FOLD) {
+ /* Emit all the Unicode characters. */
+ for (foldbuf = tmpbuf;
+ foldlen;
+ foldlen -= numlen) {
+ ender = utf8_to_uvchr(foldbuf, &numlen);
+ if (numlen > 0) {
+ reguni(pRExC_state, ender, s, &unilen);
+ s += unilen;
+ len += unilen;
+ /* In EBCDIC the numlen
+ * and unilen can differ. */
+ foldbuf += numlen;
+ if (numlen >= foldlen)
+ break;
+ }
+ else
+ break; /* "Can't happen." */
+ }
+ }
+ else {
+ reguni(pRExC_state, ender, s, &unilen);
+ if (unilen > 0) {
+ s += unilen;
+ len += unilen;
+ }
+ }
}
else {
len++;
- REGC(ender, s++);
+ REGC((char)ender, s++);
}
break;
}
- if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
- reguni(pRExC_state, ender, s, &numlen);
- s += numlen;
- len += numlen - 1;
+ if (UTF) {
+ STRLEN unilen;
+
+ if (FOLD) {
+ /* Emit all the Unicode characters. */
+ for (foldbuf = tmpbuf;
+ foldlen;
+ foldlen -= numlen) {
+ ender = utf8_to_uvchr(foldbuf, &numlen);
+ if (numlen > 0) {
+ reguni(pRExC_state, ender, s, &unilen);
+ len += unilen;
+ s += unilen;
+ /* In EBCDIC the numlen
+ * and unilen can differ. */
+ foldbuf += numlen;
+ if (numlen >= foldlen)
+ break;
+ }
+ else
+ break;
+ }
+ }
+ else {
+ reguni(pRExC_state, ender, s, &unilen);
+ if (unilen > 0) {
+ s += unilen;
+ len += unilen;
+ }
+ }
+ len--;
}
else
- REGC(ender, s++);
+ REGC((char)ender, s++);
}
loopdone:
RExC_parse = p - 1;
break;
}
+ /* If the encoding pragma is in effect recode the text of
+ * any EXACT-kind nodes. */
+ if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
+ STRLEN oldlen = STR_LEN(ret);
+ SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
+
+ if (RExC_utf8)
+ SvUTF8_on(sv);
+ if (sv_utf8_downgrade(sv, TRUE)) {
+ char *s = sv_recode_to_utf8(sv, PL_encoding);
+ STRLEN newlen = SvCUR(sv);
+
+ if (!SIZE_ONLY) {
+ DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
+ (int)oldlen, STRING(ret),
+ (int)newlen, s));
+ Copy(s, STRING(ret), newlen, char);
+ STR_LEN(ret) += newlen - oldlen;
+ RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
+ } else
+ RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
+ }
+ }
+
return(ret);
}
Character classes ([:foo:]) can also be negated ([:^foo:]).
Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
- but trigger warnings because they are currently unimplemented. */
+ but trigger failures because they are currently unimplemented. */
+
+#define POSIXCC_DONE(c) ((c) == ':')
+#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
+#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
+
STATIC I32
S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
{
if (value == '[' && RExC_parse + 1 < RExC_end &&
/* I smell either [: or [= or [. -- POSIX has been here, right? */
- (*RExC_parse == ':' ||
- *RExC_parse == '=' ||
- *RExC_parse == '.')) {
- char c = *RExC_parse;
+ POSIXCC(UCHARAT(RExC_parse))) {
+ char c = UCHARAT(RExC_parse);
char* s = RExC_parse++;
- while (RExC_parse < RExC_end && *RExC_parse != c)
+ while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
RExC_parse++;
if (RExC_parse == RExC_end)
/* Grandfather lone [:, [=, [. */
else {
char* t = RExC_parse++; /* skip over the c */
- if (*RExC_parse == ']') {
+ if (UCHARAT(RExC_parse) == ']') {
RExC_parse++; /* skip over the ending ] */
posixcc = s + 1;
if (*s == ':') {
/* adjust RExC_parse so the warning shows after
the class closes */
- while (*RExC_parse && *RExC_parse != ']')
+ while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
RExC_parse++;
Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
}
STATIC void
S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
{
- if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
- (*RExC_parse == ':' ||
- *RExC_parse == '=' ||
- *RExC_parse == '.')) {
+ if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
char *s = RExC_parse;
char c = *s++;
while(*s && isALNUM(*s))
s++;
if (*s && c == *s && s[1] == ']') {
- vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
+ if (ckWARN(WARN_REGEXP))
+ vWARN3(s+2,
+ "POSIX syntax [%c %c] belongs inside character classes",
+ c, c);
/* [[=foo=]] and [[.foo.]] are still future. */
- if (c == '=' || c == '.')
- {
+ if (POSIXCC_NOTYET(c)) {
/* adjust RExC_parse so the error shows after
the class closes */
- while (*RExC_parse && *RExC_parse++ != ']')
+ while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
;
Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
}
S_regclass(pTHX_ RExC_state_t *pRExC_state)
{
register UV value;
+ register UV nextvalue;
register IV prevvalue = OOB_UNICODE;
register IV range = 0;
register regnode *ret;
SV *listsv = Nullsv;
register char *e;
UV n;
- bool optimize_invert = TRUE;
+ bool optimize_invert = TRUE;
+ AV* unicode_alternate = 0;
ret = reganode(pRExC_state, ANYOF, 0);
if (!SIZE_ONLY)
ANYOF_FLAGS(ret) = 0;
- if (*RExC_parse == '^') { /* Complement of range. */
+ if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
RExC_naughty++;
RExC_parse++;
if (!SIZE_ONLY)
listsv = newSVpvn("# comment\n", 10);
}
- if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
+ nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
+
+ if (!SIZE_ONLY && POSIXCC(nextvalue))
checkposixcc(pRExC_state);
- if (*RExC_parse == ']' || *RExC_parse == '-')
- goto charclassloop; /* allow 1st char to be ] or - */
+ /* allow 1st char to be ] (allowing it to be - is dealt with later) */
+ if (UCHARAT(RExC_parse) == ']')
+ goto charclassloop;
- while (RExC_parse < RExC_end && *RExC_parse != ']') {
+ while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
charclassloop:
}
else
value = UCHARAT(RExC_parse++);
- if (value == '[')
+ nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
+ if (value == '[' && POSIXCC(nextvalue))
namedclass = regpposixcc(pRExC_state, value);
else if (value == '\\') {
if (UTF) {
case 'D': namedclass = ANYOF_NDIGIT; break;
case 'p':
case 'P':
+ if (RExC_parse >= RExC_end)
+ vFAIL2("Empty \\%c{}", (U8)value);
if (*RExC_parse == '{') {
+ U8 c = (U8)value;
e = strchr(RExC_parse++, '}');
if (!e)
- vFAIL("Missing right brace on \\p{}");
+ vFAIL2("Missing right brace on \\%c{}", c);
+ while (isSPACE(UCHARAT(RExC_parse)))
+ RExC_parse++;
+ if (e == RExC_parse)
+ vFAIL2("Empty \\%c{}", c);
n = e - RExC_parse;
+ while (isSPACE(UCHARAT(RExC_parse + n - 1)))
+ n--;
}
else {
e = RExC_parse;
n = 1;
}
if (!SIZE_ONLY) {
+ if (UCHARAT(RExC_parse) == '^') {
+ RExC_parse++;
+ n--;
+ value = value == 'p' ? 'P' : 'p'; /* toggle */
+ while (isSPACE(UCHARAT(RExC_parse))) {
+ RExC_parse++;
+ n--;
+ }
+ }
if (value == 'p')
- Perl_sv_catpvf(aTHX_ listsv,
- "+utf8::%.*s\n", (int)n, RExC_parse);
+ Perl_sv_catpvf(aTHX_ listsv,
+ "+utf8::%.*s\n", (int)n, RExC_parse);
else
- Perl_sv_catpvf(aTHX_ listsv,
- "!utf8::%.*s\n", (int)n, RExC_parse);
+ Perl_sv_catpvf(aTHX_ listsv,
+ "!utf8::%.*s\n", (int)n, RExC_parse);
}
RExC_parse = e + 1;
ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
case 'a': value = ASCII_TO_NATIVE('\007');break;
case 'x':
if (*RExC_parse == '{') {
+ I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
+ | PERL_SCAN_DISALLOW_PREFIX;
e = strchr(RExC_parse++, '}');
if (!e)
vFAIL("Missing right brace on \\x{}");
- numlen = 1; /* allow underscores */
- value = (UV)scan_hex(RExC_parse,
- e - RExC_parse,
- &numlen);
+
+ numlen = e - RExC_parse;
+ value = grok_hex(RExC_parse, &numlen, &flags, NULL);
RExC_parse = e + 1;
}
else {
- numlen = 0; /* disallow underscores */
- value = (UV)scan_hex(RExC_parse, 2, &numlen);
+ I32 flags = PERL_SCAN_DISALLOW_PREFIX;
+ numlen = 2;
+ value = grok_hex(RExC_parse, &numlen, &flags, NULL);
RExC_parse += numlen;
}
break;
break;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
- numlen = 0; /* disallow underscores */
- value = (UV)scan_oct(--RExC_parse, 3, &numlen);
+ {
+ I32 flags = 0;
+ numlen = 3;
+ value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
RExC_parse += numlen;
break;
+ }
default:
if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
vWARN2(RExC_parse,
} /* end of namedclass \blah */
if (range) {
- if (prevvalue > value) /* b-a */ {
+ if (prevvalue > (IV)value) /* b-a */ {
Simple_vFAIL4("Invalid [] range \"%*.*s\"",
RExC_parse - rangebegin,
RExC_parse - rangebegin,
}
else
#endif
- for (i = prevvalue; i <= ceilvalue; i++)
- ANYOF_BITMAP_SET(ret, i);
+ for (i = prevvalue; i <= ceilvalue; i++)
+ ANYOF_BITMAP_SET(ret, i);
}
- if (value > 255) {
+ if (value > 255 || UTF) {
+ UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
+ UV natvalue = NATIVE_TO_UNI(value);
+
ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
- if (prevvalue < value)
+ if (prevnatvalue < natvalue) { /* what about > ? */
Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
- (UV)prevvalue, (UV)value);
- else if (prevvalue == value)
- Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
- (UV)value);
+ prevnatvalue, natvalue);
+ }
+ else if (prevnatvalue == natvalue) {
+ Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
+ if (FOLD) {
+ U8 foldbuf[UTF8_MAXLEN_FOLD+1];
+ STRLEN foldlen;
+ UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
+
+ /* If folding and foldable and a single
+ * character, insert also the folded version
+ * to the charclass. */
+ if (f != value) {
+ if (foldlen == (STRLEN)UNISKIP(f))
+ Perl_sv_catpvf(aTHX_ listsv,
+ "%04"UVxf"\n", f);
+ else {
+ /* Any multicharacter foldings
+ * require the following transform:
+ * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
+ * where E folds into "pq" and F folds
+ * into "rst", all other characters
+ * fold to single characters. We save
+ * away these multicharacter foldings,
+ * to be later saved as part of the
+ * additional "s" data. */
+ SV *sv;
+
+ if (!unicode_alternate)
+ unicode_alternate = newAV();
+ sv = newSVpvn((char*)foldbuf, foldlen);
+ SvUTF8_on(sv);
+ av_push(unicode_alternate, sv);
+ }
+ }
+
+ /* If folding and the value is one of the Greek
+ * sigmas insert a few more sigmas to make the
+ * folding rules of the sigmas to work right.
+ * Note that not all the possible combinations
+ * are handled here: some of them are handled
+ * by the standard folding rules, and some of
+ * them (literal or EXACTF cases) are handled
+ * during runtime in regexec.c:S_find_byclass(). */
+ if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
+ Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
+ (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
+ Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
+ (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
+ }
+ else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
+ Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
+ (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
+ }
+ }
}
}
) {
for (value = 0; value < 256; ++value) {
if (ANYOF_BITMAP_TEST(ret, value)) {
- IV fold = PL_fold[value];
+ UV fold = PL_fold[value];
if (fold != value)
ANYOF_BITMAP_SET(ret, fold);
AV *av = newAV();
SV *rv;
+ /* The 0th element stores the character class description
+ * in its textual form: used later (regexec.c:Perl_regclass_swatch())
+ * to initialize the appropriate swash (which gets stored in
+ * the 1st element), and also useful for dumping the regnode.
+ * The 2nd element stores the multicharacter foldings,
+ * used later (regexec.c:s_reginclasslen()). */
av_store(av, 0, listsv);
av_store(av, 1, NULL);
+ av_store(av, 2, (SV*)unicode_alternate);
rv = newRV_noinc((SV*)av);
n = add_data(pRExC_state, 1, "s");
RExC_rx->data->data[n] = (void*)rv;
RExC_parse++;
continue;
}
- if (RExC_flags16 & PMf_EXTENDED) {
+ if (RExC_flags & PMf_EXTENDED) {
if (isSPACE(*RExC_parse)) {
RExC_parse++;
continue;
PL_colors[1],
SvTAIL(r->anchored_substr) ? "$" : "",
(IV)r->anchored_offset);
+ else if (r->anchored_utf8)
+ PerlIO_printf(Perl_debug_log,
+ "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
+ PL_colors[0],
+ (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
+ SvPVX(r->anchored_utf8),
+ PL_colors[1],
+ SvTAIL(r->anchored_utf8) ? "$" : "",
+ (IV)r->anchored_offset);
if (r->float_substr)
PerlIO_printf(Perl_debug_log,
"floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
PL_colors[1],
SvTAIL(r->float_substr) ? "$" : "",
(IV)r->float_min_offset, (UV)r->float_max_offset);
- if (r->check_substr)
+ else if (r->float_utf8)
+ PerlIO_printf(Perl_debug_log,
+ "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
+ PL_colors[0],
+ (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
+ SvPVX(r->float_utf8),
+ PL_colors[1],
+ SvTAIL(r->float_utf8) ? "$" : "",
+ (IV)r->float_min_offset, (UV)r->float_max_offset);
+ if (r->check_substr || r->check_utf8)
PerlIO_printf(Perl_debug_log,
r->check_substr == r->float_substr
+ && r->check_utf8 == r->float_utf8
? "(checking floating" : "(checking anchored");
if (r->reganch & ROPT_NOSCAN)
PerlIO_printf(Perl_debug_log, " noscan");
if (r->reganch & ROPT_CHECK_ALL)
PerlIO_printf(Perl_debug_log, " isall");
- if (r->check_substr)
+ if (r->check_substr || r->check_utf8)
PerlIO_printf(Perl_debug_log, ") ");
if (r->regstclass) {
k = PL_regkind[(U8)OP(o)];
- if (k == EXACT)
- Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>", PL_colors[0],
- STR_LEN(o), STRING(o), PL_colors[1]);
+ if (k == EXACT) {
+ SV *dsv = sv_2mortal(newSVpvn("", 0));
+ /* Using is_utf8_string() is a crude hack but it may
+ * be the best for now since we have no flag "this EXACTish
+ * node was UTF-8" --jhi */
+ bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
+ char *s = do_utf8 ?
+ pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
+ UNI_DISPLAY_REGEX) :
+ STRING(o);
+ int len = do_utf8 ?
+ strlen(s) :
+ STR_LEN(o);
+ Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
+ PL_colors[0],
+ len, s,
+ PL_colors[1]);
+ }
else if (k == CURLY) {
if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
if (flags & ANYOF_UNICODE)
sv_catpv(sv, "{unicode}");
else if (flags & ANYOF_UNICODE_ALL)
- sv_catpv(sv, "{all-unicode}");
+ sv_catpv(sv, "{unicode_all}");
{
SV *lv;
- SV *sw = regclass_swash(o, FALSE, &lv);
+ SV *sw = regclass_swash(o, FALSE, &lv, 0);
if (lv) {
if (sw) {
- UV i;
U8 s[UTF8_MAXLEN+1];
for (i = 0; i <= 256; i++) { /* just the first 256 */
{ /* Assume that RE_INTUIT is set */
DEBUG_r(
{ STRLEN n_a;
- char *s = SvPV(prog->check_substr,n_a);
+ char *s = SvPV(prog->check_substr
+ ? prog->check_substr : prog->check_utf8, n_a);
if (!PL_colorset) reginitcolors();
PerlIO_printf(Perl_debug_log,
- "%sUsing REx substr:%s `%s%.60s%s%s'\n",
- PL_colors[4],PL_colors[5],PL_colors[0],
+ "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
+ PL_colors[4],
+ prog->check_substr ? "" : "utf8 ",
+ PL_colors[5],PL_colors[0],
s,
PL_colors[1],
(strlen(s) > 60 ? "..." : ""));
} );
- return prog->check_substr;
+ return prog->check_substr ? prog->check_substr : prog->check_utf8;
}
void
Perl_pregfree(pTHX_ struct regexp *r)
{
- DEBUG_r(if (!PL_colorset) reginitcolors());
+#ifdef DEBUGGING
+ SV *dsv = PERL_DEBUG_PAD_ZERO(0);
+#endif
if (!r || (--r->refcnt > 0))
return;
- DEBUG_r(PerlIO_printf(Perl_debug_log,
- "%sFreeing REx:%s `%s%.60s%s%s'\n",
- PL_colors[4],PL_colors[5],PL_colors[0],
- r->precomp,
- PL_colors[1],
- (strlen(r->precomp) > 60 ? "..." : "")));
+ DEBUG_r({
+ char *s = pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60,
+ UNI_DISPLAY_REGEX);
+ int len = SvCUR(dsv);
+ if (!PL_colorset)
+ reginitcolors();
+ PerlIO_printf(Perl_debug_log,
+ "%sFreeing REx:%s `%s%*.*s%s%s'\n",
+ PL_colors[4],PL_colors[5],PL_colors[0],
+ len, len, s,
+ PL_colors[1],
+ len > 60 ? "..." : "");
+ });
if (r->precomp)
Safefree(r->precomp);
if (r->substrs) {
if (r->anchored_substr)
SvREFCNT_dec(r->anchored_substr);
+ if (r->anchored_utf8)
+ SvREFCNT_dec(r->anchored_utf8);
if (r->float_substr)
SvREFCNT_dec(r->float_substr);
+ if (r->float_utf8)
+ SvREFCNT_dec(r->float_utf8);
Safefree(r->substrs);
}
if (r->data) {
new_comppad = NULL;
break;
case 'n':
- break;
+ break;
default:
Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
}
SAVEPPTR(RExC_precomp); /* uncompiled string. */
SAVEI32(RExC_npar); /* () count. */
SAVEI32(RExC_size); /* Code size. */
- SAVEI16(RExC_flags16); /* are we folding, multilining? */
+ SAVEI32(RExC_flags); /* are we folding, multilining? */
SAVEVPTR(RExC_rx); /* from regcomp.c */
SAVEI32(RExC_seen); /* from regcomp.c */
SAVEI32(RExC_sawback); /* Did we see \1, ...? */
SAVEVPTR(PL_reg_re); /* from regexec.c */
SAVEPPTR(PL_reg_ganch); /* from regexec.c */
SAVESPTR(PL_reg_sv); /* from regexec.c */
- SAVEI8(PL_reg_sv_utf8); /* from regexec.c */
+ SAVEI8(PL_reg_match_utf8); /* from regexec.c */
SAVEVPTR(PL_reg_magic); /* from regexec.c */
SAVEI32(PL_reg_oldpos); /* from regexec.c */
SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
#endif
}
-#ifdef PERL_OBJECT
-#include "XSUB.h"
-#undef this
-#define this pPerl
-#endif
-
static void
-clear_re(pTHXo_ void *r)
+clear_re(pTHX_ void *r)
{
ReREFCNT_dec((regexp *)r);
}