#define vWARN(loc,m) \
STMT_START { \
IV offset = loc - RExC_precomp; \
- Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
+ Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,\
m, (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END \
#define vWARNdep(loc,m) \
STMT_START { \
IV offset = loc - RExC_precomp; \
- int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED; \
- Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\
+ Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), "%s" REPORT_LOCATION,\
m, (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END \
#define vWARN2(loc, m, a1) \
STMT_START { \
IV offset = loc - RExC_precomp; \
- Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
+ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
a1, \
(int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
#define vWARN3(loc, m, a1, a2) \
STMT_START { \
IV offset = loc - RExC_precomp; \
- Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
+ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
a1, a2, \
(int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
#define vWARN4(loc, m, a1, a2, a3) \
STMT_START { \
IV offset = loc - RExC_precomp; \
- Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
+ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
a1, a2, a3, \
(int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
#define vWARN5(loc, m, a1, a2, a3, a4) \
STMT_START { \
IV offset = loc - RExC_precomp; \
- Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
+ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
a1, a2, a3, a4, \
(int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
? I32_MAX : data->pos_min + data->pos_delta;
}
sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
+ if (UTF)
+ SvUTF8_on(data->last_found);
data->last_end = data->pos_min + l;
data->pos_min += l; /* As in the first entry. */
data->flags &= ~SF_BEFORE_EOL;
&& SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
goto remove_float; /* As in (a)+. */
- r->float_substr = data.longest_float;
+ if (SvUTF8(data.longest_float)) {
+ r->float_utf8 = data.longest_float;
+ r->float_substr = Nullsv;
+ } else {
+ r->float_substr = data.longest_float;
+ r->float_utf8 = Nullsv;
+ }
r->float_min_offset = data.offset_float_min;
r->float_max_offset = data.offset_float_max;
t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
&& (!(data.flags & SF_FL_BEFORE_MEOL)
|| (RExC_flags16 & PMf_MULTILINE)));
- fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
+ fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
}
else {
remove_float:
- r->float_substr = Nullsv;
+ r->float_substr = r->float_utf8 = Nullsv;
SvREFCNT_dec(data.longest_float);
longest_float_length = 0;
}
|| (RExC_flags16 & PMf_MULTILINE)))) {
int t;
- r->anchored_substr = data.longest_fixed;
+ if (SvUTF8(data.longest_fixed)) {
+ r->anchored_utf8 = data.longest_fixed;
+ r->anchored_substr = Nullsv;
+ } else {
+ r->anchored_substr = data.longest_fixed;
+ r->anchored_utf8 = Nullsv;
+ }
r->anchored_offset = data.offset_fixed;
t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
&& (!(data.flags & SF_FIX_BEFORE_MEOL)
|| (RExC_flags16 & PMf_MULTILINE)));
- fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
+ fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
}
else {
- r->anchored_substr = Nullsv;
+ r->anchored_substr = r->anchored_utf8 = Nullsv;
SvREFCNT_dec(data.longest_fixed);
longest_fixed_length = 0;
}
if (r->regstclass
&& (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
r->regstclass = NULL;
- if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
+ if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
+ && stclass_flag
&& !(data.start_class->flags & ANYOF_EOS)
&& !cl_is_anything(data.start_class)) {
I32 n = add_data(pRExC_state, 1, "f");
/* A temporary algorithm prefers floated substr to fixed one to dig more info. */
if (longest_fixed_length > longest_float_length) {
r->check_substr = r->anchored_substr;
+ r->check_utf8 = r->anchored_utf8;
r->check_offset_min = r->check_offset_max = r->anchored_offset;
if (r->reganch & ROPT_ANCH_SINGLE)
r->reganch |= ROPT_NOSCAN;
}
else {
r->check_substr = r->float_substr;
+ r->check_utf8 = r->float_utf8;
r->check_offset_min = data.offset_float_min;
r->check_offset_max = data.offset_float_max;
}
/* XXXX Currently intuiting is not compatible with ANCH_GPOS.
This should be changed ASAP! */
- if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
+ if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
r->reganch |= RE_USE_INTUIT;
- if (SvTAIL(r->check_substr))
+ if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
r->reganch |= RE_INTUIT_TAIL;
}
}
data.start_class = &ch_class;
data.last_closep = &last_close;
minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
- r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
+ r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
+ = r->float_substr = r->float_utf8 = Nullsv;
if (!(data.start_class->flags & ANYOF_EOS)
&& !cl_is_anything(data.start_class)) {
I32 n = add_data(pRExC_state, 1, "f");
/* FALL THROUGH*/
case '?': /* (??...) */
logical = 1;
+ if (*RExC_parse != '{')
+ goto unknown;
paren = *RExC_parse++;
/* FALL THROUGH */
case '{': /* (?{...}) */
}
RExC_end++;
}
- else
+ else {
RExC_end = RExC_parse + 2;
+ if (RExC_end > oldregxend)
+ RExC_end = oldregxend;
+ }
RExC_parse--;
ret = regclass(pRExC_state);
RExC_parse++;
defchar:
+ ender = 0;
ret = reg_node(pRExC_state, FOLD
? (LOC ? EXACTFL : EXACTF)
: EXACT);
if (len)
p = oldp;
else if (UTF) {
+ STRLEN unilen;
+
if (FOLD) {
/* Emit all the Unicode characters. */
for (foldbuf = tmpbuf;
foldlen;
foldlen -= numlen) {
ender = utf8_to_uvchr(foldbuf, &numlen);
- reguni(pRExC_state, ender, s, &numlen);
- s += numlen;
- len += numlen;
- foldbuf += numlen;
+ if (numlen > 0) {
+ reguni(pRExC_state, ender, s, &unilen);
+ s += unilen;
+ len += unilen;
+ /* In EBCDIC the numlen
+ * and unilen can differ. */
+ foldbuf += numlen;
+ if (numlen >= foldlen)
+ break;
+ }
+ else
+ break; /* "Can't happen." */
}
}
else {
- reguni(pRExC_state, ender, s, &numlen);
- s += numlen;
- len += numlen;
+ reguni(pRExC_state, ender, s, &unilen);
+ if (unilen > 0) {
+ s += unilen;
+ len += unilen;
+ }
}
}
else {
break;
}
if (UTF) {
+ STRLEN unilen;
+
if (FOLD) {
/* Emit all the Unicode characters. */
for (foldbuf = tmpbuf;
foldlen;
foldlen -= numlen) {
ender = utf8_to_uvchr(foldbuf, &numlen);
- reguni(pRExC_state, ender, s, &numlen);
- s += numlen;
- len += numlen;
- foldbuf += numlen;
+ if (numlen > 0) {
+ reguni(pRExC_state, ender, s, &unilen);
+ len += unilen;
+ s += unilen;
+ /* In EBCDIC the numlen
+ * and unilen can differ. */
+ foldbuf += numlen;
+ if (numlen >= foldlen)
+ break;
+ }
+ else
+ break;
}
}
else {
- reguni(pRExC_state, ender, s, &numlen);
- s += numlen;
- len += numlen;
+ reguni(pRExC_state, ender, s, &unilen);
+ if (unilen > 0) {
+ s += unilen;
+ len += unilen;
+ }
}
len--;
}
if (RExC_utf8)
SvUTF8_on(sv);
if (sv_utf8_downgrade(sv, TRUE)) {
- char *s = Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding);
+ char *s = sv_recode_to_utf8(sv, PL_encoding);
STRLEN newlen = SvCUR(sv);
if (!SIZE_ONLY) {
STATIC void
S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
{
- if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
- POSIXCC(UCHARAT(RExC_parse))) {
+ if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
char *s = RExC_parse;
char c = *s++;
while(*s && isALNUM(*s))
s++;
if (*s && c == *s && s[1] == ']') {
- vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
+ if (ckWARN(WARN_REGEXP))
+ vWARN3(s+2,
+ "POSIX syntax [%c %c] belongs inside character classes",
+ c, c);
/* [[=foo=]] and [[.foo.]] are still future. */
if (POSIXCC_NOTYET(c)) {
nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
- if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && POSIXCC(nextvalue))
+ if (!SIZE_ONLY && POSIXCC(nextvalue))
checkposixcc(pRExC_state);
/* allow 1st char to be ] (allowing it to be - is dealt with later) */
case 'D': namedclass = ANYOF_NDIGIT; break;
case 'p':
case 'P':
+ if (RExC_parse >= RExC_end)
+ vFAIL2("Empty \\%c{}", (U8)value);
if (*RExC_parse == '{') {
U8 c = (U8)value;
e = strchr(RExC_parse++, '}');
ANYOF_BITMAP_SET(ret, i);
}
if (value > 255 || UTF) {
+ UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
+ UV natvalue = NATIVE_TO_UNI(value);
+
ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
- if (prevvalue < value)
+ if (prevnatvalue < natvalue) { /* what about > ? */
Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
- (UV)prevvalue, (UV)value);
- else if (prevvalue == value) {
- Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
- (UV)value);
+ prevnatvalue, natvalue);
+ }
+ else if (prevnatvalue == natvalue) {
+ Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
if (FOLD) {
- U8 tmpbuf [UTF8_MAXLEN+1];
U8 foldbuf[UTF8_MAXLEN_FOLD+1];
STRLEN foldlen;
- UV f;
-
- uvchr_to_utf8(tmpbuf, value);
- to_utf8_fold(tmpbuf, foldbuf, &foldlen);
- f = utf8_to_uvchr(foldbuf, 0);
+ UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
/* If folding and foldable and a single
* character, insert also the folded version
PL_colors[1],
SvTAIL(r->anchored_substr) ? "$" : "",
(IV)r->anchored_offset);
+ else if (r->anchored_utf8)
+ PerlIO_printf(Perl_debug_log,
+ "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
+ PL_colors[0],
+ (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
+ SvPVX(r->anchored_utf8),
+ PL_colors[1],
+ SvTAIL(r->anchored_utf8) ? "$" : "",
+ (IV)r->anchored_offset);
if (r->float_substr)
PerlIO_printf(Perl_debug_log,
"floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
PL_colors[1],
SvTAIL(r->float_substr) ? "$" : "",
(IV)r->float_min_offset, (UV)r->float_max_offset);
- if (r->check_substr)
+ else if (r->float_utf8)
+ PerlIO_printf(Perl_debug_log,
+ "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
+ PL_colors[0],
+ (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
+ SvPVX(r->float_utf8),
+ PL_colors[1],
+ SvTAIL(r->float_utf8) ? "$" : "",
+ (IV)r->float_min_offset, (UV)r->float_max_offset);
+ if (r->check_substr || r->check_utf8)
PerlIO_printf(Perl_debug_log,
r->check_substr == r->float_substr
+ && r->check_utf8 == r->float_utf8
? "(checking floating" : "(checking anchored");
if (r->reganch & ROPT_NOSCAN)
PerlIO_printf(Perl_debug_log, " noscan");
if (r->reganch & ROPT_CHECK_ALL)
PerlIO_printf(Perl_debug_log, " isall");
- if (r->check_substr)
+ if (r->check_substr || r->check_utf8)
PerlIO_printf(Perl_debug_log, ") ");
if (r->regstclass) {
{ /* Assume that RE_INTUIT is set */
DEBUG_r(
{ STRLEN n_a;
- char *s = SvPV(prog->check_substr,n_a);
+ char *s = SvPV(prog->check_substr
+ ? prog->check_substr : prog->check_utf8, n_a);
if (!PL_colorset) reginitcolors();
PerlIO_printf(Perl_debug_log,
- "%sUsing REx substr:%s `%s%.60s%s%s'\n",
- PL_colors[4],PL_colors[5],PL_colors[0],
+ "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
+ PL_colors[4],
+ prog->check_substr ? "" : "utf8 ",
+ PL_colors[5],PL_colors[0],
s,
PL_colors[1],
(strlen(s) > 60 ? "..." : ""));
} );
- return prog->check_substr;
+ return prog->check_substr ? prog->check_substr : prog->check_utf8;
}
void
if (r->substrs) {
if (r->anchored_substr)
SvREFCNT_dec(r->anchored_substr);
+ if (r->anchored_utf8)
+ SvREFCNT_dec(r->anchored_utf8);
if (r->float_substr)
SvREFCNT_dec(r->float_substr);
+ if (r->float_utf8)
+ SvREFCNT_dec(r->float_utf8);
Safefree(r->substrs);
}
if (r->data) {