STATIC void
S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
{
- int value;
-
ANYOF_CLASS_ZERO(cl);
- for (value = 0; value < 256; ++value)
- ANYOF_BITMAP_SET(cl, value);
+ ANYOF_BITMAP_SETALL(cl);
cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
if (LOC)
cl->flags |= ANYOF_LOCALE;
return 1;
if (!(cl->flags & ANYOF_UNICODE_ALL))
return 0;
- for (value = 0; value < 256; ++value)
- if (!ANYOF_BITMAP_TEST(cl, value))
- return 0;
+ if (!ANYOF_BITMAP_TESTALLSET(cl))
+ return 0;
return 1;
}
r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
pm->op_pmflags = RExC_flags16;
if (UTF)
- r->reganch |= ROPT_UTF8;
+ r->reganch |= ROPT_UTF8; /* Unicode in it? */
r->regstclass = NULL;
if (RExC_naughty >= 10) /* Probably an expensive pattern. */
r->reganch |= ROPT_NAUGHTY;
case 'Z':
ret = reg_node(pRExC_state, SEOL);
*flagp |= SIMPLE;
+ RExC_seen_zerolen++; /* Do not optimize RE away */
nextchar(pRExC_state);
break;
case 'z':
break;
}
+ if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT && !RExC_utf8) {
+ STRLEN oldlen = STR_LEN(ret);
+ SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
+ char *s = Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding);
+ STRLEN newlen = SvCUR(sv);
+ if (!SIZE_ONLY) {
+ DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
+ oldlen, STRING(ret), newlen, s));
+ Copy(s, STRING(ret), newlen, char);
+ STR_LEN(ret) += newlen - oldlen;
+ RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
+ } else
+ RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
+ RExC_utf8 = 1;
+ }
+
return(ret);
}
Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
but trigger failures because they are currently unimplemented. */
+
+#define POSIXCC_DONE(c) ((c) == ':')
+#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
+#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
+
STATIC I32
S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
{
if (value == '[' && RExC_parse + 1 < RExC_end &&
/* I smell either [: or [= or [. -- POSIX has been here, right? */
- (*RExC_parse == ':' ||
- *RExC_parse == '=' ||
- *RExC_parse == '.')) {
- char c = *RExC_parse;
+ POSIXCC(UCHARAT(RExC_parse))) {
+ char c = UCHARAT(RExC_parse);
char* s = RExC_parse++;
- while (RExC_parse < RExC_end && *RExC_parse != c)
+ while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
RExC_parse++;
if (RExC_parse == RExC_end)
/* Grandfather lone [:, [=, [. */
else {
char* t = RExC_parse++; /* skip over the c */
- if (*RExC_parse == ']') {
+ if (UCHARAT(RExC_parse) == ']') {
RExC_parse++; /* skip over the ending ] */
posixcc = s + 1;
if (*s == ':') {
/* adjust RExC_parse so the warning shows after
the class closes */
- while (*RExC_parse && *RExC_parse != ']')
+ while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
RExC_parse++;
Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
}
S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
{
if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
- (*RExC_parse == ':' ||
- *RExC_parse == '=' ||
- *RExC_parse == '.')) {
+ POSIXCC(UCHARAT(RExC_parse))) {
char *s = RExC_parse;
char c = *s++;
vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
/* [[=foo=]] and [[.foo.]] are still future. */
- if (c == '=' || c == '.')
- {
+ if (POSIXCC_NOTYET(c)) {
/* adjust RExC_parse so the error shows after
the class closes */
- while (*RExC_parse && *RExC_parse++ != ']')
+ while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
;
Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
}
S_regclass(pTHX_ RExC_state_t *pRExC_state)
{
register UV value;
+ register UV nextvalue;
register IV prevvalue = OOB_UNICODE;
register IV range = 0;
register regnode *ret;
if (!SIZE_ONLY)
ANYOF_FLAGS(ret) = 0;
- if (*RExC_parse == '^') { /* Complement of range. */
+ if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
RExC_naughty++;
RExC_parse++;
if (!SIZE_ONLY)
listsv = newSVpvn("# comment\n", 10);
}
- if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
+ nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
+
+ if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && POSIXCC(nextvalue))
checkposixcc(pRExC_state);
- if (*RExC_parse == ']' || *RExC_parse == '-')
+ if (UCHARAT(RExC_parse) == ']' || UCHARAT(RExC_parse) == '-')
goto charclassloop; /* allow 1st char to be ] or - */
- while (RExC_parse < RExC_end && *RExC_parse != ']') {
+ while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
charclassloop:
}
else
value = UCHARAT(RExC_parse++);
- if (value == '[')
+ nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
+ if (value == '[' && POSIXCC(nextvalue))
namedclass = regpposixcc(pRExC_state, value);
else if (value == '\\') {
if (UTF) {
if (k == EXACT) {
SV *dsv = sv_2mortal(newSVpvn("", 0));
- bool do_utf8 = PL_reg_match_utf8;
+ bool do_utf8 = DO_UTF8(sv);
char *s = do_utf8 ?
pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60, 0) :
STRING(o);