if (UTF) {
U8 *s = (U8*)STRING(scan);
l = utf8_length(s, s + l);
- uc = utf8_to_uv_simple(s, NULL);
+ uc = utf8_to_uvchr(s, NULL);
}
min += l;
if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
if (UTF) {
U8 *s = (U8 *)STRING(scan);
l = utf8_length(s, s + l);
- uc = utf8_to_uv_simple(s, NULL);
+ uc = utf8_to_uvchr(s, NULL);
}
min += l;
if (data && (flags & SCF_DO_SUBSTR))
else
RExC_utf8 = 0;
- RExC_precomp = savepvn(exp, xend - exp);
+ RExC_precomp = exp;
DEBUG_r(if (!PL_colorset) reginitcolors());
DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
PL_colors[4],PL_colors[5],PL_colors[0],
REGC((U8)REG_MAGIC, (char*)RExC_emit);
#endif
if (reg(pRExC_state, 0, &flags) == NULL) {
- Safefree(RExC_precomp);
RExC_precomp = Nullch;
return(NULL);
}
#endif
r->refcnt = 1;
r->prelen = xend - exp;
- r->precomp = RExC_precomp;
+ r->precomp = savepvn(RExC_precomp, r->prelen);
r->subbeg = NULL;
r->reganch = pm->op_pmflags & PMf_COMPILETIME;
r->nparens = RExC_npar - 1; /* set early to validate backrefs */
ret = reg_node(pRExC_state, CLUMP);
*flagp |= HASWIDTH;
nextchar(pRExC_state);
- if (UTF && !PL_utf8_mark)
- is_utf8_mark((U8*)"~"); /* preload table */
break;
case 'w':
ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
- if (UTF && !PL_utf8_alnum)
- is_utf8_alnum((U8*)"a"); /* preload table */
break;
case 'W':
ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
- if (UTF && !PL_utf8_alnum)
- is_utf8_alnum((U8*)"a"); /* preload table */
break;
case 'b':
RExC_seen_zerolen++;
ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
*flagp |= SIMPLE;
nextchar(pRExC_state);
- if (UTF && !PL_utf8_alnum)
- is_utf8_alnum((U8*)"a"); /* preload table */
break;
case 'B':
RExC_seen_zerolen++;
ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
*flagp |= SIMPLE;
nextchar(pRExC_state);
- if (UTF && !PL_utf8_alnum)
- is_utf8_alnum((U8*)"a"); /* preload table */
break;
case 's':
ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
- if (UTF && !PL_utf8_space)
- is_utf8_space((U8*)" "); /* preload table */
break;
case 'S':
ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
- if (UTF && !PL_utf8_space)
- is_utf8_space((U8*)" "); /* preload table */
break;
case 'd':
ret = reg_node(pRExC_state, DIGIT);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
- if (UTF && !PL_utf8_digit)
- is_utf8_digit((U8*)"1"); /* preload table */
break;
case 'D':
ret = reg_node(pRExC_state, NDIGIT);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
- if (UTF && !PL_utf8_digit)
- is_utf8_digit((U8*)"1"); /* preload table */
break;
case 'p':
case 'P':
p++;
break;
case 'e':
-#ifdef ASCIIish
- ender = '\033';
-#else
- ender = '\047';
-#endif
+ ender = ASCII_TO_NATIVE('\033');
p++;
break;
case 'a':
-#ifdef ASCIIish
- ender = '\007';
-#else
- ender = '\057';
-#endif
+ ender = ASCII_TO_NATIVE('\007');
p++;
break;
case 'x':
else {
numlen = 1; /* allow underscores */
ender = (UV)scan_hex(p + 1, e - p - 1, &numlen);
+ if (ender > 0xff)
+ RExC_utf8 = 1;
/* numlen is generous */
if (numlen + len >= 127) {
p--;
default:
normal_default:
if (UTF8_IS_START(*p) && UTF) {
- ender = utf8_to_uv((U8*)p, RExC_end - p,
+ ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
&numlen, 0);
p += numlen;
}
p = regwhite(p, RExC_end);
if (UTF && FOLD) {
if (LOC)
- ender = toLOWER_LC_uni(ender);
+ ender = toLOWER_LC_uvchr(ender);
else
ender = toLOWER_uni(ender);
}
if (ISMULT2(p)) { /* Back off on ?+*. */
if (len)
p = oldp;
- /* ender is a Unicode value so it can be > 0xff --
- * in other words, do not use UTF8_IS_CONTINUED(). */
- else if (ender >= 0x80 && UTF) {
+ else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
reguni(pRExC_state, ender, s, &numlen);
s += numlen;
len += numlen;
}
break;
}
- /* ender is a Unicode value so it can be > 0xff --
- * in other words, do not use UTF8_IS_CONTINUED(). */
- if (ender >= 0x80 && UTF) {
+ if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
reguni(pRExC_state, ender, s, &numlen);
s += numlen;
len += numlen - 1;
register char *e;
UV n;
bool dont_optimize_invert = FALSE;
-#ifdef ALPHAS_HAVE_GAPS
- bool explicit_alpha = TRUE;
- bool explicit_alpha_prev = TRUE;
-#endif
ret = reganode(pRExC_state, ANYOF, 0);
if (!range)
rangebegin = RExC_parse;
if (UTF) {
- value = utf8_to_uv((U8*)RExC_parse,
+ value = utf8n_to_uvchr((U8*)RExC_parse,
RExC_end - RExC_parse,
&numlen, 0);
RExC_parse += numlen;
namedclass = regpposixcc(pRExC_state, value);
else if (value == '\\') {
if (UTF) {
- value = utf8_to_uv((U8*)RExC_parse,
+ value = utf8n_to_uvchr((U8*)RExC_parse,
RExC_end - RExC_parse,
&numlen, 0);
RExC_parse += numlen;
case 't': value = '\t'; break;
case 'f': value = '\f'; break;
case 'b': value = '\b'; break;
-#ifdef ASCIIish
- case 'e': value = '\033'; break;
- case 'a': value = '\007'; break;
-#else
- case 'e': value = '\047'; break;
- case 'a': value = '\057'; break;
-#endif
+ case 'e': value = ASCII_TO_NATIVE('\033');break;
+ case 'a': value = ASCII_TO_NATIVE('\007');break;
case 'x':
if (*RExC_parse == '{') {
e = strchr(RExC_parse++, '}');
else {
ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
Perl_sv_catpvf(aTHX_ listsv,
- /* 0x002D is Unicode for '-' */
- "%04"UVxf"\n002D\n", (UV)lastvalue);
+ "%04"UVxf"\n%04"UVxf"\n", (UV)lastvalue, (UV) '-');
}
}
if (isALNUM(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
break;
case ANYOF_NALNUM:
if (!isALNUM(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
break;
case ANYOF_ALNUMC:
if (isALNUMC(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
break;
case ANYOF_NALNUMC:
if (!isALNUMC(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
break;
case ANYOF_ALPHA:
if (isALPHA(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
break;
case ANYOF_NALPHA:
if (!isALPHA(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
break;
case ANYOF_ASCII:
if (LOC)
ANYOF_CLASS_SET(ret, ANYOF_ASCII);
else {
-#ifdef ALPHAS_HAVE_GAPS
+#ifndef EBCDIC
+ for (value = 0; value < 128; value++)
+ ANYOF_BITMAP_SET(ret, value);
+#else /* EBCDIC */
for (value = 0; value < 256; value++)
if (isASCII(value))
ANYOF_BITMAP_SET(ret, value);
-#else
- for (value = 0; value < 128; value++)
- ANYOF_BITMAP_SET(ret, value);
-#endif
+#endif /* EBCDIC */
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
break;
case ANYOF_NASCII:
if (LOC)
ANYOF_CLASS_SET(ret, ANYOF_NASCII);
else {
-#ifdef ALPHAS_HAVE_GAPS
+#ifndef EBCDIC
+ for (value = 128; value < 256; value++)
+ ANYOF_BITMAP_SET(ret, value);
+#else /* EBCDIC */
for (value = 0; value < 256; value++)
if (!isASCII(value))
ANYOF_BITMAP_SET(ret, value);
-#else
- for (value = 128; value < 256; value++)
- ANYOF_BITMAP_SET(ret, value);
-#endif
+#endif /* EBCDIC */
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
break;
case ANYOF_BLANK:
if (isBLANK(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
break;
case ANYOF_NBLANK:
if (!isBLANK(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
break;
case ANYOF_CNTRL:
if (isCNTRL(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
break;
case ANYOF_NCNTRL:
if (!isCNTRL(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
break;
case ANYOF_DIGIT:
for (value = '0'; value <= '9'; value++)
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
break;
case ANYOF_NDIGIT:
for (value = '9' + 1; value < 256; value++)
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
break;
case ANYOF_GRAPH:
if (isGRAPH(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
break;
case ANYOF_NGRAPH:
if (!isGRAPH(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
break;
case ANYOF_LOWER:
if (isLOWER(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
break;
case ANYOF_NLOWER:
if (!isLOWER(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
break;
case ANYOF_PRINT:
if (isPRINT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
break;
case ANYOF_NPRINT:
if (!isPRINT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
break;
case ANYOF_PSXSPC:
if (isPSXSPC(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
break;
case ANYOF_NPSXSPC:
if (!isPSXSPC(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
break;
case ANYOF_PUNCT:
if (isPUNCT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
break;
case ANYOF_NPUNCT:
if (!isPUNCT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
break;
case ANYOF_SPACE:
if (isSPACE(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
break;
case ANYOF_NSPACE:
if (!isSPACE(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
break;
case ANYOF_UPPER:
if (isUPPER(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
break;
case ANYOF_NUPPER:
if (!isUPPER(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
break;
case ANYOF_XDIGIT:
if (isXDIGIT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
break;
case ANYOF_NXDIGIT:
if (!isXDIGIT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
break;
default:
}
if (LOC)
ANYOF_FLAGS(ret) |= ANYOF_CLASS;
- dont_optimize_invert = TRUE;
continue;
}
} /* end of namedclass \blah */
}
else {
lastvalue = value; /* save the beginning of the range */
-#ifdef ALPHAS_HAVE_GAPS
- explicit_alpha_prev = explicit_alpha;
- explicit_alpha = isALPHA(value);
-#endif
if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
RExC_parse[1] != ']') {
RExC_parse++;
/* now is the next time */
if (!SIZE_ONLY) {
if (lastvalue < 256 && value < 256) {
-#ifdef ALPHAS_HAVE_GAPS
- /* In EBCDIC the letters are not an unbroken range
- * numerically, there's are gaps between i-j, r-s,
- * I-J, R-S. We DWIM that if the endpoints of the
- * range are specified as explicitly alphabetic,
- * an alphabetic range is requested, otherwise
- * (the else branch) (say, explicit numeric endpoints
- * like \xHH are used) we do a straightforward
- * numeric range. */
- if (explicit_alpha_prev && explicit_alpha &&
- ((isLOWER(lastvalue) && isLOWER(value)) ||
- ((isUPPER(lastvalue) && isUPPER(value)))))
+#ifdef EBCDIC /* EBCDIC, for example. */
+ if ((isLOWER(lastvalue) && isLOWER(value)) ||
+ (isUPPER(lastvalue) && isUPPER(value)))
{
IV i;
if (isLOWER(lastvalue)) {
STATIC void
S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
{
- *lenp = SIZE_ONLY ? UNISKIP(uv) : (uv_to_utf8((U8*)s, uv) - (U8*)s);
+ *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
}
/*
U8 s[UTF8_MAXLEN+1];
for (i = 0; i <= 256; i++) { /* just the first 256 */
- U8 *e = uv_to_utf8(s, i);
+ U8 *e = uvchr_to_utf8(s, i);
if (i < 256 && swash_fetch(sw, s)) {
if (rangestart == -1)
if (i <= rangestart + 3)
for (; rangestart < i; rangestart++) {
- for(e = uv_to_utf8(s, rangestart), p = s; p < e; p++)
+ for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
put_byte(sv, *p);
}
else {
- for (e = uv_to_utf8(s, rangestart), p = s; p < e; p++)
+ for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
put_byte(sv, *p);
sv_catpv(sv, "-");
- for (e = uv_to_utf8(s, i - 1), p = s; p < e; p++)
+ for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
put_byte(sv, *p);
}
rangestart = -1;