else
RExC_utf8 = 0;
- RExC_precomp = savepvn(exp, xend - exp);
+ RExC_precomp = exp;
DEBUG_r(if (!PL_colorset) reginitcolors());
DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
PL_colors[4],PL_colors[5],PL_colors[0],
REGC((U8)REG_MAGIC, (char*)RExC_emit);
#endif
if (reg(pRExC_state, 0, &flags) == NULL) {
- Safefree(RExC_precomp);
RExC_precomp = Nullch;
return(NULL);
}
#endif
r->refcnt = 1;
r->prelen = xend - exp;
- r->precomp = RExC_precomp;
+ r->precomp = savepvn(RExC_precomp, r->prelen);
r->subbeg = NULL;
r->reganch = pm->op_pmflags & PMf_COMPILETIME;
r->nparens = RExC_npar - 1; /* set early to validate backrefs */
default:
normal_default:
if (UTF8_IS_START(*p) && UTF) {
- ender = utf8n_to_uvuni((U8*)p, RExC_end - p,
+ ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
&numlen, 0);
p += numlen;
}
p = regwhite(p, RExC_end);
if (UTF && FOLD) {
if (LOC)
- ender = toLOWER_LC_uvchr(UNI_TO_NATIVE(ender));
+ ender = toLOWER_LC_uvchr(ender);
else
ender = toLOWER_uni(ender);
}
if (ISMULT2(p)) { /* Back off on ?+*. */
if (len)
p = oldp;
- /* ender is a Unicode value so it can be > 0xff --
- * in other words, do not use UTF8_IS_CONTINUED(). */
- else if (ender >= 0x80 && UTF) {
+ else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
reguni(pRExC_state, ender, s, &numlen);
s += numlen;
len += numlen;
}
break;
}
- /* ender is a Unicode value so it can be > 0xff --
- * in other words, do not use UTF8_IS_CONTINUED(). */
- if (ender >= 0x80 && UTF) {
+ if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
reguni(pRExC_state, ender, s, &numlen);
s += numlen;
len += numlen - 1;
if (!range)
rangebegin = RExC_parse;
if (UTF) {
- value = utf8n_to_uvuni((U8*)RExC_parse,
+ value = utf8n_to_uvchr((U8*)RExC_parse,
RExC_end - RExC_parse,
&numlen, 0);
RExC_parse += numlen;
namedclass = regpposixcc(pRExC_state, value);
else if (value == '\\') {
if (UTF) {
- value = utf8n_to_uvuni((U8*)RExC_parse,
+ value = utf8n_to_uvchr((U8*)RExC_parse,
RExC_end - RExC_parse,
&numlen, 0);
RExC_parse += numlen;
else {
ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
Perl_sv_catpvf(aTHX_ listsv,
- /* 0x002D is Unicode for '-' */
- "%04"UVxf"\n002D\n", (UV)lastvalue);
+ "%04"UVxf"\n%04"UVxf"\n", (UV)lastvalue, (UV) '-');
}
}
for (value = 0; value < 128; value++)
ANYOF_BITMAP_SET(ret, value);
#else /* EBCDIC */
- for (value = 0; value < 256; value++)
- if (isASCII(value))
- ANYOF_BITMAP_SET(ret, value);
+ for (value = 0; value < 256; value++) {
+ if (PL_hints & HINT_RE_ASCIIR) {
+ if (NATIVE_TO_ASCII(value) < 128)
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ else {
+ if (isASCII(value))
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ }
#endif /* EBCDIC */
}
dont_optimize_invert = TRUE;
for (value = 128; value < 256; value++)
ANYOF_BITMAP_SET(ret, value);
#else /* EBCDIC */
- for (value = 0; value < 256; value++)
- if (!isASCII(value))
- ANYOF_BITMAP_SET(ret, value);
+ for (value = 0; value < 256; value++) {
+ if (PL_hints & HINT_RE_ASCIIR) {
+ if (NATIVE_TO_ASCII(value) >= 128)
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ else {
+ if (!isASCII(value))
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ }
#endif /* EBCDIC */
}
dont_optimize_invert = TRUE;
} /* end of namedclass \blah */
if (range) {
- if (lastvalue > value) /* b-a */ {
+ if (((lastvalue > value) && !(PL_hints & HINT_RE_ASCIIR)) ||
+ ((NATIVE_TO_UNI(lastvalue) > NATIVE_TO_UNI(value)) && (PL_hints & HINT_RE_ASCIIR))) /* b-a */ {
Simple_vFAIL4("Invalid [] range \"%*.*s\"",
RExC_parse - rangebegin,
RExC_parse - rangebegin,
if (!SIZE_ONLY) {
if (lastvalue < 256 && value < 256) {
#ifdef EBCDIC /* EBCDIC, for example. */
- if ((isLOWER(lastvalue) && isLOWER(value)) ||
+ if (PL_hints & HINT_RE_ASCIIR) {
+ IV i;
+ /* New style scheme for ranges:
+ * after :
+ * use re 'asciir';
+ * do ranges in ASCII/Unicode space
+ */
+ for (i = NATIVE_TO_ASCII(lastvalue) ; i <= NATIVE_TO_ASCII(value); i++)
+ ANYOF_BITMAP_SET(ret, ASCII_TO_NATIVE(i));
+ }
+ else if ((isLOWER(lastvalue) && isLOWER(value)) ||
(isUPPER(lastvalue) && isUPPER(value)))
{
IV i;
STATIC void
S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
{
- *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvuni_to_utf8((U8*)s, uv) - (U8*)s);
+ *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
}
/*
U8 s[UTF8_MAXLEN+1];
for (i = 0; i <= 256; i++) { /* just the first 256 */
- U8 *e = uvuni_to_utf8(s, i);
+ U8 *e = uvchr_to_utf8(s, i);
if (i < 256 && swash_fetch(sw, s)) {
if (rangestart == -1)
if (i <= rangestart + 3)
for (; rangestart < i; rangestart++) {
- for(e = uvuni_to_utf8(s, rangestart), p = s; p < e; p++)
+ for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
put_byte(sv, *p);
}
else {
- for (e = uvuni_to_utf8(s, rangestart), p = s; p < e; p++)
+ for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
put_byte(sv, *p);
sv_catpv(sv, "-");
- for (e = uvuni_to_utf8(s, i - 1), p = s; p < e; p++)
+ for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
put_byte(sv, *p);
}
rangestart = -1;
{
ReREFCNT_dec((regexp *)r);
}
+