Integrate change #9530 from maintperl into mainline.
[p5sagit/p5-mst-13.2.git] / regcomp.c
index 05a48d9..85f0e45 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -1599,7 +1599,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
     else
         RExC_utf8 = 0;
 
-    RExC_precomp = savepvn(exp, xend - exp);
+    RExC_precomp = exp;
     DEBUG_r(if (!PL_colorset) reginitcolors());
     DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
                      PL_colors[4],PL_colors[5],PL_colors[0],
@@ -1625,7 +1625,6 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
     REGC((U8)REG_MAGIC, (char*)RExC_emit);
 #endif
     if (reg(pRExC_state, 0, &flags) == NULL) {
-       Safefree(RExC_precomp);
        RExC_precomp = Nullch;
        return(NULL);
     }
@@ -1652,7 +1651,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
 #endif
     r->refcnt = 1;
     r->prelen = xend - exp;
-    r->precomp = RExC_precomp;
+    r->precomp = savepvn(RExC_precomp, r->prelen);
     r->subbeg = NULL;
     r->reganch = pm->op_pmflags & PMf_COMPILETIME;
     r->nparens = RExC_npar - 1;        /* set early to validate backrefs */
@@ -2902,7 +2901,7 @@ tryagain:
                default:
                  normal_default:
                    if (UTF8_IS_START(*p) && UTF) {
-                       ender = utf8n_to_uvuni((U8*)p, RExC_end - p,
+                       ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
                                               &numlen, 0);
                        p += numlen;
                    }
@@ -2914,16 +2913,14 @@ tryagain:
                    p = regwhite(p, RExC_end);
                if (UTF && FOLD) {
                    if (LOC)
-                       ender = toLOWER_LC_uvchr(UNI_TO_NATIVE(ender));
+                       ender = toLOWER_LC_uvchr(ender);
                    else
                        ender = toLOWER_uni(ender);
                }
                if (ISMULT2(p)) { /* Back off on ?+*. */
                    if (len)
                        p = oldp;
-                   /* ender is a Unicode value so it can be > 0xff --
-                    * in other words, do not use UTF8_IS_CONTINUED(). */
-                   else if (ender >= 0x80 && UTF) {
+                   else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
                        reguni(pRExC_state, ender, s, &numlen);
                        s += numlen;
                        len += numlen;
@@ -2934,9 +2931,7 @@ tryagain:
                    }
                    break;
                }
-               /* ender is a Unicode value so it can be > 0xff --
-                * in other words, do not use UTF8_IS_CONTINUED(). */
-               if (ender >= 0x80 && UTF) {
+               if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
                    reguni(pRExC_state, ender, s, &numlen);
                    s += numlen;
                    len += numlen - 1;
@@ -3201,7 +3196,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
        if (!range)
            rangebegin = RExC_parse;
        if (UTF) {
-           value = utf8n_to_uvuni((U8*)RExC_parse,
+           value = utf8n_to_uvchr((U8*)RExC_parse,
                               RExC_end - RExC_parse,
                               &numlen, 0);
            RExC_parse += numlen;
@@ -3212,7 +3207,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
            namedclass = regpposixcc(pRExC_state, value);
        else if (value == '\\') {
            if (UTF) {
-               value = utf8n_to_uvuni((U8*)RExC_parse,
+               value = utf8n_to_uvchr((U8*)RExC_parse,
                                   RExC_end - RExC_parse,
                                   &numlen, 0);
                RExC_parse += numlen;
@@ -3320,8 +3315,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                    else {
                        ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
                        Perl_sv_catpvf(aTHX_ listsv,
-                                      /* 0x002D is Unicode for '-' */
-                                      "%04"UVxf"\n002D\n", (UV)lastvalue);
+                                      "%04"UVxf"\n%04"UVxf"\n", (UV)lastvalue, (UV) '-');
                    }
                }
 
@@ -3408,9 +3402,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                        for (value = 0; value < 128; value++)
                            ANYOF_BITMAP_SET(ret, value);
 #else  /* EBCDIC */
-                       for (value = 0; value < 256; value++)
-                           if (isASCII(value))
-                               ANYOF_BITMAP_SET(ret, value);
+                       for (value = 0; value < 256; value++) {
+                           if (PL_hints & HINT_RE_ASCIIR) {
+                               if (NATIVE_TO_ASCII(value) < 128)
+                                   ANYOF_BITMAP_SET(ret, value);
+                           }
+                           else {
+                               if (isASCII(value))
+                                   ANYOF_BITMAP_SET(ret, value);
+                           }
+                       }
 #endif /* EBCDIC */
                    }
                    dont_optimize_invert = TRUE;
@@ -3424,9 +3425,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                        for (value = 128; value < 256; value++)
                            ANYOF_BITMAP_SET(ret, value);
 #else  /* EBCDIC */
-                       for (value = 0; value < 256; value++)
-                           if (!isASCII(value))
-                               ANYOF_BITMAP_SET(ret, value);
+                       for (value = 0; value < 256; value++) {
+                           if (PL_hints & HINT_RE_ASCIIR) {
+                               if (NATIVE_TO_ASCII(value) >= 128)
+                                   ANYOF_BITMAP_SET(ret, value);
+                           }
+                           else {
+                               if (!isASCII(value))
+                                   ANYOF_BITMAP_SET(ret, value);
+                           }
+                       }
 #endif /* EBCDIC */
                    }
                    dont_optimize_invert = TRUE;
@@ -3687,7 +3695,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
        } /* end of namedclass \blah */
 
        if (range) {
-           if (lastvalue > value) /* b-a */ {
+           if (((lastvalue > value) && !(PL_hints & HINT_RE_ASCIIR)) ||
+                ((NATIVE_TO_UNI(lastvalue) > NATIVE_TO_UNI(value)) && (PL_hints & HINT_RE_ASCIIR))) /* b-a */ {
                Simple_vFAIL4("Invalid [] range \"%*.*s\"",
                              RExC_parse - rangebegin,
                              RExC_parse - rangebegin,
@@ -3721,7 +3730,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
        if (!SIZE_ONLY) {
            if (lastvalue < 256 && value < 256) {
 #ifdef EBCDIC /* EBCDIC, for example. */
-               if ((isLOWER(lastvalue) && isLOWER(value)) ||
+               if (PL_hints & HINT_RE_ASCIIR) {
+                   IV i;
+                   /* New style scheme for ranges:
+                    * after :
+                    * use re 'asciir';
+                    * do ranges in ASCII/Unicode space
+                    */
+                   for (i = NATIVE_TO_ASCII(lastvalue) ; i <= NATIVE_TO_ASCII(value); i++)
+                       ANYOF_BITMAP_SET(ret, ASCII_TO_NATIVE(i));
+               }
+               else if ((isLOWER(lastvalue) && isLOWER(value)) ||
                    (isUPPER(lastvalue) && isUPPER(value)))
                {
                    IV i;
@@ -3883,7 +3902,7 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
 STATIC void
 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
 {
-    *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvuni_to_utf8((U8*)s, uv) - (U8*)s);
+    *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
 }
 
 /*
@@ -4253,7 +4272,7 @@ Perl_regprop(pTHX_ SV *sv, regnode *o)
                    U8 s[UTF8_MAXLEN+1];
                
                    for (i = 0; i <= 256; i++) { /* just the first 256 */
-                       U8 *e = uvuni_to_utf8(s, i);
+                       U8 *e = uvchr_to_utf8(s, i);
                        
                        if (i < 256 && swash_fetch(sw, s)) {
                            if (rangestart == -1)
@@ -4263,14 +4282,14 @@ Perl_regprop(pTHX_ SV *sv, regnode *o)
                        
                            if (i <= rangestart + 3)
                                for (; rangestart < i; rangestart++) {
-                                   for(e = uvuni_to_utf8(s, rangestart), p = s; p < e; p++)
+                                   for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
                                        put_byte(sv, *p);
                                }
                            else {
-                               for (e = uvuni_to_utf8(s, rangestart), p = s; p < e; p++)
+                               for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
                                    put_byte(sv, *p);
                                sv_catpv(sv, "-");
-                                   for (e = uvuni_to_utf8(s, i - 1), p = s; p < e; p++)
+                                   for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
                                        put_byte(sv, *p);
                                }
                                rangestart = -1;
@@ -4525,3 +4544,4 @@ clear_re(pTHXo_ void *r)
 {
     ReREFCNT_dec((regexp *)r);
 }
+