[ PATCH] Cygwin: one down one to go
[p5sagit/p5-mst-13.2.git] / regcomp.c
index b5f7164..cac14bf 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -522,11 +522,8 @@ S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
 STATIC void
 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 {
-    int value;
-
     ANYOF_CLASS_ZERO(cl);
-    for (value = 0; value < 256; ++value)
-       ANYOF_BITMAP_SET(cl, value);
+    ANYOF_BITMAP_SETALL(cl);
     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
     if (LOC)
        cl->flags |= ANYOF_LOCALE;
@@ -543,9 +540,8 @@ S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
            return 1;
     if (!(cl->flags & ANYOF_UNICODE_ALL))
        return 0;
-    for (value = 0; value < 256; ++value)
-       if (!ANYOF_BITMAP_TEST(cl, value))
-           return 0;
+    if (!ANYOF_BITMAP_TESTALLSET(cl))
+       return 0;
     return 1;
 }
 
@@ -662,6 +658,17 @@ S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, str
     }
 }
 
+/*
+ * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
+ * These need to be revisited when a newer toolchain becomes available.
+ */
+#if defined(__sparc64__) && defined(__GNUC__)
+#   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
+#       undef  SPARC64_GCC_WORKAROUND
+#       define SPARC64_GCC_WORKAROUND 1
+#   endif
+#endif
+
 /* REx optimizer.  Converts nodes into quickier variants "in place".
    Finds fixed substrings.  */
 
@@ -1207,11 +1214,28 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg
                    int counted = mincount != 0;
 
                    if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
+#if defined(SPARC64_GCC_WORKAROUND)
+                       I32 b = 0;
+                       STRLEN l = 0;
+                       char *s = NULL;
+                       I32 old = 0;
+
+                       if (pos_before >= data->last_start_min)
+                           b = pos_before;
+                       else
+                           b = data->last_start_min;
+
+                       l = 0;
+                       s = SvPV(data->last_found, l);
+                       old = b - data->last_start_min;
+
+#else
                        I32 b = pos_before >= data->last_start_min
                            ? pos_before : data->last_start_min;
                        STRLEN l;
                        char *s = SvPV(data->last_found, l);
                        I32 old = b - data->last_start_min;
+#endif
 
                        if (UTF)
                            old = utf8_hop((U8*)s, old) - (U8*)s;
@@ -1666,17 +1690,15 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
     if (exp == NULL)
        FAIL("NULL regexp argument");
 
-    /* XXXX This looks very suspicious... */
-    if (pm->op_pmdynflags & PMdf_CMP_UTF8)
-        RExC_utf8 = 1;
-    else
-        RExC_utf8 = 0;
+    RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
 
     RExC_precomp = exp;
-    DEBUG_r(if (!PL_colorset) reginitcolors());
-    DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
-                     PL_colors[4],PL_colors[5],PL_colors[0],
-                     (int)(xend - exp), RExC_precomp, PL_colors[1]));
+    DEBUG_r({
+        if (!PL_colorset) reginitcolors();
+        PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
+                      PL_colors[4],PL_colors[5],PL_colors[0],
+                      (int)(xend - exp), RExC_precomp, PL_colors[1]);
+    });
     RExC_flags16 = pm->op_pmflags;
     RExC_sawback = 0;
 
@@ -1764,7 +1786,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
     pm->op_pmflags = RExC_flags16;
     if (UTF)
-       r->reganch |= ROPT_UTF8;
+        r->reganch |= ROPT_UTF8;       /* Unicode in it? */
     r->regstclass = NULL;
     if (RExC_naughty >= 10)    /* Probably an expensive pattern. */
        r->reganch |= ROPT_NAUGHTY;
@@ -2795,6 +2817,7 @@ tryagain:
        case 'Z':
            ret = reg_node(pRExC_state, SEOL);
            *flagp |= SIMPLE;
+           RExC_seen_zerolen++;                /* Do not optimize RE away */
            nextchar(pRExC_state);
            break;
        case 'z':
@@ -2963,7 +2986,8 @@ tryagain:
            char *oldp, *s;
            STRLEN numlen;
            STRLEN ulen;
-           U8 tmpbuf[UTF8_MAXLEN*2+1];
+           STRLEN foldlen;
+           U8 tmpbuf[UTF8_MAXLEN_UCLC+1], *foldbuf;
 
             parse_start = RExC_parse - 1;
 
@@ -2994,6 +3018,8 @@ tryagain:
                case '\\':
                    switch (*++p) {
                    case 'A':
+                   case 'C':
+                   case 'X':
                    case 'G':
                    case 'Z':
                    case 'z':
@@ -3105,17 +3131,28 @@ tryagain:
                }
                if (RExC_flags16 & PMf_EXTENDED)
                    p = regwhite(p, RExC_end);
-               if (UTF && FOLD) {
-                   toLOWER_uni(ender, tmpbuf, &ulen);
-                   ender = utf8_to_uvchr(tmpbuf, 0);
-               }
+               if (UTF && FOLD)
+                   toFOLD_uni(ender, tmpbuf, &foldlen);
                if (ISMULT2(p)) { /* Back off on ?+*. */
                    if (len)
                        p = oldp;
                    else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
-                       reguni(pRExC_state, ender, s, &numlen);
-                       s += numlen;
-                       len += numlen;
+                        if (FOLD) {
+                             for (foldbuf = tmpbuf;
+                                  foldlen;
+                                  foldlen -= numlen) {
+                                  ender = utf8_to_uvchr(foldbuf, &numlen);
+                                  reguni(pRExC_state, ender, s, &numlen);
+                                  s       += numlen;
+                                  len     += numlen;
+                                  foldbuf += numlen;
+                             }
+                        }
+                        else {
+                             reguni(pRExC_state, ender, s, &numlen);
+                             s   += numlen;
+                             len += numlen;
+                        }
                    }
                    else {
                        len++;
@@ -3124,9 +3161,23 @@ tryagain:
                    break;
                }
                if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
-                   reguni(pRExC_state, ender, s, &numlen);
-                   s += numlen;
-                   len += numlen - 1;
+                    if (FOLD) {
+                         for (foldbuf = tmpbuf;
+                              foldlen;
+                              foldlen -= numlen) {
+                              ender = utf8_to_uvchr(foldbuf, &numlen);
+                              reguni(pRExC_state, ender, s, &numlen);
+                              s       += numlen;
+                              len     += numlen;
+                              foldbuf += numlen;
+                         }
+                    }
+                    else {
+                         reguni(pRExC_state, ender, s, &numlen);
+                         s   += numlen;
+                         len += numlen;
+                    }
+                    len--;
                }
                else
                    REGC(ender, s++);
@@ -3155,6 +3206,28 @@ tryagain:
        break;
     }
 
+    if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
+        STRLEN oldlen = STR_LEN(ret);
+        SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
+
+        if (RExC_utf8)
+             SvUTF8_on(sv);
+        if (sv_utf8_downgrade(sv, TRUE)) {
+             char *s       = Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding);
+             STRLEN newlen = SvCUR(sv);
+        
+             if (!SIZE_ONLY) {
+                  DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
+                                        (int)oldlen, STRING(ret),
+                                        (int)newlen, s));
+                  Copy(s, STRING(ret), newlen, char);
+                  STR_LEN(ret) += newlen - oldlen;
+                  RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
+             } else
+                  RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
+        }
+    }
+
     return(ret);
 }
 
@@ -3179,7 +3252,12 @@ S_regwhite(pTHX_ char *p, char *e)
    Character classes ([:foo:]) can also be negated ([:^foo:]).
    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
-   but trigger warnings because they are currently unimplemented. */
+   but trigger failures because they are currently unimplemented. */
+
+#define POSIXCC_DONE(c)   ((c) == ':')
+#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
+#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
+
 STATIC I32
 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
 {
@@ -3188,13 +3266,11 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
 
     if (value == '[' && RExC_parse + 1 < RExC_end &&
        /* I smell either [: or [= or [. -- POSIX has been here, right? */
-       (*RExC_parse == ':' ||
-        *RExC_parse == '=' ||
-        *RExC_parse == '.')) {
-       char  c = *RExC_parse;
+       POSIXCC(UCHARAT(RExC_parse))) {
+       char  c = UCHARAT(RExC_parse);
        char* s = RExC_parse++;
        
-       while (RExC_parse < RExC_end && *RExC_parse != c)
+       while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
            RExC_parse++;
        if (RExC_parse == RExC_end)
            /* Grandfather lone [:, [=, [. */
@@ -3202,7 +3278,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
        else {
            char* t = RExC_parse++; /* skip over the c */
 
-           if (*RExC_parse == ']') {
+           if (UCHARAT(RExC_parse) == ']') {
                RExC_parse++; /* skip over the ending ] */
                posixcc = s + 1;
                if (*s == ':') {
@@ -3291,7 +3367,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
 
                    /* adjust RExC_parse so the warning shows after
                       the class closes */
-                   while (*RExC_parse && *RExC_parse != ']')
+                   while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
                        RExC_parse++;
                    Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
                }
@@ -3310,9 +3386,7 @@ STATIC void
 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
 {
     if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
-       (*RExC_parse == ':' ||
-        *RExC_parse == '=' ||
-        *RExC_parse == '.')) {
+       POSIXCC(UCHARAT(RExC_parse))) {
        char *s = RExC_parse;
        char  c = *s++;
 
@@ -3322,11 +3396,10 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
            vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
 
            /* [[=foo=]] and [[.foo.]] are still future. */
-           if (c == '=' || c == '.')
-           {
+           if (POSIXCC_NOTYET(c)) {
                /* adjust RExC_parse so the error shows after
                   the class closes */
-               while (*RExC_parse && *RExC_parse++ != ']')
+               while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
                    ;
                Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
            }
@@ -3338,6 +3411,7 @@ STATIC regnode *
 S_regclass(pTHX_ RExC_state_t *pRExC_state)
 {
     register UV value;
+    register UV nextvalue;
     register IV prevvalue = OOB_UNICODE;
     register IV range = 0;
     register regnode *ret;
@@ -3355,7 +3429,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
     if (!SIZE_ONLY)
        ANYOF_FLAGS(ret) = 0;
 
-    if (*RExC_parse == '^') {  /* Complement of range. */
+    if (UCHARAT(RExC_parse) == '^') {  /* Complement of range. */
        RExC_naughty++;
        RExC_parse++;
        if (!SIZE_ONLY)
@@ -3374,13 +3448,15 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
        listsv = newSVpvn("# comment\n", 10);
     }
 
-    if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
+    nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
+
+    if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && POSIXCC(nextvalue))
        checkposixcc(pRExC_state);
 
-    if (*RExC_parse == ']' || *RExC_parse == '-')
+    if (UCHARAT(RExC_parse) == ']' || UCHARAT(RExC_parse) == '-')
        goto charclassloop;             /* allow 1st char to be ] or - */
 
-    while (RExC_parse < RExC_end && *RExC_parse != ']') {
+    while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
 
     charclassloop:
 
@@ -3396,7 +3472,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
        }
        else
            value = UCHARAT(RExC_parse++);
-       if (value == '[')
+       nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
+       if (value == '[' && POSIXCC(nextvalue))
            namedclass = regpposixcc(pRExC_state, value);
        else if (value == '\\') {
            if (UTF) {
@@ -3922,17 +3999,41 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                }
                else
 #endif
-                   for (i = prevvalue; i <= ceilvalue; i++)
-                       ANYOF_BITMAP_SET(ret, i);
+                     for (i = prevvalue; i <= ceilvalue; i++)
+                         ANYOF_BITMAP_SET(ret, i);
          }
-         if (value > 255) {
+         if (value > 255 || UTF) {
                ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
                if (prevvalue < value)
                    Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
                                   (UV)prevvalue, (UV)value);
-               else if (prevvalue == value)
+               else if (prevvalue == value) {
                    Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
                                   (UV)value);
+                   if (FOLD) {
+                        U8 tmpbuf [UTF8_MAXLEN+1];
+                        U8 foldbuf[UTF8_MAXLEN_FOLD+1];
+                        STRLEN foldlen;
+                        UV f;
+
+                        uvchr_to_utf8(tmpbuf, value);
+                        to_utf8_fold(tmpbuf, foldbuf, &foldlen);
+                        f = utf8_to_uvchr(foldbuf, 0);
+
+                        if (f != value)
+                             Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", f);
+
+                        if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
+                             Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
+                                            (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
+                             Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
+                                            (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
+                        }
+                        else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
+                             Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
+                                            (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
+                   }
+               }
            }
         }
 
@@ -4411,7 +4512,7 @@ Perl_regprop(pTHX_ SV *sv, regnode *o)
 
     if (k == EXACT) {
         SV *dsv = sv_2mortal(newSVpvn("", 0));
-       bool do_utf8 = PL_reg_match_utf8;
+       bool do_utf8 = DO_UTF8(sv);
        char *s    = do_utf8 ?
          pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60, 0) :
          STRING(o);