Add U+0085, U+2028, and U+2029 to \s under Unicode.
[p5sagit/p5-mst-13.2.git] / regcomp.c
index 463b778..aacae22 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -2985,8 +2985,8 @@ tryagain:
            register char *p;
            char *oldp, *s;
            STRLEN numlen;
-           STRLEN ulen;
-           U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
+           STRLEN foldlen;
+           U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
 
             parse_start = RExC_parse - 1;
 
@@ -3017,6 +3017,8 @@ tryagain:
                case '\\':
                    switch (*++p) {
                    case 'A':
+                   case 'C':
+                   case 'X':
                    case 'G':
                    case 'Z':
                    case 'z':
@@ -3129,16 +3131,30 @@ tryagain:
                if (RExC_flags16 & PMf_EXTENDED)
                    p = regwhite(p, RExC_end);
                if (UTF && FOLD) {
-                   toLOWER_uni(ender, tmpbuf, &ulen);
-                   ender = utf8_to_uvchr(tmpbuf, 0);
+                   /* Prime the casefolded buffer. */
+                   ender = toFOLD_uni(ender, tmpbuf, &foldlen);
                }
                if (ISMULT2(p)) { /* Back off on ?+*. */
                    if (len)
                        p = oldp;
-                   else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
-                       reguni(pRExC_state, ender, s, &numlen);
-                       s += numlen;
-                       len += numlen;
+                   else if (UTF) {
+                        if (FOLD) {
+                             /* Emit all the Unicode characters. */
+                             for (foldbuf = tmpbuf;
+                                  foldlen;
+                                  foldlen -= numlen) {
+                                  ender = utf8_to_uvchr(foldbuf, &numlen);
+                                  reguni(pRExC_state, ender, s, &numlen);
+                                  s       += numlen;
+                                  len     += numlen;
+                                  foldbuf += numlen;
+                             }
+                        }
+                        else {
+                             reguni(pRExC_state, ender, s, &numlen);
+                             s   += numlen;
+                             len += numlen;
+                        }
                    }
                    else {
                        len++;
@@ -3146,10 +3162,25 @@ tryagain:
                    }
                    break;
                }
-               if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
-                   reguni(pRExC_state, ender, s, &numlen);
-                   s += numlen;
-                   len += numlen - 1;
+               if (UTF) {
+                    if (FOLD) {
+                         /* Emit all the Unicode characters. */
+                         for (foldbuf = tmpbuf;
+                              foldlen;
+                              foldlen -= numlen) {
+                              ender = utf8_to_uvchr(foldbuf, &numlen);
+                              reguni(pRExC_state, ender, s, &numlen);
+                              s       += numlen;
+                              len     += numlen;
+                              foldbuf += numlen;
+                         }
+                    }
+                    else {
+                         reguni(pRExC_state, ender, s, &numlen);
+                         s   += numlen;
+                         len += numlen;
+                    }
+                    len--;
                }
                else
                    REGC(ender, s++);
@@ -3178,20 +3209,28 @@ tryagain:
        break;
     }
 
-    if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT && !RExC_utf8) {
+    /* If the encoding pragma is in effect recode the text of
+     * any EXACT-kind nodes. */
+    if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
         STRLEN oldlen = STR_LEN(ret);
         SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
-        char *s       = Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding);
-        STRLEN newlen = SvCUR(sv);
-        if (!SIZE_ONLY) {
-             DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
-                                   (int)oldlen, STRING(ret), (int)newlen, s));
-             Copy(s, STRING(ret), newlen, char);
-             STR_LEN(ret) += newlen - oldlen;
-             RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
-        } else
-             RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
-        RExC_utf8 = 1;
+
+        if (RExC_utf8)
+             SvUTF8_on(sv);
+        if (sv_utf8_downgrade(sv, TRUE)) {
+             char *s       = Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding);
+             STRLEN newlen = SvCUR(sv);
+        
+             if (!SIZE_ONLY) {
+                  DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
+                                        (int)oldlen, STRING(ret),
+                                        (int)newlen, s));
+                  Copy(s, STRING(ret), newlen, char);
+                  STR_LEN(ret) += newlen - oldlen;
+                  RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
+             } else
+                  RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
+        }
     }
 
     return(ret);
@@ -3973,9 +4012,45 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                if (prevvalue < value)
                    Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
                                   (UV)prevvalue, (UV)value);
-               else if (prevvalue == value)
+               else if (prevvalue == value) {
                    Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
                                   (UV)value);
+                   if (FOLD) {
+                        U8 tmpbuf [UTF8_MAXLEN+1];
+                        U8 foldbuf[UTF8_MAXLEN_FOLD+1];
+                        STRLEN foldlen;
+                        UV f;
+
+                        uvchr_to_utf8(tmpbuf, value);
+                        to_utf8_fold(tmpbuf, foldbuf, &foldlen);
+                        f = utf8_to_uvchr(foldbuf, 0);
+
+                        /* If folding and foldable and a single
+                         * character, insert also the folded version
+                         * to the charclass. */
+                        if (f != value && foldlen == UNISKIP(f))
+                             Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", f);
+
+                        /* If folding and the value is one of the Greek
+                         * sigmas insert a few more sigmas to make the
+                         * folding rules of the sigmas to work right.
+                         * Note that not all the possible combinations
+                         * are handled here: some of them are handled
+                         * handled by the standard folding rules, and
+                         * some of them (literal or EXACTF cases) are
+                         * handled during runtime in
+                         * regexec.c:S_find_byclass(). */
+                        if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
+                             Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
+                                            (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
+                             Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
+                                            (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
+                        }
+                        else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
+                             Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
+                                            (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
+                   }
+               }
            }
         }