[REPATCH] Re: [PATCH] Re: socketpair blip on unicos/mk, too
[p5sagit/p5-mst-13.2.git] / regcomp.c
index b061991..e81bc0a 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -2985,8 +2985,8 @@ tryagain:
            register char *p;
            char *oldp, *s;
            STRLEN numlen;
-           STRLEN ulen;
-           U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
+           STRLEN foldlen;
+           U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
 
             parse_start = RExC_parse - 1;
 
@@ -3131,16 +3131,30 @@ tryagain:
                if (RExC_flags16 & PMf_EXTENDED)
                    p = regwhite(p, RExC_end);
                if (UTF && FOLD) {
-                   toFOLD_uni(ender, tmpbuf, &ulen);
-                   ender = utf8_to_uvchr(tmpbuf, 0);
+                   /* Prime the casefolded buffer. */
+                   ender = toFOLD_uni(ender, tmpbuf, &foldlen);
                }
                if (ISMULT2(p)) { /* Back off on ?+*. */
                    if (len)
                        p = oldp;
-                   else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
-                       reguni(pRExC_state, ender, s, &numlen);
-                       s += numlen;
-                       len += numlen;
+                   else if (UTF) {
+                        if (FOLD) {
+                             /* Emit all the Unicode characters. */
+                             for (foldbuf = tmpbuf;
+                                  foldlen;
+                                  foldlen -= numlen) {
+                                  ender = utf8_to_uvchr(foldbuf, &numlen);
+                                  reguni(pRExC_state, ender, s, &numlen);
+                                  s       += numlen;
+                                  len     += numlen;
+                                  foldbuf += numlen;
+                             }
+                        }
+                        else {
+                             reguni(pRExC_state, ender, s, &numlen);
+                             s   += numlen;
+                             len += numlen;
+                        }
                    }
                    else {
                        len++;
@@ -3148,10 +3162,25 @@ tryagain:
                    }
                    break;
                }
-               if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
-                   reguni(pRExC_state, ender, s, &numlen);
-                   s += numlen;
-                   len += numlen - 1;
+               if (UTF) {
+                    if (FOLD) {
+                         /* Emit all the Unicode characters. */
+                         for (foldbuf = tmpbuf;
+                              foldlen;
+                              foldlen -= numlen) {
+                              ender = utf8_to_uvchr(foldbuf, &numlen);
+                              reguni(pRExC_state, ender, s, &numlen);
+                              s       += numlen;
+                              len     += numlen;
+                              foldbuf += numlen;
+                         }
+                    }
+                    else {
+                         reguni(pRExC_state, ender, s, &numlen);
+                         s   += numlen;
+                         len += numlen;
+                    }
+                    len--;
                }
                else
                    REGC(ender, s++);
@@ -3180,6 +3209,8 @@ tryagain:
        break;
     }
 
+    /* If the encoding pragma is in effect recode the text of
+     * any EXACT-kind nodes. */
     if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
         STRLEN oldlen = STR_LEN(ret);
         SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
@@ -3396,7 +3427,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
     SV *listsv = Nullsv;
     register char *e;
     UV n;
-    bool optimize_invert = TRUE;
+    bool optimize_invert   = TRUE;
+    AV* unicode_alternate  = 0;
 
     ret = reganode(pRExC_state, ANYOF, 0);
 
@@ -3994,9 +4026,38 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                         to_utf8_fold(tmpbuf, foldbuf, &foldlen);
                         f = utf8_to_uvchr(foldbuf, 0);
 
-                        if (f != value)
-                             Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", f);
+                        /* If folding and foldable and a single
+                         * character, insert also the folded version
+                         * to the charclass. */
+                        if (f != value) {
+                             if (foldlen == UNISKIP(f))
+                                 Perl_sv_catpvf(aTHX_ listsv,
+                                                "%04"UVxf"\n", f);
+                             else {
+                                 /* Any multicharacter foldings
+                                  * require the following transform:
+                                  * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
+                                  * where E folds into "pq" and F folds
+                                  * into "rst", all other characters
+                                  * fold to single characters. */
+                                 SV *sv;
+
+                                 if (!unicode_alternate)
+                                     unicode_alternate = newAV();
+                                 sv = newSVpvn((char*)foldbuf, foldlen);
+                                 SvUTF8_on(sv);
+                                 av_push(unicode_alternate, sv);
+                             }
+                        }
 
+                        /* If folding and the value is one of the Greek
+                         * sigmas insert a few more sigmas to make the
+                         * folding rules of the sigmas to work right.
+                         * Note that not all the possible combinations
+                         * are handled here: some of them are handled
+                         * by the standard folding rules, and some of
+                         * them (literal or EXACTF cases) are handled
+                         * during runtime in regexec.c:S_find_byclass(). */
                         if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
                              Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
                                             (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
@@ -4053,6 +4114,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
 
        av_store(av, 0, listsv);
        av_store(av, 1, NULL);
+       av_store(av, 2, (SV*)unicode_alternate);
        rv = newRV_noinc((SV*)av);
        n = add_data(pRExC_state, 1, "s");
        RExC_rx->data->data[n] = (void*)rv;
@@ -4582,7 +4644,7 @@ Perl_regprop(pTHX_ SV *sv, regnode *o)
 
        {
            SV *lv;
-           SV *sw = regclass_swash(o, FALSE, &lv);
+           SV *sw = regclass_swash(o, FALSE, &lv, 0);
        
            if (lv) {
                if (sw) {
@@ -4736,7 +4798,7 @@ Perl_pregfree(pTHX_ struct regexp *r)
                new_comppad = NULL;
                break;
            case 'n':
-               break;
+               break;
            default:
                Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
            }