Add mingw64 support
[p5sagit/p5-mst-13.2.git] / regexec.c
index 83d216b..06fe13a 100644 (file)
--- a/regexec.c
+++ b/regexec.c
 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 
+/* these are unrolled below in the CCC_TRY_XXX defined */
 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 #define LOAD_UTF8_CHARCLASS_MARK()  LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
 
+
+/* 
+   We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
+   so that it is possible to override the option here without having to 
+   rebuild the entire core. as we are required to do if we change regcomp.h
+   which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
+*/
+#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
+#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
+#endif
+
+#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
+#define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS_ALNUM()
+#define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS_SPACE()
+#define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
+#define RE_utf8_perl_word   PL_utf8_alnum
+#define RE_utf8_perl_space  PL_utf8_space
+#define RE_utf8_posix_digit PL_utf8_digit
+#define perl_word  alnum
+#define perl_space space
+#define posix_digit digit
+#else
+#define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS(perl_word,"a")
+#define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS(perl_space," ")
+#define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
+#define RE_utf8_perl_word   PL_utf8_perl_word
+#define RE_utf8_perl_space  PL_utf8_perl_space
+#define RE_utf8_posix_digit PL_utf8_posix_digit
+#endif
+
+
+#define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)                          \
+        case NAMEL:                                                              \
+            PL_reg_flags |= RF_tainted;                                                 \
+            /* FALL THROUGH */                                                          \
+        case NAME:                                                                     \
+            if (!nextchr)                                                               \
+                sayNO;                                                                  \
+            if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) {                                \
+                if (!CAT2(PL_utf8_,CLASS)) {                                            \
+                    bool ok;                                                            \
+                    ENTER;                                                              \
+                    save_re_context();                                                  \
+                    ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                            \
+                    assert(ok);                                                         \
+                    LEAVE;                                                              \
+                }                                                                       \
+                if (!(OP(scan) == NAME                                                  \
+                    ? (bool)swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8)   \
+                    : LCFUNC_utf8((U8*)locinput)))                                      \
+                {                                                                       \
+                    sayNO;                                                              \
+                }                                                                       \
+                locinput += PL_utf8skip[nextchr];                                       \
+                nextchr = UCHARAT(locinput);                                            \
+                break;                                                                  \
+            }                                                                           \
+            if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))                  \
+                sayNO;                                                                  \
+            nextchr = UCHARAT(++locinput);                                              \
+            break
+
+#define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)                        \
+        case NAMEL:                                                              \
+            PL_reg_flags |= RF_tainted;                                                 \
+            /* FALL THROUGH */                                                          \
+        case NAME :                                                                     \
+            if (!nextchr && locinput >= PL_regeol)                                      \
+                sayNO;                                                                  \
+            if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) {                                \
+                if (!CAT2(PL_utf8_,CLASS)) {                                            \
+                    bool ok;                                                            \
+                    ENTER;                                                              \
+                    save_re_context();                                                  \
+                    ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                            \
+                    assert(ok);                                                         \
+                    LEAVE;                                                              \
+                }                                                                       \
+                if ((OP(scan) == NAME                                                  \
+                    ? (bool)swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8)    \
+                    : LCFUNC_utf8((U8*)locinput)))                                      \
+                {                                                                       \
+                    sayNO;                                                              \
+                }                                                                       \
+                locinput += PL_utf8skip[nextchr];                                       \
+                nextchr = UCHARAT(locinput);                                            \
+                break;                                                                  \
+            }                                                                           \
+            if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))                   \
+                sayNO;                                                                  \
+            nextchr = UCHARAT(++locinput);                                              \
+            break
+
+
+
+
+
 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 
 /* for use after a quantifier and before an EXACT-like node -- japhy */
@@ -863,9 +961,9 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
        {
            /* If flags & SOMETHING - do not do it many times on the same match */
            DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
+           /* XXX Does the destruction order has to change with do_utf8? */
            SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
-           if (do_utf8 ? prog->check_substr : prog->check_utf8)
-               SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
+           SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
            prog->check_substr = prog->check_utf8 = NULL;       /* disable */
            prog->float_substr = prog->float_utf8 = NULL;       /* clear */
            check = NULL;                       /* abort */
@@ -1007,16 +1105,15 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 
 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
-    UV uvc_unfolded = 0;                                                   \
     switch (trie_type) {                                                    \
     case trie_utf8_fold:                                                    \
        if ( foldlen>0 ) {                                                  \
-           uvc_unfolded = uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
+           uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
            foldlen -= len;                                                 \
            uscan += len;                                                   \
            len=0;                                                          \
        } else {                                                            \
-           uvc_unfolded = uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
+           uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
            uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
            foldlen -= UNISKIP( uvc );                                      \
            uscan = foldbuf + UNISKIP( uvc );                               \
@@ -1042,7 +1139,6 @@ uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
        uvc = (UV)*uc;                                                      \
        len = 1;                                                            \
     }                                                                       \
-                                                                           \
     if (uvc < 256) {                                                        \
        charid = trie->charmap[ uvc ];                                      \
     }                                                                       \
@@ -1055,9 +1151,6 @@ uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
                charid = (U16)SvIV(*svpp);                                  \
        }                                                                   \
     }                                                                       \
-    if (!charid && trie_type == trie_utf8_fold && !UTF) {                  \
-       charid = trie->charmap[uvc_unfolded];                               \
-    }                                                                      \
 } STMT_END
 
 #define REXEC_FBC_EXACTISH_CHECK(CoNd)                 \
@@ -1426,8 +1519,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
            break;
        case ALNUM:
            REXEC_FBC_CSCAN_PRELOAD(
-               LOAD_UTF8_CHARCLASS_ALNUM(),
-               swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
+               LOAD_UTF8_CHARCLASS_PERL_WORD(),
+               swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8),
                isALNUM(*s)
            );
        case ALNUML:
@@ -1437,8 +1530,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
            );
        case NALNUM:
            REXEC_FBC_CSCAN_PRELOAD(
-               LOAD_UTF8_CHARCLASS_ALNUM(),
-               !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
+               LOAD_UTF8_CHARCLASS_PERL_WORD(),
+               !swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8),
                !isALNUM(*s)
            );
        case NALNUML:
@@ -1448,8 +1541,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
            );
        case SPACE:
            REXEC_FBC_CSCAN_PRELOAD(
-               LOAD_UTF8_CHARCLASS_SPACE(),
-               *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
+               LOAD_UTF8_CHARCLASS_PERL_SPACE(),
+               *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8),
                isSPACE(*s)
            );
        case SPACEL:
@@ -1459,8 +1552,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
            );
        case NSPACE:
            REXEC_FBC_CSCAN_PRELOAD(
-               LOAD_UTF8_CHARCLASS_SPACE(),
-               !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
+               LOAD_UTF8_CHARCLASS_PERL_SPACE(),
+               !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8)),
                !isSPACE(*s)
            );
        case NSPACEL:
@@ -1470,8 +1563,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
            );
        case DIGIT:
            REXEC_FBC_CSCAN_PRELOAD(
-               LOAD_UTF8_CHARCLASS_DIGIT(),
-               swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
+               LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
+               swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8),
                isDIGIT(*s)
            );
        case DIGITL:
@@ -1481,8 +1574,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
            );
        case NDIGIT:
            REXEC_FBC_CSCAN_PRELOAD(
-               LOAD_UTF8_CHARCLASS_DIGIT(),
-               !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
+               LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
+               !swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8),
                !isDIGIT(*s)
            );
        case NDIGITL:
@@ -3344,85 +3437,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
            nextchr = UCHARAT(locinput);
            break;
            }
-       case ANYOF:
-           if (do_utf8) {
-               STRLEN inclasslen = PL_regeol - locinput;
-
-               if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
-                   goto anyof_fail;
-               if (locinput >= PL_regeol)
-                   sayNO;
-               locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
-               nextchr = UCHARAT(locinput);
-               break;
-           }
-           else {
-               if (nextchr < 0)
-                   nextchr = UCHARAT(locinput);
-               if (!REGINCLASS(rex, scan, (U8*)locinput))
-                   goto anyof_fail;
-               if (!nextchr && locinput >= PL_regeol)
-                   sayNO;
-               nextchr = UCHARAT(++locinput);
-               break;
-           }
-       anyof_fail:
-           /* If we might have the case of the German sharp s
-            * in a casefolding Unicode character class. */
-
-           if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
-                locinput += SHARP_S_SKIP;
-                nextchr = UCHARAT(locinput);
-           }
-           else
-                sayNO;
-           break;
-       case ALNUML:
-           PL_reg_flags |= RF_tainted;
-           /* FALL THROUGH */
-       case ALNUM:
-           if (!nextchr)
-               sayNO;
-           if (do_utf8) {
-               LOAD_UTF8_CHARCLASS_ALNUM();
-               if (!(OP(scan) == ALNUM
-                     ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
-                     : isALNUM_LC_utf8((U8*)locinput)))
-               {
-                   sayNO;
-               }
-               locinput += PL_utf8skip[nextchr];
-               nextchr = UCHARAT(locinput);
-               break;
-           }
-           if (!(OP(scan) == ALNUM
-                 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
-               sayNO;
-           nextchr = UCHARAT(++locinput);
-           break;
-       case NALNUML:
-           PL_reg_flags |= RF_tainted;
-           /* FALL THROUGH */
-       case NALNUM:
-           if (!nextchr && locinput >= PL_regeol)
-               sayNO;
-           if (do_utf8) {
-               LOAD_UTF8_CHARCLASS_ALNUM();
-               if (OP(scan) == NALNUM
-                   ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
-                   : isALNUM_LC_utf8((U8*)locinput))
-               {
-                   sayNO;
-               }
-               locinput += PL_utf8skip[nextchr];
-               nextchr = UCHARAT(locinput);
-               break;
-           }
-           if (OP(scan) == NALNUM
-               ? isALNUM(nextchr) : isALNUM_LC(nextchr))
-               sayNO;
-           nextchr = UCHARAT(++locinput);
-           break;
        case BOUNDL:
        case NBOUNDL:
            PL_reg_flags |= RF_tainted;
@@ -3435,7 +3449,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                    ln = '\n';
                else {
                    const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
-               
+
                    ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
                }
                if (OP(scan) == BOUND || OP(scan) == NBOUND) {
@@ -3464,100 +3478,49 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                                    OP(scan) == BOUNDL))
                    sayNO;
            break;
-       case SPACEL:
-           PL_reg_flags |= RF_tainted;
-           /* FALL THROUGH */
-       case SPACE:
-           if (!nextchr)
-               sayNO;
-           if (do_utf8) {
-               if (UTF8_IS_CONTINUED(nextchr)) {
-                   LOAD_UTF8_CHARCLASS_SPACE();
-                   if (!(OP(scan) == SPACE
-                         ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
-                         : isSPACE_LC_utf8((U8*)locinput)))
-                   {
-                       sayNO;
-                   }
-                   locinput += PL_utf8skip[nextchr];
-                   nextchr = UCHARAT(locinput);
-                   break;
-               }
-           }
-           if (!(OP(scan) == SPACE
-                 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
-               sayNO;
-           nextchr = UCHARAT(++locinput);
-           break;
-       case NSPACEL:
-           PL_reg_flags |= RF_tainted;
-           /* FALL THROUGH */
-       case NSPACE:
-           if (!nextchr && locinput >= PL_regeol)
-               sayNO;
+       case ANYOF:
            if (do_utf8) {
-               LOAD_UTF8_CHARCLASS_SPACE();
-               if (OP(scan) == NSPACE
-                   ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
-                   : isSPACE_LC_utf8((U8*)locinput))
-               {
+               STRLEN inclasslen = PL_regeol - locinput;
+
+               if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
+                   goto anyof_fail;
+               if (locinput >= PL_regeol)
                    sayNO;
-               }
-               locinput += PL_utf8skip[nextchr];
+               locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
                nextchr = UCHARAT(locinput);
                break;
            }
-           if (OP(scan) == NSPACE
-               ? isSPACE(nextchr) : isSPACE_LC(nextchr))
-               sayNO;
-           nextchr = UCHARAT(++locinput);
-           break;
-       case DIGITL:
-           PL_reg_flags |= RF_tainted;
-           /* FALL THROUGH */
-       case DIGIT:
-           if (!nextchr)
-               sayNO;
-           if (do_utf8) {
-               LOAD_UTF8_CHARCLASS_DIGIT();
-               if (!(OP(scan) == DIGIT
-                     ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
-                     : isDIGIT_LC_utf8((U8*)locinput)))
-               {
+           else {
+               if (nextchr < 0)
+                   nextchr = UCHARAT(locinput);
+               if (!REGINCLASS(rex, scan, (U8*)locinput))
+                   goto anyof_fail;
+               if (!nextchr && locinput >= PL_regeol)
                    sayNO;
-               }
-               locinput += PL_utf8skip[nextchr];
-               nextchr = UCHARAT(locinput);
+               nextchr = UCHARAT(++locinput);
                break;
            }
-           if (!(OP(scan) == DIGIT
-                 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
-               sayNO;
-           nextchr = UCHARAT(++locinput);
-           break;
-       case NDIGITL:
-           PL_reg_flags |= RF_tainted;
-           /* FALL THROUGH */
-       case NDIGIT:
-           if (!nextchr && locinput >= PL_regeol)
-               sayNO;
-           if (do_utf8) {
-               LOAD_UTF8_CHARCLASS_DIGIT();
-               if (OP(scan) == NDIGIT
-                   ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
-                   : isDIGIT_LC_utf8((U8*)locinput))
-               {
-                   sayNO;
-               }
-               locinput += PL_utf8skip[nextchr];
-               nextchr = UCHARAT(locinput);
-               break;
+       anyof_fail:
+           /* If we might have the case of the German sharp s
+            * in a casefolding Unicode character class. */
+
+           if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
+                locinput += SHARP_S_SKIP;
+                nextchr = UCHARAT(locinput);
            }
-           if (OP(scan) == NDIGIT
-               ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
-               sayNO;
-           nextchr = UCHARAT(++locinput);
+           else
+                sayNO;
            break;
+       /* Special char classes - The defines start on line 129 or so */
+       CCC_TRY_AFF( ALNUM,  ALNUML, perl_word,   "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
+       CCC_TRY_NEG(NALNUM, NALNUML, perl_word,   "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
+
+       CCC_TRY_AFF( SPACE,  SPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
+       CCC_TRY_NEG(NSPACE, NSPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
+
+       CCC_TRY_AFF( DIGIT,  DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
+       CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
+
        case CLUMP:
            if (locinput >= PL_regeol)
                sayNO;
@@ -3787,7 +3750,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                        assert(rx);
                    }
                    if (rx) {
-                       rx = reg_temp_copy(rx);
+                       rx = reg_temp_copy(NULL, rx);
                    }
                    else {
                        U32 pm_flags = 0;