regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #define RF_tainted      1               /* tainted information used? */
  84 #define RF_warned       2               /* warned about big count? */
  85
  86 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  87
  88 #define UTF ((PL_reg_flags & RF_utf8) != 0)
  89
  90 #define RS_init         1               /* eval environment created */
  91 #define RS_set          2               /* replsv value is set */
  92
  93 #ifndef STATIC
  94 #define STATIC  static
  95 #endif
  96
  97 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
  98
  99 /*
 100  * Forwards.
 101  */
 102
 103 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
 104 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 105
 106 #define HOPc(pos,off) \
 107         (char *)(PL_reg_match_utf8 \
 108             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 109             : (U8*)(pos + off))
 110 #define HOPBACKc(pos, off) \
 111         (char*)(PL_reg_match_utf8\
 112             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 113             : (pos - off >= PL_bostr)           \
 114                 ? (U8*)pos - off                \
 115                 : NULL)
 116
 117 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 118 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 119
 120 /* these are unrolled below in the CCC_TRY_XXX defined */
 121 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 122     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
 123
 124 /* Doesn't do an assert to verify that is correct */
 125 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 126     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
 127
 128 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 129 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 130 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 131
 132 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 133         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 134         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 135         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 136             * assert should likely and hopefully fail on an EBCDIC machine */ \
 137         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 138                                                                             \
 139         /* No asserts are done for these, in case called on an early        \
 140             * Unicode version in which they map to nothing */               \
 141         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 142         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 143         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 144         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 145         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 146         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 147         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 148
 149 /*
 150    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
 151    so that it is possible to override the option here without having to
 152    rebuild the entire core. as we are required to do if we change regcomp.h
 153    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
 154 */
 155 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
 156 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
 157 #endif
 158
 159 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
 160 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS_ALNUM()
 161 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS_SPACE()
 162 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
 163 #define RE_utf8_perl_word   PL_utf8_alnum
 164 #define RE_utf8_perl_space  PL_utf8_space
 165 #define RE_utf8_posix_digit PL_utf8_digit
 166 #define perl_word  alnum
 167 #define perl_space space
 168 #define posix_digit digit
 169 #else
 170 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS(perl_word,"a")
 171 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS(perl_space," ")
 172 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
 173 #define RE_utf8_perl_word   PL_utf8_perl_word
 174 #define RE_utf8_perl_space  PL_utf8_perl_space
 175 #define RE_utf8_posix_digit PL_utf8_posix_digit
 176 #endif
 177
 178
 179 #define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)                          \
 180         case NAMEL:                                                              \
 181             PL_reg_flags |= RF_tainted;                                                 \
 182             /* FALL THROUGH */                                                          \
 183         case NAME:                                                                     \
 184             if (!nextchr)                                                               \
 185                 sayNO;                                                                  \
 186             if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) {                                \
 187                 if (!CAT2(PL_utf8_,CLASS)) {                                            \
 188                     bool ok;                                                            \
 189                     ENTER;                                                              \
 190                     save_re_context();                                                  \
 191                     ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                            \
 192                     assert(ok);                                                         \
 193                     LEAVE;                                                              \
 194                 }                                                                       \
 195                 if (!(OP(scan) == NAME                                                  \
 196                     ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8))  \
 197                     : LCFUNC_utf8((U8*)locinput)))                                      \
 198                 {                                                                       \
 199                     sayNO;                                                              \
 200                 }                                                                       \
 201                 locinput += PL_utf8skip[nextchr];                                       \
 202                 nextchr = UCHARAT(locinput);                                            \
 203                 break;                                                                  \
 204             }                                                                           \
 205             if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))                  \
 206                 sayNO;                                                                  \
 207             nextchr = UCHARAT(++locinput);                                              \
 208             break
 209
 210 #define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)                        \
 211         case NAMEL:                                                              \
 212             PL_reg_flags |= RF_tainted;                                                 \
 213             /* FALL THROUGH */                                                          \
 214         case NAME :                                                                     \
 215             if (!nextchr && locinput >= PL_regeol)                                      \
 216                 sayNO;                                                                  \
 217             if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) {                                \
 218                 if (!CAT2(PL_utf8_,CLASS)) {                                            \
 219                     bool ok;                                                            \
 220                     ENTER;                                                              \
 221                     save_re_context();                                                  \
 222                     ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                            \
 223                     assert(ok);                                                         \
 224                     LEAVE;                                                              \
 225                 }                                                                       \
 226                 if ((OP(scan) == NAME                                                  \
 227                     ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8))  \
 228                     : LCFUNC_utf8((U8*)locinput)))                                      \
 229                 {                                                                       \
 230                     sayNO;                                                              \
 231                 }                                                                       \
 232                 locinput += PL_utf8skip[nextchr];                                       \
 233                 nextchr = UCHARAT(locinput);                                            \
 234                 break;                                                                  \
 235             }                                                                           \
 236             if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))                   \
 237                 sayNO;                                                                  \
 238             nextchr = UCHARAT(++locinput);                                              \
 239             break
 240
 241
 242
 243
 244
 245 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 246
 247 /* for use after a quantifier and before an EXACT-like node -- japhy */
 248 /* it would be nice to rework regcomp.sym to generate this stuff. sigh */
 249 #define JUMPABLE(rn) (      \
 250     OP(rn) == OPEN ||       \
 251     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 252     OP(rn) == EVAL ||   \
 253     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 254     OP(rn) == PLUS || OP(rn) == MINMOD || \
 255     OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
 256     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 257 )
 258 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 259
 260 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 261
 262 #if 0
 263 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 264    we don't need this definition. */
 265 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 266 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  || OP(rn)==REFF  || OP(rn)==NREFF  )
 267 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 268
 269 #else
 270 /* ... so we use this as its faster. */
 271 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 272 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 273 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 274
 275 #endif
 276
 277 /*
 278   Search for mandatory following text node; for lookahead, the text must
 279   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 280 */
 281 #define FIND_NEXT_IMPT(rn) STMT_START { \
 282     while (JUMPABLE(rn)) { \
 283         const OPCODE type = OP(rn); \
 284         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 285             rn = NEXTOPER(NEXTOPER(rn)); \
 286         else if (type == PLUS) \
 287             rn = NEXTOPER(rn); \
 288         else if (type == IFMATCH) \
 289             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 290         else rn += NEXT_OFF(rn); \
 291     } \
 292 } STMT_END
 293
 294
 295 static void restore_pos(pTHX_ void *arg);
 296
 297 #define REGCP_PAREN_ELEMS 4
 298 #define REGCP_OTHER_ELEMS 5
 299 #define REGCP_FRAME_ELEMS 2
 300 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 301  * are needed for the regexp context stack bookkeeping. */
 302
 303 STATIC CHECKPOINT
 304 S_regcppush(pTHX_ I32 parenfloor)
 305 {
 306     dVAR;
 307     const int retval = PL_savestack_ix;
 308     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 309     const unsigned int total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 310     int p;
 311     GET_RE_DEBUG_FLAGS_DECL;
 312
 313     if (paren_elems_to_push < 0)
 314         Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 315
 316     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 317
 318     for (p = PL_regsize; p > parenfloor; p--) {
 319 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 320         SSPUSHINT(PL_regoffs[p].end);
 321         SSPUSHINT(PL_regoffs[p].start);
 322         SSPUSHPTR(PL_reg_start_tmp[p]);
 323         SSPUSHINT(p);
 324         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 325           "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 326                       (UV)p, (IV)PL_regoffs[p].start,
 327                       (IV)(PL_reg_start_tmp[p] - PL_bostr),
 328                       (IV)PL_regoffs[p].end
 329         ));
 330     }
 331 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 332     SSPUSHPTR(PL_regoffs);
 333     SSPUSHINT(PL_regsize);
 334     SSPUSHINT(*PL_reglastparen);
 335     SSPUSHINT(*PL_reglastcloseparen);
 336     SSPUSHPTR(PL_reginput);
 337     SSPUSHINT(total_elems);
 338     SSPUSHUV(SAVEt_REGCONTEXT); /* Magic cookie. */
 339
 340     return retval;
 341 }
 342
 343 /* These are needed since we do not localize EVAL nodes: */
 344 #define REGCP_SET(cp)                                           \
 345     DEBUG_STATE_r(                                              \
 346             PerlIO_printf(Perl_debug_log,                       \
 347                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 348                 (IV)PL_savestack_ix));                          \
 349     cp = PL_savestack_ix
 350
 351 #define REGCP_UNWIND(cp)                                        \
 352     DEBUG_STATE_r(                                              \
 353         if (cp != PL_savestack_ix)                              \
 354             PerlIO_printf(Perl_debug_log,                       \
 355                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 356                 (IV)(cp), (IV)PL_savestack_ix));                \
 357     regcpblow(cp)
 358
 359 STATIC char *
 360 S_regcppop(pTHX_ const regexp *rex)
 361 {
 362     dVAR;
 363     U32 i;
 364     char *input;
 365     GET_RE_DEBUG_FLAGS_DECL;
 366
 367     PERL_ARGS_ASSERT_REGCPPOP;
 368
 369     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 370     i = SSPOPUV;
 371     assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 372     i = SSPOPINT; /* Parentheses elements to pop. */
 373     input = (char *) SSPOPPTR;
 374     *PL_reglastcloseparen = SSPOPINT;
 375     *PL_reglastparen = SSPOPINT;
 376     PL_regsize = SSPOPINT;
 377     PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 378
 379     i -= REGCP_OTHER_ELEMS;
 380     /* Now restore the parentheses context. */
 381     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 382         I32 tmps;
 383         U32 paren = (U32)SSPOPINT;
 384         PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 385         PL_regoffs[paren].start = SSPOPINT;
 386         tmps = SSPOPINT;
 387         if (paren <= *PL_reglastparen)
 388             PL_regoffs[paren].end = tmps;
 389         DEBUG_BUFFERS_r(
 390             PerlIO_printf(Perl_debug_log,
 391                           "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 392                           (UV)paren, (IV)PL_regoffs[paren].start,
 393                           (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 394                           (IV)PL_regoffs[paren].end,
 395                           (paren > *PL_reglastparen ? "(no)" : ""));
 396         );
 397     }
 398     DEBUG_BUFFERS_r(
 399         if (*PL_reglastparen + 1 <= rex->nparens) {
 400             PerlIO_printf(Perl_debug_log,
 401                           "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 402                           (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 403         }
 404     );
 405 #if 1
 406     /* It would seem that the similar code in regtry()
 407      * already takes care of this, and in fact it is in
 408      * a better location to since this code can #if 0-ed out
 409      * but the code in regtry() is needed or otherwise tests
 410      * requiring null fields (pat.t#187 and split.t#{13,14}
 411      * (as of patchlevel 7877)  will fail.  Then again,
 412      * this code seems to be necessary or otherwise
 413      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 414      * --jhi updated by dapm */
 415     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 416         if (i > PL_regsize)
 417             PL_regoffs[i].start = -1;
 418         PL_regoffs[i].end = -1;
 419     }
 420 #endif
 421     return input;
 422 }
 423
 424 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 425
 426 /*
 427  * pregexec and friends
 428  */
 429
 430 #ifndef PERL_IN_XSUB_RE
 431 /*
 432  - pregexec - match a regexp against a string
 433  */
 434 I32
 435 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 436          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 437 /* strend: pointer to null at end of string */
 438 /* strbeg: real beginning of string */
 439 /* minend: end of match must be >=minend after stringarg. */
 440 /* nosave: For optimizations. */
 441 {
 442     PERL_ARGS_ASSERT_PREGEXEC;
 443
 444     return
 445         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 446                       nosave ? 0 : REXEC_COPY_STR);
 447 }
 448 #endif
 449
 450 /*
 451  * Need to implement the following flags for reg_anch:
 452  *
 453  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 454  * USE_INTUIT_ML
 455  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 456  * INTUIT_AUTORITATIVE_ML
 457  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 458  * INTUIT_ONCE_ML
 459  *
 460  * Another flag for this function: SECOND_TIME (so that float substrs
 461  * with giant delta may be not rechecked).
 462  */
 463
 464 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 465
 466 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 467    Otherwise, only SvCUR(sv) is used to get strbeg. */
 468
 469 /* XXXX We assume that strpos is strbeg unless sv. */
 470
 471 /* XXXX Some places assume that there is a fixed substring.
 472         An update may be needed if optimizer marks as "INTUITable"
 473         RExen without fixed substrings.  Similarly, it is assumed that
 474         lengths of all the strings are no more than minlen, thus they
 475         cannot come from lookahead.
 476         (Or minlen should take into account lookahead.)
 477   NOTE: Some of this comment is not correct. minlen does now take account
 478   of lookahead/behind. Further research is required. -- demerphq
 479
 480 */
 481
 482 /* A failure to find a constant substring means that there is no need to make
 483    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 484    finding a substring too deep into the string means that less calls to
 485    regtry() should be needed.
 486
 487    REx compiler's optimizer found 4 possible hints:
 488         a) Anchored substring;
 489         b) Fixed substring;
 490         c) Whether we are anchored (beginning-of-line or \G);
 491         d) First node (of those at offset 0) which may distingush positions;
 492    We use a)b)d) and multiline-part of c), and try to find a position in the
 493    string which does not contradict any of them.
 494  */
 495
 496 /* Most of decisions we do here should have been done at compile time.
 497    The nodes of the REx which we used for the search should have been
 498    deleted from the finite automaton. */
 499
 500 char *
 501 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 502                      char *strend, const U32 flags, re_scream_pos_data *data)
 503 {
 504     dVAR;
 505     struct regexp *const prog = (struct regexp *)SvANY(rx);
 506     register I32 start_shift = 0;
 507     /* Should be nonnegative! */
 508     register I32 end_shift   = 0;
 509     register char *s;
 510     register SV *check;
 511     char *strbeg;
 512     char *t;
 513     const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 514     I32 ml_anch;
 515     register char *other_last = NULL;   /* other substr checked before this */
 516     char *check_at = NULL;              /* check substr found at this pos */
 517     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 518     RXi_GET_DECL(prog,progi);
 519 #ifdef DEBUGGING
 520     const char * const i_strpos = strpos;
 521 #endif
 522     GET_RE_DEBUG_FLAGS_DECL;
 523
 524     PERL_ARGS_ASSERT_RE_INTUIT_START;
 525
 526     RX_MATCH_UTF8_set(rx,do_utf8);
 527
 528     if (RX_UTF8(rx)) {
 529         PL_reg_flags |= RF_utf8;
 530     }
 531     DEBUG_EXECUTE_r(
 532         debug_start_match(rx, do_utf8, strpos, strend,
 533             sv ? "Guessing start of match in sv for"
 534                : "Guessing start of match in string for");
 535               );
 536
 537     /* CHR_DIST() would be more correct here but it makes things slow. */
 538     if (prog->minlen > strend - strpos) {
 539         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 540                               "String too short... [re_intuit_start]\n"));
 541         goto fail;
 542     }
 543
 544     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 545     PL_regeol = strend;
 546     if (do_utf8) {
 547         if (!prog->check_utf8 && prog->check_substr)
 548             to_utf8_substr(prog);
 549         check = prog->check_utf8;
 550     } else {
 551         if (!prog->check_substr && prog->check_utf8)
 552             to_byte_substr(prog);
 553         check = prog->check_substr;
 554     }
 555     if (check == &PL_sv_undef) {
 556         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 557                 "Non-utf8 string cannot match utf8 check string\n"));
 558         goto fail;
 559     }
 560     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 561         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 562                      || ( (prog->extflags & RXf_ANCH_BOL)
 563                           && !multiline ) );    /* Check after \n? */
 564
 565         if (!ml_anch) {
 566           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 567                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 568                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 569                && sv && !SvROK(sv)
 570                && (strpos != strbeg)) {
 571               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 572               goto fail;
 573           }
 574           if (prog->check_offset_min == prog->check_offset_max &&
 575               !(prog->extflags & RXf_CANY_SEEN)) {
 576             /* Substring at constant offset from beg-of-str... */
 577             I32 slen;
 578
 579             s = HOP3c(strpos, prog->check_offset_min, strend);
 580
 581             if (SvTAIL(check)) {
 582                 slen = SvCUR(check);    /* >= 1 */
 583
 584                 if ( strend - s > slen || strend - s < slen - 1
 585                      || (strend - s == slen && strend[-1] != '\n')) {
 586                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 587                     goto fail_finish;
 588                 }
 589                 /* Now should match s[0..slen-2] */
 590                 slen--;
 591                 if (slen && (*SvPVX_const(check) != *s
 592                              || (slen > 1
 593                                  && memNE(SvPVX_const(check), s, slen)))) {
 594                   report_neq:
 595                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 596                     goto fail_finish;
 597                 }
 598             }
 599             else if (*SvPVX_const(check) != *s
 600                      || ((slen = SvCUR(check)) > 1
 601                          && memNE(SvPVX_const(check), s, slen)))
 602                 goto report_neq;
 603             check_at = s;
 604             goto success_at_start;
 605           }
 606         }
 607         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 608         s = strpos;
 609         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 610         end_shift = prog->check_end_shift;
 611
 612         if (!ml_anch) {
 613             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 614                                          - (SvTAIL(check) != 0);
 615             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 616
 617             if (end_shift < eshift)
 618                 end_shift = eshift;
 619         }
 620     }
 621     else {                              /* Can match at random position */
 622         ml_anch = 0;
 623         s = strpos;
 624         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 625         end_shift = prog->check_end_shift;
 626
 627         /* end shift should be non negative here */
 628     }
 629
 630 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 631     if (end_shift < 0)
 632         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 633                    (IV)end_shift, RX_PRECOMP(prog));
 634 #endif
 635
 636   restart:
 637     /* Find a possible match in the region s..strend by looking for
 638        the "check" substring in the region corrected by start/end_shift. */
 639
 640     {
 641         I32 srch_start_shift = start_shift;
 642         I32 srch_end_shift = end_shift;
 643         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 644             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 645             srch_start_shift = strbeg - s;
 646         }
 647     DEBUG_OPTIMISE_MORE_r({
 648         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 649             (IV)prog->check_offset_min,
 650             (IV)srch_start_shift,
 651             (IV)srch_end_shift,
 652             (IV)prog->check_end_shift);
 653     });
 654
 655     if (flags & REXEC_SCREAM) {
 656         I32 p = -1;                     /* Internal iterator of scream. */
 657         I32 * const pp = data ? data->scream_pos : &p;
 658
 659         if (PL_screamfirst[BmRARE(check)] >= 0
 660             || ( BmRARE(check) == '\n'
 661                  && (BmPREVIOUS(check) == SvCUR(check) - 1)
 662                  && SvTAIL(check) ))
 663             s = screaminstr(sv, check,
 664                             srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 665         else
 666             goto fail_finish;
 667         /* we may be pointing at the wrong string */
 668         if (s && RXp_MATCH_COPIED(prog))
 669             s = strbeg + (s - SvPVX_const(sv));
 670         if (data)
 671             *data->scream_olds = s;
 672     }
 673     else {
 674         U8* start_point;
 675         U8* end_point;
 676         if (prog->extflags & RXf_CANY_SEEN) {
 677             start_point= (U8*)(s + srch_start_shift);
 678             end_point= (U8*)(strend - srch_end_shift);
 679         } else {
 680             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 681             end_point= HOP3(strend, -srch_end_shift, strbeg);
 682         }
 683         DEBUG_OPTIMISE_MORE_r({
 684             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 685                 (int)(end_point - start_point),
 686                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 687                 start_point);
 688         });
 689
 690         s = fbm_instr( start_point, end_point,
 691                       check, multiline ? FBMrf_MULTILINE : 0);
 692     }
 693     }
 694     /* Update the count-of-usability, remove useless subpatterns,
 695         unshift s.  */
 696
 697     DEBUG_EXECUTE_r({
 698         RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
 699             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 700         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 701                           (s ? "Found" : "Did not find"),
 702             (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
 703                 ? "anchored" : "floating"),
 704             quoted,
 705             RE_SV_TAIL(check),
 706             (s ? " at offset " : "...\n") );
 707     });
 708
 709     if (!s)
 710         goto fail_finish;
 711     /* Finish the diagnostic message */
 712     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 713
 714     /* XXX dmq: first branch is for positive lookbehind...
 715        Our check string is offset from the beginning of the pattern.
 716        So we need to do any stclass tests offset forward from that
 717        point. I think. :-(
 718      */
 719
 720
 721
 722     check_at=s;
 723
 724
 725     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 726        Start with the other substr.
 727        XXXX no SCREAM optimization yet - and a very coarse implementation
 728        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 729                 *always* match.  Probably should be marked during compile...
 730        Probably it is right to do no SCREAM here...
 731      */
 732
 733     if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
 734                 : (prog->float_substr && prog->anchored_substr))
 735     {
 736         /* Take into account the "other" substring. */
 737         /* XXXX May be hopelessly wrong for UTF... */
 738         if (!other_last)
 739             other_last = strpos;
 740         if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
 741           do_other_anchored:
 742             {
 743                 char * const last = HOP3c(s, -start_shift, strbeg);
 744                 char *last1, *last2;
 745                 char * const saved_s = s;
 746                 SV* must;
 747
 748                 t = s - prog->check_offset_max;
 749                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 750                     && (!do_utf8
 751                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 752                             && t > strpos)))
 753                     NOOP;
 754                 else
 755                     t = strpos;
 756                 t = HOP3c(t, prog->anchored_offset, strend);
 757                 if (t < other_last)     /* These positions already checked */
 758                     t = other_last;
 759                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 760                 if (last < last1)
 761                     last1 = last;
 762                 /* XXXX It is not documented what units *_offsets are in.
 763                    We assume bytes, but this is clearly wrong.
 764                    Meaning this code needs to be carefully reviewed for errors.
 765                    dmq.
 766                   */
 767
 768                 /* On end-of-str: see comment below. */
 769                 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
 770                 if (must == &PL_sv_undef) {
 771                     s = (char*)NULL;
 772                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 773                 }
 774                 else
 775                     s = fbm_instr(
 776                         (unsigned char*)t,
 777                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 778                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 779                         must,
 780                         multiline ? FBMrf_MULTILINE : 0
 781                     );
 782                 DEBUG_EXECUTE_r({
 783                     RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
 784                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 785                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 786                         (s ? "Found" : "Contradicts"),
 787                         quoted, RE_SV_TAIL(must));
 788                 });
 789
 790
 791                 if (!s) {
 792                     if (last1 >= last2) {
 793                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 794                                                 ", giving up...\n"));
 795                         goto fail_finish;
 796                     }
 797                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 798                         ", trying floating at offset %ld...\n",
 799                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 800                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 801                     s = HOP3c(last, 1, strend);
 802                     goto restart;
 803                 }
 804                 else {
 805                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 806                           (long)(s - i_strpos)));
 807                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 808                     other_last = HOP3c(s, 1, strend);
 809                     s = saved_s;
 810                     if (t == strpos)
 811                         goto try_at_start;
 812                     goto try_at_offset;
 813                 }
 814             }
 815         }
 816         else {          /* Take into account the floating substring. */
 817             char *last, *last1;
 818             char * const saved_s = s;
 819             SV* must;
 820
 821             t = HOP3c(s, -start_shift, strbeg);
 822             last1 = last =
 823                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 824             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 825                 last = HOP3c(t, prog->float_max_offset, strend);
 826             s = HOP3c(t, prog->float_min_offset, strend);
 827             if (s < other_last)
 828                 s = other_last;
 829  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 830             must = do_utf8 ? prog->float_utf8 : prog->float_substr;
 831             /* fbm_instr() takes into account exact value of end-of-str
 832                if the check is SvTAIL(ed).  Since false positives are OK,
 833                and end-of-str is not later than strend we are OK. */
 834             if (must == &PL_sv_undef) {
 835                 s = (char*)NULL;
 836                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 837             }
 838             else
 839                 s = fbm_instr((unsigned char*)s,
 840                               (unsigned char*)last + SvCUR(must)
 841                                   - (SvTAIL(must)!=0),
 842                               must, multiline ? FBMrf_MULTILINE : 0);
 843             DEBUG_EXECUTE_r({
 844                 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
 845                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 846                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 847                     (s ? "Found" : "Contradicts"),
 848                     quoted, RE_SV_TAIL(must));
 849             });
 850             if (!s) {
 851                 if (last1 == last) {
 852                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 853                                             ", giving up...\n"));
 854                     goto fail_finish;
 855                 }
 856                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 857                     ", trying anchored starting at offset %ld...\n",
 858                     (long)(saved_s + 1 - i_strpos)));
 859                 other_last = last;
 860                 s = HOP3c(t, 1, strend);
 861                 goto restart;
 862             }
 863             else {
 864                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 865                       (long)(s - i_strpos)));
 866                 other_last = s; /* Fix this later. --Hugo */
 867                 s = saved_s;
 868                 if (t == strpos)
 869                     goto try_at_start;
 870                 goto try_at_offset;
 871             }
 872         }
 873     }
 874
 875
 876     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 877
 878     DEBUG_OPTIMISE_MORE_r(
 879         PerlIO_printf(Perl_debug_log,
 880             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 881             (IV)prog->check_offset_min,
 882             (IV)prog->check_offset_max,
 883             (IV)(s-strpos),
 884             (IV)(t-strpos),
 885             (IV)(t-s),
 886             (IV)(strend-strpos)
 887         )
 888     );
 889
 890     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 891         && (!do_utf8
 892             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 893                  && t > strpos)))
 894     {
 895         /* Fixed substring is found far enough so that the match
 896            cannot start at strpos. */
 897       try_at_offset:
 898         if (ml_anch && t[-1] != '\n') {
 899             /* Eventually fbm_*() should handle this, but often
 900                anchored_offset is not 0, so this check will not be wasted. */
 901             /* XXXX In the code below we prefer to look for "^" even in
 902                presence of anchored substrings.  And we search even
 903                beyond the found float position.  These pessimizations
 904                are historical artefacts only.  */
 905           find_anchor:
 906             while (t < strend - prog->minlen) {
 907                 if (*t == '\n') {
 908                     if (t < check_at - prog->check_offset_min) {
 909                         if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
 910                             /* Since we moved from the found position,
 911                                we definitely contradict the found anchored
 912                                substr.  Due to the above check we do not
 913                                contradict "check" substr.
 914                                Thus we can arrive here only if check substr
 915                                is float.  Redo checking for "other"=="fixed".
 916                              */
 917                             strpos = t + 1;
 918                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 919                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 920                             goto do_other_anchored;
 921                         }
 922                         /* We don't contradict the found floating substring. */
 923                         /* XXXX Why not check for STCLASS? */
 924                         s = t + 1;
 925                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 926                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 927                         goto set_useful;
 928                     }
 929                     /* Position contradicts check-string */
 930                     /* XXXX probably better to look for check-string
 931                        than for "\n", so one should lower the limit for t? */
 932                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 933                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 934                     other_last = strpos = s = t + 1;
 935                     goto restart;
 936                 }
 937                 t++;
 938             }
 939             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 940                         PL_colors[0], PL_colors[1]));
 941             goto fail_finish;
 942         }
 943         else {
 944             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
 945                         PL_colors[0], PL_colors[1]));
 946         }
 947         s = t;
 948       set_useful:
 949         ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr);    /* hooray/5 */
 950     }
 951     else {
 952         /* The found string does not prohibit matching at strpos,
 953            - no optimization of calling REx engine can be performed,
 954            unless it was an MBOL and we are not after MBOL,
 955            or a future STCLASS check will fail this. */
 956       try_at_start:
 957         /* Even in this situation we may use MBOL flag if strpos is offset
 958            wrt the start of the string. */
 959         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
 960             && (strpos != strbeg) && strpos[-1] != '\n'
 961             /* May be due to an implicit anchor of m{.*foo}  */
 962             && !(prog->intflags & PREGf_IMPLICIT))
 963         {
 964             t = strpos;
 965             goto find_anchor;
 966         }
 967         DEBUG_EXECUTE_r( if (ml_anch)
 968             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
 969                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
 970         );
 971       success_at_start:
 972         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
 973             && (do_utf8 ? (
 974                 prog->check_utf8                /* Could be deleted already */
 975                 && --BmUSEFUL(prog->check_utf8) < 0
 976                 && (prog->check_utf8 == prog->float_utf8)
 977             ) : (
 978                 prog->check_substr              /* Could be deleted already */
 979                 && --BmUSEFUL(prog->check_substr) < 0
 980                 && (prog->check_substr == prog->float_substr)
 981             )))
 982         {
 983             /* If flags & SOMETHING - do not do it many times on the same match */
 984             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
 985             /* XXX Does the destruction order has to change with do_utf8? */
 986             SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
 987             SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
 988             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
 989             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
 990             check = NULL;                       /* abort */
 991             s = strpos;
 992             /* XXXX This is a remnant of the old implementation.  It
 993                     looks wasteful, since now INTUIT can use many
 994                     other heuristics. */
 995             prog->extflags &= ~RXf_USE_INTUIT;
 996         }
 997         else
 998             s = strpos;
 999     }
1000
1001     /* Last resort... */
1002     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1003     /* trie stclasses are too expensive to use here, we are better off to
1004        leave it to regmatch itself */
1005     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1006         /* minlen == 0 is possible if regstclass is \b or \B,
1007            and the fixed substr is ''$.
1008            Since minlen is already taken into account, s+1 is before strend;
1009            accidentally, minlen >= 1 guaranties no false positives at s + 1
1010            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1011            regstclass does not come from lookahead...  */
1012         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1013            This leaves EXACTF only, which is dealt with in find_byclass().  */
1014         const U8* const str = (U8*)STRING(progi->regstclass);
1015         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1016                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1017                     : 1);
1018         char * endpos;
1019         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1020             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1021         else if (prog->float_substr || prog->float_utf8)
1022             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1023         else
1024             endpos= strend;
1025
1026         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1027                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1028
1029         t = s;
1030         s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1031         if (!s) {
1032 #ifdef DEBUGGING
1033             const char *what = NULL;
1034 #endif
1035             if (endpos == strend) {
1036                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1037                                 "Could not match STCLASS...\n") );
1038                 goto fail;
1039             }
1040             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1041                                    "This position contradicts STCLASS...\n") );
1042             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1043                 goto fail;
1044             /* Contradict one of substrings */
1045             if (prog->anchored_substr || prog->anchored_utf8) {
1046                 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1047                     DEBUG_EXECUTE_r( what = "anchored" );
1048                   hop_and_restart:
1049                     s = HOP3c(t, 1, strend);
1050                     if (s + start_shift + end_shift > strend) {
1051                         /* XXXX Should be taken into account earlier? */
1052                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1053                                                "Could not match STCLASS...\n") );
1054                         goto fail;
1055                     }
1056                     if (!check)
1057                         goto giveup;
1058                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1059                                 "Looking for %s substr starting at offset %ld...\n",
1060                                  what, (long)(s + start_shift - i_strpos)) );
1061                     goto restart;
1062                 }
1063                 /* Have both, check_string is floating */
1064                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1065                     goto retry_floating_check;
1066                 /* Recheck anchored substring, but not floating... */
1067                 s = check_at;
1068                 if (!check)
1069                     goto giveup;
1070                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1071                           "Looking for anchored substr starting at offset %ld...\n",
1072                           (long)(other_last - i_strpos)) );
1073                 goto do_other_anchored;
1074             }
1075             /* Another way we could have checked stclass at the
1076                current position only: */
1077             if (ml_anch) {
1078                 s = t = t + 1;
1079                 if (!check)
1080                     goto giveup;
1081                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1082                           "Looking for /%s^%s/m starting at offset %ld...\n",
1083                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1084                 goto try_at_offset;
1085             }
1086             if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))     /* Could have been deleted */
1087                 goto fail;
1088             /* Check is floating subtring. */
1089           retry_floating_check:
1090             t = check_at - start_shift;
1091             DEBUG_EXECUTE_r( what = "floating" );
1092             goto hop_and_restart;
1093         }
1094         if (t != s) {
1095             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1096                         "By STCLASS: moving %ld --> %ld\n",
1097                                   (long)(t - i_strpos), (long)(s - i_strpos))
1098                    );
1099         }
1100         else {
1101             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1102                                   "Does not contradict STCLASS...\n");
1103                    );
1104         }
1105     }
1106   giveup:
1107     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1108                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1109                           PL_colors[5], (long)(s - i_strpos)) );
1110     return s;
1111
1112   fail_finish:                          /* Substring not found */
1113     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1114         BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1115   fail:
1116     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1117                           PL_colors[4], PL_colors[5]));
1118     return NULL;
1119 }
1120
1121 #define DECL_TRIE_TYPE(scan) \
1122     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1123                     trie_type = (scan->flags != EXACT) \
1124                               ? (do_utf8 ? trie_utf8_fold : (UTF ? trie_latin_utf8_fold : trie_plain)) \
1125                               : (do_utf8 ? trie_utf8 : trie_plain)
1126
1127 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1128 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1129     switch (trie_type) {                                                    \
1130     case trie_utf8_fold:                                                    \
1131         if ( foldlen>0 ) {                                                  \
1132             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1133             foldlen -= len;                                                 \
1134             uscan += len;                                                   \
1135             len=0;                                                          \
1136         } else {                                                            \
1137             uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1138             uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1139             foldlen -= UNISKIP( uvc );                                      \
1140             uscan = foldbuf + UNISKIP( uvc );                               \
1141         }                                                                   \
1142         break;                                                              \
1143     case trie_latin_utf8_fold:                                              \
1144         if ( foldlen>0 ) {                                                  \
1145             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1146             foldlen -= len;                                                 \
1147             uscan += len;                                                   \
1148             len=0;                                                          \
1149         } else {                                                            \
1150             len = 1;                                                        \
1151             uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1152             foldlen -= UNISKIP( uvc );                                      \
1153             uscan = foldbuf + UNISKIP( uvc );                               \
1154         }                                                                   \
1155         break;                                                              \
1156     case trie_utf8:                                                         \
1157         uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1158         break;                                                              \
1159     case trie_plain:                                                        \
1160         uvc = (UV)*uc;                                                      \
1161         len = 1;                                                            \
1162     }                                                                       \
1163     if (uvc < 256) {                                                        \
1164         charid = trie->charmap[ uvc ];                                      \
1165     }                                                                       \
1166     else {                                                                  \
1167         charid = 0;                                                         \
1168         if (widecharmap) {                                                  \
1169             SV** const svpp = hv_fetch(widecharmap,                         \
1170                         (char*)&uvc, sizeof(UV), 0);                        \
1171             if (svpp)                                                       \
1172                 charid = (U16)SvIV(*svpp);                                  \
1173         }                                                                   \
1174     }                                                                       \
1175 } STMT_END
1176
1177 #define REXEC_FBC_EXACTISH_CHECK(CoNd)                 \
1178 {                                                      \
1179     char *my_strend= (char *)strend;                   \
1180     if ( (CoNd)                                        \
1181          && (ln == len ||                              \
1182              !ibcmp_utf8(s, &my_strend, 0,  do_utf8,   \
1183                         m, NULL, ln, cBOOL(UTF)))      \
1184          && (!reginfo || regtry(reginfo, &s)) )        \
1185         goto got_it;                                   \
1186     else {                                             \
1187          U8 foldbuf[UTF8_MAXBYTES_CASE+1];             \
1188          uvchr_to_utf8(tmpbuf, c);                     \
1189          f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);  \
1190          if ( f != c                                   \
1191               && (f == c1 || f == c2)                  \
1192               && (ln == len ||                         \
1193                 !ibcmp_utf8(s, &my_strend, 0,  do_utf8,\
1194                               m, NULL, ln, cBOOL(UTF)))\
1195               && (!reginfo || regtry(reginfo, &s)) )   \
1196               goto got_it;                             \
1197     }                                                  \
1198 }                                                      \
1199 s += len
1200
1201 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1202 STMT_START {                                              \
1203     while (s <= e) {                                      \
1204         if ( (CoNd)                                       \
1205              && (ln == 1 || !(OP(c) == EXACTF             \
1206                               ? ibcmp(s, m, ln)           \
1207                               : ibcmp_locale(s, m, ln)))  \
1208              && (!reginfo || regtry(reginfo, &s)) )        \
1209             goto got_it;                                  \
1210         s++;                                              \
1211     }                                                     \
1212 } STMT_END
1213
1214 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1215 STMT_START {                                          \
1216     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1217         CoDe                                          \
1218         s += uskip;                                   \
1219     }                                                 \
1220 } STMT_END
1221
1222 #define REXEC_FBC_SCAN(CoDe)                          \
1223 STMT_START {                                          \
1224     while (s < strend) {                              \
1225         CoDe                                          \
1226         s++;                                          \
1227     }                                                 \
1228 } STMT_END
1229
1230 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1231 REXEC_FBC_UTF8_SCAN(                                  \
1232     if (CoNd) {                                       \
1233         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1234             goto got_it;                              \
1235         else                                          \
1236             tmp = doevery;                            \
1237     }                                                 \
1238     else                                              \
1239         tmp = 1;                                      \
1240 )
1241
1242 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1243 REXEC_FBC_SCAN(                                       \
1244     if (CoNd) {                                       \
1245         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1246             goto got_it;                              \
1247         else                                          \
1248             tmp = doevery;                            \
1249     }                                                 \
1250     else                                              \
1251         tmp = 1;                                      \
1252 )
1253
1254 #define REXEC_FBC_TRYIT               \
1255 if ((!reginfo || regtry(reginfo, &s))) \
1256     goto got_it
1257
1258 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1259     if (do_utf8) {                                             \
1260         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1261     }                                                          \
1262     else {                                                     \
1263         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1264     }                                                          \
1265     break
1266
1267 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1268     if (do_utf8) {                                             \
1269         UtFpReLoAd;                                            \
1270         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1271     }                                                          \
1272     else {                                                     \
1273         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1274     }                                                          \
1275     break
1276
1277 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1278     PL_reg_flags |= RF_tainted;                                \
1279     if (do_utf8) {                                             \
1280         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1281     }                                                          \
1282     else {                                                     \
1283         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1284     }                                                          \
1285     break
1286
1287 #define DUMP_EXEC_POS(li,s,doutf8) \
1288     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1289
1290 /* We know what class REx starts with.  Try to find this position... */
1291 /* if reginfo is NULL, its a dryrun */
1292 /* annoyingly all the vars in this routine have different names from their counterparts
1293    in regmatch. /grrr */
1294
1295 STATIC char *
1296 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1297     const char *strend, regmatch_info *reginfo)
1298 {
1299         dVAR;
1300         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1301         char *m;
1302         STRLEN ln;
1303         STRLEN lnc;
1304         register STRLEN uskip;
1305         unsigned int c1;
1306         unsigned int c2;
1307         char *e;
1308         register I32 tmp = 1;   /* Scratch variable? */
1309         register const bool do_utf8 = PL_reg_match_utf8;
1310         RXi_GET_DECL(prog,progi);
1311
1312         PERL_ARGS_ASSERT_FIND_BYCLASS;
1313
1314         /* We know what class it must start with. */
1315         switch (OP(c)) {
1316         case ANYOF:
1317             if (do_utf8) {
1318                  REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1319                           !UTF8_IS_INVARIANT((U8)s[0]) ?
1320                           reginclass(prog, c, (U8*)s, 0, do_utf8) :
1321                           REGINCLASS(prog, c, (U8*)s));
1322             }
1323             else {
1324                  while (s < strend) {
1325                       STRLEN skip = 1;
1326
1327                       if (REGINCLASS(prog, c, (U8*)s) ||
1328                           (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1329                            /* The assignment of 2 is intentional:
1330                             * for the folded sharp s, the skip is 2. */
1331                            (skip = SHARP_S_SKIP))) {
1332                            if (tmp && (!reginfo || regtry(reginfo, &s)))
1333                                 goto got_it;
1334                            else
1335                                 tmp = doevery;
1336                       }
1337                       else
1338                            tmp = 1;
1339                       s += skip;
1340                  }
1341             }
1342             break;
1343         case CANY:
1344             REXEC_FBC_SCAN(
1345                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1346                     goto got_it;
1347                 else
1348                     tmp = doevery;
1349             );
1350             break;
1351         case EXACTF:
1352             m   = STRING(c);
1353             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1354             lnc = (I32) ln;     /* length to match in characters */
1355             if (UTF) {
1356                 STRLEN ulen1, ulen2;
1357                 U8 *sm = (U8 *) m;
1358                 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1359                 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1360                 /* used by commented-out code below */
1361                 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
1362
1363                 /* XXX: Since the node will be case folded at compile
1364                    time this logic is a little odd, although im not
1365                    sure that its actually wrong. --dmq */
1366
1367                 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1368                 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1369
1370                 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1371                    codepoint of the first character in the converted
1372                    form, yet originally we did the extra step.
1373                    No tests fail by commenting this code out however
1374                    so Ive left it out. -- dmq.
1375
1376                 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1377                                     0, uniflags);
1378                 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1379                                     0, uniflags);
1380                 */
1381
1382                 lnc = 0;
1383                 while (sm < ((U8 *) m + ln)) {
1384                     lnc++;
1385                     sm += UTF8SKIP(sm);
1386                 }
1387             }
1388             else {
1389                 c1 = *(U8*)m;
1390                 c2 = PL_fold[c1];
1391             }
1392             goto do_exactf;
1393         case EXACTFL:
1394             m   = STRING(c);
1395             ln  = STR_LEN(c);
1396             lnc = (I32) ln;
1397             c1 = *(U8*)m;
1398             c2 = PL_fold_locale[c1];
1399           do_exactf:
1400             e = HOP3c(strend, -((I32)lnc), s);
1401
1402             if (!reginfo && e < s)
1403                 e = s;                  /* Due to minlen logic of intuit() */
1404
1405             /* The idea in the EXACTF* cases is to first find the
1406              * first character of the EXACTF* node and then, if
1407              * necessary, case-insensitively compare the full
1408              * text of the node.  The c1 and c2 are the first
1409              * characters (though in Unicode it gets a bit
1410              * more complicated because there are more cases
1411              * than just upper and lower: one needs to use
1412              * the so-called folding case for case-insensitive
1413              * matching (called "loose matching" in Unicode).
1414              * ibcmp_utf8() will do just that. */
1415
1416             if (do_utf8 || UTF) {
1417                 UV c, f;
1418                 U8 tmpbuf [UTF8_MAXBYTES+1];
1419                 STRLEN len = 1;
1420                 STRLEN foldlen;
1421                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1422                 if (c1 == c2) {
1423                     /* Upper and lower of 1st char are equal -
1424                      * probably not a "letter". */
1425                     while (s <= e) {
1426                         if (do_utf8) {
1427                             c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1428                                            uniflags);
1429                         } else {
1430                             c = *((U8*)s);
1431                         }
1432                         REXEC_FBC_EXACTISH_CHECK(c == c1);
1433                     }
1434                 }
1435                 else {
1436                     while (s <= e) {
1437                         if (do_utf8) {
1438                             c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1439                                            uniflags);
1440                         } else {
1441                             c = *((U8*)s);
1442                         }
1443
1444                         /* Handle some of the three Greek sigmas cases.
1445                          * Note that not all the possible combinations
1446                          * are handled here: some of them are handled
1447                          * by the standard folding rules, and some of
1448                          * them (the character class or ANYOF cases)
1449                          * are handled during compiletime in
1450                          * regexec.c:S_regclass(). */
1451                         if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1452                             c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1453                             c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1454
1455                         REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1456                     }
1457                 }
1458             }
1459             else {
1460                 /* Neither pattern nor string are UTF8 */
1461                 if (c1 == c2)
1462                     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1463                 else
1464                     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1465             }
1466             break;
1467         case BOUNDL:
1468             PL_reg_flags |= RF_tainted;
1469             /* FALL THROUGH */
1470         case BOUND:
1471             if (do_utf8) {
1472                 if (s == PL_bostr)
1473                     tmp = '\n';
1474                 else {
1475                     U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1476                     tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1477                 }
1478                 tmp = ((OP(c) == BOUND ?
1479                         isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1480                 LOAD_UTF8_CHARCLASS_ALNUM();
1481                 REXEC_FBC_UTF8_SCAN(
1482                     if (tmp == !(OP(c) == BOUND ?
1483                                  cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) :
1484                                  isALNUM_LC_utf8((U8*)s)))
1485                     {
1486                         tmp = !tmp;
1487                         REXEC_FBC_TRYIT;
1488                 }
1489                 );
1490             }
1491             else {
1492                 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1493                 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1494                 REXEC_FBC_SCAN(
1495                     if (tmp ==
1496                         !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1497                         tmp = !tmp;
1498                         REXEC_FBC_TRYIT;
1499                 }
1500                 );
1501             }
1502             if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
1503                 goto got_it;
1504             break;
1505         case NBOUNDL:
1506             PL_reg_flags |= RF_tainted;
1507             /* FALL THROUGH */
1508         case NBOUND:
1509             if (do_utf8) {
1510                 if (s == PL_bostr)
1511                     tmp = '\n';
1512                 else {
1513                     U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1514                     tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1515                 }
1516                 tmp = ((OP(c) == NBOUND ?
1517                         isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1518                 LOAD_UTF8_CHARCLASS_ALNUM();
1519                 REXEC_FBC_UTF8_SCAN(
1520                     if (tmp == !(OP(c) == NBOUND ?
1521                                  cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) :
1522                                  isALNUM_LC_utf8((U8*)s)))
1523                         tmp = !tmp;
1524                     else REXEC_FBC_TRYIT;
1525                 );
1526             }
1527             else {
1528                 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1529                 tmp = ((OP(c) == NBOUND ?
1530                         isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1531                 REXEC_FBC_SCAN(
1532                     if (tmp ==
1533                         !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1534                         tmp = !tmp;
1535                     else REXEC_FBC_TRYIT;
1536                 );
1537             }
1538             if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
1539                 goto got_it;
1540             break;
1541         case ALNUM:
1542             REXEC_FBC_CSCAN_PRELOAD(
1543                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1544                 swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8),
1545                 isALNUM(*s)
1546             );
1547         case ALNUML:
1548             REXEC_FBC_CSCAN_TAINT(
1549                 isALNUM_LC_utf8((U8*)s),
1550                 isALNUM_LC(*s)
1551             );
1552         case NALNUM:
1553             REXEC_FBC_CSCAN_PRELOAD(
1554                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1555                 !swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8),
1556                 !isALNUM(*s)
1557             );
1558         case NALNUML:
1559             REXEC_FBC_CSCAN_TAINT(
1560                 !isALNUM_LC_utf8((U8*)s),
1561                 !isALNUM_LC(*s)
1562             );
1563         case SPACE:
1564             REXEC_FBC_CSCAN_PRELOAD(
1565                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1566                 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8),
1567                 isSPACE(*s)
1568             );
1569         case SPACEL:
1570             REXEC_FBC_CSCAN_TAINT(
1571                 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1572                 isSPACE_LC(*s)
1573             );
1574         case NSPACE:
1575             REXEC_FBC_CSCAN_PRELOAD(
1576                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1577                 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8)),
1578                 !isSPACE(*s)
1579             );
1580         case NSPACEL:
1581             REXEC_FBC_CSCAN_TAINT(
1582                 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1583                 !isSPACE_LC(*s)
1584             );
1585         case DIGIT:
1586             REXEC_FBC_CSCAN_PRELOAD(
1587                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1588                 swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8),
1589                 isDIGIT(*s)
1590             );
1591         case DIGITL:
1592             REXEC_FBC_CSCAN_TAINT(
1593                 isDIGIT_LC_utf8((U8*)s),
1594                 isDIGIT_LC(*s)
1595             );
1596         case NDIGIT:
1597             REXEC_FBC_CSCAN_PRELOAD(
1598                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1599                 !swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8),
1600                 !isDIGIT(*s)
1601             );
1602         case NDIGITL:
1603             REXEC_FBC_CSCAN_TAINT(
1604                 !isDIGIT_LC_utf8((U8*)s),
1605                 !isDIGIT_LC(*s)
1606             );
1607         case LNBREAK:
1608             REXEC_FBC_CSCAN(
1609                 is_LNBREAK_utf8(s),
1610                 is_LNBREAK_latin1(s)
1611             );
1612         case VERTWS:
1613             REXEC_FBC_CSCAN(
1614                 is_VERTWS_utf8(s),
1615                 is_VERTWS_latin1(s)
1616             );
1617         case NVERTWS:
1618             REXEC_FBC_CSCAN(
1619                 !is_VERTWS_utf8(s),
1620                 !is_VERTWS_latin1(s)
1621             );
1622         case HORIZWS:
1623             REXEC_FBC_CSCAN(
1624                 is_HORIZWS_utf8(s),
1625                 is_HORIZWS_latin1(s)
1626             );
1627         case NHORIZWS:
1628             REXEC_FBC_CSCAN(
1629                 !is_HORIZWS_utf8(s),
1630                 !is_HORIZWS_latin1(s)
1631             );
1632         case AHOCORASICKC:
1633         case AHOCORASICK:
1634             {
1635                 DECL_TRIE_TYPE(c);
1636                 /* what trie are we using right now */
1637                 reg_ac_data *aho
1638                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1639                 reg_trie_data *trie
1640                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1641                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1642
1643                 const char *last_start = strend - trie->minlen;
1644 #ifdef DEBUGGING
1645                 const char *real_start = s;
1646 #endif
1647                 STRLEN maxlen = trie->maxlen;
1648                 SV *sv_points;
1649                 U8 **points; /* map of where we were in the input string
1650                                 when reading a given char. For ASCII this
1651                                 is unnecessary overhead as the relationship
1652                                 is always 1:1, but for Unicode, especially
1653                                 case folded Unicode this is not true. */
1654                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1655                 U8 *bitmap=NULL;
1656
1657
1658                 GET_RE_DEBUG_FLAGS_DECL;
1659
1660                 /* We can't just allocate points here. We need to wrap it in
1661                  * an SV so it gets freed properly if there is a croak while
1662                  * running the match */
1663                 ENTER;
1664                 SAVETMPS;
1665                 sv_points=newSV(maxlen * sizeof(U8 *));
1666                 SvCUR_set(sv_points,
1667                     maxlen * sizeof(U8 *));
1668                 SvPOK_on(sv_points);
1669                 sv_2mortal(sv_points);
1670                 points=(U8**)SvPV_nolen(sv_points );
1671                 if ( trie_type != trie_utf8_fold
1672                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1673                 {
1674                     if (trie->bitmap)
1675                         bitmap=(U8*)trie->bitmap;
1676                     else
1677                         bitmap=(U8*)ANYOF_BITMAP(c);
1678                 }
1679                 /* this is the Aho-Corasick algorithm modified a touch
1680                    to include special handling for long "unknown char"
1681                    sequences. The basic idea being that we use AC as long
1682                    as we are dealing with a possible matching char, when
1683                    we encounter an unknown char (and we have not encountered
1684                    an accepting state) we scan forward until we find a legal
1685                    starting char.
1686                    AC matching is basically that of trie matching, except
1687                    that when we encounter a failing transition, we fall back
1688                    to the current states "fail state", and try the current char
1689                    again, a process we repeat until we reach the root state,
1690                    state 1, or a legal transition. If we fail on the root state
1691                    then we can either terminate if we have reached an accepting
1692                    state previously, or restart the entire process from the beginning
1693                    if we have not.
1694
1695                  */
1696                 while (s <= last_start) {
1697                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1698                     U8 *uc = (U8*)s;
1699                     U16 charid = 0;
1700                     U32 base = 1;
1701                     U32 state = 1;
1702                     UV uvc = 0;
1703                     STRLEN len = 0;
1704                     STRLEN foldlen = 0;
1705                     U8 *uscan = (U8*)NULL;
1706                     U8 *leftmost = NULL;
1707 #ifdef DEBUGGING
1708                     U32 accepted_word= 0;
1709 #endif
1710                     U32 pointpos = 0;
1711
1712                     while ( state && uc <= (U8*)strend ) {
1713                         int failed=0;
1714                         U32 word = aho->states[ state ].wordnum;
1715
1716                         if( state==1 ) {
1717                             if ( bitmap ) {
1718                                 DEBUG_TRIE_EXECUTE_r(
1719                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1720                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1721                                             (char *)uc, do_utf8 );
1722                                         PerlIO_printf( Perl_debug_log,
1723                                             " Scanning for legal start char...\n");
1724                                     }
1725                                 );
1726                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1727                                     uc++;
1728                                 }
1729                                 s= (char *)uc;
1730                             }
1731                             if (uc >(U8*)last_start) break;
1732                         }
1733
1734                         if ( word ) {
1735                             U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1736                             if (!leftmost || lpos < leftmost) {
1737                                 DEBUG_r(accepted_word=word);
1738                                 leftmost= lpos;
1739                             }
1740                             if (base==0) break;
1741
1742                         }
1743                         points[pointpos++ % maxlen]= uc;
1744                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1745                                              uscan, len, uvc, charid, foldlen,
1746                                              foldbuf, uniflags);
1747                         DEBUG_TRIE_EXECUTE_r({
1748                             dump_exec_pos( (char *)uc, c, strend, real_start,
1749                                 s,   do_utf8 );
1750                             PerlIO_printf(Perl_debug_log,
1751                                 " Charid:%3u CP:%4"UVxf" ",
1752                                  charid, uvc);
1753                         });
1754
1755                         do {
1756 #ifdef DEBUGGING
1757                             word = aho->states[ state ].wordnum;
1758 #endif
1759                             base = aho->states[ state ].trans.base;
1760
1761                             DEBUG_TRIE_EXECUTE_r({
1762                                 if (failed)
1763                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1764                                         s,   do_utf8 );
1765                                 PerlIO_printf( Perl_debug_log,
1766                                     "%sState: %4"UVxf", word=%"UVxf,
1767                                     failed ? " Fail transition to " : "",
1768                                     (UV)state, (UV)word);
1769                             });
1770                             if ( base ) {
1771                                 U32 tmp;
1772                                 if (charid &&
1773                                      (base + charid > trie->uniquecharcount )
1774                                      && (base + charid - 1 - trie->uniquecharcount
1775                                             < trie->lasttrans)
1776                                      && trie->trans[base + charid - 1 -
1777                                             trie->uniquecharcount].check == state
1778                                      && (tmp=trie->trans[base + charid - 1 -
1779                                         trie->uniquecharcount ].next))
1780                                 {
1781                                     DEBUG_TRIE_EXECUTE_r(
1782                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1783                                     state = tmp;
1784                                     break;
1785                                 }
1786                                 else {
1787                                     DEBUG_TRIE_EXECUTE_r(
1788                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1789                                     failed = 1;
1790                                     state = aho->fail[state];
1791                                 }
1792                             }
1793                             else {
1794                                 /* we must be accepting here */
1795                                 DEBUG_TRIE_EXECUTE_r(
1796                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1797                                 failed = 1;
1798                                 break;
1799                             }
1800                         } while(state);
1801                         uc += len;
1802                         if (failed) {
1803                             if (leftmost)
1804                                 break;
1805                             if (!state) state = 1;
1806                         }
1807                     }
1808                     if ( aho->states[ state ].wordnum ) {
1809                         U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1810                         if (!leftmost || lpos < leftmost) {
1811                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1812                             leftmost = lpos;
1813                         }
1814                     }
1815                     if (leftmost) {
1816                         s = (char*)leftmost;
1817                         DEBUG_TRIE_EXECUTE_r({
1818                             PerlIO_printf(
1819                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1820                                 (UV)accepted_word, (IV)(s - real_start)
1821                             );
1822                         });
1823                         if (!reginfo || regtry(reginfo, &s)) {
1824                             FREETMPS;
1825                             LEAVE;
1826                             goto got_it;
1827                         }
1828                         s = HOPc(s,1);
1829                         DEBUG_TRIE_EXECUTE_r({
1830                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1831                         });
1832                     } else {
1833                         DEBUG_TRIE_EXECUTE_r(
1834                             PerlIO_printf( Perl_debug_log,"No match.\n"));
1835                         break;
1836                     }
1837                 }
1838                 FREETMPS;
1839                 LEAVE;
1840             }
1841             break;
1842         default:
1843             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1844             break;
1845         }
1846         return 0;
1847       got_it:
1848         return s;
1849 }
1850
1851
1852 /*
1853  - regexec_flags - match a regexp against a string
1854  */
1855 I32
1856 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1857               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1858 /* strend: pointer to null at end of string */
1859 /* strbeg: real beginning of string */
1860 /* minend: end of match must be >=minend after stringarg. */
1861 /* data: May be used for some additional optimizations.
1862          Currently its only used, with a U32 cast, for transmitting
1863          the ganch offset when doing a /g match. This will change */
1864 /* nosave: For optimizations. */
1865 {
1866     dVAR;
1867     struct regexp *const prog = (struct regexp *)SvANY(rx);
1868     /*register*/ char *s;
1869     register regnode *c;
1870     /*register*/ char *startpos = stringarg;
1871     I32 minlen;         /* must match at least this many chars */
1872     I32 dontbother = 0; /* how many characters not to try at end */
1873     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
1874     I32 scream_pos = -1;                /* Internal iterator of scream. */
1875     char *scream_olds = NULL;
1876     const bool do_utf8 = cBOOL(DO_UTF8(sv));
1877     I32 multiline;
1878     RXi_GET_DECL(prog,progi);
1879     regmatch_info reginfo;  /* create some info to pass to regtry etc */
1880     regexp_paren_pair *swap = NULL;
1881     GET_RE_DEBUG_FLAGS_DECL;
1882
1883     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
1884     PERL_UNUSED_ARG(data);
1885
1886     /* Be paranoid... */
1887     if (prog == NULL || startpos == NULL) {
1888         Perl_croak(aTHX_ "NULL regexp parameter");
1889         return 0;
1890     }
1891
1892     multiline = prog->extflags & RXf_PMf_MULTILINE;
1893     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
1894
1895     RX_MATCH_UTF8_set(rx, do_utf8);
1896     DEBUG_EXECUTE_r(
1897         debug_start_match(rx, do_utf8, startpos, strend,
1898         "Matching");
1899     );
1900
1901     minlen = prog->minlen;
1902
1903     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1904         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1905                               "String too short [regexec_flags]...\n"));
1906         goto phooey;
1907     }
1908
1909
1910     /* Check validity of program. */
1911     if (UCHARAT(progi->program) != REG_MAGIC) {
1912         Perl_croak(aTHX_ "corrupted regexp program");
1913     }
1914
1915     PL_reg_flags = 0;
1916     PL_reg_eval_set = 0;
1917     PL_reg_maxiter = 0;
1918
1919     if (RX_UTF8(rx))
1920         PL_reg_flags |= RF_utf8;
1921
1922     /* Mark beginning of line for ^ and lookbehind. */
1923     reginfo.bol = startpos; /* XXX not used ??? */
1924     PL_bostr  = strbeg;
1925     reginfo.sv = sv;
1926
1927     /* Mark end of line for $ (and such) */
1928     PL_regeol = strend;
1929
1930     /* see how far we have to get to not match where we matched before */
1931     reginfo.till = startpos+minend;
1932
1933     /* If there is a "must appear" string, look for it. */
1934     s = startpos;
1935
1936     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
1937         MAGIC *mg;
1938         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
1939             reginfo.ganch = startpos + prog->gofs;
1940             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1941               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
1942         } else if (sv && SvTYPE(sv) >= SVt_PVMG
1943                   && SvMAGIC(sv)
1944                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1945                   && mg->mg_len >= 0) {
1946             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
1947             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1948                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
1949
1950             if (prog->extflags & RXf_ANCH_GPOS) {
1951                 if (s > reginfo.ganch)
1952                     goto phooey;
1953                 s = reginfo.ganch - prog->gofs;
1954                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1955                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
1956                 if (s < strbeg)
1957                     goto phooey;
1958             }
1959         }
1960         else if (data) {
1961             reginfo.ganch = strbeg + PTR2UV(data);
1962             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1963                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
1964
1965         } else {                                /* pos() not defined */
1966             reginfo.ganch = strbeg;
1967             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1968                  "GPOS: reginfo.ganch = strbeg\n"));
1969         }
1970     }
1971     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
1972         /* We have to be careful. If the previous successful match
1973            was from this regex we don't want a subsequent partially
1974            successful match to clobber the old results.
1975            So when we detect this possibility we add a swap buffer
1976            to the re, and switch the buffer each match. If we fail
1977            we switch it back, otherwise we leave it swapped.
1978         */
1979         swap = prog->offs;
1980         /* do we need a save destructor here for eval dies? */
1981         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
1982     }
1983     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1984         re_scream_pos_data d;
1985
1986         d.scream_olds = &scream_olds;
1987         d.scream_pos = &scream_pos;
1988         s = re_intuit_start(rx, sv, s, strend, flags, &d);
1989         if (!s) {
1990             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1991             goto phooey;        /* not present */
1992         }
1993     }
1994
1995
1996
1997     /* Simplest case:  anchored match need be tried only once. */
1998     /*  [unless only anchor is BOL and multiline is set] */
1999     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2000         if (s == startpos && regtry(&reginfo, &startpos))
2001             goto got_it;
2002         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2003                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2004         {
2005             char *end;
2006
2007             if (minlen)
2008                 dontbother = minlen - 1;
2009             end = HOP3c(strend, -dontbother, strbeg) - 1;
2010             /* for multiline we only have to try after newlines */
2011             if (prog->check_substr || prog->check_utf8) {
2012                 if (s == startpos)
2013                     goto after_try;
2014                 while (1) {
2015                     if (regtry(&reginfo, &s))
2016                         goto got_it;
2017                   after_try:
2018                     if (s > end)
2019                         goto phooey;
2020                     if (prog->extflags & RXf_USE_INTUIT) {
2021                         s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2022                         if (!s)
2023                             goto phooey;
2024                     }
2025                     else
2026                         s++;
2027                 }
2028             } else {
2029                 if (s > startpos)
2030                     s--;
2031                 while (s < end) {
2032                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2033                         if (regtry(&reginfo, &s))
2034                             goto got_it;
2035                     }
2036                 }
2037             }
2038         }
2039         goto phooey;
2040     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2041     {
2042         /* the warning about reginfo.ganch being used without intialization
2043            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2044            and we only enter this block when the same bit is set. */
2045         char *tmp_s = reginfo.ganch - prog->gofs;
2046
2047         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2048             goto got_it;
2049         goto phooey;
2050     }
2051
2052     /* Messy cases:  unanchored match. */
2053     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2054         /* we have /x+whatever/ */
2055         /* it must be a one character string (XXXX Except UTF?) */
2056         char ch;
2057 #ifdef DEBUGGING
2058         int did_match = 0;
2059 #endif
2060         if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2061             do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2062         ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
2063
2064         if (do_utf8) {
2065             REXEC_FBC_SCAN(
2066                 if (*s == ch) {
2067                     DEBUG_EXECUTE_r( did_match = 1 );
2068                     if (regtry(&reginfo, &s)) goto got_it;
2069                     s += UTF8SKIP(s);
2070                     while (s < strend && *s == ch)
2071                         s += UTF8SKIP(s);
2072                 }
2073             );
2074         }
2075         else {
2076             REXEC_FBC_SCAN(
2077                 if (*s == ch) {
2078                     DEBUG_EXECUTE_r( did_match = 1 );
2079                     if (regtry(&reginfo, &s)) goto got_it;
2080                     s++;
2081                     while (s < strend && *s == ch)
2082                         s++;
2083                 }
2084             );
2085         }
2086         DEBUG_EXECUTE_r(if (!did_match)
2087                 PerlIO_printf(Perl_debug_log,
2088                                   "Did not find anchored character...\n")
2089                );
2090     }
2091     else if (prog->anchored_substr != NULL
2092               || prog->anchored_utf8 != NULL
2093               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2094                   && prog->float_max_offset < strend - s)) {
2095         SV *must;
2096         I32 back_max;
2097         I32 back_min;
2098         char *last;
2099         char *last1;            /* Last position checked before */
2100 #ifdef DEBUGGING
2101         int did_match = 0;
2102 #endif
2103         if (prog->anchored_substr || prog->anchored_utf8) {
2104             if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2105                 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2106             must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
2107             back_max = back_min = prog->anchored_offset;
2108         } else {
2109             if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2110                 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2111             must = do_utf8 ? prog->float_utf8 : prog->float_substr;
2112             back_max = prog->float_max_offset;
2113             back_min = prog->float_min_offset;
2114         }
2115
2116
2117         if (must == &PL_sv_undef)
2118             /* could not downgrade utf8 check substring, so must fail */
2119             goto phooey;
2120
2121         if (back_min<0) {
2122             last = strend;
2123         } else {
2124             last = HOP3c(strend,        /* Cannot start after this */
2125                   -(I32)(CHR_SVLEN(must)
2126                          - (SvTAIL(must) != 0) + back_min), strbeg);
2127         }
2128         if (s > PL_bostr)
2129             last1 = HOPc(s, -1);
2130         else
2131             last1 = s - 1;      /* bogus */
2132
2133         /* XXXX check_substr already used to find "s", can optimize if
2134            check_substr==must. */
2135         scream_pos = -1;
2136         dontbother = end_shift;
2137         strend = HOPc(strend, -dontbother);
2138         while ( (s <= last) &&
2139                 ((flags & REXEC_SCREAM)
2140                  ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2141                                     end_shift, &scream_pos, 0))
2142                  : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2143                                   (unsigned char*)strend, must,
2144                                   multiline ? FBMrf_MULTILINE : 0))) ) {
2145             /* we may be pointing at the wrong string */
2146             if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2147                 s = strbeg + (s - SvPVX_const(sv));
2148             DEBUG_EXECUTE_r( did_match = 1 );
2149             if (HOPc(s, -back_max) > last1) {
2150                 last1 = HOPc(s, -back_min);
2151                 s = HOPc(s, -back_max);
2152             }
2153             else {
2154                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2155
2156                 last1 = HOPc(s, -back_min);
2157                 s = t;
2158             }
2159             if (do_utf8) {
2160                 while (s <= last1) {
2161                     if (regtry(&reginfo, &s))
2162                         goto got_it;
2163                     s += UTF8SKIP(s);
2164                 }
2165             }
2166             else {
2167                 while (s <= last1) {
2168                     if (regtry(&reginfo, &s))
2169                         goto got_it;
2170                     s++;
2171                 }
2172             }
2173         }
2174         DEBUG_EXECUTE_r(if (!did_match) {
2175             RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
2176                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2177             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2178                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2179                                ? "anchored" : "floating"),
2180                 quoted, RE_SV_TAIL(must));
2181         });
2182         goto phooey;
2183     }
2184     else if ( (c = progi->regstclass) ) {
2185         if (minlen) {
2186             const OPCODE op = OP(progi->regstclass);
2187             /* don't bother with what can't match */
2188             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2189                 strend = HOPc(strend, -(minlen - 1));
2190         }
2191         DEBUG_EXECUTE_r({
2192             SV * const prop = sv_newmortal();
2193             regprop(prog, prop, c);
2194             {
2195                 RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1),
2196                     s,strend-s,60);
2197                 PerlIO_printf(Perl_debug_log,
2198                     "Matching stclass %.*s against %s (%d chars)\n",
2199                     (int)SvCUR(prop), SvPVX_const(prop),
2200                      quoted, (int)(strend - s));
2201             }
2202         });
2203         if (find_byclass(prog, c, s, strend, &reginfo))
2204             goto got_it;
2205         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2206     }
2207     else {
2208         dontbother = 0;
2209         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2210             /* Trim the end. */
2211             char *last;
2212             SV* float_real;
2213
2214             if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2215                 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2216             float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
2217
2218             if (flags & REXEC_SCREAM) {
2219                 last = screaminstr(sv, float_real, s - strbeg,
2220                                    end_shift, &scream_pos, 1); /* last one */
2221                 if (!last)
2222                     last = scream_olds; /* Only one occurrence. */
2223                 /* we may be pointing at the wrong string */
2224                 else if (RXp_MATCH_COPIED(prog))
2225                     s = strbeg + (s - SvPVX_const(sv));
2226             }
2227             else {
2228                 STRLEN len;
2229                 const char * const little = SvPV_const(float_real, len);
2230
2231                 if (SvTAIL(float_real)) {
2232                     if (memEQ(strend - len + 1, little, len - 1))
2233                         last = strend - len + 1;
2234                     else if (!multiline)
2235                         last = memEQ(strend - len, little, len)
2236                             ? strend - len : NULL;
2237                     else
2238                         goto find_last;
2239                 } else {
2240                   find_last:
2241                     if (len)
2242                         last = rninstr(s, strend, little, little + len);
2243                     else
2244                         last = strend;  /* matching "$" */
2245                 }
2246             }
2247             if (last == NULL) {
2248                 DEBUG_EXECUTE_r(
2249                     PerlIO_printf(Perl_debug_log,
2250                         "%sCan't trim the tail, match fails (should not happen)%s\n",
2251                         PL_colors[4], PL_colors[5]));
2252                 goto phooey; /* Should not happen! */
2253             }
2254             dontbother = strend - last + prog->float_min_offset;
2255         }
2256         if (minlen && (dontbother < minlen))
2257             dontbother = minlen - 1;
2258         strend -= dontbother;              /* this one's always in bytes! */
2259         /* We don't know much -- general case. */
2260         if (do_utf8) {
2261             for (;;) {
2262                 if (regtry(&reginfo, &s))
2263                     goto got_it;
2264                 if (s >= strend)
2265                     break;
2266                 s += UTF8SKIP(s);
2267             };
2268         }
2269         else {
2270             do {
2271                 if (regtry(&reginfo, &s))
2272                     goto got_it;
2273             } while (s++ < strend);
2274         }
2275     }
2276
2277     /* Failure. */
2278     goto phooey;
2279
2280 got_it:
2281     Safefree(swap);
2282     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2283
2284     if (PL_reg_eval_set)
2285         restore_pos(aTHX_ prog);
2286     if (RXp_PAREN_NAMES(prog))
2287         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2288
2289     /* make sure $`, $&, $', and $digit will work later */
2290     if ( !(flags & REXEC_NOT_FIRST) ) {
2291         RX_MATCH_COPY_FREE(rx);
2292         if (flags & REXEC_COPY_STR) {
2293             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2294 #ifdef PERL_OLD_COPY_ON_WRITE
2295             if ((SvIsCOW(sv)
2296                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2297                 if (DEBUG_C_TEST) {
2298                     PerlIO_printf(Perl_debug_log,
2299                                   "Copy on write: regexp capture, type %d\n",
2300                                   (int) SvTYPE(sv));
2301                 }
2302                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2303                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2304                 assert (SvPOKp(prog->saved_copy));
2305             } else
2306 #endif
2307             {
2308                 RX_MATCH_COPIED_on(rx);
2309                 s = savepvn(strbeg, i);
2310                 prog->subbeg = s;
2311             }
2312             prog->sublen = i;
2313         }
2314         else {
2315             prog->subbeg = strbeg;
2316             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2317         }
2318     }
2319
2320     return 1;
2321
2322 phooey:
2323     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2324                           PL_colors[4], PL_colors[5]));
2325     if (PL_reg_eval_set)
2326         restore_pos(aTHX_ prog);
2327     if (swap) {
2328         /* we failed :-( roll it back */
2329         Safefree(prog->offs);
2330         prog->offs = swap;
2331     }
2332
2333     return 0;
2334 }
2335
2336
2337 /*
2338  - regtry - try match at specific point
2339  */
2340 STATIC I32                      /* 0 failure, 1 success */
2341 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2342 {
2343     dVAR;
2344     CHECKPOINT lastcp;
2345     REGEXP *const rx = reginfo->prog;
2346     regexp *const prog = (struct regexp *)SvANY(rx);
2347     RXi_GET_DECL(prog,progi);
2348     GET_RE_DEBUG_FLAGS_DECL;
2349
2350     PERL_ARGS_ASSERT_REGTRY;
2351
2352     reginfo->cutpoint=NULL;
2353
2354     if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2355         MAGIC *mg;
2356
2357         PL_reg_eval_set = RS_init;
2358         DEBUG_EXECUTE_r(DEBUG_s(
2359             PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2360                           (IV)(PL_stack_sp - PL_stack_base));
2361             ));
2362         SAVESTACK_CXPOS();
2363         cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2364         /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2365         SAVETMPS;
2366         /* Apparently this is not needed, judging by wantarray. */
2367         /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2368            cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2369
2370         if (reginfo->sv) {
2371             /* Make $_ available to executed code. */
2372             if (reginfo->sv != DEFSV) {
2373                 SAVE_DEFSV;
2374                 DEFSV_set(reginfo->sv);
2375             }
2376
2377             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2378                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2379                 /* prepare for quick setting of pos */
2380 #ifdef PERL_OLD_COPY_ON_WRITE
2381                 if (SvIsCOW(reginfo->sv))
2382                     sv_force_normal_flags(reginfo->sv, 0);
2383 #endif
2384                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2385                                  &PL_vtbl_mglob, NULL, 0);
2386                 mg->mg_len = -1;
2387             }
2388             PL_reg_magic    = mg;
2389             PL_reg_oldpos   = mg->mg_len;
2390             SAVEDESTRUCTOR_X(restore_pos, prog);
2391         }
2392         if (!PL_reg_curpm) {
2393             Newxz(PL_reg_curpm, 1, PMOP);
2394 #ifdef USE_ITHREADS
2395             {
2396                 SV* const repointer = &PL_sv_undef;
2397                 /* this regexp is also owned by the new PL_reg_curpm, which
2398                    will try to free it.  */
2399                 av_push(PL_regex_padav, repointer);
2400                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2401                 PL_regex_pad = AvARRAY(PL_regex_padav);
2402             }
2403 #endif
2404         }
2405 #ifdef USE_ITHREADS
2406         /* It seems that non-ithreads works both with and without this code.
2407            So for efficiency reasons it seems best not to have the code
2408            compiled when it is not needed.  */
2409         /* This is safe against NULLs: */
2410         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2411         /* PM_reg_curpm owns a reference to this regexp.  */
2412         ReREFCNT_inc(rx);
2413 #endif
2414         PM_SETRE(PL_reg_curpm, rx);
2415         PL_reg_oldcurpm = PL_curpm;
2416         PL_curpm = PL_reg_curpm;
2417         if (RXp_MATCH_COPIED(prog)) {
2418             /*  Here is a serious problem: we cannot rewrite subbeg,
2419                 since it may be needed if this match fails.  Thus
2420                 $` inside (?{}) could fail... */
2421             PL_reg_oldsaved = prog->subbeg;
2422             PL_reg_oldsavedlen = prog->sublen;
2423 #ifdef PERL_OLD_COPY_ON_WRITE
2424             PL_nrs = prog->saved_copy;
2425 #endif
2426             RXp_MATCH_COPIED_off(prog);
2427         }
2428         else
2429             PL_reg_oldsaved = NULL;
2430         prog->subbeg = PL_bostr;
2431         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2432     }
2433     DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2434     prog->offs[0].start = *startpos - PL_bostr;
2435     PL_reginput = *startpos;
2436     PL_reglastparen = &prog->lastparen;
2437     PL_reglastcloseparen = &prog->lastcloseparen;
2438     prog->lastparen = 0;
2439     prog->lastcloseparen = 0;
2440     PL_regsize = 0;
2441     PL_regoffs = prog->offs;
2442     if (PL_reg_start_tmpl <= prog->nparens) {
2443         PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2444         if(PL_reg_start_tmp)
2445             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2446         else
2447             Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2448     }
2449
2450     /* XXXX What this code is doing here?!!!  There should be no need
2451        to do this again and again, PL_reglastparen should take care of
2452        this!  --ilya*/
2453
2454     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2455      * Actually, the code in regcppop() (which Ilya may be meaning by
2456      * PL_reglastparen), is not needed at all by the test suite
2457      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2458      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2459      * Meanwhile, this code *is* needed for the
2460      * above-mentioned test suite tests to succeed.  The common theme
2461      * on those tests seems to be returning null fields from matches.
2462      * --jhi updated by dapm */
2463 #if 1
2464     if (prog->nparens) {
2465         regexp_paren_pair *pp = PL_regoffs;
2466         register I32 i;
2467         for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2468             ++pp;
2469             pp->start = -1;
2470             pp->end = -1;
2471         }
2472     }
2473 #endif
2474     REGCP_SET(lastcp);
2475     if (regmatch(reginfo, progi->program + 1)) {
2476         PL_regoffs[0].end = PL_reginput - PL_bostr;
2477         return 1;
2478     }
2479     if (reginfo->cutpoint)
2480         *startpos= reginfo->cutpoint;
2481     REGCP_UNWIND(lastcp);
2482     return 0;
2483 }
2484
2485
2486 #define sayYES goto yes
2487 #define sayNO goto no
2488 #define sayNO_SILENT goto no_silent
2489
2490 /* we dont use STMT_START/END here because it leads to
2491    "unreachable code" warnings, which are bogus, but distracting. */
2492 #define CACHEsayNO \
2493     if (ST.cache_mask) \
2494        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2495     sayNO
2496
2497 /* this is used to determine how far from the left messages like
2498    'failed...' are printed. It should be set such that messages
2499    are inline with the regop output that created them.
2500 */
2501 #define REPORT_CODE_OFF 32
2502
2503
2504 /* Make sure there is a test for this +1 options in re_tests */
2505 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2506
2507 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2508 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2509
2510 #define SLAB_FIRST(s) (&(s)->states[0])
2511 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2512
2513 /* grab a new slab and return the first slot in it */
2514
2515 STATIC regmatch_state *
2516 S_push_slab(pTHX)
2517 {
2518 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2519     dMY_CXT;
2520 #endif
2521     regmatch_slab *s = PL_regmatch_slab->next;
2522     if (!s) {
2523         Newx(s, 1, regmatch_slab);
2524         s->prev = PL_regmatch_slab;
2525         s->next = NULL;
2526         PL_regmatch_slab->next = s;
2527     }
2528     PL_regmatch_slab = s;
2529     return SLAB_FIRST(s);
2530 }
2531
2532
2533 /* push a new state then goto it */
2534
2535 #define PUSH_STATE_GOTO(state, node) \
2536     scan = node; \
2537     st->resume_state = state; \
2538     goto push_state;
2539
2540 /* push a new state with success backtracking, then goto it */
2541
2542 #define PUSH_YES_STATE_GOTO(state, node) \
2543     scan = node; \
2544     st->resume_state = state; \
2545     goto push_yes_state;
2546
2547
2548
2549 /*
2550
2551 regmatch() - main matching routine
2552
2553 This is basically one big switch statement in a loop. We execute an op,
2554 set 'next' to point the next op, and continue. If we come to a point which
2555 we may need to backtrack to on failure such as (A|B|C), we push a
2556 backtrack state onto the backtrack stack. On failure, we pop the top
2557 state, and re-enter the loop at the state indicated. If there are no more
2558 states to pop, we return failure.
2559
2560 Sometimes we also need to backtrack on success; for example /A+/, where
2561 after successfully matching one A, we need to go back and try to
2562 match another one; similarly for lookahead assertions: if the assertion
2563 completes successfully, we backtrack to the state just before the assertion
2564 and then carry on.  In these cases, the pushed state is marked as
2565 'backtrack on success too'. This marking is in fact done by a chain of
2566 pointers, each pointing to the previous 'yes' state. On success, we pop to
2567 the nearest yes state, discarding any intermediate failure-only states.
2568 Sometimes a yes state is pushed just to force some cleanup code to be
2569 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2570 it to free the inner regex.
2571
2572 Note that failure backtracking rewinds the cursor position, while
2573 success backtracking leaves it alone.
2574
2575 A pattern is complete when the END op is executed, while a subpattern
2576 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2577 ops trigger the "pop to last yes state if any, otherwise return true"
2578 behaviour.
2579
2580 A common convention in this function is to use A and B to refer to the two
2581 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2582 the subpattern to be matched possibly multiple times, while B is the entire
2583 rest of the pattern. Variable and state names reflect this convention.
2584
2585 The states in the main switch are the union of ops and failure/success of
2586 substates associated with with that op.  For example, IFMATCH is the op
2587 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2588 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2589 successfully matched A and IFMATCH_A_fail is a state saying that we have
2590 just failed to match A. Resume states always come in pairs. The backtrack
2591 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2592 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2593 on success or failure.
2594
2595 The struct that holds a backtracking state is actually a big union, with
2596 one variant for each major type of op. The variable st points to the
2597 top-most backtrack struct. To make the code clearer, within each
2598 block of code we #define ST to alias the relevant union.
2599
2600 Here's a concrete example of a (vastly oversimplified) IFMATCH
2601 implementation:
2602
2603     switch (state) {
2604     ....
2605
2606 #define ST st->u.ifmatch
2607
2608     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2609         ST.foo = ...; // some state we wish to save
2610         ...
2611         // push a yes backtrack state with a resume value of
2612         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2613         // first node of A:
2614         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2615         // NOTREACHED
2616
2617     case IFMATCH_A: // we have successfully executed A; now continue with B
2618         next = B;
2619         bar = ST.foo; // do something with the preserved value
2620         break;
2621
2622     case IFMATCH_A_fail: // A failed, so the assertion failed
2623         ...;   // do some housekeeping, then ...
2624         sayNO; // propagate the failure
2625
2626 #undef ST
2627
2628     ...
2629     }
2630
2631 For any old-timers reading this who are familiar with the old recursive
2632 approach, the code above is equivalent to:
2633
2634     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2635     {
2636         int foo = ...
2637         ...
2638         if (regmatch(A)) {
2639             next = B;
2640             bar = foo;
2641             break;
2642         }
2643         ...;   // do some housekeeping, then ...
2644         sayNO; // propagate the failure
2645     }
2646
2647 The topmost backtrack state, pointed to by st, is usually free. If you
2648 want to claim it, populate any ST.foo fields in it with values you wish to
2649 save, then do one of
2650
2651         PUSH_STATE_GOTO(resume_state, node);
2652         PUSH_YES_STATE_GOTO(resume_state, node);
2653
2654 which sets that backtrack state's resume value to 'resume_state', pushes a
2655 new free entry to the top of the backtrack stack, then goes to 'node'.
2656 On backtracking, the free slot is popped, and the saved state becomes the
2657 new free state. An ST.foo field in this new top state can be temporarily
2658 accessed to retrieve values, but once the main loop is re-entered, it
2659 becomes available for reuse.
2660
2661 Note that the depth of the backtrack stack constantly increases during the
2662 left-to-right execution of the pattern, rather than going up and down with
2663 the pattern nesting. For example the stack is at its maximum at Z at the
2664 end of the pattern, rather than at X in the following:
2665
2666     /(((X)+)+)+....(Y)+....Z/
2667
2668 The only exceptions to this are lookahead/behind assertions and the cut,
2669 (?>A), which pop all the backtrack states associated with A before
2670 continuing.
2671
2672 Bascktrack state structs are allocated in slabs of about 4K in size.
2673 PL_regmatch_state and st always point to the currently active state,
2674 and PL_regmatch_slab points to the slab currently containing
2675 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2676 allocated, and is never freed until interpreter destruction. When the slab
2677 is full, a new one is allocated and chained to the end. At exit from
2678 regmatch(), slabs allocated since entry are freed.
2679
2680 */
2681
2682
2683 #define DEBUG_STATE_pp(pp)                                  \
2684     DEBUG_STATE_r({                                         \
2685         DUMP_EXEC_POS(locinput, scan, do_utf8);             \
2686         PerlIO_printf(Perl_debug_log,                       \
2687             "    %*s"pp" %s%s%s%s%s\n",                     \
2688             depth*2, "",                                    \
2689             PL_reg_name[st->resume_state],                     \
2690             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2691             ((st==yes_state) ? "Y" : ""),                   \
2692             ((st==mark_state) ? "M" : ""),                  \
2693             ((st==yes_state||st==mark_state) ? "]" : "")    \
2694         );                                                  \
2695     });
2696
2697
2698 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2699
2700 #ifdef DEBUGGING
2701
2702 STATIC void
2703 S_debug_start_match(pTHX_ const REGEXP *prog, const bool do_utf8,
2704     const char *start, const char *end, const char *blurb)
2705 {
2706     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2707
2708     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2709
2710     if (!PL_colorset)
2711             reginitcolors();
2712     {
2713         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2714             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2715
2716         RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2717             start, end - start, 60);
2718
2719         PerlIO_printf(Perl_debug_log,
2720             "%s%s REx%s %s against %s\n",
2721                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2722
2723         if (do_utf8||utf8_pat)
2724             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2725                 utf8_pat ? "pattern" : "",
2726                 utf8_pat && do_utf8 ? " and " : "",
2727                 do_utf8 ? "string" : ""
2728             );
2729     }
2730 }
2731
2732 STATIC void
2733 S_dump_exec_pos(pTHX_ const char *locinput,
2734                       const regnode *scan,
2735                       const char *loc_regeol,
2736                       const char *loc_bostr,
2737                       const char *loc_reg_starttry,
2738                       const bool do_utf8)
2739 {
2740     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2741     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2742     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2743     /* The part of the string before starttry has one color
2744        (pref0_len chars), between starttry and current
2745        position another one (pref_len - pref0_len chars),
2746        after the current position the third one.
2747        We assume that pref0_len <= pref_len, otherwise we
2748        decrease pref0_len.  */
2749     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2750         ? (5 + taill) - l : locinput - loc_bostr;
2751     int pref0_len;
2752
2753     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2754
2755     while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2756         pref_len++;
2757     pref0_len = pref_len  - (locinput - loc_reg_starttry);
2758     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2759         l = ( loc_regeol - locinput > (5 + taill) - pref_len
2760               ? (5 + taill) - pref_len : loc_regeol - locinput);
2761     while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2762         l--;
2763     if (pref0_len < 0)
2764         pref0_len = 0;
2765     if (pref0_len > pref_len)
2766         pref0_len = pref_len;
2767     {
2768         const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2769
2770         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2771             (locinput - pref_len),pref0_len, 60, 4, 5);
2772
2773         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2774                     (locinput - pref_len + pref0_len),
2775                     pref_len - pref0_len, 60, 2, 3);
2776
2777         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2778                     locinput, loc_regeol - locinput, 10, 0, 1);
2779
2780         const STRLEN tlen=len0+len1+len2;
2781         PerlIO_printf(Perl_debug_log,
2782                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2783                     (IV)(locinput - loc_bostr),
2784                     len0, s0,
2785                     len1, s1,
2786                     (docolor ? "" : "> <"),
2787                     len2, s2,
2788                     (int)(tlen > 19 ? 0 :  19 - tlen),
2789                     "");
2790     }
2791 }
2792
2793 #endif
2794
2795 /* reg_check_named_buff_matched()
2796  * Checks to see if a named buffer has matched. The data array of
2797  * buffer numbers corresponding to the buffer is expected to reside
2798  * in the regexp->data->data array in the slot stored in the ARG() of
2799  * node involved. Note that this routine doesn't actually care about the
2800  * name, that information is not preserved from compilation to execution.
2801  * Returns the index of the leftmost defined buffer with the given name
2802  * or 0 if non of the buffers matched.
2803  */
2804 STATIC I32
2805 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2806 {
2807     I32 n;
2808     RXi_GET_DECL(rex,rexi);
2809     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2810     I32 *nums=(I32*)SvPVX(sv_dat);
2811
2812     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2813
2814     for ( n=0; n<SvIVX(sv_dat); n++ ) {
2815         if ((I32)*PL_reglastparen >= nums[n] &&
2816             PL_regoffs[nums[n]].end != -1)
2817         {
2818             return nums[n];
2819         }
2820     }
2821     return 0;
2822 }
2823
2824
2825 /* free all slabs above current one  - called during LEAVE_SCOPE */
2826
2827 STATIC void
2828 S_clear_backtrack_stack(pTHX_ void *p)
2829 {
2830     regmatch_slab *s = PL_regmatch_slab->next;
2831     PERL_UNUSED_ARG(p);
2832
2833     if (!s)
2834         return;
2835     PL_regmatch_slab->next = NULL;
2836     while (s) {
2837         regmatch_slab * const osl = s;
2838         s = s->next;
2839         Safefree(osl);
2840     }
2841 }
2842
2843
2844 #define SETREX(Re1,Re2) \
2845     if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2846     Re1 = (Re2)
2847
2848 STATIC I32                      /* 0 failure, 1 success */
2849 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2850 {
2851 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2852     dMY_CXT;
2853 #endif
2854     dVAR;
2855     register const bool do_utf8 = PL_reg_match_utf8;
2856     const U32 uniflags = UTF8_ALLOW_DEFAULT;
2857     REGEXP *rex_sv = reginfo->prog;
2858     regexp *rex = (struct regexp *)SvANY(rex_sv);
2859     RXi_GET_DECL(rex,rexi);
2860     I32 oldsave;
2861     /* the current state. This is a cached copy of PL_regmatch_state */
2862     register regmatch_state *st;
2863     /* cache heavy used fields of st in registers */
2864     register regnode *scan;
2865     register regnode *next;
2866     register U32 n = 0; /* general value; init to avoid compiler warning */
2867     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
2868     register char *locinput = PL_reginput;
2869     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
2870
2871     bool result = 0;        /* return value of S_regmatch */
2872     int depth = 0;          /* depth of backtrack stack */
2873     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2874     const U32 max_nochange_depth =
2875         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2876         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2877     regmatch_state *yes_state = NULL; /* state to pop to on success of
2878                                                             subpattern */
2879     /* mark_state piggy backs on the yes_state logic so that when we unwind
2880        the stack on success we can update the mark_state as we go */
2881     regmatch_state *mark_state = NULL; /* last mark state we have seen */
2882     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2883     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
2884     U32 state_num;
2885     bool no_final = 0;      /* prevent failure from backtracking? */
2886     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
2887     char *startpoint = PL_reginput;
2888     SV *popmark = NULL;     /* are we looking for a mark? */
2889     SV *sv_commit = NULL;   /* last mark name seen in failure */
2890     SV *sv_yes_mark = NULL; /* last mark name we have seen
2891                                during a successfull match */
2892     U32 lastopen = 0;       /* last open we saw */
2893     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
2894     SV* const oreplsv = GvSV(PL_replgv);
2895     /* these three flags are set by various ops to signal information to
2896      * the very next op. They have a useful lifetime of exactly one loop
2897      * iteration, and are not preserved or restored by state pushes/pops
2898      */
2899     bool sw = 0;            /* the condition value in (?(cond)a|b) */
2900     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
2901     int logical = 0;        /* the following EVAL is:
2902                                 0: (?{...})
2903                                 1: (?(?{...})X|Y)
2904                                 2: (??{...})
2905                                or the following IFMATCH/UNLESSM is:
2906                                 false: plain (?=foo)
2907                                 true:  used as a condition: (?(?=foo))
2908                             */
2909 #ifdef DEBUGGING
2910     GET_RE_DEBUG_FLAGS_DECL;
2911 #endif
2912
2913     PERL_ARGS_ASSERT_REGMATCH;
2914
2915     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
2916             PerlIO_printf(Perl_debug_log,"regmatch start\n");
2917     }));
2918     /* on first ever call to regmatch, allocate first slab */
2919     if (!PL_regmatch_slab) {
2920         Newx(PL_regmatch_slab, 1, regmatch_slab);
2921         PL_regmatch_slab->prev = NULL;
2922         PL_regmatch_slab->next = NULL;
2923         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2924     }
2925
2926     oldsave = PL_savestack_ix;
2927     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
2928     SAVEVPTR(PL_regmatch_slab);
2929     SAVEVPTR(PL_regmatch_state);
2930
2931     /* grab next free state slot */
2932     st = ++PL_regmatch_state;
2933     if (st >  SLAB_LAST(PL_regmatch_slab))
2934         st = PL_regmatch_state = S_push_slab(aTHX);
2935
2936     /* Note that nextchr is a byte even in UTF */
2937     nextchr = UCHARAT(locinput);
2938     scan = prog;
2939     while (scan != NULL) {
2940
2941         DEBUG_EXECUTE_r( {
2942             SV * const prop = sv_newmortal();
2943             regnode *rnext=regnext(scan);
2944             DUMP_EXEC_POS( locinput, scan, do_utf8 );
2945             regprop(rex, prop, scan);
2946
2947             PerlIO_printf(Perl_debug_log,
2948                     "%3"IVdf":%*s%s(%"IVdf")\n",
2949                     (IV)(scan - rexi->program), depth*2, "",
2950                     SvPVX_const(prop),
2951                     (PL_regkind[OP(scan)] == END || !rnext) ?
2952                         0 : (IV)(rnext - rexi->program));
2953         });
2954
2955         next = scan + NEXT_OFF(scan);
2956         if (next == scan)
2957             next = NULL;
2958         state_num = OP(scan);
2959
2960       reenter_switch:
2961
2962         assert(PL_reglastparen == &rex->lastparen);
2963         assert(PL_reglastcloseparen == &rex->lastcloseparen);
2964         assert(PL_regoffs == rex->offs);
2965
2966         switch (state_num) {
2967         case BOL:
2968             if (locinput == PL_bostr)
2969             {
2970                 /* reginfo->till = reginfo->bol; */
2971                 break;
2972             }
2973             sayNO;
2974         case MBOL:
2975             if (locinput == PL_bostr ||
2976                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2977             {
2978                 break;
2979             }
2980             sayNO;
2981         case SBOL:
2982             if (locinput == PL_bostr)
2983                 break;
2984             sayNO;
2985         case GPOS:
2986             if (locinput == reginfo->ganch)
2987                 break;
2988             sayNO;
2989
2990         case KEEPS:
2991             /* update the startpoint */
2992             st->u.keeper.val = PL_regoffs[0].start;
2993             PL_reginput = locinput;
2994             PL_regoffs[0].start = locinput - PL_bostr;
2995             PUSH_STATE_GOTO(KEEPS_next, next);
2996             /*NOT-REACHED*/
2997         case KEEPS_next_fail:
2998             /* rollback the start point change */
2999             PL_regoffs[0].start = st->u.keeper.val;
3000             sayNO_SILENT;
3001             /*NOT-REACHED*/
3002         case EOL:
3003                 goto seol;
3004         case MEOL:
3005             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3006                 sayNO;
3007             break;
3008         case SEOL:
3009           seol:
3010             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3011                 sayNO;
3012             if (PL_regeol - locinput > 1)
3013                 sayNO;
3014             break;
3015         case EOS:
3016             if (PL_regeol != locinput)
3017                 sayNO;
3018             break;
3019         case SANY:
3020             if (!nextchr && locinput >= PL_regeol)
3021                 sayNO;
3022             if (do_utf8) {
3023                 locinput += PL_utf8skip[nextchr];
3024                 if (locinput > PL_regeol)
3025                     sayNO;
3026                 nextchr = UCHARAT(locinput);
3027             }
3028             else
3029                 nextchr = UCHARAT(++locinput);
3030             break;
3031         case CANY:
3032             if (!nextchr && locinput >= PL_regeol)
3033                 sayNO;
3034             nextchr = UCHARAT(++locinput);
3035             break;
3036         case REG_ANY:
3037             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3038                 sayNO;
3039             if (do_utf8) {
3040                 locinput += PL_utf8skip[nextchr];
3041                 if (locinput > PL_regeol)
3042                     sayNO;
3043                 nextchr = UCHARAT(locinput);
3044             }
3045             else
3046                 nextchr = UCHARAT(++locinput);
3047             break;
3048
3049 #undef  ST
3050 #define ST st->u.trie
3051         case TRIEC:
3052             /* In this case the charclass data is available inline so
3053                we can fail fast without a lot of extra overhead.
3054              */
3055             if (scan->flags == EXACT || !do_utf8) {
3056                 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3057                     DEBUG_EXECUTE_r(
3058                         PerlIO_printf(Perl_debug_log,
3059                                   "%*s  %sfailed to match trie start class...%s\n",
3060                                   REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3061                     );
3062                     sayNO_SILENT;
3063                     /* NOTREACHED */
3064                 }
3065             }
3066             /* FALL THROUGH */
3067         case TRIE:
3068             {
3069                 /* what type of TRIE am I? (utf8 makes this contextual) */
3070                 DECL_TRIE_TYPE(scan);
3071
3072                 /* what trie are we using right now */
3073                 reg_trie_data * const trie
3074                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3075                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3076                 U32 state = trie->startstate;
3077
3078                 if (trie->bitmap && trie_type != trie_utf8_fold &&
3079                     !TRIE_BITMAP_TEST(trie,*locinput)
3080                 ) {
3081                     if (trie->states[ state ].wordnum) {
3082                          DEBUG_EXECUTE_r(
3083                             PerlIO_printf(Perl_debug_log,
3084                                           "%*s  %smatched empty string...%s\n",
3085                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3086                         );
3087                         break;
3088                     } else {
3089                         DEBUG_EXECUTE_r(
3090                             PerlIO_printf(Perl_debug_log,
3091                                           "%*s  %sfailed to match trie start class...%s\n",
3092                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3093                         );
3094                         sayNO_SILENT;
3095                    }
3096                 }
3097
3098             {
3099                 U8 *uc = ( U8* )locinput;
3100
3101                 STRLEN len = 0;
3102                 STRLEN foldlen = 0;
3103                 U8 *uscan = (U8*)NULL;
3104                 STRLEN bufflen=0;
3105                 SV *sv_accept_buff = NULL;
3106                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3107
3108                 ST.accepted = 0; /* how many accepting states we have seen */
3109                 ST.B = next;
3110                 ST.jump = trie->jump;
3111                 ST.me = scan;
3112                 /*
3113                    traverse the TRIE keeping track of all accepting states
3114                    we transition through until we get to a failing node.
3115                 */
3116
3117                 while ( state && uc <= (U8*)PL_regeol ) {
3118                     U32 base = trie->states[ state ].trans.base;
3119                     UV uvc = 0;
3120                     U16 charid;
3121                     /* We use charid to hold the wordnum as we don't use it
3122                        for charid until after we have done the wordnum logic.
3123                        We define an alias just so that the wordnum logic reads
3124                        more naturally. */
3125
3126 #define got_wordnum charid
3127                     got_wordnum = trie->states[ state ].wordnum;
3128
3129                     if ( got_wordnum ) {
3130                         if ( ! ST.accepted ) {
3131                             ENTER;
3132                             SAVETMPS; /* XXX is this necessary? dmq */
3133                             bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
3134                             sv_accept_buff=newSV(bufflen *
3135                                             sizeof(reg_trie_accepted) - 1);
3136                             SvCUR_set(sv_accept_buff, 0);
3137                             SvPOK_on(sv_accept_buff);
3138                             sv_2mortal(sv_accept_buff);
3139                             SAVETMPS;
3140                             ST.accept_buff =
3141                                 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
3142                         }
3143                         do {
3144                             if (ST.accepted >= bufflen) {
3145                                 bufflen *= 2;
3146                                 ST.accept_buff =(reg_trie_accepted*)
3147                                     SvGROW(sv_accept_buff,
3148                                         bufflen * sizeof(reg_trie_accepted));
3149                             }
3150                             SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
3151                                 + sizeof(reg_trie_accepted));
3152
3153
3154                             ST.accept_buff[ST.accepted].wordnum = got_wordnum;
3155                             ST.accept_buff[ST.accepted].endpos = uc;
3156                             ++ST.accepted;
3157                         } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
3158                     }
3159 #undef got_wordnum
3160
3161                     DEBUG_TRIE_EXECUTE_r({
3162                                 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
3163                                 PerlIO_printf( Perl_debug_log,
3164                                     "%*s  %sState: %4"UVxf" Accepted: %4"UVxf" ",
3165                                     2+depth * 2, "", PL_colors[4],
3166                                     (UV)state, (UV)ST.accepted );
3167                     });
3168
3169                     if ( base ) {
3170                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3171                                              uscan, len, uvc, charid, foldlen,
3172                                              foldbuf, uniflags);
3173
3174                         if (charid &&
3175                              (base + charid > trie->uniquecharcount )
3176                              && (base + charid - 1 - trie->uniquecharcount
3177                                     < trie->lasttrans)
3178                              && trie->trans[base + charid - 1 -
3179                                     trie->uniquecharcount].check == state)
3180                         {
3181                             state = trie->trans[base + charid - 1 -
3182                                 trie->uniquecharcount ].next;
3183                         }
3184                         else {
3185                             state = 0;
3186                         }
3187                         uc += len;
3188
3189                     }
3190                     else {
3191                         state = 0;
3192                     }
3193                     DEBUG_TRIE_EXECUTE_r(
3194                         PerlIO_printf( Perl_debug_log,
3195                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3196                             charid, uvc, (UV)state, PL_colors[5] );
3197                     );
3198                 }
3199                 if (!ST.accepted )
3200                    sayNO;
3201
3202                 DEBUG_EXECUTE_r(
3203                     PerlIO_printf( Perl_debug_log,
3204                         "%*s  %sgot %"IVdf" possible matches%s\n",
3205                         REPORT_CODE_OFF + depth * 2, "",
3206                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3207                 );
3208             }}
3209             goto trie_first_try; /* jump into the fail handler */
3210             /* NOTREACHED */
3211         case TRIE_next_fail: /* we failed - try next alterative */
3212             if ( ST.jump) {
3213                 REGCP_UNWIND(ST.cp);
3214                 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3215                     PL_regoffs[n].end = -1;
3216                 *PL_reglastparen = n;
3217             }
3218           trie_first_try:
3219             if (do_cutgroup) {
3220                 do_cutgroup = 0;
3221                 no_final = 0;
3222             }
3223
3224             if ( ST.jump) {
3225                 ST.lastparen = *PL_reglastparen;
3226                 REGCP_SET(ST.cp);
3227             }
3228             if ( ST.accepted == 1 ) {
3229                 /* only one choice left - just continue */
3230                 DEBUG_EXECUTE_r({
3231                     AV *const trie_words
3232                         = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3233                     SV ** const tmp = av_fetch( trie_words,
3234                         ST.accept_buff[ 0 ].wordnum-1, 0 );
3235                     SV *sv= tmp ? sv_newmortal() : NULL;
3236
3237                     PerlIO_printf( Perl_debug_log,
3238                         "%*s  %sonly one match left: #%d <%s>%s\n",
3239                         REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3240                         ST.accept_buff[ 0 ].wordnum,
3241                         tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3242                                 PL_colors[0], PL_colors[1],
3243                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3244                             )
3245                         : "not compiled under -Dr",
3246                         PL_colors[5] );
3247                 });
3248                 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3249                 /* in this case we free tmps/leave before we call regmatch
3250                    as we wont be using accept_buff again. */
3251
3252                 locinput = PL_reginput;
3253                 nextchr = UCHARAT(locinput);
3254                 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
3255                     scan = ST.B;
3256                 else
3257                     scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
3258                 if (!has_cutgroup) {
3259                     FREETMPS;
3260                     LEAVE;
3261                 } else {
3262                     ST.accepted--;
3263                     PUSH_YES_STATE_GOTO(TRIE_next, scan);
3264                 }
3265
3266                 continue; /* execute rest of RE */
3267             }
3268
3269             if ( !ST.accepted-- ) {
3270                 DEBUG_EXECUTE_r({
3271                     PerlIO_printf( Perl_debug_log,
3272                         "%*s  %sTRIE failed...%s\n",
3273                         REPORT_CODE_OFF+depth*2, "",
3274                         PL_colors[4],
3275                         PL_colors[5] );
3276                 });
3277                 FREETMPS;
3278                 LEAVE;
3279                 sayNO_SILENT;
3280                 /*NOTREACHED*/
3281             }
3282
3283             /*
3284                There are at least two accepting states left.  Presumably
3285                the number of accepting states is going to be low,
3286                typically two. So we simply scan through to find the one
3287                with lowest wordnum.  Once we find it, we swap the last
3288                state into its place and decrement the size. We then try to
3289                match the rest of the pattern at the point where the word
3290                ends. If we succeed, control just continues along the
3291                regex; if we fail we return here to try the next accepting
3292                state
3293              */
3294
3295             {
3296                 U32 best = 0;
3297                 U32 cur;
3298                 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3299                     DEBUG_TRIE_EXECUTE_r(
3300                         PerlIO_printf( Perl_debug_log,
3301                             "%*s  %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3302                             REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
3303                             (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3304                             ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3305                     );
3306
3307                     if (ST.accept_buff[cur].wordnum <
3308                             ST.accept_buff[best].wordnum)
3309                         best = cur;
3310                 }
3311
3312                 DEBUG_EXECUTE_r({
3313                     AV *const trie_words
3314                         = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3315                     SV ** const tmp = av_fetch( trie_words,
3316                         ST.accept_buff[ best ].wordnum - 1, 0 );
3317                     regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
3318                                     ST.B :
3319                                     ST.me + ST.jump[ST.accept_buff[best].wordnum];
3320                     SV *sv= tmp ? sv_newmortal() : NULL;
3321
3322                     PerlIO_printf( Perl_debug_log,
3323                         "%*s  %strying alternation #%d <%s> at node #%d %s\n",
3324                         REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3325                         ST.accept_buff[best].wordnum,
3326                         tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3327                                 PL_colors[0], PL_colors[1],
3328                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3329                             ) : "not compiled under -Dr",
3330                             REG_NODE_NUM(nextop),
3331                         PL_colors[5] );
3332                 });
3333
3334                 if ( best<ST.accepted ) {
3335                     reg_trie_accepted tmp = ST.accept_buff[ best ];
3336                     ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3337                     ST.accept_buff[ ST.accepted ] = tmp;
3338                     best = ST.accepted;
3339                 }
3340                 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
3341                 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
3342                     scan = ST.B;
3343                 } else {
3344                     scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
3345                 }
3346                 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3347                 /* NOTREACHED */
3348             }
3349             /* NOTREACHED */
3350         case TRIE_next:
3351             /* we dont want to throw this away, see bug 57042*/
3352             if (oreplsv != GvSV(PL_replgv))
3353                 sv_setsv(oreplsv, GvSV(PL_replgv));
3354             FREETMPS;
3355             LEAVE;
3356             sayYES;
3357 #undef  ST
3358
3359         case EXACT: {
3360             char *s = STRING(scan);
3361             ln = STR_LEN(scan);
3362             if (do_utf8 != UTF) {
3363                 /* The target and the pattern have differing utf8ness. */
3364                 char *l = locinput;
3365                 const char * const e = s + ln;
3366
3367                 if (do_utf8) {
3368                     /* The target is utf8, the pattern is not utf8. */
3369                     while (s < e) {
3370                         STRLEN ulen;
3371                         if (l >= PL_regeol)
3372                              sayNO;
3373                         if (NATIVE_TO_UNI(*(U8*)s) !=
3374                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3375                                             uniflags))
3376                              sayNO;
3377                         l += ulen;
3378                         s ++;
3379                     }
3380                 }
3381                 else {
3382                     /* The target is not utf8, the pattern is utf8. */
3383                     while (s < e) {
3384                         STRLEN ulen;
3385                         if (l >= PL_regeol)
3386                             sayNO;
3387                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3388                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3389                                            uniflags))
3390                             sayNO;
3391                         s += ulen;
3392                         l ++;
3393                     }
3394                 }
3395                 locinput = l;
3396                 nextchr = UCHARAT(locinput);
3397                 break;
3398             }
3399             /* The target and the pattern have the same utf8ness. */
3400             /* Inline the first character, for speed. */
3401             if (UCHARAT(s) != nextchr)
3402                 sayNO;
3403             if (PL_regeol - locinput < ln)
3404                 sayNO;
3405             if (ln > 1 && memNE(s, locinput, ln))
3406                 sayNO;
3407             locinput += ln;
3408             nextchr = UCHARAT(locinput);
3409             break;
3410             }
3411         case EXACTFL:
3412             PL_reg_flags |= RF_tainted;
3413             /* FALL THROUGH */
3414         case EXACTF: {
3415             char * const s = STRING(scan);
3416             ln = STR_LEN(scan);
3417
3418             if (do_utf8 || UTF) {
3419               /* Either target or the pattern are utf8. */
3420                 const char * const l = locinput;
3421                 char *e = PL_regeol;
3422
3423                 if (ibcmp_utf8(s, 0,  ln, cBOOL(UTF),
3424                                l, &e, 0,  do_utf8)) {
3425                      /* One more case for the sharp s:
3426                       * pack("U0U*", 0xDF) =~ /ss/i,
3427                       * the 0xC3 0x9F are the UTF-8
3428                       * byte sequence for the U+00DF. */
3429
3430                      if (!(do_utf8 &&
3431                            toLOWER(s[0]) == 's' &&
3432                            ln >= 2 &&
3433                            toLOWER(s[1]) == 's' &&
3434                            (U8)l[0] == 0xC3 &&
3435                            e - l >= 2 &&
3436                            (U8)l[1] == 0x9F))
3437                           sayNO;
3438                 }
3439                 locinput = e;
3440                 nextchr = UCHARAT(locinput);
3441                 break;
3442             }
3443
3444             /* Neither the target and the pattern are utf8. */
3445
3446             /* Inline the first character, for speed. */
3447             if (UCHARAT(s) != nextchr &&
3448                 UCHARAT(s) != ((OP(scan) == EXACTF)
3449                                ? PL_fold : PL_fold_locale)[nextchr])
3450                 sayNO;
3451             if (PL_regeol - locinput < ln)
3452                 sayNO;
3453             if (ln > 1 && (OP(scan) == EXACTF
3454                            ? ibcmp(s, locinput, ln)
3455                            : ibcmp_locale(s, locinput, ln)))
3456                 sayNO;
3457             locinput += ln;
3458             nextchr = UCHARAT(locinput);
3459             break;
3460             }
3461         case BOUNDL:
3462         case NBOUNDL:
3463             PL_reg_flags |= RF_tainted;
3464             /* FALL THROUGH */
3465         case BOUND:
3466         case NBOUND:
3467             /* was last char in word? */
3468             if (do_utf8) {
3469                 if (locinput == PL_bostr)
3470                     ln = '\n';
3471                 else {
3472                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3473
3474                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3475                 }
3476                 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3477                     ln = isALNUM_uni(ln);
3478                     LOAD_UTF8_CHARCLASS_ALNUM();
3479                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3480                 }
3481                 else {
3482                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3483                     n = isALNUM_LC_utf8((U8*)locinput);
3484                 }
3485             }
3486             else {
3487                 ln = (locinput != PL_bostr) ?
3488                     UCHARAT(locinput - 1) : '\n';
3489                 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3490                     ln = isALNUM(ln);
3491                     n = isALNUM(nextchr);
3492                 }
3493                 else {
3494                     ln = isALNUM_LC(ln);
3495                     n = isALNUM_LC(nextchr);
3496                 }
3497             }
3498             if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3499                                     OP(scan) == BOUNDL))
3500                     sayNO;
3501             break;
3502         case ANYOF:
3503             if (do_utf8) {
3504                 STRLEN inclasslen = PL_regeol - locinput;
3505
3506                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3507                     goto anyof_fail;
3508                 if (locinput >= PL_regeol)
3509                     sayNO;
3510                 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3511                 nextchr = UCHARAT(locinput);
3512                 break;
3513             }
3514             else {
3515                 if (nextchr < 0)
3516                     nextchr = UCHARAT(locinput);
3517                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3518                     goto anyof_fail;
3519                 if (!nextchr && locinput >= PL_regeol)
3520                     sayNO;
3521                 nextchr = UCHARAT(++locinput);
3522                 break;
3523             }
3524         anyof_fail:
3525             /* If we might have the case of the German sharp s
3526              * in a casefolding Unicode character class. */
3527
3528             if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3529                  locinput += SHARP_S_SKIP;
3530                  nextchr = UCHARAT(locinput);
3531             }
3532             else
3533                  sayNO;
3534             break;
3535         /* Special char classes - The defines start on line 129 or so */
3536         CCC_TRY_AFF( ALNUM,  ALNUML, perl_word,   "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
3537         CCC_TRY_NEG(NALNUM, NALNUML, perl_word,   "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
3538
3539         CCC_TRY_AFF( SPACE,  SPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
3540         CCC_TRY_NEG(NSPACE, NSPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
3541
3542         CCC_TRY_AFF( DIGIT,  DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3543         CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3544
3545         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3546                        a Unicode extended Grapheme Cluster */
3547             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3548               extended Grapheme Cluster is:
3549
3550                CR LF
3551                | Prepend* Begin Extend*
3552                | .
3553
3554                Begin is (Hangul-syllable | ! Control)
3555                Extend is (Grapheme_Extend | Spacing_Mark)
3556                Control is [ GCB_Control CR LF ]
3557
3558                The discussion below shows how the code for CLUMP is derived
3559                from this regex.  Note that most of these concepts are from
3560                property values of the Grapheme Cluster Boundary (GCB) property.
3561                No code point can have multiple property values for a given
3562                property.  Thus a code point in Prepend can't be in Control, but
3563                it must be in !Control.  This is why Control above includes
3564                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3565                property separately, and so can't be in GCB_Control, even though
3566                they logically are controls.  Control is not the same as gc=cc,
3567                but includes format and other characters as well.
3568
3569                The Unicode definition of Hangul-syllable is:
3570                    L+
3571                    | (L* ( ( V | LV ) V* | LVT ) T*)
3572                    | T+
3573                   )
3574                Each of these is a value for the GCB property, and hence must be
3575                disjoint, so the order they are tested is immaterial, so the
3576                above can safely be changed to
3577                    T+
3578                    | L+
3579                    | (L* ( LVT | ( V | LV ) V*) T*)
3580
3581                The last two terms can be combined like this:
3582                    L* ( L
3583                         | (( LVT | ( V | LV ) V*) T*))
3584
3585                And refactored into this:
3586                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3587
3588                That means that if we have seen any L's at all we can quit
3589                there, but if the next character is a LVT, a V or and LV we
3590                should keep going.
3591
3592                There is a subtlety with Prepend* which showed up in testing.
3593                Note that the Begin, and only the Begin is required in:
3594                 | Prepend* Begin Extend*
3595                Also, Begin contains '! Control'.  A Prepend must be a '!
3596                Control', which means it must be a Begin.  What it comes down to
3597                is that if we match Prepend* and then find no suitable Begin
3598                afterwards, that if we backtrack the last Prepend, that one will
3599                be a suitable Begin.
3600             */
3601
3602             if (locinput >= PL_regeol)
3603                 sayNO;
3604             if  (! do_utf8) {
3605
3606                 /* Match either CR LF  or '.', as all the other possibilities
3607                  * require utf8 */
3608                 locinput++;         /* Match the . or CR */
3609                 if (nextchr == '\r'
3610                     && locinput < PL_regeol
3611                     && UCHARAT(locinput) == '\n') locinput++;
3612             }
3613             else {
3614
3615                 /* Utf8: See if is ( CR LF ); already know that locinput <
3616                  * PL_regeol, so locinput+1 is in bounds */
3617                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3618                     locinput += 2;
3619                 }
3620                 else {
3621                     /* In case have to backtrack to beginning, then match '.' */
3622                     char *starting = locinput;
3623
3624                     /* In case have to backtrack the last prepend */
3625                     char *previous_prepend = 0;
3626
3627                     LOAD_UTF8_CHARCLASS_GCB();
3628
3629                     /* Match (prepend)* */
3630                     while (locinput < PL_regeol
3631                            && swash_fetch(PL_utf8_X_prepend,
3632                                           (U8*)locinput, do_utf8))
3633                     {
3634                         previous_prepend = locinput;
3635                         locinput += UTF8SKIP(locinput);
3636                     }
3637
3638                     /* As noted above, if we matched a prepend character, but
3639                      * the next thing won't match, back off the last prepend we
3640                      * matched, as it is guaranteed to match the begin */
3641                     if (previous_prepend
3642                         && (locinput >=  PL_regeol
3643                             || ! swash_fetch(PL_utf8_X_begin,
3644                                              (U8*)locinput, do_utf8)))
3645                     {
3646                         locinput = previous_prepend;
3647                     }
3648
3649                     /* Note that here we know PL_regeol > locinput, as we
3650                      * tested that upon input to this switch case, and if we
3651                      * moved locinput forward, we tested the result just above
3652                      * and it either passed, or we backed off so that it will
3653                      * now pass */
3654                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, do_utf8)) {
3655
3656                         /* Here did not match the required 'Begin' in the
3657                          * second term.  So just match the very first
3658                          * character, the '.' of the final term of the regex */
3659                         locinput = starting + UTF8SKIP(starting);
3660                     } else {
3661
3662                         /* Here is the beginning of a character that can have
3663                          * an extender.  It is either a hangul syllable, or a
3664                          * non-control */
3665                         if (swash_fetch(PL_utf8_X_non_hangul,
3666                                         (U8*)locinput, do_utf8))
3667                         {
3668
3669                             /* Here not a Hangul syllable, must be a
3670                              * ('!  * Control') */
3671                             locinput += UTF8SKIP(locinput);
3672                         } else {
3673
3674                             /* Here is a Hangul syllable.  It can be composed
3675                              * of several individual characters.  One
3676                              * possibility is T+ */
3677                             if (swash_fetch(PL_utf8_X_T,
3678                                             (U8*)locinput, do_utf8))
3679                             {
3680                                 while (locinput < PL_regeol
3681                                         && swash_fetch(PL_utf8_X_T,
3682                                                         (U8*)locinput, do_utf8))
3683                                 {
3684                                     locinput += UTF8SKIP(locinput);
3685                                 }
3686                             } else {
3687
3688                                 /* Here, not T+, but is a Hangul.  That means
3689                                  * it is one of the others: L, LV, LVT or V,
3690                                  * and matches:
3691                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3692
3693                                 /* Match L*           */
3694                                 while (locinput < PL_regeol
3695                                         && swash_fetch(PL_utf8_X_L,
3696                                                         (U8*)locinput, do_utf8))
3697                                 {
3698                                     locinput += UTF8SKIP(locinput);
3699                                 }
3700
3701                                 /* Here, have exhausted L*.  If the next
3702                                  * character is not an LV, LVT nor V, it means
3703                                  * we had to have at least one L, so matches L+
3704                                  * in the original equation, we have a complete
3705                                  * hangul syllable.  Are done. */
3706
3707                                 if (locinput < PL_regeol
3708                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
3709                                                     (U8*)locinput, do_utf8))
3710                                 {
3711
3712                                     /* Otherwise keep going.  Must be LV, LVT
3713                                      * or V.  See if LVT */
3714                                     if (swash_fetch(PL_utf8_X_LVT,
3715                                                     (U8*)locinput, do_utf8))
3716                                     {
3717                                         locinput += UTF8SKIP(locinput);
3718                                     } else {
3719
3720                                         /* Must be  V or LV.  Take it, then
3721                                          * match V*     */
3722                                         locinput += UTF8SKIP(locinput);
3723                                         while (locinput < PL_regeol
3724                                                 && swash_fetch(PL_utf8_X_V,
3725                                                          (U8*)locinput, do_utf8))
3726                                         {
3727                                             locinput += UTF8SKIP(locinput);
3728                                         }
3729                                     }
3730
3731                                     /* And any of LV, LVT, or V can be followed
3732                                      * by T*            */
3733                                     while (locinput < PL_regeol
3734                                            && swash_fetch(PL_utf8_X_T,
3735                                                            (U8*)locinput,
3736                                                            do_utf8))
3737                                     {
3738                                         locinput += UTF8SKIP(locinput);
3739                                     }
3740                                 }
3741                             }
3742                         }
3743
3744                         /* Match any extender */
3745                         while (locinput < PL_regeol
3746                                 && swash_fetch(PL_utf8_X_extend,
3747                                                 (U8*)locinput, do_utf8))
3748                         {
3749                             locinput += UTF8SKIP(locinput);
3750                         }
3751                     }
3752                 }
3753                 if (locinput > PL_regeol) sayNO;
3754             }
3755             nextchr = UCHARAT(locinput);
3756             break;
3757
3758         case NREFFL:
3759         {
3760             char *s;
3761             char type;
3762             PL_reg_flags |= RF_tainted;
3763             /* FALL THROUGH */
3764         case NREF:
3765         case NREFF:
3766             type = OP(scan);
3767             n = reg_check_named_buff_matched(rex,scan);
3768
3769             if ( n ) {
3770                 type = REF + ( type - NREF );
3771                 goto do_ref;
3772             } else {
3773                 sayNO;
3774             }
3775             /* unreached */
3776         case REFFL:
3777             PL_reg_flags |= RF_tainted;
3778             /* FALL THROUGH */
3779         case REF:
3780         case REFF:
3781             n = ARG(scan);  /* which paren pair */
3782             type = OP(scan);
3783           do_ref:
3784             ln = PL_regoffs[n].start;
3785             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
3786             if (*PL_reglastparen < n || ln == -1)
3787                 sayNO;                  /* Do not match unless seen CLOSEn. */
3788             if (ln == PL_regoffs[n].end)
3789                 break;
3790
3791             s = PL_bostr + ln;
3792             if (do_utf8 && type != REF) {       /* REF can do byte comparison */
3793                 char *l = locinput;
3794                 const char *e = PL_bostr + PL_regoffs[n].end;
3795                 /*
3796                  * Note that we can't do the "other character" lookup trick as
3797                  * in the 8-bit case (no pun intended) because in Unicode we
3798                  * have to map both upper and title case to lower case.
3799                  */
3800                 if (type == REFF) {
3801                     while (s < e) {
3802                         STRLEN ulen1, ulen2;
3803                         U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3804                         U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3805
3806                         if (l >= PL_regeol)
3807                             sayNO;
3808                         toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3809                         toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3810                         if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3811                             sayNO;
3812                         s += ulen1;
3813                         l += ulen2;
3814                     }
3815                 }
3816                 locinput = l;
3817                 nextchr = UCHARAT(locinput);
3818                 break;
3819             }
3820
3821             /* Inline the first character, for speed. */
3822             if (UCHARAT(s) != nextchr &&
3823                 (type == REF ||
3824                  (UCHARAT(s) != (type == REFF
3825                                   ? PL_fold : PL_fold_locale)[nextchr])))
3826                 sayNO;
3827             ln = PL_regoffs[n].end - ln;
3828             if (locinput + ln > PL_regeol)
3829                 sayNO;
3830             if (ln > 1 && (type == REF
3831                            ? memNE(s, locinput, ln)
3832                            : (type == REFF
3833                               ? ibcmp(s, locinput, ln)
3834                               : ibcmp_locale(s, locinput, ln))))
3835                 sayNO;
3836             locinput += ln;
3837             nextchr = UCHARAT(locinput);
3838             break;
3839         }
3840         case NOTHING:
3841         case TAIL:
3842             break;
3843         case BACK:
3844             break;
3845
3846 #undef  ST
3847 #define ST st->u.eval
3848         {
3849             SV *ret;
3850             REGEXP *re_sv;
3851             regexp *re;
3852             regexp_internal *rei;
3853             regnode *startpoint;
3854
3855         case GOSTART:
3856         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
3857             if (cur_eval && cur_eval->locinput==locinput) {
3858                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
3859                     Perl_croak(aTHX_ "Infinite recursion in regex");
3860                 if ( ++nochange_depth > max_nochange_depth )
3861                     Perl_croak(aTHX_
3862                         "Pattern subroutine nesting without pos change"
3863                         " exceeded limit in regex");
3864             } else {
3865                 nochange_depth = 0;
3866             }
3867             re_sv = rex_sv;
3868             re = rex;
3869             rei = rexi;
3870             (void)ReREFCNT_inc(rex_sv);
3871             if (OP(scan)==GOSUB) {
3872                 startpoint = scan + ARG2L(scan);
3873                 ST.close_paren = ARG(scan);
3874             } else {
3875                 startpoint = rei->program+1;
3876                 ST.close_paren = 0;
3877             }
3878             goto eval_recurse_doit;
3879             /* NOTREACHED */
3880         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
3881             if (cur_eval && cur_eval->locinput==locinput) {
3882                 if ( ++nochange_depth > max_nochange_depth )
3883                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
3884             } else {
3885                 nochange_depth = 0;
3886             }
3887             {
3888                 /* execute the code in the {...} */
3889                 dSP;
3890                 SV ** const before = SP;
3891                 OP_4tree * const oop = PL_op;
3892                 COP * const ocurcop = PL_curcop;
3893                 PAD *old_comppad;
3894                 char *saved_regeol = PL_regeol;
3895
3896                 n = ARG(scan);
3897                 PL_op = (OP_4tree*)rexi->data->data[n];
3898                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
3899                     "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3900                 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
3901                 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
3902
3903                 if (sv_yes_mark) {
3904                     SV *sv_mrk = get_sv("REGMARK", 1);
3905                     sv_setsv(sv_mrk, sv_yes_mark);
3906                 }
3907
3908                 CALLRUNOPS(aTHX);                       /* Scalar context. */
3909                 SPAGAIN;
3910                 if (SP == before)
3911                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
3912                 else {
3913                     ret = POPs;
3914                     PUTBACK;
3915                 }
3916
3917                 PL_op = oop;
3918                 PAD_RESTORE_LOCAL(old_comppad);
3919                 PL_curcop = ocurcop;
3920                 PL_regeol = saved_regeol;
3921                 if (!logical) {
3922                     /* /(?{...})/ */
3923                     sv_setsv(save_scalar(PL_replgv), ret);
3924                     break;
3925                 }
3926             }
3927             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3928                 logical = 0;
3929                 {
3930                     /* extract RE object from returned value; compiling if
3931                      * necessary */
3932                     MAGIC *mg = NULL;
3933                     REGEXP *rx = NULL;
3934
3935                     if (SvROK(ret)) {
3936                         SV *const sv = SvRV(ret);
3937
3938                         if (SvTYPE(sv) == SVt_REGEXP) {
3939                             rx = (REGEXP*) sv;
3940                         } else if (SvSMAGICAL(sv)) {
3941                             mg = mg_find(sv, PERL_MAGIC_qr);
3942                             assert(mg);
3943                         }
3944                     } else if (SvTYPE(ret) == SVt_REGEXP) {
3945                         rx = (REGEXP*) ret;
3946                     } else if (SvSMAGICAL(ret)) {
3947                         if (SvGMAGICAL(ret)) {
3948                             /* I don't believe that there is ever qr magic
3949                                here.  */
3950                             assert(!mg_find(ret, PERL_MAGIC_qr));
3951                             sv_unmagic(ret, PERL_MAGIC_qr);
3952                         }
3953                         else {
3954                             mg = mg_find(ret, PERL_MAGIC_qr);
3955                             /* testing suggests mg only ends up non-NULL for
3956                                scalars who were upgraded and compiled in the
3957                                else block below. In turn, this is only
3958                                triggered in the "postponed utf8 string" tests
3959                                in t/op/pat.t  */
3960                         }
3961                     }
3962
3963                     if (mg) {
3964                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
3965                         assert(rx);
3966                     }
3967                     if (rx) {
3968                         rx = reg_temp_copy(NULL, rx);
3969                     }
3970                     else {
3971                         U32 pm_flags = 0;
3972                         const I32 osize = PL_regsize;
3973
3974                         if (DO_UTF8(ret)) {
3975                             assert (SvUTF8(ret));
3976                         } else if (SvUTF8(ret)) {
3977                             /* Not doing UTF-8, despite what the SV says. Is
3978                                this only if we're trapped in use 'bytes'?  */
3979                             /* Make a copy of the octet sequence, but without
3980                                the flag on, as the compiler now honours the
3981                                SvUTF8 flag on ret.  */
3982                             STRLEN len;
3983                             const char *const p = SvPV(ret, len);
3984                             ret = newSVpvn_flags(p, len, SVs_TEMP);
3985                         }
3986                         rx = CALLREGCOMP(ret, pm_flags);
3987                         if (!(SvFLAGS(ret)
3988                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3989                                  | SVs_GMG))) {
3990                             /* This isn't a first class regexp. Instead, it's
3991                                caching a regexp onto an existing, Perl visible
3992                                scalar.  */
3993                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
3994                         }
3995                         PL_regsize = osize;
3996                     }
3997                     re_sv = rx;
3998                     re = (struct regexp *)SvANY(rx);
3999                 }
4000                 RXp_MATCH_COPIED_off(re);
4001                 re->subbeg = rex->subbeg;
4002                 re->sublen = rex->sublen;
4003                 rei = RXi_GET(re);
4004                 DEBUG_EXECUTE_r(
4005                     debug_start_match(re_sv, do_utf8, locinput, PL_regeol,
4006                         "Matching embedded");
4007                 );
4008                 startpoint = rei->program + 1;
4009                 ST.close_paren = 0; /* only used for GOSUB */
4010                 /* borrowed from regtry */
4011                 if (PL_reg_start_tmpl <= re->nparens) {
4012                     PL_reg_start_tmpl = re->nparens*3/2 + 3;
4013                     if(PL_reg_start_tmp)
4014                         Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4015                     else
4016                         Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4017                 }
4018
4019         eval_recurse_doit: /* Share code with GOSUB below this line */
4020                 /* run the pattern returned from (??{...}) */
4021                 ST.cp = regcppush(0);   /* Save *all* the positions. */
4022                 REGCP_SET(ST.lastcp);
4023
4024                 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4025
4026                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4027                 PL_reglastparen = &re->lastparen;
4028                 PL_reglastcloseparen = &re->lastcloseparen;
4029                 re->lastparen = 0;
4030                 re->lastcloseparen = 0;
4031
4032                 PL_reginput = locinput;
4033                 PL_regsize = 0;
4034
4035                 /* XXXX This is too dramatic a measure... */
4036                 PL_reg_maxiter = 0;
4037
4038                 ST.toggle_reg_flags = PL_reg_flags;
4039                 if (RX_UTF8(re_sv))
4040                     PL_reg_flags |= RF_utf8;
4041                 else
4042                     PL_reg_flags &= ~RF_utf8;
4043                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4044
4045                 ST.prev_rex = rex_sv;
4046                 ST.prev_curlyx = cur_curlyx;
4047                 SETREX(rex_sv,re_sv);
4048                 rex = re;
4049                 rexi = rei;
4050                 cur_curlyx = NULL;
4051                 ST.B = next;
4052                 ST.prev_eval = cur_eval;
4053                 cur_eval = st;
4054                 /* now continue from first node in postoned RE */
4055                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4056                 /* NOTREACHED */
4057             }
4058             /* logical is 1,   /(?(?{...})X|Y)/ */
4059             sw = cBOOL(SvTRUE(ret));
4060             logical = 0;
4061             break;
4062         }
4063
4064         case EVAL_AB: /* cleanup after a successful (??{A})B */
4065             /* note: this is called twice; first after popping B, then A */
4066             PL_reg_flags ^= ST.toggle_reg_flags;
4067             ReREFCNT_dec(rex_sv);
4068             SETREX(rex_sv,ST.prev_rex);
4069             rex = (struct regexp *)SvANY(rex_sv);
4070             rexi = RXi_GET(rex);
4071             regcpblow(ST.cp);
4072             cur_eval = ST.prev_eval;
4073             cur_curlyx = ST.prev_curlyx;
4074
4075             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4076             PL_reglastparen = &rex->lastparen;
4077             PL_reglastcloseparen = &rex->lastcloseparen;
4078             /* also update PL_regoffs */
4079             PL_regoffs = rex->offs;
4080
4081             /* XXXX This is too dramatic a measure... */
4082             PL_reg_maxiter = 0;
4083             if ( nochange_depth )
4084                 nochange_depth--;
4085             sayYES;
4086
4087
4088         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4089             /* note: this is called twice; first after popping B, then A */
4090             PL_reg_flags ^= ST.toggle_reg_flags;
4091             ReREFCNT_dec(rex_sv);
4092             SETREX(rex_sv,ST.prev_rex);
4093             rex = (struct regexp *)SvANY(rex_sv);
4094             rexi = RXi_GET(rex);
4095             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4096             PL_reglastparen = &rex->lastparen;
4097             PL_reglastcloseparen = &rex->lastcloseparen;
4098
4099             PL_reginput = locinput;
4100             REGCP_UNWIND(ST.lastcp);
4101             regcppop(rex);
4102             cur_eval = ST.prev_eval;
4103             cur_curlyx = ST.prev_curlyx;
4104             /* XXXX This is too dramatic a measure... */
4105             PL_reg_maxiter = 0;
4106             if ( nochange_depth )
4107                 nochange_depth--;
4108             sayNO_SILENT;
4109 #undef ST
4110
4111         case OPEN:
4112             n = ARG(scan);  /* which paren pair */
4113             PL_reg_start_tmp[n] = locinput;
4114             if (n > PL_regsize)
4115                 PL_regsize = n;
4116             lastopen = n;
4117             break;
4118         case CLOSE:
4119             n = ARG(scan);  /* which paren pair */
4120             PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4121             PL_regoffs[n].end = locinput - PL_bostr;
4122             /*if (n > PL_regsize)
4123                 PL_regsize = n;*/
4124             if (n > *PL_reglastparen)
4125                 *PL_reglastparen = n;
4126             *PL_reglastcloseparen = n;
4127             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4128                 goto fake_end;
4129             }
4130             break;
4131         case ACCEPT:
4132             if (ARG(scan)){
4133                 regnode *cursor;
4134                 for (cursor=scan;
4135                      cursor && OP(cursor)!=END;
4136                      cursor=regnext(cursor))
4137                 {
4138                     if ( OP(cursor)==CLOSE ){
4139                         n = ARG(cursor);
4140                         if ( n <= lastopen ) {
4141                             PL_regoffs[n].start
4142                                 = PL_reg_start_tmp[n] - PL_bostr;
4143                             PL_regoffs[n].end = locinput - PL_bostr;
4144                             /*if (n > PL_regsize)
4145                             PL_regsize = n;*/
4146                             if (n > *PL_reglastparen)
4147                                 *PL_reglastparen = n;
4148                             *PL_reglastcloseparen = n;
4149                             if ( n == ARG(scan) || (cur_eval &&
4150                                 cur_eval->u.eval.close_paren == n))
4151                                 break;
4152                         }
4153                     }
4154                 }
4155             }
4156             goto fake_end;
4157             /*NOTREACHED*/
4158         case GROUPP:
4159             n = ARG(scan);  /* which paren pair */
4160             sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4161             break;
4162         case NGROUPP:
4163             /* reg_check_named_buff_matched returns 0 for no match */
4164             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4165             break;
4166         case INSUBP:
4167             n = ARG(scan);
4168             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4169             break;
4170         case DEFINEP:
4171             sw = 0;
4172             break;
4173         case IFTHEN:
4174             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4175             if (sw)
4176                 next = NEXTOPER(NEXTOPER(scan));
4177             else {
4178                 next = scan + ARG(scan);
4179                 if (OP(next) == IFTHEN) /* Fake one. */
4180                     next = NEXTOPER(NEXTOPER(next));
4181             }
4182             break;
4183         case LOGICAL:
4184             logical = scan->flags;
4185             break;
4186
4187 /*******************************************************************
4188
4189 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4190 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4191 STAR/PLUS/CURLY/CURLYN are used instead.)
4192
4193 A*B is compiled as <CURLYX><A><WHILEM><B>
4194
4195 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4196 state, which contains the current count, initialised to -1. It also sets
4197 cur_curlyx to point to this state, with any previous value saved in the
4198 state block.
4199
4200 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4201 since the pattern may possibly match zero times (i.e. it's a while {} loop
4202 rather than a do {} while loop).
4203
4204 Each entry to WHILEM represents a successful match of A. The count in the
4205 CURLYX block is incremented, another WHILEM state is pushed, and execution
4206 passes to A or B depending on greediness and the current count.
4207
4208 For example, if matching against the string a1a2a3b (where the aN are
4209 substrings that match /A/), then the match progresses as follows: (the
4210 pushed states are interspersed with the bits of strings matched so far):
4211
4212     <CURLYX cnt=-1>
4213     <CURLYX cnt=0><WHILEM>
4214     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4215     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4216     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4217     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4218
4219 (Contrast this with something like CURLYM, which maintains only a single
4220 backtrack state:
4221
4222     <CURLYM cnt=0> a1
4223     a1 <CURLYM cnt=1> a2
4224     a1 a2 <CURLYM cnt=2> a3
4225     a1 a2 a3 <CURLYM cnt=3> b
4226 )
4227
4228 Each WHILEM state block marks a point to backtrack to upon partial failure
4229 of A or B, and also contains some minor state data related to that
4230 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4231 overall state, such as the count, and pointers to the A and B ops.
4232
4233 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4234 must always point to the *current* CURLYX block, the rules are:
4235
4236 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4237 and set cur_curlyx to point the new block.
4238
4239 When popping the CURLYX block after a successful or unsuccessful match,
4240 restore the previous cur_curlyx.
4241
4242 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4243 to the outer one saved in the CURLYX block.
4244
4245 When popping the WHILEM block after a successful or unsuccessful B match,
4246 restore the previous cur_curlyx.
4247
4248 Here's an example for the pattern (AI* BI)*BO
4249 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4250
4251 cur_
4252 curlyx backtrack stack
4253 ------ ---------------
4254 NULL
4255 CO     <CO prev=NULL> <WO>
4256 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4257 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4258 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4259
4260 At this point the pattern succeeds, and we work back down the stack to
4261 clean up, restoring as we go:
4262
4263 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4264 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4265 CO     <CO prev=NULL> <WO>
4266 NULL
4267
4268 *******************************************************************/
4269
4270 #define ST st->u.curlyx
4271
4272         case CURLYX:    /* start of /A*B/  (for complex A) */
4273         {
4274             /* No need to save/restore up to this paren */
4275             I32 parenfloor = scan->flags;
4276
4277             assert(next); /* keep Coverity happy */
4278             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4279                 next += ARG(next);
4280
4281             /* XXXX Probably it is better to teach regpush to support
4282                parenfloor > PL_regsize... */
4283             if (parenfloor > (I32)*PL_reglastparen)
4284                 parenfloor = *PL_reglastparen; /* Pessimization... */
4285
4286             ST.prev_curlyx= cur_curlyx;
4287             cur_curlyx = st;
4288             ST.cp = PL_savestack_ix;
4289
4290             /* these fields contain the state of the current curly.
4291              * they are accessed by subsequent WHILEMs */
4292             ST.parenfloor = parenfloor;
4293             ST.min = ARG1(scan);
4294             ST.max = ARG2(scan);
4295             ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4296             ST.B = next;
4297             ST.minmod = minmod;
4298             minmod = 0;
4299             ST.count = -1;      /* this will be updated by WHILEM */
4300             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4301
4302             PL_reginput = locinput;
4303             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4304             /* NOTREACHED */
4305         }
4306
4307         case CURLYX_end: /* just finished matching all of A*B */
4308             cur_curlyx = ST.prev_curlyx;
4309             sayYES;
4310             /* NOTREACHED */
4311
4312         case CURLYX_end_fail: /* just failed to match all of A*B */
4313             regcpblow(ST.cp);
4314             cur_curlyx = ST.prev_curlyx;
4315             sayNO;
4316             /* NOTREACHED */
4317
4318
4319 #undef ST
4320 #define ST st->u.whilem
4321
4322         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4323         {
4324             /* see the discussion above about CURLYX/WHILEM */
4325             I32 n;
4326             assert(cur_curlyx); /* keep Coverity happy */
4327             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4328             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4329             ST.cache_offset = 0;
4330             ST.cache_mask = 0;
4331
4332             PL_reginput = locinput;
4333
4334             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4335                   "%*s  whilem: matched %ld out of %ld..%ld\n",
4336                   REPORT_CODE_OFF+depth*2, "", (long)n,
4337                   (long)cur_curlyx->u.curlyx.min,
4338                   (long)cur_curlyx->u.curlyx.max)
4339             );
4340
4341             /* First just match a string of min A's. */
4342
4343             if (n < cur_curlyx->u.curlyx.min) {
4344                 cur_curlyx->u.curlyx.lastloc = locinput;
4345                 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
4346                 /* NOTREACHED */
4347             }
4348
4349             /* If degenerate A matches "", assume A done. */
4350
4351             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4352                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4353                    "%*s  whilem: empty match detected, trying continuation...\n",
4354                    REPORT_CODE_OFF+depth*2, "")
4355                 );
4356                 goto do_whilem_B_max;
4357             }
4358
4359             /* super-linear cache processing */
4360
4361             if (scan->flags) {
4362
4363                 if (!PL_reg_maxiter) {
4364                     /* start the countdown: Postpone detection until we
4365                      * know the match is not *that* much linear. */
4366                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4367                     /* possible overflow for long strings and many CURLYX's */
4368                     if (PL_reg_maxiter < 0)
4369                         PL_reg_maxiter = I32_MAX;
4370                     PL_reg_leftiter = PL_reg_maxiter;
4371                 }
4372
4373                 if (PL_reg_leftiter-- == 0) {
4374                     /* initialise cache */
4375                     const I32 size = (PL_reg_maxiter + 7)/8;
4376                     if (PL_reg_poscache) {
4377                         if ((I32)PL_reg_poscache_size < size) {
4378                             Renew(PL_reg_poscache, size, char);
4379                             PL_reg_poscache_size = size;
4380                         }
4381                         Zero(PL_reg_poscache, size, char);
4382                     }
4383                     else {
4384                         PL_reg_poscache_size = size;
4385                         Newxz(PL_reg_poscache, size, char);
4386                     }
4387                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4388       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4389                               PL_colors[4], PL_colors[5])
4390                     );
4391                 }
4392
4393                 if (PL_reg_leftiter < 0) {
4394                     /* have we already failed at this position? */
4395                     I32 offset, mask;
4396                     offset  = (scan->flags & 0xf) - 1
4397                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4398                     mask    = 1 << (offset % 8);
4399                     offset /= 8;
4400                     if (PL_reg_poscache[offset] & mask) {
4401                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4402                             "%*s  whilem: (cache) already tried at this position...\n",
4403                             REPORT_CODE_OFF+depth*2, "")
4404                         );
4405                         sayNO; /* cache records failure */
4406                     }
4407                     ST.cache_offset = offset;
4408                     ST.cache_mask   = mask;
4409                 }
4410             }
4411
4412             /* Prefer B over A for minimal matching. */
4413
4414             if (cur_curlyx->u.curlyx.minmod) {
4415                 ST.save_curlyx = cur_curlyx;
4416                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4417                 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4418                 REGCP_SET(ST.lastcp);
4419                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4420                 /* NOTREACHED */
4421             }
4422
4423             /* Prefer A over B for maximal matching. */
4424
4425             if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
4426                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4427                 cur_curlyx->u.curlyx.lastloc = locinput;
4428                 REGCP_SET(ST.lastcp);
4429                 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
4430                 /* NOTREACHED */
4431             }
4432             goto do_whilem_B_max;
4433         }
4434         /* NOTREACHED */
4435
4436         case WHILEM_B_min: /* just matched B in a minimal match */
4437         case WHILEM_B_max: /* just matched B in a maximal match */
4438             cur_curlyx = ST.save_curlyx;
4439             sayYES;
4440             /* NOTREACHED */
4441
4442         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4443             cur_curlyx = ST.save_curlyx;
4444             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4445             cur_curlyx->u.curlyx.count--;
4446             CACHEsayNO;
4447             /* NOTREACHED */
4448
4449         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4450             REGCP_UNWIND(ST.lastcp);
4451             regcppop(rex);
4452             /* FALL THROUGH */
4453         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4454             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4455             cur_curlyx->u.curlyx.count--;
4456             CACHEsayNO;
4457             /* NOTREACHED */
4458
4459         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4460             REGCP_UNWIND(ST.lastcp);
4461             regcppop(rex);      /* Restore some previous $<digit>s? */
4462             PL_reginput = locinput;
4463             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4464                 "%*s  whilem: failed, trying continuation...\n",
4465                 REPORT_CODE_OFF+depth*2, "")
4466             );
4467           do_whilem_B_max:
4468             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4469                 && ckWARN(WARN_REGEXP)
4470                 && !(PL_reg_flags & RF_warned))
4471             {
4472                 PL_reg_flags |= RF_warned;
4473                 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4474                      "Complex regular subexpression recursion",
4475                      REG_INFTY - 1);
4476             }
4477
4478             /* now try B */
4479             ST.save_curlyx = cur_curlyx;
4480             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4481             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4482             /* NOTREACHED */
4483
4484         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4485             cur_curlyx = ST.save_curlyx;
4486             REGCP_UNWIND(ST.lastcp);
4487             regcppop(rex);
4488
4489             if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
4490                 /* Maximum greed exceeded */
4491                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4492                     && ckWARN(WARN_REGEXP)
4493                     && !(PL_reg_flags & RF_warned))
4494                 {
4495                     PL_reg_flags |= RF_warned;
4496                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4497                         "%s limit (%d) exceeded",
4498                         "Complex regular subexpression recursion",
4499                         REG_INFTY - 1);
4500                 }
4501                 cur_curlyx->u.curlyx.count--;
4502                 CACHEsayNO;
4503             }
4504
4505             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4506                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4507             );
4508             /* Try grabbing another A and see if it helps. */
4509             PL_reginput = locinput;
4510             cur_curlyx->u.curlyx.lastloc = locinput;
4511             ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4512             REGCP_SET(ST.lastcp);
4513             PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
4514             /* NOTREACHED */
4515
4516 #undef  ST
4517 #define ST st->u.branch
4518
4519         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
4520             next = scan + ARG(scan);
4521             if (next == scan)
4522                 next = NULL;
4523             scan = NEXTOPER(scan);
4524             /* FALL THROUGH */
4525
4526         case BRANCH:        /*  /(...|A|...)/ */
4527             scan = NEXTOPER(scan); /* scan now points to inner node */
4528             ST.lastparen = *PL_reglastparen;
4529             ST.next_branch = next;
4530             REGCP_SET(ST.cp);
4531             PL_reginput = locinput;
4532
4533             /* Now go into the branch */
4534             if (has_cutgroup) {
4535                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4536             } else {
4537                 PUSH_STATE_GOTO(BRANCH_next, scan);
4538             }
4539             /* NOTREACHED */
4540         case CUTGROUP:
4541             PL_reginput = locinput;
4542             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4543                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4544             PUSH_STATE_GOTO(CUTGROUP_next,next);
4545             /* NOTREACHED */
4546         case CUTGROUP_next_fail:
4547             do_cutgroup = 1;
4548             no_final = 1;
4549             if (st->u.mark.mark_name)
4550                 sv_commit = st->u.mark.mark_name;
4551             sayNO;
4552             /* NOTREACHED */
4553         case BRANCH_next:
4554             sayYES;
4555             /* NOTREACHED */
4556         case BRANCH_next_fail: /* that branch failed; try the next, if any */
4557             if (do_cutgroup) {
4558                 do_cutgroup = 0;
4559                 no_final = 0;
4560             }
4561             REGCP_UNWIND(ST.cp);
4562             for (n = *PL_reglastparen; n > ST.lastparen; n--)
4563                 PL_regoffs[n].end = -1;
4564             *PL_reglastparen = n;
4565             /*dmq: *PL_reglastcloseparen = n; */
4566             scan = ST.next_branch;
4567             /* no more branches? */
4568             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4569                 DEBUG_EXECUTE_r({
4570                     PerlIO_printf( Perl_debug_log,
4571                         "%*s  %sBRANCH failed...%s\n",
4572                         REPORT_CODE_OFF+depth*2, "",
4573                         PL_colors[4],
4574                         PL_colors[5] );
4575                 });
4576                 sayNO_SILENT;
4577             }
4578             continue; /* execute next BRANCH[J] op */
4579             /* NOTREACHED */
4580
4581         case MINMOD:
4582             minmod = 1;
4583             break;
4584
4585 #undef  ST
4586 #define ST st->u.curlym
4587
4588         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
4589
4590             /* This is an optimisation of CURLYX that enables us to push
4591              * only a single backtracking state, no matter how many matches
4592              * there are in {m,n}. It relies on the pattern being constant
4593              * length, with no parens to influence future backrefs
4594              */
4595
4596             ST.me = scan;
4597             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4598
4599             /* if paren positive, emulate an OPEN/CLOSE around A */
4600             if (ST.me->flags) {
4601                 U32 paren = ST.me->flags;
4602                 if (paren > PL_regsize)
4603                     PL_regsize = paren;
4604                 if (paren > *PL_reglastparen)
4605                     *PL_reglastparen = paren;
4606                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4607             }
4608             ST.A = scan;
4609             ST.B = next;
4610             ST.alen = 0;
4611             ST.count = 0;
4612             ST.minmod = minmod;
4613             minmod = 0;
4614             ST.c1 = CHRTEST_UNINIT;
4615             REGCP_SET(ST.cp);
4616
4617             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4618                 goto curlym_do_B;
4619
4620           curlym_do_A: /* execute the A in /A{m,n}B/  */
4621             PL_reginput = locinput;
4622             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4623             /* NOTREACHED */
4624
4625         case CURLYM_A: /* we've just matched an A */
4626             locinput = st->locinput;
4627             nextchr = UCHARAT(locinput);
4628
4629             ST.count++;
4630             /* after first match, determine A's length: u.curlym.alen */
4631             if (ST.count == 1) {
4632                 if (PL_reg_match_utf8) {
4633                     char *s = locinput;
4634                     while (s < PL_reginput) {
4635                         ST.alen++;
4636                         s += UTF8SKIP(s);
4637                     }
4638                 }
4639                 else {
4640                     ST.alen = PL_reginput - locinput;
4641                 }
4642                 if (ST.alen == 0)
4643                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4644             }
4645             DEBUG_EXECUTE_r(
4646                 PerlIO_printf(Perl_debug_log,
4647                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4648                           (int)(REPORT_CODE_OFF+(depth*2)), "",
4649                           (IV) ST.count, (IV)ST.alen)
4650             );
4651
4652             locinput = PL_reginput;
4653
4654             if (cur_eval && cur_eval->u.eval.close_paren &&
4655                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4656                 goto fake_end;
4657
4658             {
4659                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4660                 if ( max == REG_INFTY || ST.count < max )
4661                     goto curlym_do_A; /* try to match another A */
4662             }
4663             goto curlym_do_B; /* try to match B */
4664
4665         case CURLYM_A_fail: /* just failed to match an A */
4666             REGCP_UNWIND(ST.cp);
4667
4668             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4669                 || (cur_eval && cur_eval->u.eval.close_paren &&
4670                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4671                 sayNO;
4672
4673           curlym_do_B: /* execute the B in /A{m,n}B/  */
4674             PL_reginput = locinput;
4675             if (ST.c1 == CHRTEST_UNINIT) {
4676                 /* calculate c1 and c2 for possible match of 1st char
4677                  * following curly */
4678                 ST.c1 = ST.c2 = CHRTEST_VOID;
4679                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4680                     regnode *text_node = ST.B;
4681                     if (! HAS_TEXT(text_node))
4682                         FIND_NEXT_IMPT(text_node);
4683                     /* this used to be
4684
4685                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4686
4687                         But the former is redundant in light of the latter.
4688
4689                         if this changes back then the macro for
4690                         IS_TEXT and friends need to change.
4691                      */
4692                     if (PL_regkind[OP(text_node)] == EXACT)
4693                     {
4694
4695                         ST.c1 = (U8)*STRING(text_node);
4696                         ST.c2 =
4697                             (IS_TEXTF(text_node))
4698                             ? PL_fold[ST.c1]
4699                             : (IS_TEXTFL(text_node))
4700                                 ? PL_fold_locale[ST.c1]
4701                                 : ST.c1;
4702                     }
4703                 }
4704             }
4705
4706             DEBUG_EXECUTE_r(
4707                 PerlIO_printf(Perl_debug_log,
4708                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
4709                     (int)(REPORT_CODE_OFF+(depth*2)),
4710                     "", (IV)ST.count)
4711                 );
4712             if (ST.c1 != CHRTEST_VOID
4713                     && UCHARAT(PL_reginput) != ST.c1
4714                     && UCHARAT(PL_reginput) != ST.c2)
4715             {
4716                 /* simulate B failing */
4717                 DEBUG_OPTIMISE_r(
4718                     PerlIO_printf(Perl_debug_log,
4719                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4720                         (int)(REPORT_CODE_OFF+(depth*2)),"",
4721                         (IV)ST.c1,(IV)ST.c2
4722                 ));
4723                 state_num = CURLYM_B_fail;
4724                 goto reenter_switch;
4725             }
4726
4727             if (ST.me->flags) {
4728                 /* mark current A as captured */
4729                 I32 paren = ST.me->flags;
4730                 if (ST.count) {
4731                     PL_regoffs[paren].start
4732                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4733                     PL_regoffs[paren].end = PL_reginput - PL_bostr;
4734                     /*dmq: *PL_reglastcloseparen = paren; */
4735                 }
4736                 else
4737                     PL_regoffs[paren].end = -1;
4738                 if (cur_eval && cur_eval->u.eval.close_paren &&
4739                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4740                 {
4741                     if (ST.count)
4742                         goto fake_end;
4743                     else
4744                         sayNO;
4745                 }
4746             }
4747
4748             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4749             /* NOTREACHED */
4750
4751         case CURLYM_B_fail: /* just failed to match a B */
4752             REGCP_UNWIND(ST.cp);
4753             if (ST.minmod) {
4754                 I32 max = ARG2(ST.me);
4755                 if (max != REG_INFTY && ST.count == max)
4756                     sayNO;
4757                 goto curlym_do_A; /* try to match a further A */
4758             }
4759             /* backtrack one A */
4760             if (ST.count == ARG1(ST.me) /* min */)
4761                 sayNO;
4762             ST.count--;
4763             locinput = HOPc(locinput, -ST.alen);
4764             goto curlym_do_B; /* try to match B */
4765
4766 #undef ST
4767 #define ST st->u.curly
4768
4769 #define CURLY_SETPAREN(paren, success) \
4770     if (paren) { \
4771         if (success) { \
4772             PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
4773             PL_regoffs[paren].end = locinput - PL_bostr; \
4774             *PL_reglastcloseparen = paren; \
4775         } \
4776         else \
4777             PL_regoffs[paren].end = -1; \
4778     }
4779
4780         case STAR:              /*  /A*B/ where A is width 1 */
4781             ST.paren = 0;
4782             ST.min = 0;
4783             ST.max = REG_INFTY;
4784             scan = NEXTOPER(scan);
4785             goto repeat;
4786         case PLUS:              /*  /A+B/ where A is width 1 */
4787             ST.paren = 0;
4788             ST.min = 1;
4789             ST.max = REG_INFTY;
4790             scan = NEXTOPER(scan);
4791             goto repeat;
4792         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
4793             ST.paren = scan->flags;     /* Which paren to set */
4794             if (ST.paren > PL_regsize)
4795                 PL_regsize = ST.paren;
4796             if (ST.paren > *PL_reglastparen)
4797                 *PL_reglastparen = ST.paren;
4798             ST.min = ARG1(scan);  /* min to match */
4799             ST.max = ARG2(scan);  /* max to match */
4800             if (cur_eval && cur_eval->u.eval.close_paren &&
4801                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4802                 ST.min=1;
4803                 ST.max=1;
4804             }
4805             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4806             goto repeat;
4807         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
4808             ST.paren = 0;
4809             ST.min = ARG1(scan);  /* min to match */
4810             ST.max = ARG2(scan);  /* max to match */
4811             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4812           repeat:
4813             /*
4814             * Lookahead to avoid useless match attempts
4815             * when we know what character comes next.
4816             *
4817             * Used to only do .*x and .*?x, but now it allows
4818             * for )'s, ('s and (?{ ... })'s to be in the way
4819             * of the quantifier and the EXACT-like node.  -- japhy
4820             */
4821
4822             if (ST.min > ST.max) /* XXX make this a compile-time check? */
4823                 sayNO;
4824             if (HAS_TEXT(next) || JUMPABLE(next)) {
4825                 U8 *s;
4826                 regnode *text_node = next;
4827
4828                 if (! HAS_TEXT(text_node))
4829                     FIND_NEXT_IMPT(text_node);
4830
4831                 if (! HAS_TEXT(text_node))
4832                     ST.c1 = ST.c2 = CHRTEST_VOID;
4833                 else {
4834                     if ( PL_regkind[OP(text_node)] != EXACT ) {
4835                         ST.c1 = ST.c2 = CHRTEST_VOID;
4836                         goto assume_ok_easy;
4837                     }
4838                     else
4839                         s = (U8*)STRING(text_node);
4840
4841                     /*  Currently we only get here when
4842
4843                         PL_rekind[OP(text_node)] == EXACT
4844
4845                         if this changes back then the macro for IS_TEXT and
4846                         friends need to change. */
4847                     if (!UTF) {
4848                         ST.c2 = ST.c1 = *s;
4849                         if (IS_TEXTF(text_node))
4850                             ST.c2 = PL_fold[ST.c1];
4851                         else if (IS_TEXTFL(text_node))
4852                             ST.c2 = PL_fold_locale[ST.c1];
4853                     }
4854                     else { /* UTF */
4855                         if (IS_TEXTF(text_node)) {
4856                              STRLEN ulen1, ulen2;
4857                              U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4858                              U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4859
4860                              to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4861                              to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4862 #ifdef EBCDIC
4863                              ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4864                                                     ckWARN(WARN_UTF8) ?
4865                                                     0 : UTF8_ALLOW_ANY);
4866                              ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4867                                                     ckWARN(WARN_UTF8) ?
4868                                                     0 : UTF8_ALLOW_ANY);
4869 #else
4870                              ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
4871                                                     uniflags);
4872                              ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
4873                                                     uniflags);
4874 #endif
4875                         }
4876                         else {
4877                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
4878                                                      uniflags);
4879                         }
4880                     }
4881                 }
4882             }
4883             else
4884                 ST.c1 = ST.c2 = CHRTEST_VOID;
4885         assume_ok_easy:
4886
4887             ST.A = scan;
4888             ST.B = next;
4889             PL_reginput = locinput;
4890             if (minmod) {
4891                 minmod = 0;
4892                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4893                     sayNO;
4894                 ST.count = ST.min;
4895                 locinput = PL_reginput;
4896                 REGCP_SET(ST.cp);
4897                 if (ST.c1 == CHRTEST_VOID)
4898                     goto curly_try_B_min;
4899
4900                 ST.oldloc = locinput;
4901
4902                 /* set ST.maxpos to the furthest point along the
4903                  * string that could possibly match */
4904                 if  (ST.max == REG_INFTY) {
4905                     ST.maxpos = PL_regeol - 1;
4906                     if (do_utf8)
4907                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4908                             ST.maxpos--;
4909                 }
4910                 else if (do_utf8) {
4911                     int m = ST.max - ST.min;
4912                     for (ST.maxpos = locinput;
4913                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4914                         ST.maxpos += UTF8SKIP(ST.maxpos);
4915                 }
4916                 else {
4917                     ST.maxpos = locinput + ST.max - ST.min;
4918                     if (ST.maxpos >= PL_regeol)
4919                         ST.maxpos = PL_regeol - 1;
4920                 }
4921                 goto curly_try_B_min_known;
4922
4923             }
4924             else {
4925                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
4926                 locinput = PL_reginput;
4927                 if (ST.count < ST.min)
4928                     sayNO;
4929                 if ((ST.count > ST.min)
4930                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4931                 {
4932                     /* A{m,n} must come at the end of the string, there's
4933                      * no point in backing off ... */
4934                     ST.min = ST.count;
4935                     /* ...except that $ and \Z can match before *and* after
4936                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
4937                        We may back off by one in this case. */
4938                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4939                         ST.min--;
4940                 }
4941                 REGCP_SET(ST.cp);
4942                 goto curly_try_B_max;
4943             }
4944             /* NOTREACHED */
4945
4946
4947         case CURLY_B_min_known_fail:
4948             /* failed to find B in a non-greedy match where c1,c2 valid */
4949             if (ST.paren && ST.count)
4950                 PL_regoffs[ST.paren].end = -1;
4951
4952             PL_reginput = locinput;     /* Could be reset... */
4953             REGCP_UNWIND(ST.cp);
4954             /* Couldn't or didn't -- move forward. */
4955             ST.oldloc = locinput;
4956             if (do_utf8)
4957                 locinput += UTF8SKIP(locinput);
4958             else
4959                 locinput++;
4960             ST.count++;
4961           curly_try_B_min_known:
4962              /* find the next place where 'B' could work, then call B */
4963             {
4964                 int n;
4965                 if (do_utf8) {
4966                     n = (ST.oldloc == locinput) ? 0 : 1;
4967                     if (ST.c1 == ST.c2) {
4968                         STRLEN len;
4969                         /* set n to utf8_distance(oldloc, locinput) */
4970                         while (locinput <= ST.maxpos &&
4971                                utf8n_to_uvchr((U8*)locinput,
4972                                               UTF8_MAXBYTES, &len,
4973                                               uniflags) != (UV)ST.c1) {
4974                             locinput += len;
4975                             n++;
4976                         }
4977                     }
4978                     else {
4979                         /* set n to utf8_distance(oldloc, locinput) */
4980                         while (locinput <= ST.maxpos) {
4981                             STRLEN len;
4982                             const UV c = utf8n_to_uvchr((U8*)locinput,
4983                                                   UTF8_MAXBYTES, &len,
4984                                                   uniflags);
4985                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
4986                                 break;
4987                             locinput += len;
4988                             n++;
4989                         }
4990                     }
4991                 }
4992                 else {
4993                     if (ST.c1 == ST.c2) {
4994                         while (locinput <= ST.maxpos &&
4995                                UCHARAT(locinput) != ST.c1)
4996                             locinput++;
4997                     }
4998                     else {
4999                         while (locinput <= ST.maxpos
5000                                && UCHARAT(locinput) != ST.c1
5001                                && UCHARAT(locinput) != ST.c2)
5002                             locinput++;
5003                     }
5004                     n = locinput - ST.oldloc;
5005                 }
5006                 if (locinput > ST.maxpos)
5007                     sayNO;
5008                 /* PL_reginput == oldloc now */
5009                 if (n) {
5010                     ST.count += n;
5011                     if (regrepeat(rex, ST.A, n, depth) < n)
5012                         sayNO;
5013                 }
5014                 PL_reginput = locinput;
5015                 CURLY_SETPAREN(ST.paren, ST.count);
5016                 if (cur_eval && cur_eval->u.eval.close_paren &&
5017                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5018                     goto fake_end;
5019                 }
5020                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5021             }
5022             /* NOTREACHED */
5023
5024
5025         case CURLY_B_min_fail:
5026             /* failed to find B in a non-greedy match where c1,c2 invalid */
5027             if (ST.paren && ST.count)
5028                 PL_regoffs[ST.paren].end = -1;
5029
5030             REGCP_UNWIND(ST.cp);
5031             /* failed -- move forward one */
5032             PL_reginput = locinput;
5033             if (regrepeat(rex, ST.A, 1, depth)) {
5034                 ST.count++;
5035                 locinput = PL_reginput;
5036                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5037                         ST.count > 0)) /* count overflow ? */
5038                 {
5039                   curly_try_B_min:
5040                     CURLY_SETPAREN(ST.paren, ST.count);
5041                     if (cur_eval && cur_eval->u.eval.close_paren &&
5042                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5043                         goto fake_end;
5044                     }
5045                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5046                 }
5047             }
5048             sayNO;
5049             /* NOTREACHED */
5050
5051
5052         curly_try_B_max:
5053             /* a successful greedy match: now try to match B */
5054             if (cur_eval && cur_eval->u.eval.close_paren &&
5055                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5056                 goto fake_end;
5057             }
5058             {
5059                 UV c = 0;
5060                 if (ST.c1 != CHRTEST_VOID)
5061                     c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
5062                                            UTF8_MAXBYTES, 0, uniflags)
5063                                 : (UV) UCHARAT(PL_reginput);
5064                 /* If it could work, try it. */
5065                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5066                     CURLY_SETPAREN(ST.paren, ST.count);
5067                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5068                     /* NOTREACHED */
5069                 }
5070             }
5071             /* FALL THROUGH */
5072         case CURLY_B_max_fail:
5073             /* failed to find B in a greedy match */
5074             if (ST.paren && ST.count)
5075                 PL_regoffs[ST.paren].end = -1;
5076
5077             REGCP_UNWIND(ST.cp);
5078             /*  back up. */
5079             if (--ST.count < ST.min)
5080                 sayNO;
5081             PL_reginput = locinput = HOPc(locinput, -1);
5082             goto curly_try_B_max;
5083
5084 #undef ST
5085
5086         case END:
5087             fake_end:
5088             if (cur_eval) {
5089                 /* we've just finished A in /(??{A})B/; now continue with B */
5090                 I32 tmpix;
5091                 st->u.eval.toggle_reg_flags
5092                             = cur_eval->u.eval.toggle_reg_flags;
5093                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5094
5095                 st->u.eval.prev_rex = rex_sv;           /* inner */
5096                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5097                 rex = (struct regexp *)SvANY(rex_sv);
5098                 rexi = RXi_GET(rex);
5099                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5100                 ReREFCNT_inc(rex_sv);
5101                 st->u.eval.cp = regcppush(0);   /* Save *all* the positions. */
5102
5103                 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5104                 PL_reglastparen = &rex->lastparen;
5105                 PL_reglastcloseparen = &rex->lastcloseparen;
5106
5107                 REGCP_SET(st->u.eval.lastcp);
5108                 PL_reginput = locinput;
5109
5110                 /* Restore parens of the outer rex without popping the
5111                  * savestack */
5112                 tmpix = PL_savestack_ix;
5113                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5114                 regcppop(rex);
5115                 PL_savestack_ix = tmpix;
5116
5117                 st->u.eval.prev_eval = cur_eval;
5118                 cur_eval = cur_eval->u.eval.prev_eval;
5119                 DEBUG_EXECUTE_r(
5120                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5121                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5122                 if ( nochange_depth )
5123                     nochange_depth--;
5124
5125                 PUSH_YES_STATE_GOTO(EVAL_AB,
5126                         st->u.eval.prev_eval->u.eval.B); /* match B */
5127             }
5128
5129             if (locinput < reginfo->till) {
5130                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5131                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5132                                       PL_colors[4],
5133                                       (long)(locinput - PL_reg_starttry),
5134                                       (long)(reginfo->till - PL_reg_starttry),
5135                                       PL_colors[5]));
5136
5137                 sayNO_SILENT;           /* Cannot match: too short. */
5138             }
5139             PL_reginput = locinput;     /* put where regtry can find it */
5140             sayYES;                     /* Success! */
5141
5142         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5143             DEBUG_EXECUTE_r(
5144             PerlIO_printf(Perl_debug_log,
5145                 "%*s  %ssubpattern success...%s\n",
5146                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5147             PL_reginput = locinput;     /* put where regtry can find it */
5148             sayYES;                     /* Success! */
5149
5150 #undef  ST
5151 #define ST st->u.ifmatch
5152
5153         case SUSPEND:   /* (?>A) */
5154             ST.wanted = 1;
5155             PL_reginput = locinput;
5156             goto do_ifmatch;
5157
5158         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5159             ST.wanted = 0;
5160             goto ifmatch_trivial_fail_test;
5161
5162         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5163             ST.wanted = 1;
5164           ifmatch_trivial_fail_test:
5165             if (scan->flags) {
5166                 char * const s = HOPBACKc(locinput, scan->flags);
5167                 if (!s) {
5168                     /* trivial fail */
5169                     if (logical) {
5170                         logical = 0;
5171                         sw = 1 - cBOOL(ST.wanted);
5172                     }
5173                     else if (ST.wanted)
5174                         sayNO;
5175                     next = scan + ARG(scan);
5176                     if (next == scan)
5177                         next = NULL;
5178                     break;
5179                 }
5180                 PL_reginput = s;
5181             }
5182             else
5183                 PL_reginput = locinput;
5184
5185           do_ifmatch:
5186             ST.me = scan;
5187             ST.logical = logical;
5188             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5189
5190             /* execute body of (?...A) */
5191             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5192             /* NOTREACHED */
5193
5194         case IFMATCH_A_fail: /* body of (?...A) failed */
5195             ST.wanted = !ST.wanted;
5196             /* FALL THROUGH */
5197
5198         case IFMATCH_A: /* body of (?...A) succeeded */
5199             if (ST.logical) {
5200                 sw = cBOOL(ST.wanted);
5201             }
5202             else if (!ST.wanted)
5203                 sayNO;
5204
5205             if (OP(ST.me) == SUSPEND)
5206                 locinput = PL_reginput;
5207             else {
5208                 locinput = PL_reginput = st->locinput;
5209                 nextchr = UCHARAT(locinput);
5210             }
5211             scan = ST.me + ARG(ST.me);
5212             if (scan == ST.me)
5213                 scan = NULL;
5214             continue; /* execute B */
5215
5216 #undef ST
5217
5218         case LONGJMP:
5219             next = scan + ARG(scan);
5220             if (next == scan)
5221                 next = NULL;
5222             break;
5223         case COMMIT:
5224             reginfo->cutpoint = PL_regeol;
5225             /* FALLTHROUGH */
5226         case PRUNE:
5227             PL_reginput = locinput;
5228             if (!scan->flags)
5229                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5230             PUSH_STATE_GOTO(COMMIT_next,next);
5231             /* NOTREACHED */
5232         case COMMIT_next_fail:
5233             no_final = 1;
5234             /* FALLTHROUGH */
5235         case OPFAIL:
5236             sayNO;
5237             /* NOTREACHED */
5238
5239 #define ST st->u.mark
5240         case MARKPOINT:
5241             ST.prev_mark = mark_state;
5242             ST.mark_name = sv_commit = sv_yes_mark
5243                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5244             mark_state = st;
5245             ST.mark_loc = PL_reginput = locinput;
5246             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5247             /* NOTREACHED */
5248         case MARKPOINT_next:
5249             mark_state = ST.prev_mark;
5250             sayYES;
5251             /* NOTREACHED */
5252         case MARKPOINT_next_fail:
5253             if (popmark && sv_eq(ST.mark_name,popmark))
5254             {
5255                 if (ST.mark_loc > startpoint)
5256                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5257                 popmark = NULL; /* we found our mark */
5258                 sv_commit = ST.mark_name;
5259
5260                 DEBUG_EXECUTE_r({
5261                         PerlIO_printf(Perl_debug_log,
5262                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5263                             REPORT_CODE_OFF+depth*2, "",
5264                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5265                 });
5266             }
5267             mark_state = ST.prev_mark;
5268             sv_yes_mark = mark_state ?
5269                 mark_state->u.mark.mark_name : NULL;
5270             sayNO;
5271             /* NOTREACHED */
5272         case SKIP:
5273             PL_reginput = locinput;
5274             if (scan->flags) {
5275                 /* (*SKIP) : if we fail we cut here*/
5276                 ST.mark_name = NULL;
5277                 ST.mark_loc = locinput;
5278                 PUSH_STATE_GOTO(SKIP_next,next);
5279             } else {
5280                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5281                    otherwise do nothing.  Meaning we need to scan
5282                  */
5283                 regmatch_state *cur = mark_state;
5284                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5285
5286                 while (cur) {
5287                     if ( sv_eq( cur->u.mark.mark_name,
5288                                 find ) )
5289                     {
5290                         ST.mark_name = find;
5291                         PUSH_STATE_GOTO( SKIP_next, next );
5292                     }
5293                     cur = cur->u.mark.prev_mark;
5294                 }
5295             }
5296             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5297             break;
5298         case SKIP_next_fail:
5299             if (ST.mark_name) {
5300                 /* (*CUT:NAME) - Set up to search for the name as we
5301                    collapse the stack*/
5302                 popmark = ST.mark_name;
5303             } else {
5304                 /* (*CUT) - No name, we cut here.*/
5305                 if (ST.mark_loc > startpoint)
5306                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5307                 /* but we set sv_commit to latest mark_name if there
5308                    is one so they can test to see how things lead to this
5309                    cut */
5310                 if (mark_state)
5311                     sv_commit=mark_state->u.mark.mark_name;
5312             }
5313             no_final = 1;
5314             sayNO;
5315             /* NOTREACHED */
5316 #undef ST
5317         case FOLDCHAR:
5318             n = ARG(scan);
5319             if ( n == (U32)what_len_TRICKYFOLD(locinput,do_utf8,ln) ) {
5320                 locinput += ln;
5321             } else if ( 0xDF == n && !do_utf8 && !UTF ) {
5322                 sayNO;
5323             } else  {
5324                 U8 folded[UTF8_MAXBYTES_CASE+1];
5325                 STRLEN foldlen;
5326                 const char * const l = locinput;
5327                 char *e = PL_regeol;
5328                 to_uni_fold(n, folded, &foldlen);
5329
5330                 if (ibcmp_utf8((const char*) folded, 0,  foldlen, 1,
5331                                l, &e, 0,  do_utf8)) {
5332                         sayNO;
5333                 }
5334                 locinput = e;
5335             }
5336             nextchr = UCHARAT(locinput);
5337             break;
5338         case LNBREAK:
5339             if ((n=is_LNBREAK(locinput,do_utf8))) {
5340                 locinput += n;
5341                 nextchr = UCHARAT(locinput);
5342             } else
5343                 sayNO;
5344             break;
5345
5346 #define CASE_CLASS(nAmE)                              \
5347         case nAmE:                                    \
5348             if ((n=is_##nAmE(locinput,do_utf8))) {    \
5349                 locinput += n;                        \
5350                 nextchr = UCHARAT(locinput);          \
5351             } else                                    \
5352                 sayNO;                                \
5353             break;                                    \
5354         case N##nAmE:                                 \
5355             if ((n=is_##nAmE(locinput,do_utf8))) {    \
5356                 sayNO;                                \
5357             } else {                                  \
5358                 locinput += UTF8SKIP(locinput);       \
5359                 nextchr = UCHARAT(locinput);          \
5360             }                                         \
5361             break
5362
5363         CASE_CLASS(VERTWS);
5364         CASE_CLASS(HORIZWS);
5365 #undef CASE_CLASS
5366
5367         default:
5368             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5369                           PTR2UV(scan), OP(scan));
5370             Perl_croak(aTHX_ "regexp memory corruption");
5371
5372         } /* end switch */
5373
5374         /* switch break jumps here */
5375         scan = next; /* prepare to execute the next op and ... */
5376         continue;    /* ... jump back to the top, reusing st */
5377         /* NOTREACHED */
5378
5379       push_yes_state:
5380         /* push a state that backtracks on success */
5381         st->u.yes.prev_yes_state = yes_state;
5382         yes_state = st;
5383         /* FALL THROUGH */
5384       push_state:
5385         /* push a new regex state, then continue at scan  */
5386         {
5387             regmatch_state *newst;
5388
5389             DEBUG_STACK_r({
5390                 regmatch_state *cur = st;
5391                 regmatch_state *curyes = yes_state;
5392                 int curd = depth;
5393                 regmatch_slab *slab = PL_regmatch_slab;
5394                 for (;curd > -1;cur--,curd--) {
5395                     if (cur < SLAB_FIRST(slab)) {
5396                         slab = slab->prev;
5397                         cur = SLAB_LAST(slab);
5398                     }
5399                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5400                         REPORT_CODE_OFF + 2 + depth * 2,"",
5401                         curd, PL_reg_name[cur->resume_state],
5402                         (curyes == cur) ? "yes" : ""
5403                     );
5404                     if (curyes == cur)
5405                         curyes = cur->u.yes.prev_yes_state;
5406                 }
5407             } else
5408                 DEBUG_STATE_pp("push")
5409             );
5410             depth++;
5411             st->locinput = locinput;
5412             newst = st+1;
5413             if (newst >  SLAB_LAST(PL_regmatch_slab))
5414                 newst = S_push_slab(aTHX);
5415             PL_regmatch_state = newst;
5416
5417             locinput = PL_reginput;
5418             nextchr = UCHARAT(locinput);
5419             st = newst;
5420             continue;
5421             /* NOTREACHED */
5422         }
5423     }
5424
5425     /*
5426     * We get here only if there's trouble -- normally "case END" is
5427     * the terminating point.
5428     */
5429     Perl_croak(aTHX_ "corrupted regexp pointers");
5430     /*NOTREACHED*/
5431     sayNO;
5432
5433 yes:
5434     if (yes_state) {
5435         /* we have successfully completed a subexpression, but we must now
5436          * pop to the state marked by yes_state and continue from there */
5437         assert(st != yes_state);
5438 #ifdef DEBUGGING
5439         while (st != yes_state) {
5440             st--;
5441             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5442                 PL_regmatch_slab = PL_regmatch_slab->prev;
5443                 st = SLAB_LAST(PL_regmatch_slab);
5444             }
5445             DEBUG_STATE_r({
5446                 if (no_final) {
5447                     DEBUG_STATE_pp("pop (no final)");
5448                 } else {
5449                     DEBUG_STATE_pp("pop (yes)");
5450                 }
5451             });
5452             depth--;
5453         }
5454 #else
5455         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5456             || yes_state > SLAB_LAST(PL_regmatch_slab))
5457         {
5458             /* not in this slab, pop slab */
5459             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5460             PL_regmatch_slab = PL_regmatch_slab->prev;
5461             st = SLAB_LAST(PL_regmatch_slab);
5462         }
5463         depth -= (st - yes_state);
5464 #endif
5465         st = yes_state;
5466         yes_state = st->u.yes.prev_yes_state;
5467         PL_regmatch_state = st;
5468
5469         if (no_final) {
5470             locinput= st->locinput;
5471             nextchr = UCHARAT(locinput);
5472         }
5473         state_num = st->resume_state + no_final;
5474         goto reenter_switch;
5475     }
5476
5477     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5478                           PL_colors[4], PL_colors[5]));
5479
5480     if (PL_reg_eval_set) {
5481         /* each successfully executed (?{...}) block does the equivalent of
5482          *   local $^R = do {...}
5483          * When popping the save stack, all these locals would be undone;
5484          * bypass this by setting the outermost saved $^R to the latest
5485          * value */
5486         if (oreplsv != GvSV(PL_replgv))
5487             sv_setsv(oreplsv, GvSV(PL_replgv));
5488     }
5489     result = 1;
5490     goto final_exit;
5491
5492 no:
5493     DEBUG_EXECUTE_r(
5494         PerlIO_printf(Perl_debug_log,
5495             "%*s  %sfailed...%s\n",
5496             REPORT_CODE_OFF+depth*2, "",
5497             PL_colors[4], PL_colors[5])
5498         );
5499
5500 no_silent:
5501     if (no_final) {
5502         if (yes_state) {
5503             goto yes;
5504         } else {
5505             goto final_exit;
5506         }
5507     }
5508     if (depth) {
5509         /* there's a previous state to backtrack to */
5510         st--;
5511         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5512             PL_regmatch_slab = PL_regmatch_slab->prev;
5513             st = SLAB_LAST(PL_regmatch_slab);
5514         }
5515         PL_regmatch_state = st;
5516         locinput= st->locinput;
5517         nextchr = UCHARAT(locinput);
5518
5519         DEBUG_STATE_pp("pop");
5520         depth--;
5521         if (yes_state == st)
5522             yes_state = st->u.yes.prev_yes_state;
5523
5524         state_num = st->resume_state + 1; /* failure = success + 1 */
5525         goto reenter_switch;
5526     }
5527     result = 0;
5528
5529   final_exit:
5530     if (rex->intflags & PREGf_VERBARG_SEEN) {
5531         SV *sv_err = get_sv("REGERROR", 1);
5532         SV *sv_mrk = get_sv("REGMARK", 1);
5533         if (result) {
5534             sv_commit = &PL_sv_no;
5535             if (!sv_yes_mark)
5536                 sv_yes_mark = &PL_sv_yes;
5537         } else {
5538             if (!sv_commit)
5539                 sv_commit = &PL_sv_yes;
5540             sv_yes_mark = &PL_sv_no;
5541         }
5542         sv_setsv(sv_err, sv_commit);
5543         sv_setsv(sv_mrk, sv_yes_mark);
5544     }
5545
5546     /* clean up; in particular, free all slabs above current one */
5547     LEAVE_SCOPE(oldsave);
5548
5549     return result;
5550 }
5551
5552 /*
5553  - regrepeat - repeatedly match something simple, report how many
5554  */
5555 /*
5556  * [This routine now assumes that it will only match on things of length 1.
5557  * That was true before, but now we assume scan - reginput is the count,
5558  * rather than incrementing count on every character.  [Er, except utf8.]]
5559  */
5560 STATIC I32
5561 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5562 {
5563     dVAR;
5564     register char *scan;
5565     register I32 c;
5566     register char *loceol = PL_regeol;
5567     register I32 hardcount = 0;
5568     register bool do_utf8 = PL_reg_match_utf8;
5569 #ifndef DEBUGGING
5570     PERL_UNUSED_ARG(depth);
5571 #endif
5572
5573     PERL_ARGS_ASSERT_REGREPEAT;
5574
5575     scan = PL_reginput;
5576     if (max == REG_INFTY)
5577         max = I32_MAX;
5578     else if (max < loceol - scan)
5579         loceol = scan + max;
5580     switch (OP(p)) {
5581     case REG_ANY:
5582         if (do_utf8) {
5583             loceol = PL_regeol;
5584             while (scan < loceol && hardcount < max && *scan != '\n') {
5585                 scan += UTF8SKIP(scan);
5586                 hardcount++;
5587             }
5588         } else {
5589             while (scan < loceol && *scan != '\n')
5590                 scan++;
5591         }
5592         break;
5593     case SANY:
5594         if (do_utf8) {
5595             loceol = PL_regeol;
5596             while (scan < loceol && hardcount < max) {
5597                 scan += UTF8SKIP(scan);
5598                 hardcount++;
5599             }
5600         }
5601         else
5602             scan = loceol;
5603         break;
5604     case CANY:
5605         scan = loceol;
5606         break;
5607     case EXACT:         /* length of string is 1 */
5608         c = (U8)*STRING(p);
5609         while (scan < loceol && UCHARAT(scan) == c)
5610             scan++;
5611         break;
5612     case EXACTF:        /* length of string is 1 */
5613         c = (U8)*STRING(p);
5614         while (scan < loceol &&
5615                (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
5616             scan++;
5617         break;
5618     case EXACTFL:       /* length of string is 1 */
5619         PL_reg_flags |= RF_tainted;
5620         c = (U8)*STRING(p);
5621         while (scan < loceol &&
5622                (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
5623             scan++;
5624         break;
5625     case ANYOF:
5626         if (do_utf8) {
5627             loceol = PL_regeol;
5628             while (hardcount < max && scan < loceol &&
5629                    reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
5630                 scan += UTF8SKIP(scan);
5631                 hardcount++;
5632             }
5633         } else {
5634             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
5635                 scan++;
5636         }
5637         break;
5638     case ALNUM:
5639         if (do_utf8) {
5640             loceol = PL_regeol;
5641             LOAD_UTF8_CHARCLASS_ALNUM();
5642             while (hardcount < max && scan < loceol &&
5643                    swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5644                 scan += UTF8SKIP(scan);
5645                 hardcount++;
5646             }
5647         } else {
5648             while (scan < loceol && isALNUM(*scan))
5649                 scan++;
5650         }
5651         break;
5652     case ALNUML:
5653         PL_reg_flags |= RF_tainted;
5654         if (do_utf8) {
5655             loceol = PL_regeol;
5656             while (hardcount < max && scan < loceol &&
5657                    isALNUM_LC_utf8((U8*)scan)) {
5658                 scan += UTF8SKIP(scan);
5659                 hardcount++;
5660             }
5661         } else {
5662             while (scan < loceol && isALNUM_LC(*scan))
5663                 scan++;
5664         }
5665         break;
5666     case NALNUM:
5667         if (do_utf8) {
5668             loceol = PL_regeol;
5669             LOAD_UTF8_CHARCLASS_ALNUM();
5670             while (hardcount < max && scan < loceol &&
5671                    !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5672                 scan += UTF8SKIP(scan);
5673                 hardcount++;
5674             }
5675         } else {
5676             while (scan < loceol && !isALNUM(*scan))
5677                 scan++;
5678         }
5679         break;
5680     case NALNUML:
5681         PL_reg_flags |= RF_tainted;
5682         if (do_utf8) {
5683             loceol = PL_regeol;
5684             while (hardcount < max && scan < loceol &&
5685                    !isALNUM_LC_utf8((U8*)scan)) {
5686                 scan += UTF8SKIP(scan);
5687                 hardcount++;
5688             }
5689         } else {
5690             while (scan < loceol && !isALNUM_LC(*scan))
5691                 scan++;
5692         }
5693         break;
5694     case SPACE:
5695         if (do_utf8) {
5696             loceol = PL_regeol;
5697             LOAD_UTF8_CHARCLASS_SPACE();
5698             while (hardcount < max && scan < loceol &&
5699                    (*scan == ' ' ||
5700                     swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5701                 scan += UTF8SKIP(scan);
5702                 hardcount++;
5703             }
5704         } else {
5705             while (scan < loceol && isSPACE(*scan))
5706                 scan++;
5707         }
5708         break;
5709     case SPACEL:
5710         PL_reg_flags |= RF_tainted;
5711         if (do_utf8) {
5712             loceol = PL_regeol;
5713             while (hardcount < max && scan < loceol &&
5714                    (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5715                 scan += UTF8SKIP(scan);
5716                 hardcount++;
5717             }
5718         } else {
5719             while (scan < loceol && isSPACE_LC(*scan))
5720                 scan++;
5721         }
5722         break;
5723     case NSPACE:
5724         if (do_utf8) {
5725             loceol = PL_regeol;
5726             LOAD_UTF8_CHARCLASS_SPACE();
5727             while (hardcount < max && scan < loceol &&
5728                    !(*scan == ' ' ||
5729                      swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5730                 scan += UTF8SKIP(scan);
5731                 hardcount++;
5732             }
5733         } else {
5734             while (scan < loceol && !isSPACE(*scan))
5735                 scan++;
5736         }
5737         break;
5738     case NSPACEL:
5739         PL_reg_flags |= RF_tainted;
5740         if (do_utf8) {
5741             loceol = PL_regeol;
5742             while (hardcount < max && scan < loceol &&
5743                    !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5744                 scan += UTF8SKIP(scan);
5745                 hardcount++;
5746             }
5747         } else {
5748             while (scan < loceol && !isSPACE_LC(*scan))
5749                 scan++;
5750         }
5751         break;
5752     case DIGIT:
5753         if (do_utf8) {
5754             loceol = PL_regeol;
5755             LOAD_UTF8_CHARCLASS_DIGIT();
5756             while (hardcount < max && scan < loceol &&
5757                    swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5758                 scan += UTF8SKIP(scan);
5759                 hardcount++;
5760             }
5761         } else {
5762             while (scan < loceol && isDIGIT(*scan))
5763                 scan++;
5764         }
5765         break;
5766     case NDIGIT:
5767         if (do_utf8) {
5768             loceol = PL_regeol;
5769             LOAD_UTF8_CHARCLASS_DIGIT();
5770             while (hardcount < max && scan < loceol &&
5771                    !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5772                 scan += UTF8SKIP(scan);
5773                 hardcount++;
5774             }
5775         } else {
5776             while (scan < loceol && !isDIGIT(*scan))
5777                 scan++;
5778         }
5779     case LNBREAK:
5780         if (do_utf8) {
5781             loceol = PL_regeol;
5782             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
5783                 scan += c;
5784                 hardcount++;
5785             }
5786         } else {
5787             /*
5788               LNBREAK can match two latin chars, which is ok,
5789               because we have a null terminated string, but we
5790               have to use hardcount in this situation
5791             */
5792             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
5793                 scan+=c;
5794                 hardcount++;
5795             }
5796         }
5797         break;
5798     case HORIZWS:
5799         if (do_utf8) {
5800             loceol = PL_regeol;
5801             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
5802                 scan += c;
5803                 hardcount++;
5804             }
5805         } else {
5806             while (scan < loceol && is_HORIZWS_latin1(scan))
5807                 scan++;
5808         }
5809         break;
5810     case NHORIZWS:
5811         if (do_utf8) {
5812             loceol = PL_regeol;
5813             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
5814                 scan += UTF8SKIP(scan);
5815                 hardcount++;
5816             }
5817         } else {
5818             while (scan < loceol && !is_HORIZWS_latin1(scan))
5819                 scan++;
5820
5821         }
5822         break;
5823     case VERTWS:
5824         if (do_utf8) {
5825             loceol = PL_regeol;
5826             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
5827                 scan += c;
5828                 hardcount++;
5829             }
5830         } else {
5831             while (scan < loceol && is_VERTWS_latin1(scan))
5832                 scan++;
5833
5834         }
5835         break;
5836     case NVERTWS:
5837         if (do_utf8) {
5838             loceol = PL_regeol;
5839             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
5840                 scan += UTF8SKIP(scan);
5841                 hardcount++;
5842             }
5843         } else {
5844             while (scan < loceol && !is_VERTWS_latin1(scan))
5845                 scan++;
5846
5847         }
5848         break;
5849
5850     default:            /* Called on something of 0 width. */
5851         break;          /* So match right here or not at all. */
5852     }
5853
5854     if (hardcount)
5855         c = hardcount;
5856     else
5857         c = scan - PL_reginput;
5858     PL_reginput = scan;
5859
5860     DEBUG_r({
5861         GET_RE_DEBUG_FLAGS_DECL;
5862         DEBUG_EXECUTE_r({
5863             SV * const prop = sv_newmortal();
5864             regprop(prog, prop, p);
5865             PerlIO_printf(Perl_debug_log,
5866                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
5867                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
5868         });
5869     });
5870
5871     return(c);
5872 }
5873
5874
5875 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
5876 /*
5877 - regclass_swash - prepare the utf8 swash
5878 */
5879
5880 SV *
5881 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
5882 {
5883     dVAR;
5884     SV *sw  = NULL;
5885     SV *si  = NULL;
5886     SV *alt = NULL;
5887     RXi_GET_DECL(prog,progi);
5888     const struct reg_data * const data = prog ? progi->data : NULL;
5889
5890     PERL_ARGS_ASSERT_REGCLASS_SWASH;
5891
5892     if (data && data->count) {
5893         const U32 n = ARG(node);
5894
5895         if (data->what[n] == 's') {
5896             SV * const rv = MUTABLE_SV(data->data[n]);
5897             AV * const av = MUTABLE_AV(SvRV(rv));
5898             SV **const ary = AvARRAY(av);
5899             SV **a, **b;
5900
5901             /* See the end of regcomp.c:S_regclass() for
5902              * documentation of these array elements. */
5903
5904             si = *ary;
5905             a  = SvROK(ary[1]) ? &ary[1] : NULL;
5906             b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
5907
5908             if (a)
5909                 sw = *a;
5910             else if (si && doinit) {
5911                 sw = swash_init("utf8", "", si, 1, 0);
5912                 (void)av_store(av, 1, sw);
5913             }
5914             if (b)
5915                 alt = *b;
5916         }
5917     }
5918
5919     if (listsvp)
5920         *listsvp = si;
5921     if (altsvp)
5922         *altsvp  = alt;
5923
5924     return sw;
5925 }
5926 #endif
5927
5928 /*
5929  - reginclass - determine if a character falls into a character class
5930
5931   The n is the ANYOF regnode, the p is the target string, lenp
5932   is pointer to the maximum length of how far to go in the p
5933   (if the lenp is zero, UTF8SKIP(p) is used),
5934   do_utf8 tells whether the target string is in UTF-8.
5935
5936  */
5937
5938 STATIC bool
5939 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
5940 {
5941     dVAR;
5942     const char flags = ANYOF_FLAGS(n);
5943     bool match = FALSE;
5944     UV c = *p;
5945     STRLEN len = 0;
5946     STRLEN plen;
5947
5948     PERL_ARGS_ASSERT_REGINCLASS;
5949
5950     if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5951         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
5952                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
5953                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
5954                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
5955                  * UTF8_ALLOW_FFFF */
5956         if (len == (STRLEN)-1)
5957             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5958     }
5959
5960     plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
5961     if (do_utf8 || (flags & ANYOF_UNICODE)) {
5962         if (lenp)
5963             *lenp = 0;
5964         if (do_utf8 && !ANYOF_RUNTIME(n)) {
5965             if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5966                 match = TRUE;
5967         }
5968         if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
5969             match = TRUE;
5970         if (!match) {
5971             AV *av;
5972             SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
5973
5974             if (sw) {
5975                 U8 * utf8_p;
5976                 if (do_utf8) {
5977                     utf8_p = (U8 *) p;
5978                 } else {
5979                     STRLEN len = 1;
5980                     utf8_p = bytes_to_utf8(p, &len);
5981                 }
5982                 if (swash_fetch(sw, utf8_p, 1))
5983                     match = TRUE;
5984                 else if (flags & ANYOF_FOLD) {
5985                     if (!match && lenp && av) {
5986                         I32 i;
5987                         for (i = 0; i <= av_len(av); i++) {
5988                             SV* const sv = *av_fetch(av, i, FALSE);
5989                             STRLEN len;
5990                             const char * const s = SvPV_const(sv, len);
5991                             if (len <= plen && memEQ(s, (char*)utf8_p, len)) {
5992                                 *lenp = len;
5993                                 match = TRUE;
5994                                 break;
5995                             }
5996                         }
5997                     }
5998                     if (!match) {
5999                         U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
6000
6001                         STRLEN tmplen;
6002                         to_utf8_fold(utf8_p, tmpbuf, &tmplen);
6003                         if (swash_fetch(sw, tmpbuf, 1))
6004                             match = TRUE;
6005                     }
6006                 }
6007
6008                 /* If we allocated a string above, free it */
6009                 if (! do_utf8) Safefree(utf8_p);
6010             }
6011         }
6012         if (match && lenp && *lenp == 0)
6013             *lenp = UNISKIP(NATIVE_TO_UNI(c));
6014     }
6015     if (!match && c < 256) {
6016         if (ANYOF_BITMAP_TEST(n, c))
6017             match = TRUE;
6018         else if (flags & ANYOF_FOLD) {
6019             U8 f;
6020
6021             if (flags & ANYOF_LOCALE) {
6022                 PL_reg_flags |= RF_tainted;
6023                 f = PL_fold_locale[c];
6024             }
6025             else
6026                 f = PL_fold[c];
6027             if (f != c && ANYOF_BITMAP_TEST(n, f))
6028                 match = TRUE;
6029         }
6030
6031         if (!match && (flags & ANYOF_CLASS)) {
6032             PL_reg_flags |= RF_tainted;
6033             if (
6034                 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6035                 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6036                 (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6037                 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6038                 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6039                 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6040                 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6041                 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6042                 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6043                 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6044                 (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6045                 (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6046                 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6047                 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6048                 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6049                 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6050                 (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6051                 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6052                 (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6053                 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6054                 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6055                 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6056                 (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6057                 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6058                 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6059                 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6060                 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6061                 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6062                 (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6063                 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6064                 ) /* How's that for a conditional? */
6065             {
6066                 match = TRUE;
6067             }
6068         }
6069     }
6070
6071     return (flags & ANYOF_INVERT) ? !match : match;
6072 }
6073
6074 STATIC U8 *
6075 S_reghop3(U8 *s, I32 off, const U8* lim)
6076 {
6077     dVAR;
6078
6079     PERL_ARGS_ASSERT_REGHOP3;
6080
6081     if (off >= 0) {
6082         while (off-- && s < lim) {
6083             /* XXX could check well-formedness here */
6084             s += UTF8SKIP(s);
6085         }
6086     }
6087     else {
6088         while (off++ && s > lim) {
6089             s--;
6090             if (UTF8_IS_CONTINUED(*s)) {
6091                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6092                     s--;
6093             }
6094             /* XXX could check well-formedness here */
6095         }
6096     }
6097     return s;
6098 }
6099
6100 #ifdef XXX_dmq
6101 /* there are a bunch of places where we use two reghop3's that should
6102    be replaced with this routine. but since thats not done yet
6103    we ifdef it out - dmq
6104 */
6105 STATIC U8 *
6106 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6107 {
6108     dVAR;
6109
6110     PERL_ARGS_ASSERT_REGHOP4;
6111
6112     if (off >= 0) {
6113         while (off-- && s < rlim) {
6114             /* XXX could check well-formedness here */
6115             s += UTF8SKIP(s);
6116         }
6117     }
6118     else {
6119         while (off++ && s > llim) {
6120             s--;
6121             if (UTF8_IS_CONTINUED(*s)) {
6122                 while (s > llim && UTF8_IS_CONTINUATION(*s))
6123                     s--;
6124             }
6125             /* XXX could check well-formedness here */
6126         }
6127     }
6128     return s;
6129 }
6130 #endif
6131
6132 STATIC U8 *
6133 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6134 {
6135     dVAR;
6136
6137     PERL_ARGS_ASSERT_REGHOPMAYBE3;
6138
6139     if (off >= 0) {
6140         while (off-- && s < lim) {
6141             /* XXX could check well-formedness here */
6142             s += UTF8SKIP(s);
6143         }
6144         if (off >= 0)
6145             return NULL;
6146     }
6147     else {
6148         while (off++ && s > lim) {
6149             s--;
6150             if (UTF8_IS_CONTINUED(*s)) {
6151                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6152                     s--;
6153             }
6154             /* XXX could check well-formedness here */
6155         }
6156         if (off <= 0)
6157             return NULL;
6158     }
6159     return s;
6160 }
6161
6162 static void
6163 restore_pos(pTHX_ void *arg)
6164 {
6165     dVAR;
6166     regexp * const rex = (regexp *)arg;
6167     if (PL_reg_eval_set) {
6168         if (PL_reg_oldsaved) {
6169             rex->subbeg = PL_reg_oldsaved;
6170             rex->sublen = PL_reg_oldsavedlen;
6171 #ifdef PERL_OLD_COPY_ON_WRITE
6172             rex->saved_copy = PL_nrs;
6173 #endif
6174             RXp_MATCH_COPIED_on(rex);
6175         }
6176         PL_reg_magic->mg_len = PL_reg_oldpos;
6177         PL_reg_eval_set = 0;
6178         PL_curpm = PL_reg_oldcurpm;
6179     }
6180 }
6181
6182 STATIC void
6183 S_to_utf8_substr(pTHX_ register regexp *prog)
6184 {
6185     int i = 1;
6186
6187     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6188
6189     do {
6190         if (prog->substrs->data[i].substr
6191             && !prog->substrs->data[i].utf8_substr) {
6192             SV* const sv = newSVsv(prog->substrs->data[i].substr);
6193             prog->substrs->data[i].utf8_substr = sv;
6194             sv_utf8_upgrade(sv);
6195             if (SvVALID(prog->substrs->data[i].substr)) {
6196                 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
6197                 if (flags & FBMcf_TAIL) {
6198                     /* Trim the trailing \n that fbm_compile added last
6199                        time.  */
6200                     SvCUR_set(sv, SvCUR(sv) - 1);
6201                     /* Whilst this makes the SV technically "invalid" (as its
6202                        buffer is no longer followed by "\0") when fbm_compile()
6203                        adds the "\n" back, a "\0" is restored.  */
6204                 }
6205                 fbm_compile(sv, flags);
6206             }
6207             if (prog->substrs->data[i].substr == prog->check_substr)
6208                 prog->check_utf8 = sv;
6209         }
6210     } while (i--);
6211 }
6212
6213 STATIC void
6214 S_to_byte_substr(pTHX_ register regexp *prog)
6215 {
6216     dVAR;
6217     int i = 1;
6218
6219     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6220
6221     do {
6222         if (prog->substrs->data[i].utf8_substr
6223             && !prog->substrs->data[i].substr) {
6224             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6225             if (sv_utf8_downgrade(sv, TRUE)) {
6226                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6227                     const U8 flags
6228                         = BmFLAGS(prog->substrs->data[i].utf8_substr);
6229                     if (flags & FBMcf_TAIL) {
6230                         /* Trim the trailing \n that fbm_compile added last
6231                            time.  */
6232                         SvCUR_set(sv, SvCUR(sv) - 1);
6233                     }
6234                     fbm_compile(sv, flags);
6235                 }
6236             } else {
6237                 SvREFCNT_dec(sv);
6238                 sv = &PL_sv_undef;
6239             }
6240             prog->substrs->data[i].substr = sv;
6241             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6242                 prog->check_substr = sv;
6243         }
6244     } while (i--);
6245 }
6246
6247 /*
6248  * Local variables:
6249  * c-indentation-style: bsd
6250  * c-basic-offset: 4
6251  * indent-tabs-mode: t
6252  * End:
6253  *
6254  * ex: set ts=8 sts=4 sw=4 noet:
6255  */