regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #define RF_tainted      1               /* tainted information used? */
  84 #define RF_warned       2               /* warned about big count? */
  85
  86 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  87
  88 #define UTF ((PL_reg_flags & RF_utf8) != 0)
  89
  90 #define RS_init         1               /* eval environment created */
  91 #define RS_set          2               /* replsv value is set */
  92
  93 #ifndef STATIC
  94 #define STATIC  static
  95 #endif
  96
  97 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
  98
  99 /*
 100  * Forwards.
 101  */
 102
 103 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
 104 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 105
 106 #define HOPc(pos,off) \
 107         (char *)(PL_reg_match_utf8 \
 108             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 109             : (U8*)(pos + off))
 110 #define HOPBACKc(pos, off) \
 111         (char*)(PL_reg_match_utf8\
 112             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 113             : (pos - off >= PL_bostr)           \
 114                 ? (U8*)pos - off                \
 115                 : NULL)
 116
 117 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 118 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 119
 120 /* these are unrolled below in the CCC_TRY_XXX defined */
 121 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 122     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
 123
 124 /* Doesn't do an assert to verify that is correct */
 125 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 126     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
 127
 128 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 129 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 130 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 131
 132 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 133         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 134         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 135         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 136             * assert should likely and hopefully fail on an EBCDIC machine */ \
 137         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 138                                                                             \
 139         /* No asserts are done for these, in case called on an early        \
 140             * Unicode version in which they map to nothing */               \
 141         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 142         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 143         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 144         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 145         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 146         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 147         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 148
 149 /*
 150    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
 151    so that it is possible to override the option here without having to
 152    rebuild the entire core. as we are required to do if we change regcomp.h
 153    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
 154 */
 155 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
 156 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
 157 #endif
 158
 159 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
 160 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS_ALNUM()
 161 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS_SPACE()
 162 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
 163 #define RE_utf8_perl_word   PL_utf8_alnum
 164 #define RE_utf8_perl_space  PL_utf8_space
 165 #define RE_utf8_posix_digit PL_utf8_digit
 166 #define perl_word  alnum
 167 #define perl_space space
 168 #define posix_digit digit
 169 #else
 170 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS(perl_word,"a")
 171 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS(perl_space," ")
 172 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
 173 #define RE_utf8_perl_word   PL_utf8_perl_word
 174 #define RE_utf8_perl_space  PL_utf8_perl_space
 175 #define RE_utf8_posix_digit PL_utf8_posix_digit
 176 #endif
 177
 178
 179 #define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)                          \
 180         case NAMEL:                                                              \
 181             PL_reg_flags |= RF_tainted;                                                 \
 182             /* FALL THROUGH */                                                          \
 183         case NAME:                                                                     \
 184             if (!nextchr)                                                               \
 185                 sayNO;                                                                  \
 186             if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) {                                \
 187                 if (!CAT2(PL_utf8_,CLASS)) {                                            \
 188                     bool ok;                                                            \
 189                     ENTER;                                                              \
 190                     save_re_context();                                                  \
 191                     ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                            \
 192                     assert(ok);                                                         \
 193                     LEAVE;                                                              \
 194                 }                                                                       \
 195                 if (!(OP(scan) == NAME                                                  \
 196                     ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8))  \
 197                     : LCFUNC_utf8((U8*)locinput)))                                      \
 198                 {                                                                       \
 199                     sayNO;                                                              \
 200                 }                                                                       \
 201                 locinput += PL_utf8skip[nextchr];                                       \
 202                 nextchr = UCHARAT(locinput);                                            \
 203                 break;                                                                  \
 204             }                                                                           \
 205             if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))                  \
 206                 sayNO;                                                                  \
 207             nextchr = UCHARAT(++locinput);                                              \
 208             break
 209
 210 #define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)                        \
 211         case NAMEL:                                                              \
 212             PL_reg_flags |= RF_tainted;                                                 \
 213             /* FALL THROUGH */                                                          \
 214         case NAME :                                                                     \
 215             if (!nextchr && locinput >= PL_regeol)                                      \
 216                 sayNO;                                                                  \
 217             if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) {                                \
 218                 if (!CAT2(PL_utf8_,CLASS)) {                                            \
 219                     bool ok;                                                            \
 220                     ENTER;                                                              \
 221                     save_re_context();                                                  \
 222                     ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                            \
 223                     assert(ok);                                                         \
 224                     LEAVE;                                                              \
 225                 }                                                                       \
 226                 if ((OP(scan) == NAME                                                  \
 227                     ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8))  \
 228                     : LCFUNC_utf8((U8*)locinput)))                                      \
 229                 {                                                                       \
 230                     sayNO;                                                              \
 231                 }                                                                       \
 232                 locinput += PL_utf8skip[nextchr];                                       \
 233                 nextchr = UCHARAT(locinput);                                            \
 234                 break;                                                                  \
 235             }                                                                           \
 236             if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))                   \
 237                 sayNO;                                                                  \
 238             nextchr = UCHARAT(++locinput);                                              \
 239             break
 240
 241
 242
 243
 244
 245 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 246
 247 /* for use after a quantifier and before an EXACT-like node -- japhy */
 248 /* it would be nice to rework regcomp.sym to generate this stuff. sigh */
 249 #define JUMPABLE(rn) (      \
 250     OP(rn) == OPEN ||       \
 251     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 252     OP(rn) == EVAL ||   \
 253     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 254     OP(rn) == PLUS || OP(rn) == MINMOD || \
 255     OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
 256     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 257 )
 258 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 259
 260 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 261
 262 #if 0
 263 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 264    we don't need this definition. */
 265 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 266 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  || OP(rn)==REFF  || OP(rn)==NREFF  )
 267 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 268
 269 #else
 270 /* ... so we use this as its faster. */
 271 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 272 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 273 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 274
 275 #endif
 276
 277 /*
 278   Search for mandatory following text node; for lookahead, the text must
 279   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 280 */
 281 #define FIND_NEXT_IMPT(rn) STMT_START { \
 282     while (JUMPABLE(rn)) { \
 283         const OPCODE type = OP(rn); \
 284         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 285             rn = NEXTOPER(NEXTOPER(rn)); \
 286         else if (type == PLUS) \
 287             rn = NEXTOPER(rn); \
 288         else if (type == IFMATCH) \
 289             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 290         else rn += NEXT_OFF(rn); \
 291     } \
 292 } STMT_END
 293
 294
 295 static void restore_pos(pTHX_ void *arg);
 296
 297 #define REGCP_PAREN_ELEMS 4
 298 #define REGCP_OTHER_ELEMS 5
 299 #define REGCP_FRAME_ELEMS 1
 300 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 301  * are needed for the regexp context stack bookkeeping. */
 302
 303 STATIC CHECKPOINT
 304 S_regcppush(pTHX_ I32 parenfloor)
 305 {
 306     dVAR;
 307     const int retval = PL_savestack_ix;
 308     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 309     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 310     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 311     int p;
 312     GET_RE_DEBUG_FLAGS_DECL;
 313
 314     if (paren_elems_to_push < 0)
 315         Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 316
 317     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 318         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 319                    " out of range (%d-%d)", total_elems, PL_regsize, parenfloor);
 320
 321     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 322
 323     for (p = PL_regsize; p > parenfloor; p--) {
 324 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 325         SSPUSHINT(PL_regoffs[p].end);
 326         SSPUSHINT(PL_regoffs[p].start);
 327         SSPUSHPTR(PL_reg_start_tmp[p]);
 328         SSPUSHINT(p);
 329         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 330           "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 331                       (UV)p, (IV)PL_regoffs[p].start,
 332                       (IV)(PL_reg_start_tmp[p] - PL_bostr),
 333                       (IV)PL_regoffs[p].end
 334         ));
 335     }
 336 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 337     SSPUSHPTR(PL_regoffs);
 338     SSPUSHINT(PL_regsize);
 339     SSPUSHINT(*PL_reglastparen);
 340     SSPUSHINT(*PL_reglastcloseparen);
 341     SSPUSHPTR(PL_reginput);
 342     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 343
 344     return retval;
 345 }
 346
 347 /* These are needed since we do not localize EVAL nodes: */
 348 #define REGCP_SET(cp)                                           \
 349     DEBUG_STATE_r(                                              \
 350             PerlIO_printf(Perl_debug_log,                       \
 351                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 352                 (IV)PL_savestack_ix));                          \
 353     cp = PL_savestack_ix
 354
 355 #define REGCP_UNWIND(cp)                                        \
 356     DEBUG_STATE_r(                                              \
 357         if (cp != PL_savestack_ix)                              \
 358             PerlIO_printf(Perl_debug_log,                       \
 359                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 360                 (IV)(cp), (IV)PL_savestack_ix));                \
 361     regcpblow(cp)
 362
 363 STATIC char *
 364 S_regcppop(pTHX_ const regexp *rex)
 365 {
 366     dVAR;
 367     UV i;
 368     char *input;
 369     GET_RE_DEBUG_FLAGS_DECL;
 370
 371     PERL_ARGS_ASSERT_REGCPPOP;
 372
 373     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 374     i = SSPOPUV;
 375     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 376     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 377     input = (char *) SSPOPPTR;
 378     *PL_reglastcloseparen = SSPOPINT;
 379     *PL_reglastparen = SSPOPINT;
 380     PL_regsize = SSPOPINT;
 381     PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 382
 383     i -= REGCP_OTHER_ELEMS;
 384     /* Now restore the parentheses context. */
 385     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 386         I32 tmps;
 387         U32 paren = (U32)SSPOPINT;
 388         PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 389         PL_regoffs[paren].start = SSPOPINT;
 390         tmps = SSPOPINT;
 391         if (paren <= *PL_reglastparen)
 392             PL_regoffs[paren].end = tmps;
 393         DEBUG_BUFFERS_r(
 394             PerlIO_printf(Perl_debug_log,
 395                           "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 396                           (UV)paren, (IV)PL_regoffs[paren].start,
 397                           (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 398                           (IV)PL_regoffs[paren].end,
 399                           (paren > *PL_reglastparen ? "(no)" : ""));
 400         );
 401     }
 402     DEBUG_BUFFERS_r(
 403         if (*PL_reglastparen + 1 <= rex->nparens) {
 404             PerlIO_printf(Perl_debug_log,
 405                           "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 406                           (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 407         }
 408     );
 409 #if 1
 410     /* It would seem that the similar code in regtry()
 411      * already takes care of this, and in fact it is in
 412      * a better location to since this code can #if 0-ed out
 413      * but the code in regtry() is needed or otherwise tests
 414      * requiring null fields (pat.t#187 and split.t#{13,14}
 415      * (as of patchlevel 7877)  will fail.  Then again,
 416      * this code seems to be necessary or otherwise
 417      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 418      * --jhi updated by dapm */
 419     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 420         if (i > PL_regsize)
 421             PL_regoffs[i].start = -1;
 422         PL_regoffs[i].end = -1;
 423     }
 424 #endif
 425     return input;
 426 }
 427
 428 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 429
 430 /*
 431  * pregexec and friends
 432  */
 433
 434 #ifndef PERL_IN_XSUB_RE
 435 /*
 436  - pregexec - match a regexp against a string
 437  */
 438 I32
 439 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 440          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 441 /* strend: pointer to null at end of string */
 442 /* strbeg: real beginning of string */
 443 /* minend: end of match must be >=minend after stringarg. */
 444 /* nosave: For optimizations. */
 445 {
 446     PERL_ARGS_ASSERT_PREGEXEC;
 447
 448     return
 449         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 450                       nosave ? 0 : REXEC_COPY_STR);
 451 }
 452 #endif
 453
 454 /*
 455  * Need to implement the following flags for reg_anch:
 456  *
 457  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 458  * USE_INTUIT_ML
 459  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 460  * INTUIT_AUTORITATIVE_ML
 461  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 462  * INTUIT_ONCE_ML
 463  *
 464  * Another flag for this function: SECOND_TIME (so that float substrs
 465  * with giant delta may be not rechecked).
 466  */
 467
 468 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 469
 470 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 471    Otherwise, only SvCUR(sv) is used to get strbeg. */
 472
 473 /* XXXX We assume that strpos is strbeg unless sv. */
 474
 475 /* XXXX Some places assume that there is a fixed substring.
 476         An update may be needed if optimizer marks as "INTUITable"
 477         RExen without fixed substrings.  Similarly, it is assumed that
 478         lengths of all the strings are no more than minlen, thus they
 479         cannot come from lookahead.
 480         (Or minlen should take into account lookahead.)
 481   NOTE: Some of this comment is not correct. minlen does now take account
 482   of lookahead/behind. Further research is required. -- demerphq
 483
 484 */
 485
 486 /* A failure to find a constant substring means that there is no need to make
 487    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 488    finding a substring too deep into the string means that less calls to
 489    regtry() should be needed.
 490
 491    REx compiler's optimizer found 4 possible hints:
 492         a) Anchored substring;
 493         b) Fixed substring;
 494         c) Whether we are anchored (beginning-of-line or \G);
 495         d) First node (of those at offset 0) which may distingush positions;
 496    We use a)b)d) and multiline-part of c), and try to find a position in the
 497    string which does not contradict any of them.
 498  */
 499
 500 /* Most of decisions we do here should have been done at compile time.
 501    The nodes of the REx which we used for the search should have been
 502    deleted from the finite automaton. */
 503
 504 char *
 505 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 506                      char *strend, const U32 flags, re_scream_pos_data *data)
 507 {
 508     dVAR;
 509     struct regexp *const prog = (struct regexp *)SvANY(rx);
 510     register I32 start_shift = 0;
 511     /* Should be nonnegative! */
 512     register I32 end_shift   = 0;
 513     register char *s;
 514     register SV *check;
 515     char *strbeg;
 516     char *t;
 517     const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 518     I32 ml_anch;
 519     register char *other_last = NULL;   /* other substr checked before this */
 520     char *check_at = NULL;              /* check substr found at this pos */
 521     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 522     RXi_GET_DECL(prog,progi);
 523 #ifdef DEBUGGING
 524     const char * const i_strpos = strpos;
 525 #endif
 526     GET_RE_DEBUG_FLAGS_DECL;
 527
 528     PERL_ARGS_ASSERT_RE_INTUIT_START;
 529
 530     RX_MATCH_UTF8_set(rx,do_utf8);
 531
 532     if (RX_UTF8(rx)) {
 533         PL_reg_flags |= RF_utf8;
 534     }
 535     DEBUG_EXECUTE_r(
 536         debug_start_match(rx, do_utf8, strpos, strend,
 537             sv ? "Guessing start of match in sv for"
 538                : "Guessing start of match in string for");
 539               );
 540
 541     /* CHR_DIST() would be more correct here but it makes things slow. */
 542     if (prog->minlen > strend - strpos) {
 543         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 544                               "String too short... [re_intuit_start]\n"));
 545         goto fail;
 546     }
 547
 548     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 549     PL_regeol = strend;
 550     if (do_utf8) {
 551         if (!prog->check_utf8 && prog->check_substr)
 552             to_utf8_substr(prog);
 553         check = prog->check_utf8;
 554     } else {
 555         if (!prog->check_substr && prog->check_utf8)
 556             to_byte_substr(prog);
 557         check = prog->check_substr;
 558     }
 559     if (check == &PL_sv_undef) {
 560         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 561                 "Non-utf8 string cannot match utf8 check string\n"));
 562         goto fail;
 563     }
 564     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 565         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 566                      || ( (prog->extflags & RXf_ANCH_BOL)
 567                           && !multiline ) );    /* Check after \n? */
 568
 569         if (!ml_anch) {
 570           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 571                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 572                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 573                && sv && !SvROK(sv)
 574                && (strpos != strbeg)) {
 575               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 576               goto fail;
 577           }
 578           if (prog->check_offset_min == prog->check_offset_max &&
 579               !(prog->extflags & RXf_CANY_SEEN)) {
 580             /* Substring at constant offset from beg-of-str... */
 581             I32 slen;
 582
 583             s = HOP3c(strpos, prog->check_offset_min, strend);
 584
 585             if (SvTAIL(check)) {
 586                 slen = SvCUR(check);    /* >= 1 */
 587
 588                 if ( strend - s > slen || strend - s < slen - 1
 589                      || (strend - s == slen && strend[-1] != '\n')) {
 590                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 591                     goto fail_finish;
 592                 }
 593                 /* Now should match s[0..slen-2] */
 594                 slen--;
 595                 if (slen && (*SvPVX_const(check) != *s
 596                              || (slen > 1
 597                                  && memNE(SvPVX_const(check), s, slen)))) {
 598                   report_neq:
 599                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 600                     goto fail_finish;
 601                 }
 602             }
 603             else if (*SvPVX_const(check) != *s
 604                      || ((slen = SvCUR(check)) > 1
 605                          && memNE(SvPVX_const(check), s, slen)))
 606                 goto report_neq;
 607             check_at = s;
 608             goto success_at_start;
 609           }
 610         }
 611         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 612         s = strpos;
 613         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 614         end_shift = prog->check_end_shift;
 615
 616         if (!ml_anch) {
 617             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 618                                          - (SvTAIL(check) != 0);
 619             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 620
 621             if (end_shift < eshift)
 622                 end_shift = eshift;
 623         }
 624     }
 625     else {                              /* Can match at random position */
 626         ml_anch = 0;
 627         s = strpos;
 628         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 629         end_shift = prog->check_end_shift;
 630
 631         /* end shift should be non negative here */
 632     }
 633
 634 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 635     if (end_shift < 0)
 636         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 637                    (IV)end_shift, RX_PRECOMP(prog));
 638 #endif
 639
 640   restart:
 641     /* Find a possible match in the region s..strend by looking for
 642        the "check" substring in the region corrected by start/end_shift. */
 643
 644     {
 645         I32 srch_start_shift = start_shift;
 646         I32 srch_end_shift = end_shift;
 647         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 648             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 649             srch_start_shift = strbeg - s;
 650         }
 651     DEBUG_OPTIMISE_MORE_r({
 652         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 653             (IV)prog->check_offset_min,
 654             (IV)srch_start_shift,
 655             (IV)srch_end_shift,
 656             (IV)prog->check_end_shift);
 657     });
 658
 659     if (flags & REXEC_SCREAM) {
 660         I32 p = -1;                     /* Internal iterator of scream. */
 661         I32 * const pp = data ? data->scream_pos : &p;
 662
 663         if (PL_screamfirst[BmRARE(check)] >= 0
 664             || ( BmRARE(check) == '\n'
 665                  && (BmPREVIOUS(check) == SvCUR(check) - 1)
 666                  && SvTAIL(check) ))
 667             s = screaminstr(sv, check,
 668                             srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 669         else
 670             goto fail_finish;
 671         /* we may be pointing at the wrong string */
 672         if (s && RXp_MATCH_COPIED(prog))
 673             s = strbeg + (s - SvPVX_const(sv));
 674         if (data)
 675             *data->scream_olds = s;
 676     }
 677     else {
 678         U8* start_point;
 679         U8* end_point;
 680         if (prog->extflags & RXf_CANY_SEEN) {
 681             start_point= (U8*)(s + srch_start_shift);
 682             end_point= (U8*)(strend - srch_end_shift);
 683         } else {
 684             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 685             end_point= HOP3(strend, -srch_end_shift, strbeg);
 686         }
 687         DEBUG_OPTIMISE_MORE_r({
 688             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 689                 (int)(end_point - start_point),
 690                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 691                 start_point);
 692         });
 693
 694         s = fbm_instr( start_point, end_point,
 695                       check, multiline ? FBMrf_MULTILINE : 0);
 696     }
 697     }
 698     /* Update the count-of-usability, remove useless subpatterns,
 699         unshift s.  */
 700
 701     DEBUG_EXECUTE_r({
 702         RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
 703             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 704         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 705                           (s ? "Found" : "Did not find"),
 706             (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
 707                 ? "anchored" : "floating"),
 708             quoted,
 709             RE_SV_TAIL(check),
 710             (s ? " at offset " : "...\n") );
 711     });
 712
 713     if (!s)
 714         goto fail_finish;
 715     /* Finish the diagnostic message */
 716     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 717
 718     /* XXX dmq: first branch is for positive lookbehind...
 719        Our check string is offset from the beginning of the pattern.
 720        So we need to do any stclass tests offset forward from that
 721        point. I think. :-(
 722      */
 723
 724
 725
 726     check_at=s;
 727
 728
 729     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 730        Start with the other substr.
 731        XXXX no SCREAM optimization yet - and a very coarse implementation
 732        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 733                 *always* match.  Probably should be marked during compile...
 734        Probably it is right to do no SCREAM here...
 735      */
 736
 737     if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
 738                 : (prog->float_substr && prog->anchored_substr))
 739     {
 740         /* Take into account the "other" substring. */
 741         /* XXXX May be hopelessly wrong for UTF... */
 742         if (!other_last)
 743             other_last = strpos;
 744         if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
 745           do_other_anchored:
 746             {
 747                 char * const last = HOP3c(s, -start_shift, strbeg);
 748                 char *last1, *last2;
 749                 char * const saved_s = s;
 750                 SV* must;
 751
 752                 t = s - prog->check_offset_max;
 753                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 754                     && (!do_utf8
 755                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 756                             && t > strpos)))
 757                     NOOP;
 758                 else
 759                     t = strpos;
 760                 t = HOP3c(t, prog->anchored_offset, strend);
 761                 if (t < other_last)     /* These positions already checked */
 762                     t = other_last;
 763                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 764                 if (last < last1)
 765                     last1 = last;
 766                 /* XXXX It is not documented what units *_offsets are in.
 767                    We assume bytes, but this is clearly wrong.
 768                    Meaning this code needs to be carefully reviewed for errors.
 769                    dmq.
 770                   */
 771
 772                 /* On end-of-str: see comment below. */
 773                 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
 774                 if (must == &PL_sv_undef) {
 775                     s = (char*)NULL;
 776                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 777                 }
 778                 else
 779                     s = fbm_instr(
 780                         (unsigned char*)t,
 781                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 782                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 783                         must,
 784                         multiline ? FBMrf_MULTILINE : 0
 785                     );
 786                 DEBUG_EXECUTE_r({
 787                     RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
 788                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 789                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 790                         (s ? "Found" : "Contradicts"),
 791                         quoted, RE_SV_TAIL(must));
 792                 });
 793
 794
 795                 if (!s) {
 796                     if (last1 >= last2) {
 797                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 798                                                 ", giving up...\n"));
 799                         goto fail_finish;
 800                     }
 801                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 802                         ", trying floating at offset %ld...\n",
 803                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 804                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 805                     s = HOP3c(last, 1, strend);
 806                     goto restart;
 807                 }
 808                 else {
 809                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 810                           (long)(s - i_strpos)));
 811                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 812                     other_last = HOP3c(s, 1, strend);
 813                     s = saved_s;
 814                     if (t == strpos)
 815                         goto try_at_start;
 816                     goto try_at_offset;
 817                 }
 818             }
 819         }
 820         else {          /* Take into account the floating substring. */
 821             char *last, *last1;
 822             char * const saved_s = s;
 823             SV* must;
 824
 825             t = HOP3c(s, -start_shift, strbeg);
 826             last1 = last =
 827                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 828             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 829                 last = HOP3c(t, prog->float_max_offset, strend);
 830             s = HOP3c(t, prog->float_min_offset, strend);
 831             if (s < other_last)
 832                 s = other_last;
 833  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 834             must = do_utf8 ? prog->float_utf8 : prog->float_substr;
 835             /* fbm_instr() takes into account exact value of end-of-str
 836                if the check is SvTAIL(ed).  Since false positives are OK,
 837                and end-of-str is not later than strend we are OK. */
 838             if (must == &PL_sv_undef) {
 839                 s = (char*)NULL;
 840                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 841             }
 842             else
 843                 s = fbm_instr((unsigned char*)s,
 844                               (unsigned char*)last + SvCUR(must)
 845                                   - (SvTAIL(must)!=0),
 846                               must, multiline ? FBMrf_MULTILINE : 0);
 847             DEBUG_EXECUTE_r({
 848                 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
 849                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 850                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 851                     (s ? "Found" : "Contradicts"),
 852                     quoted, RE_SV_TAIL(must));
 853             });
 854             if (!s) {
 855                 if (last1 == last) {
 856                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 857                                             ", giving up...\n"));
 858                     goto fail_finish;
 859                 }
 860                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 861                     ", trying anchored starting at offset %ld...\n",
 862                     (long)(saved_s + 1 - i_strpos)));
 863                 other_last = last;
 864                 s = HOP3c(t, 1, strend);
 865                 goto restart;
 866             }
 867             else {
 868                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 869                       (long)(s - i_strpos)));
 870                 other_last = s; /* Fix this later. --Hugo */
 871                 s = saved_s;
 872                 if (t == strpos)
 873                     goto try_at_start;
 874                 goto try_at_offset;
 875             }
 876         }
 877     }
 878
 879
 880     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 881
 882     DEBUG_OPTIMISE_MORE_r(
 883         PerlIO_printf(Perl_debug_log,
 884             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 885             (IV)prog->check_offset_min,
 886             (IV)prog->check_offset_max,
 887             (IV)(s-strpos),
 888             (IV)(t-strpos),
 889             (IV)(t-s),
 890             (IV)(strend-strpos)
 891         )
 892     );
 893
 894     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 895         && (!do_utf8
 896             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 897                  && t > strpos)))
 898     {
 899         /* Fixed substring is found far enough so that the match
 900            cannot start at strpos. */
 901       try_at_offset:
 902         if (ml_anch && t[-1] != '\n') {
 903             /* Eventually fbm_*() should handle this, but often
 904                anchored_offset is not 0, so this check will not be wasted. */
 905             /* XXXX In the code below we prefer to look for "^" even in
 906                presence of anchored substrings.  And we search even
 907                beyond the found float position.  These pessimizations
 908                are historical artefacts only.  */
 909           find_anchor:
 910             while (t < strend - prog->minlen) {
 911                 if (*t == '\n') {
 912                     if (t < check_at - prog->check_offset_min) {
 913                         if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
 914                             /* Since we moved from the found position,
 915                                we definitely contradict the found anchored
 916                                substr.  Due to the above check we do not
 917                                contradict "check" substr.
 918                                Thus we can arrive here only if check substr
 919                                is float.  Redo checking for "other"=="fixed".
 920                              */
 921                             strpos = t + 1;
 922                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 923                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 924                             goto do_other_anchored;
 925                         }
 926                         /* We don't contradict the found floating substring. */
 927                         /* XXXX Why not check for STCLASS? */
 928                         s = t + 1;
 929                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 930                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 931                         goto set_useful;
 932                     }
 933                     /* Position contradicts check-string */
 934                     /* XXXX probably better to look for check-string
 935                        than for "\n", so one should lower the limit for t? */
 936                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 937                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 938                     other_last = strpos = s = t + 1;
 939                     goto restart;
 940                 }
 941                 t++;
 942             }
 943             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 944                         PL_colors[0], PL_colors[1]));
 945             goto fail_finish;
 946         }
 947         else {
 948             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
 949                         PL_colors[0], PL_colors[1]));
 950         }
 951         s = t;
 952       set_useful:
 953         ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr);    /* hooray/5 */
 954     }
 955     else {
 956         /* The found string does not prohibit matching at strpos,
 957            - no optimization of calling REx engine can be performed,
 958            unless it was an MBOL and we are not after MBOL,
 959            or a future STCLASS check will fail this. */
 960       try_at_start:
 961         /* Even in this situation we may use MBOL flag if strpos is offset
 962            wrt the start of the string. */
 963         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
 964             && (strpos != strbeg) && strpos[-1] != '\n'
 965             /* May be due to an implicit anchor of m{.*foo}  */
 966             && !(prog->intflags & PREGf_IMPLICIT))
 967         {
 968             t = strpos;
 969             goto find_anchor;
 970         }
 971         DEBUG_EXECUTE_r( if (ml_anch)
 972             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
 973                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
 974         );
 975       success_at_start:
 976         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
 977             && (do_utf8 ? (
 978                 prog->check_utf8                /* Could be deleted already */
 979                 && --BmUSEFUL(prog->check_utf8) < 0
 980                 && (prog->check_utf8 == prog->float_utf8)
 981             ) : (
 982                 prog->check_substr              /* Could be deleted already */
 983                 && --BmUSEFUL(prog->check_substr) < 0
 984                 && (prog->check_substr == prog->float_substr)
 985             )))
 986         {
 987             /* If flags & SOMETHING - do not do it many times on the same match */
 988             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
 989             /* XXX Does the destruction order has to change with do_utf8? */
 990             SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
 991             SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
 992             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
 993             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
 994             check = NULL;                       /* abort */
 995             s = strpos;
 996             /* XXXX This is a remnant of the old implementation.  It
 997                     looks wasteful, since now INTUIT can use many
 998                     other heuristics. */
 999             prog->extflags &= ~RXf_USE_INTUIT;
1000         }
1001         else
1002             s = strpos;
1003     }
1004
1005     /* Last resort... */
1006     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1007     /* trie stclasses are too expensive to use here, we are better off to
1008        leave it to regmatch itself */
1009     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1010         /* minlen == 0 is possible if regstclass is \b or \B,
1011            and the fixed substr is ''$.
1012            Since minlen is already taken into account, s+1 is before strend;
1013            accidentally, minlen >= 1 guaranties no false positives at s + 1
1014            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1015            regstclass does not come from lookahead...  */
1016         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1017            This leaves EXACTF only, which is dealt with in find_byclass().  */
1018         const U8* const str = (U8*)STRING(progi->regstclass);
1019         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1020                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1021                     : 1);
1022         char * endpos;
1023         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1024             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1025         else if (prog->float_substr || prog->float_utf8)
1026             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1027         else
1028             endpos= strend;
1029
1030         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1031                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1032
1033         t = s;
1034         s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1035         if (!s) {
1036 #ifdef DEBUGGING
1037             const char *what = NULL;
1038 #endif
1039             if (endpos == strend) {
1040                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1041                                 "Could not match STCLASS...\n") );
1042                 goto fail;
1043             }
1044             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1045                                    "This position contradicts STCLASS...\n") );
1046             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1047                 goto fail;
1048             /* Contradict one of substrings */
1049             if (prog->anchored_substr || prog->anchored_utf8) {
1050                 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1051                     DEBUG_EXECUTE_r( what = "anchored" );
1052                   hop_and_restart:
1053                     s = HOP3c(t, 1, strend);
1054                     if (s + start_shift + end_shift > strend) {
1055                         /* XXXX Should be taken into account earlier? */
1056                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1057                                                "Could not match STCLASS...\n") );
1058                         goto fail;
1059                     }
1060                     if (!check)
1061                         goto giveup;
1062                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1063                                 "Looking for %s substr starting at offset %ld...\n",
1064                                  what, (long)(s + start_shift - i_strpos)) );
1065                     goto restart;
1066                 }
1067                 /* Have both, check_string is floating */
1068                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1069                     goto retry_floating_check;
1070                 /* Recheck anchored substring, but not floating... */
1071                 s = check_at;
1072                 if (!check)
1073                     goto giveup;
1074                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1075                           "Looking for anchored substr starting at offset %ld...\n",
1076                           (long)(other_last - i_strpos)) );
1077                 goto do_other_anchored;
1078             }
1079             /* Another way we could have checked stclass at the
1080                current position only: */
1081             if (ml_anch) {
1082                 s = t = t + 1;
1083                 if (!check)
1084                     goto giveup;
1085                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1086                           "Looking for /%s^%s/m starting at offset %ld...\n",
1087                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1088                 goto try_at_offset;
1089             }
1090             if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))     /* Could have been deleted */
1091                 goto fail;
1092             /* Check is floating subtring. */
1093           retry_floating_check:
1094             t = check_at - start_shift;
1095             DEBUG_EXECUTE_r( what = "floating" );
1096             goto hop_and_restart;
1097         }
1098         if (t != s) {
1099             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1100                         "By STCLASS: moving %ld --> %ld\n",
1101                                   (long)(t - i_strpos), (long)(s - i_strpos))
1102                    );
1103         }
1104         else {
1105             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1106                                   "Does not contradict STCLASS...\n");
1107                    );
1108         }
1109     }
1110   giveup:
1111     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1112                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1113                           PL_colors[5], (long)(s - i_strpos)) );
1114     return s;
1115
1116   fail_finish:                          /* Substring not found */
1117     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1118         BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1119   fail:
1120     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1121                           PL_colors[4], PL_colors[5]));
1122     return NULL;
1123 }
1124
1125 #define DECL_TRIE_TYPE(scan) \
1126     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1127                     trie_type = (scan->flags != EXACT) \
1128                               ? (do_utf8 ? trie_utf8_fold : (UTF ? trie_latin_utf8_fold : trie_plain)) \
1129                               : (do_utf8 ? trie_utf8 : trie_plain)
1130
1131 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1132 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1133     switch (trie_type) {                                                    \
1134     case trie_utf8_fold:                                                    \
1135         if ( foldlen>0 ) {                                                  \
1136             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1137             foldlen -= len;                                                 \
1138             uscan += len;                                                   \
1139             len=0;                                                          \
1140         } else {                                                            \
1141             uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1142             uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1143             foldlen -= UNISKIP( uvc );                                      \
1144             uscan = foldbuf + UNISKIP( uvc );                               \
1145         }                                                                   \
1146         break;                                                              \
1147     case trie_latin_utf8_fold:                                              \
1148         if ( foldlen>0 ) {                                                  \
1149             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1150             foldlen -= len;                                                 \
1151             uscan += len;                                                   \
1152             len=0;                                                          \
1153         } else {                                                            \
1154             len = 1;                                                        \
1155             uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1156             foldlen -= UNISKIP( uvc );                                      \
1157             uscan = foldbuf + UNISKIP( uvc );                               \
1158         }                                                                   \
1159         break;                                                              \
1160     case trie_utf8:                                                         \
1161         uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1162         break;                                                              \
1163     case trie_plain:                                                        \
1164         uvc = (UV)*uc;                                                      \
1165         len = 1;                                                            \
1166     }                                                                       \
1167     if (uvc < 256) {                                                        \
1168         charid = trie->charmap[ uvc ];                                      \
1169     }                                                                       \
1170     else {                                                                  \
1171         charid = 0;                                                         \
1172         if (widecharmap) {                                                  \
1173             SV** const svpp = hv_fetch(widecharmap,                         \
1174                         (char*)&uvc, sizeof(UV), 0);                        \
1175             if (svpp)                                                       \
1176                 charid = (U16)SvIV(*svpp);                                  \
1177         }                                                                   \
1178     }                                                                       \
1179 } STMT_END
1180
1181 #define REXEC_FBC_EXACTISH_CHECK(CoNd)                 \
1182 {                                                      \
1183     char *my_strend= (char *)strend;                   \
1184     if ( (CoNd)                                        \
1185          && (ln == len ||                              \
1186              !ibcmp_utf8(s, &my_strend, 0,  do_utf8,   \
1187                         m, NULL, ln, cBOOL(UTF)))      \
1188          && (!reginfo || regtry(reginfo, &s)) )        \
1189         goto got_it;                                   \
1190     else {                                             \
1191          U8 foldbuf[UTF8_MAXBYTES_CASE+1];             \
1192          uvchr_to_utf8(tmpbuf, c);                     \
1193          f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);  \
1194          if ( f != c                                   \
1195               && (f == c1 || f == c2)                  \
1196               && (ln == len ||                         \
1197                 !ibcmp_utf8(s, &my_strend, 0,  do_utf8,\
1198                               m, NULL, ln, cBOOL(UTF)))\
1199               && (!reginfo || regtry(reginfo, &s)) )   \
1200               goto got_it;                             \
1201     }                                                  \
1202 }                                                      \
1203 s += len
1204
1205 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1206 STMT_START {                                              \
1207     while (s <= e) {                                      \
1208         if ( (CoNd)                                       \
1209              && (ln == 1 || !(OP(c) == EXACTF             \
1210                               ? ibcmp(s, m, ln)           \
1211                               : ibcmp_locale(s, m, ln)))  \
1212              && (!reginfo || regtry(reginfo, &s)) )        \
1213             goto got_it;                                  \
1214         s++;                                              \
1215     }                                                     \
1216 } STMT_END
1217
1218 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1219 STMT_START {                                          \
1220     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1221         CoDe                                          \
1222         s += uskip;                                   \
1223     }                                                 \
1224 } STMT_END
1225
1226 #define REXEC_FBC_SCAN(CoDe)                          \
1227 STMT_START {                                          \
1228     while (s < strend) {                              \
1229         CoDe                                          \
1230         s++;                                          \
1231     }                                                 \
1232 } STMT_END
1233
1234 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1235 REXEC_FBC_UTF8_SCAN(                                  \
1236     if (CoNd) {                                       \
1237         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1238             goto got_it;                              \
1239         else                                          \
1240             tmp = doevery;                            \
1241     }                                                 \
1242     else                                              \
1243         tmp = 1;                                      \
1244 )
1245
1246 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1247 REXEC_FBC_SCAN(                                       \
1248     if (CoNd) {                                       \
1249         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1250             goto got_it;                              \
1251         else                                          \
1252             tmp = doevery;                            \
1253     }                                                 \
1254     else                                              \
1255         tmp = 1;                                      \
1256 )
1257
1258 #define REXEC_FBC_TRYIT               \
1259 if ((!reginfo || regtry(reginfo, &s))) \
1260     goto got_it
1261
1262 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1263     if (do_utf8) {                                             \
1264         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1265     }                                                          \
1266     else {                                                     \
1267         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1268     }                                                          \
1269     break
1270
1271 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1272     if (do_utf8) {                                             \
1273         UtFpReLoAd;                                            \
1274         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1275     }                                                          \
1276     else {                                                     \
1277         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1278     }                                                          \
1279     break
1280
1281 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1282     PL_reg_flags |= RF_tainted;                                \
1283     if (do_utf8) {                                             \
1284         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1285     }                                                          \
1286     else {                                                     \
1287         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1288     }                                                          \
1289     break
1290
1291 #define DUMP_EXEC_POS(li,s,doutf8) \
1292     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1293
1294 /* We know what class REx starts with.  Try to find this position... */
1295 /* if reginfo is NULL, its a dryrun */
1296 /* annoyingly all the vars in this routine have different names from their counterparts
1297    in regmatch. /grrr */
1298
1299 STATIC char *
1300 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1301     const char *strend, regmatch_info *reginfo)
1302 {
1303         dVAR;
1304         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1305         char *m;
1306         STRLEN ln;
1307         STRLEN lnc;
1308         register STRLEN uskip;
1309         unsigned int c1;
1310         unsigned int c2;
1311         char *e;
1312         register I32 tmp = 1;   /* Scratch variable? */
1313         register const bool do_utf8 = PL_reg_match_utf8;
1314         RXi_GET_DECL(prog,progi);
1315
1316         PERL_ARGS_ASSERT_FIND_BYCLASS;
1317
1318         /* We know what class it must start with. */
1319         switch (OP(c)) {
1320         case ANYOF:
1321             if (do_utf8) {
1322                  REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1323                           !UTF8_IS_INVARIANT((U8)s[0]) ?
1324                           reginclass(prog, c, (U8*)s, 0, do_utf8) :
1325                           REGINCLASS(prog, c, (U8*)s));
1326             }
1327             else {
1328                  while (s < strend) {
1329                       STRLEN skip = 1;
1330
1331                       if (REGINCLASS(prog, c, (U8*)s) ||
1332                           (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1333                            /* The assignment of 2 is intentional:
1334                             * for the folded sharp s, the skip is 2. */
1335                            (skip = SHARP_S_SKIP))) {
1336                            if (tmp && (!reginfo || regtry(reginfo, &s)))
1337                                 goto got_it;
1338                            else
1339                                 tmp = doevery;
1340                       }
1341                       else
1342                            tmp = 1;
1343                       s += skip;
1344                  }
1345             }
1346             break;
1347         case CANY:
1348             REXEC_FBC_SCAN(
1349                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1350                     goto got_it;
1351                 else
1352                     tmp = doevery;
1353             );
1354             break;
1355         case EXACTF:
1356             m   = STRING(c);
1357             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1358             lnc = (I32) ln;     /* length to match in characters */
1359             if (UTF) {
1360                 STRLEN ulen1, ulen2;
1361                 U8 *sm = (U8 *) m;
1362                 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1363                 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1364                 /* used by commented-out code below */
1365                 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
1366
1367                 /* XXX: Since the node will be case folded at compile
1368                    time this logic is a little odd, although im not
1369                    sure that its actually wrong. --dmq */
1370
1371                 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1372                 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1373
1374                 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1375                    codepoint of the first character in the converted
1376                    form, yet originally we did the extra step.
1377                    No tests fail by commenting this code out however
1378                    so Ive left it out. -- dmq.
1379
1380                 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1381                                     0, uniflags);
1382                 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1383                                     0, uniflags);
1384                 */
1385
1386                 lnc = 0;
1387                 while (sm < ((U8 *) m + ln)) {
1388                     lnc++;
1389                     sm += UTF8SKIP(sm);
1390                 }
1391             }
1392             else {
1393                 c1 = *(U8*)m;
1394                 c2 = PL_fold[c1];
1395             }
1396             goto do_exactf;
1397         case EXACTFL:
1398             m   = STRING(c);
1399             ln  = STR_LEN(c);
1400             lnc = (I32) ln;
1401             c1 = *(U8*)m;
1402             c2 = PL_fold_locale[c1];
1403           do_exactf:
1404             e = HOP3c(strend, -((I32)lnc), s);
1405
1406             if (!reginfo && e < s)
1407                 e = s;                  /* Due to minlen logic of intuit() */
1408
1409             /* The idea in the EXACTF* cases is to first find the
1410              * first character of the EXACTF* node and then, if
1411              * necessary, case-insensitively compare the full
1412              * text of the node.  The c1 and c2 are the first
1413              * characters (though in Unicode it gets a bit
1414              * more complicated because there are more cases
1415              * than just upper and lower: one needs to use
1416              * the so-called folding case for case-insensitive
1417              * matching (called "loose matching" in Unicode).
1418              * ibcmp_utf8() will do just that. */
1419
1420             if (do_utf8 || UTF) {
1421                 UV c, f;
1422                 U8 tmpbuf [UTF8_MAXBYTES+1];
1423                 STRLEN len = 1;
1424                 STRLEN foldlen;
1425                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1426                 if (c1 == c2) {
1427                     /* Upper and lower of 1st char are equal -
1428                      * probably not a "letter". */
1429                     while (s <= e) {
1430                         if (do_utf8) {
1431                             c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1432                                            uniflags);
1433                         } else {
1434                             c = *((U8*)s);
1435                         }
1436                         REXEC_FBC_EXACTISH_CHECK(c == c1);
1437                     }
1438                 }
1439                 else {
1440                     while (s <= e) {
1441                         if (do_utf8) {
1442                             c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1443                                            uniflags);
1444                         } else {
1445                             c = *((U8*)s);
1446                         }
1447
1448                         /* Handle some of the three Greek sigmas cases.
1449                          * Note that not all the possible combinations
1450                          * are handled here: some of them are handled
1451                          * by the standard folding rules, and some of
1452                          * them (the character class or ANYOF cases)
1453                          * are handled during compiletime in
1454                          * regexec.c:S_regclass(). */
1455                         if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1456                             c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1457                             c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1458
1459                         REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1460                     }
1461                 }
1462             }
1463             else {
1464                 /* Neither pattern nor string are UTF8 */
1465                 if (c1 == c2)
1466                     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1467                 else
1468                     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1469             }
1470             break;
1471         case BOUNDL:
1472             PL_reg_flags |= RF_tainted;
1473             /* FALL THROUGH */
1474         case BOUND:
1475             if (do_utf8) {
1476                 if (s == PL_bostr)
1477                     tmp = '\n';
1478                 else {
1479                     U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1480                     tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1481                 }
1482                 tmp = ((OP(c) == BOUND ?
1483                         isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1484                 LOAD_UTF8_CHARCLASS_ALNUM();
1485                 REXEC_FBC_UTF8_SCAN(
1486                     if (tmp == !(OP(c) == BOUND ?
1487                                  cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) :
1488                                  isALNUM_LC_utf8((U8*)s)))
1489                     {
1490                         tmp = !tmp;
1491                         REXEC_FBC_TRYIT;
1492                 }
1493                 );
1494             }
1495             else {
1496                 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1497                 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1498                 REXEC_FBC_SCAN(
1499                     if (tmp ==
1500                         !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1501                         tmp = !tmp;
1502                         REXEC_FBC_TRYIT;
1503                 }
1504                 );
1505             }
1506             if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
1507                 goto got_it;
1508             break;
1509         case NBOUNDL:
1510             PL_reg_flags |= RF_tainted;
1511             /* FALL THROUGH */
1512         case NBOUND:
1513             if (do_utf8) {
1514                 if (s == PL_bostr)
1515                     tmp = '\n';
1516                 else {
1517                     U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1518                     tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1519                 }
1520                 tmp = ((OP(c) == NBOUND ?
1521                         isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1522                 LOAD_UTF8_CHARCLASS_ALNUM();
1523                 REXEC_FBC_UTF8_SCAN(
1524                     if (tmp == !(OP(c) == NBOUND ?
1525                                  cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) :
1526                                  isALNUM_LC_utf8((U8*)s)))
1527                         tmp = !tmp;
1528                     else REXEC_FBC_TRYIT;
1529                 );
1530             }
1531             else {
1532                 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1533                 tmp = ((OP(c) == NBOUND ?
1534                         isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1535                 REXEC_FBC_SCAN(
1536                     if (tmp ==
1537                         !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1538                         tmp = !tmp;
1539                     else REXEC_FBC_TRYIT;
1540                 );
1541             }
1542             if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
1543                 goto got_it;
1544             break;
1545         case ALNUM:
1546             REXEC_FBC_CSCAN_PRELOAD(
1547                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1548                 swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8),
1549                 isALNUM(*s)
1550             );
1551         case ALNUML:
1552             REXEC_FBC_CSCAN_TAINT(
1553                 isALNUM_LC_utf8((U8*)s),
1554                 isALNUM_LC(*s)
1555             );
1556         case NALNUM:
1557             REXEC_FBC_CSCAN_PRELOAD(
1558                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1559                 !swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8),
1560                 !isALNUM(*s)
1561             );
1562         case NALNUML:
1563             REXEC_FBC_CSCAN_TAINT(
1564                 !isALNUM_LC_utf8((U8*)s),
1565                 !isALNUM_LC(*s)
1566             );
1567         case SPACE:
1568             REXEC_FBC_CSCAN_PRELOAD(
1569                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1570                 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8),
1571                 isSPACE(*s)
1572             );
1573         case SPACEL:
1574             REXEC_FBC_CSCAN_TAINT(
1575                 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1576                 isSPACE_LC(*s)
1577             );
1578         case NSPACE:
1579             REXEC_FBC_CSCAN_PRELOAD(
1580                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1581                 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8)),
1582                 !isSPACE(*s)
1583             );
1584         case NSPACEL:
1585             REXEC_FBC_CSCAN_TAINT(
1586                 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1587                 !isSPACE_LC(*s)
1588             );
1589         case DIGIT:
1590             REXEC_FBC_CSCAN_PRELOAD(
1591                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1592                 swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8),
1593                 isDIGIT(*s)
1594             );
1595         case DIGITL:
1596             REXEC_FBC_CSCAN_TAINT(
1597                 isDIGIT_LC_utf8((U8*)s),
1598                 isDIGIT_LC(*s)
1599             );
1600         case NDIGIT:
1601             REXEC_FBC_CSCAN_PRELOAD(
1602                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1603                 !swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8),
1604                 !isDIGIT(*s)
1605             );
1606         case NDIGITL:
1607             REXEC_FBC_CSCAN_TAINT(
1608                 !isDIGIT_LC_utf8((U8*)s),
1609                 !isDIGIT_LC(*s)
1610             );
1611         case LNBREAK:
1612             REXEC_FBC_CSCAN(
1613                 is_LNBREAK_utf8(s),
1614                 is_LNBREAK_latin1(s)
1615             );
1616         case VERTWS:
1617             REXEC_FBC_CSCAN(
1618                 is_VERTWS_utf8(s),
1619                 is_VERTWS_latin1(s)
1620             );
1621         case NVERTWS:
1622             REXEC_FBC_CSCAN(
1623                 !is_VERTWS_utf8(s),
1624                 !is_VERTWS_latin1(s)
1625             );
1626         case HORIZWS:
1627             REXEC_FBC_CSCAN(
1628                 is_HORIZWS_utf8(s),
1629                 is_HORIZWS_latin1(s)
1630             );
1631         case NHORIZWS:
1632             REXEC_FBC_CSCAN(
1633                 !is_HORIZWS_utf8(s),
1634                 !is_HORIZWS_latin1(s)
1635             );
1636         case AHOCORASICKC:
1637         case AHOCORASICK:
1638             {
1639                 DECL_TRIE_TYPE(c);
1640                 /* what trie are we using right now */
1641                 reg_ac_data *aho
1642                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1643                 reg_trie_data *trie
1644                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1645                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1646
1647                 const char *last_start = strend - trie->minlen;
1648 #ifdef DEBUGGING
1649                 const char *real_start = s;
1650 #endif
1651                 STRLEN maxlen = trie->maxlen;
1652                 SV *sv_points;
1653                 U8 **points; /* map of where we were in the input string
1654                                 when reading a given char. For ASCII this
1655                                 is unnecessary overhead as the relationship
1656                                 is always 1:1, but for Unicode, especially
1657                                 case folded Unicode this is not true. */
1658                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1659                 U8 *bitmap=NULL;
1660
1661
1662                 GET_RE_DEBUG_FLAGS_DECL;
1663
1664                 /* We can't just allocate points here. We need to wrap it in
1665                  * an SV so it gets freed properly if there is a croak while
1666                  * running the match */
1667                 ENTER;
1668                 SAVETMPS;
1669                 sv_points=newSV(maxlen * sizeof(U8 *));
1670                 SvCUR_set(sv_points,
1671                     maxlen * sizeof(U8 *));
1672                 SvPOK_on(sv_points);
1673                 sv_2mortal(sv_points);
1674                 points=(U8**)SvPV_nolen(sv_points );
1675                 if ( trie_type != trie_utf8_fold
1676                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1677                 {
1678                     if (trie->bitmap)
1679                         bitmap=(U8*)trie->bitmap;
1680                     else
1681                         bitmap=(U8*)ANYOF_BITMAP(c);
1682                 }
1683                 /* this is the Aho-Corasick algorithm modified a touch
1684                    to include special handling for long "unknown char"
1685                    sequences. The basic idea being that we use AC as long
1686                    as we are dealing with a possible matching char, when
1687                    we encounter an unknown char (and we have not encountered
1688                    an accepting state) we scan forward until we find a legal
1689                    starting char.
1690                    AC matching is basically that of trie matching, except
1691                    that when we encounter a failing transition, we fall back
1692                    to the current states "fail state", and try the current char
1693                    again, a process we repeat until we reach the root state,
1694                    state 1, or a legal transition. If we fail on the root state
1695                    then we can either terminate if we have reached an accepting
1696                    state previously, or restart the entire process from the beginning
1697                    if we have not.
1698
1699                  */
1700                 while (s <= last_start) {
1701                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1702                     U8 *uc = (U8*)s;
1703                     U16 charid = 0;
1704                     U32 base = 1;
1705                     U32 state = 1;
1706                     UV uvc = 0;
1707                     STRLEN len = 0;
1708                     STRLEN foldlen = 0;
1709                     U8 *uscan = (U8*)NULL;
1710                     U8 *leftmost = NULL;
1711 #ifdef DEBUGGING
1712                     U32 accepted_word= 0;
1713 #endif
1714                     U32 pointpos = 0;
1715
1716                     while ( state && uc <= (U8*)strend ) {
1717                         int failed=0;
1718                         U32 word = aho->states[ state ].wordnum;
1719
1720                         if( state==1 ) {
1721                             if ( bitmap ) {
1722                                 DEBUG_TRIE_EXECUTE_r(
1723                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1724                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1725                                             (char *)uc, do_utf8 );
1726                                         PerlIO_printf( Perl_debug_log,
1727                                             " Scanning for legal start char...\n");
1728                                     }
1729                                 );
1730                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1731                                     uc++;
1732                                 }
1733                                 s= (char *)uc;
1734                             }
1735                             if (uc >(U8*)last_start) break;
1736                         }
1737
1738                         if ( word ) {
1739                             U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1740                             if (!leftmost || lpos < leftmost) {
1741                                 DEBUG_r(accepted_word=word);
1742                                 leftmost= lpos;
1743                             }
1744                             if (base==0) break;
1745
1746                         }
1747                         points[pointpos++ % maxlen]= uc;
1748                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1749                                              uscan, len, uvc, charid, foldlen,
1750                                              foldbuf, uniflags);
1751                         DEBUG_TRIE_EXECUTE_r({
1752                             dump_exec_pos( (char *)uc, c, strend, real_start,
1753                                 s,   do_utf8 );
1754                             PerlIO_printf(Perl_debug_log,
1755                                 " Charid:%3u CP:%4"UVxf" ",
1756                                  charid, uvc);
1757                         });
1758
1759                         do {
1760 #ifdef DEBUGGING
1761                             word = aho->states[ state ].wordnum;
1762 #endif
1763                             base = aho->states[ state ].trans.base;
1764
1765                             DEBUG_TRIE_EXECUTE_r({
1766                                 if (failed)
1767                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1768                                         s,   do_utf8 );
1769                                 PerlIO_printf( Perl_debug_log,
1770                                     "%sState: %4"UVxf", word=%"UVxf,
1771                                     failed ? " Fail transition to " : "",
1772                                     (UV)state, (UV)word);
1773                             });
1774                             if ( base ) {
1775                                 U32 tmp;
1776                                 if (charid &&
1777                                      (base + charid > trie->uniquecharcount )
1778                                      && (base + charid - 1 - trie->uniquecharcount
1779                                             < trie->lasttrans)
1780                                      && trie->trans[base + charid - 1 -
1781                                             trie->uniquecharcount].check == state
1782                                      && (tmp=trie->trans[base + charid - 1 -
1783                                         trie->uniquecharcount ].next))
1784                                 {
1785                                     DEBUG_TRIE_EXECUTE_r(
1786                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1787                                     state = tmp;
1788                                     break;
1789                                 }
1790                                 else {
1791                                     DEBUG_TRIE_EXECUTE_r(
1792                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1793                                     failed = 1;
1794                                     state = aho->fail[state];
1795                                 }
1796                             }
1797                             else {
1798                                 /* we must be accepting here */
1799                                 DEBUG_TRIE_EXECUTE_r(
1800                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1801                                 failed = 1;
1802                                 break;
1803                             }
1804                         } while(state);
1805                         uc += len;
1806                         if (failed) {
1807                             if (leftmost)
1808                                 break;
1809                             if (!state) state = 1;
1810                         }
1811                     }
1812                     if ( aho->states[ state ].wordnum ) {
1813                         U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1814                         if (!leftmost || lpos < leftmost) {
1815                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1816                             leftmost = lpos;
1817                         }
1818                     }
1819                     if (leftmost) {
1820                         s = (char*)leftmost;
1821                         DEBUG_TRIE_EXECUTE_r({
1822                             PerlIO_printf(
1823                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1824                                 (UV)accepted_word, (IV)(s - real_start)
1825                             );
1826                         });
1827                         if (!reginfo || regtry(reginfo, &s)) {
1828                             FREETMPS;
1829                             LEAVE;
1830                             goto got_it;
1831                         }
1832                         s = HOPc(s,1);
1833                         DEBUG_TRIE_EXECUTE_r({
1834                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1835                         });
1836                     } else {
1837                         DEBUG_TRIE_EXECUTE_r(
1838                             PerlIO_printf( Perl_debug_log,"No match.\n"));
1839                         break;
1840                     }
1841                 }
1842                 FREETMPS;
1843                 LEAVE;
1844             }
1845             break;
1846         default:
1847             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1848             break;
1849         }
1850         return 0;
1851       got_it:
1852         return s;
1853 }
1854
1855
1856 /*
1857  - regexec_flags - match a regexp against a string
1858  */
1859 I32
1860 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1861               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1862 /* strend: pointer to null at end of string */
1863 /* strbeg: real beginning of string */
1864 /* minend: end of match must be >=minend after stringarg. */
1865 /* data: May be used for some additional optimizations.
1866          Currently its only used, with a U32 cast, for transmitting
1867          the ganch offset when doing a /g match. This will change */
1868 /* nosave: For optimizations. */
1869 {
1870     dVAR;
1871     struct regexp *const prog = (struct regexp *)SvANY(rx);
1872     /*register*/ char *s;
1873     register regnode *c;
1874     /*register*/ char *startpos = stringarg;
1875     I32 minlen;         /* must match at least this many chars */
1876     I32 dontbother = 0; /* how many characters not to try at end */
1877     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
1878     I32 scream_pos = -1;                /* Internal iterator of scream. */
1879     char *scream_olds = NULL;
1880     const bool do_utf8 = cBOOL(DO_UTF8(sv));
1881     I32 multiline;
1882     RXi_GET_DECL(prog,progi);
1883     regmatch_info reginfo;  /* create some info to pass to regtry etc */
1884     regexp_paren_pair *swap = NULL;
1885     GET_RE_DEBUG_FLAGS_DECL;
1886
1887     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
1888     PERL_UNUSED_ARG(data);
1889
1890     /* Be paranoid... */
1891     if (prog == NULL || startpos == NULL) {
1892         Perl_croak(aTHX_ "NULL regexp parameter");
1893         return 0;
1894     }
1895
1896     multiline = prog->extflags & RXf_PMf_MULTILINE;
1897     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
1898
1899     RX_MATCH_UTF8_set(rx, do_utf8);
1900     DEBUG_EXECUTE_r(
1901         debug_start_match(rx, do_utf8, startpos, strend,
1902         "Matching");
1903     );
1904
1905     minlen = prog->minlen;
1906
1907     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1908         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1909                               "String too short [regexec_flags]...\n"));
1910         goto phooey;
1911     }
1912
1913
1914     /* Check validity of program. */
1915     if (UCHARAT(progi->program) != REG_MAGIC) {
1916         Perl_croak(aTHX_ "corrupted regexp program");
1917     }
1918
1919     PL_reg_flags = 0;
1920     PL_reg_eval_set = 0;
1921     PL_reg_maxiter = 0;
1922
1923     if (RX_UTF8(rx))
1924         PL_reg_flags |= RF_utf8;
1925
1926     /* Mark beginning of line for ^ and lookbehind. */
1927     reginfo.bol = startpos; /* XXX not used ??? */
1928     PL_bostr  = strbeg;
1929     reginfo.sv = sv;
1930
1931     /* Mark end of line for $ (and such) */
1932     PL_regeol = strend;
1933
1934     /* see how far we have to get to not match where we matched before */
1935     reginfo.till = startpos+minend;
1936
1937     /* If there is a "must appear" string, look for it. */
1938     s = startpos;
1939
1940     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
1941         MAGIC *mg;
1942         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
1943             reginfo.ganch = startpos + prog->gofs;
1944             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1945               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
1946         } else if (sv && SvTYPE(sv) >= SVt_PVMG
1947                   && SvMAGIC(sv)
1948                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1949                   && mg->mg_len >= 0) {
1950             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
1951             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1952                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
1953
1954             if (prog->extflags & RXf_ANCH_GPOS) {
1955                 if (s > reginfo.ganch)
1956                     goto phooey;
1957                 s = reginfo.ganch - prog->gofs;
1958                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1959                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
1960                 if (s < strbeg)
1961                     goto phooey;
1962             }
1963         }
1964         else if (data) {
1965             reginfo.ganch = strbeg + PTR2UV(data);
1966             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1967                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
1968
1969         } else {                                /* pos() not defined */
1970             reginfo.ganch = strbeg;
1971             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1972                  "GPOS: reginfo.ganch = strbeg\n"));
1973         }
1974     }
1975     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
1976         /* We have to be careful. If the previous successful match
1977            was from this regex we don't want a subsequent partially
1978            successful match to clobber the old results.
1979            So when we detect this possibility we add a swap buffer
1980            to the re, and switch the buffer each match. If we fail
1981            we switch it back, otherwise we leave it swapped.
1982         */
1983         swap = prog->offs;
1984         /* do we need a save destructor here for eval dies? */
1985         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
1986     }
1987     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1988         re_scream_pos_data d;
1989
1990         d.scream_olds = &scream_olds;
1991         d.scream_pos = &scream_pos;
1992         s = re_intuit_start(rx, sv, s, strend, flags, &d);
1993         if (!s) {
1994             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1995             goto phooey;        /* not present */
1996         }
1997     }
1998
1999
2000
2001     /* Simplest case:  anchored match need be tried only once. */
2002     /*  [unless only anchor is BOL and multiline is set] */
2003     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2004         if (s == startpos && regtry(&reginfo, &startpos))
2005             goto got_it;
2006         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2007                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2008         {
2009             char *end;
2010
2011             if (minlen)
2012                 dontbother = minlen - 1;
2013             end = HOP3c(strend, -dontbother, strbeg) - 1;
2014             /* for multiline we only have to try after newlines */
2015             if (prog->check_substr || prog->check_utf8) {
2016                 if (s == startpos)
2017                     goto after_try;
2018                 while (1) {
2019                     if (regtry(&reginfo, &s))
2020                         goto got_it;
2021                   after_try:
2022                     if (s > end)
2023                         goto phooey;
2024                     if (prog->extflags & RXf_USE_INTUIT) {
2025                         s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2026                         if (!s)
2027                             goto phooey;
2028                     }
2029                     else
2030                         s++;
2031                 }
2032             } else {
2033                 if (s > startpos)
2034                     s--;
2035                 while (s < end) {
2036                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2037                         if (regtry(&reginfo, &s))
2038                             goto got_it;
2039                     }
2040                 }
2041             }
2042         }
2043         goto phooey;
2044     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2045     {
2046         /* the warning about reginfo.ganch being used without intialization
2047            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2048            and we only enter this block when the same bit is set. */
2049         char *tmp_s = reginfo.ganch - prog->gofs;
2050
2051         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2052             goto got_it;
2053         goto phooey;
2054     }
2055
2056     /* Messy cases:  unanchored match. */
2057     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2058         /* we have /x+whatever/ */
2059         /* it must be a one character string (XXXX Except UTF?) */
2060         char ch;
2061 #ifdef DEBUGGING
2062         int did_match = 0;
2063 #endif
2064         if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2065             do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2066         ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
2067
2068         if (do_utf8) {
2069             REXEC_FBC_SCAN(
2070                 if (*s == ch) {
2071                     DEBUG_EXECUTE_r( did_match = 1 );
2072                     if (regtry(&reginfo, &s)) goto got_it;
2073                     s += UTF8SKIP(s);
2074                     while (s < strend && *s == ch)
2075                         s += UTF8SKIP(s);
2076                 }
2077             );
2078         }
2079         else {
2080             REXEC_FBC_SCAN(
2081                 if (*s == ch) {
2082                     DEBUG_EXECUTE_r( did_match = 1 );
2083                     if (regtry(&reginfo, &s)) goto got_it;
2084                     s++;
2085                     while (s < strend && *s == ch)
2086                         s++;
2087                 }
2088             );
2089         }
2090         DEBUG_EXECUTE_r(if (!did_match)
2091                 PerlIO_printf(Perl_debug_log,
2092                                   "Did not find anchored character...\n")
2093                );
2094     }
2095     else if (prog->anchored_substr != NULL
2096               || prog->anchored_utf8 != NULL
2097               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2098                   && prog->float_max_offset < strend - s)) {
2099         SV *must;
2100         I32 back_max;
2101         I32 back_min;
2102         char *last;
2103         char *last1;            /* Last position checked before */
2104 #ifdef DEBUGGING
2105         int did_match = 0;
2106 #endif
2107         if (prog->anchored_substr || prog->anchored_utf8) {
2108             if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2109                 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2110             must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
2111             back_max = back_min = prog->anchored_offset;
2112         } else {
2113             if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2114                 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2115             must = do_utf8 ? prog->float_utf8 : prog->float_substr;
2116             back_max = prog->float_max_offset;
2117             back_min = prog->float_min_offset;
2118         }
2119
2120
2121         if (must == &PL_sv_undef)
2122             /* could not downgrade utf8 check substring, so must fail */
2123             goto phooey;
2124
2125         if (back_min<0) {
2126             last = strend;
2127         } else {
2128             last = HOP3c(strend,        /* Cannot start after this */
2129                   -(I32)(CHR_SVLEN(must)
2130                          - (SvTAIL(must) != 0) + back_min), strbeg);
2131         }
2132         if (s > PL_bostr)
2133             last1 = HOPc(s, -1);
2134         else
2135             last1 = s - 1;      /* bogus */
2136
2137         /* XXXX check_substr already used to find "s", can optimize if
2138            check_substr==must. */
2139         scream_pos = -1;
2140         dontbother = end_shift;
2141         strend = HOPc(strend, -dontbother);
2142         while ( (s <= last) &&
2143                 ((flags & REXEC_SCREAM)
2144                  ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2145                                     end_shift, &scream_pos, 0))
2146                  : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2147                                   (unsigned char*)strend, must,
2148                                   multiline ? FBMrf_MULTILINE : 0))) ) {
2149             /* we may be pointing at the wrong string */
2150             if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2151                 s = strbeg + (s - SvPVX_const(sv));
2152             DEBUG_EXECUTE_r( did_match = 1 );
2153             if (HOPc(s, -back_max) > last1) {
2154                 last1 = HOPc(s, -back_min);
2155                 s = HOPc(s, -back_max);
2156             }
2157             else {
2158                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2159
2160                 last1 = HOPc(s, -back_min);
2161                 s = t;
2162             }
2163             if (do_utf8) {
2164                 while (s <= last1) {
2165                     if (regtry(&reginfo, &s))
2166                         goto got_it;
2167                     s += UTF8SKIP(s);
2168                 }
2169             }
2170             else {
2171                 while (s <= last1) {
2172                     if (regtry(&reginfo, &s))
2173                         goto got_it;
2174                     s++;
2175                 }
2176             }
2177         }
2178         DEBUG_EXECUTE_r(if (!did_match) {
2179             RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
2180                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2181             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2182                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2183                                ? "anchored" : "floating"),
2184                 quoted, RE_SV_TAIL(must));
2185         });
2186         goto phooey;
2187     }
2188     else if ( (c = progi->regstclass) ) {
2189         if (minlen) {
2190             const OPCODE op = OP(progi->regstclass);
2191             /* don't bother with what can't match */
2192             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2193                 strend = HOPc(strend, -(minlen - 1));
2194         }
2195         DEBUG_EXECUTE_r({
2196             SV * const prop = sv_newmortal();
2197             regprop(prog, prop, c);
2198             {
2199                 RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1),
2200                     s,strend-s,60);
2201                 PerlIO_printf(Perl_debug_log,
2202                     "Matching stclass %.*s against %s (%d chars)\n",
2203                     (int)SvCUR(prop), SvPVX_const(prop),
2204                      quoted, (int)(strend - s));
2205             }
2206         });
2207         if (find_byclass(prog, c, s, strend, &reginfo))
2208             goto got_it;
2209         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2210     }
2211     else {
2212         dontbother = 0;
2213         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2214             /* Trim the end. */
2215             char *last;
2216             SV* float_real;
2217
2218             if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2219                 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2220             float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
2221
2222             if (flags & REXEC_SCREAM) {
2223                 last = screaminstr(sv, float_real, s - strbeg,
2224                                    end_shift, &scream_pos, 1); /* last one */
2225                 if (!last)
2226                     last = scream_olds; /* Only one occurrence. */
2227                 /* we may be pointing at the wrong string */
2228                 else if (RXp_MATCH_COPIED(prog))
2229                     s = strbeg + (s - SvPVX_const(sv));
2230             }
2231             else {
2232                 STRLEN len;
2233                 const char * const little = SvPV_const(float_real, len);
2234
2235                 if (SvTAIL(float_real)) {
2236                     if (memEQ(strend - len + 1, little, len - 1))
2237                         last = strend - len + 1;
2238                     else if (!multiline)
2239                         last = memEQ(strend - len, little, len)
2240                             ? strend - len : NULL;
2241                     else
2242                         goto find_last;
2243                 } else {
2244                   find_last:
2245                     if (len)
2246                         last = rninstr(s, strend, little, little + len);
2247                     else
2248                         last = strend;  /* matching "$" */
2249                 }
2250             }
2251             if (last == NULL) {
2252                 DEBUG_EXECUTE_r(
2253                     PerlIO_printf(Perl_debug_log,
2254                         "%sCan't trim the tail, match fails (should not happen)%s\n",
2255                         PL_colors[4], PL_colors[5]));
2256                 goto phooey; /* Should not happen! */
2257             }
2258             dontbother = strend - last + prog->float_min_offset;
2259         }
2260         if (minlen && (dontbother < minlen))
2261             dontbother = minlen - 1;
2262         strend -= dontbother;              /* this one's always in bytes! */
2263         /* We don't know much -- general case. */
2264         if (do_utf8) {
2265             for (;;) {
2266                 if (regtry(&reginfo, &s))
2267                     goto got_it;
2268                 if (s >= strend)
2269                     break;
2270                 s += UTF8SKIP(s);
2271             };
2272         }
2273         else {
2274             do {
2275                 if (regtry(&reginfo, &s))
2276                     goto got_it;
2277             } while (s++ < strend);
2278         }
2279     }
2280
2281     /* Failure. */
2282     goto phooey;
2283
2284 got_it:
2285     Safefree(swap);
2286     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2287
2288     if (PL_reg_eval_set)
2289         restore_pos(aTHX_ prog);
2290     if (RXp_PAREN_NAMES(prog))
2291         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2292
2293     /* make sure $`, $&, $', and $digit will work later */
2294     if ( !(flags & REXEC_NOT_FIRST) ) {
2295         RX_MATCH_COPY_FREE(rx);
2296         if (flags & REXEC_COPY_STR) {
2297             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2298 #ifdef PERL_OLD_COPY_ON_WRITE
2299             if ((SvIsCOW(sv)
2300                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2301                 if (DEBUG_C_TEST) {
2302                     PerlIO_printf(Perl_debug_log,
2303                                   "Copy on write: regexp capture, type %d\n",
2304                                   (int) SvTYPE(sv));
2305                 }
2306                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2307                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2308                 assert (SvPOKp(prog->saved_copy));
2309             } else
2310 #endif
2311             {
2312                 RX_MATCH_COPIED_on(rx);
2313                 s = savepvn(strbeg, i);
2314                 prog->subbeg = s;
2315             }
2316             prog->sublen = i;
2317         }
2318         else {
2319             prog->subbeg = strbeg;
2320             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2321         }
2322     }
2323
2324     return 1;
2325
2326 phooey:
2327     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2328                           PL_colors[4], PL_colors[5]));
2329     if (PL_reg_eval_set)
2330         restore_pos(aTHX_ prog);
2331     if (swap) {
2332         /* we failed :-( roll it back */
2333         Safefree(prog->offs);
2334         prog->offs = swap;
2335     }
2336
2337     return 0;
2338 }
2339
2340
2341 /*
2342  - regtry - try match at specific point
2343  */
2344 STATIC I32                      /* 0 failure, 1 success */
2345 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2346 {
2347     dVAR;
2348     CHECKPOINT lastcp;
2349     REGEXP *const rx = reginfo->prog;
2350     regexp *const prog = (struct regexp *)SvANY(rx);
2351     RXi_GET_DECL(prog,progi);
2352     GET_RE_DEBUG_FLAGS_DECL;
2353
2354     PERL_ARGS_ASSERT_REGTRY;
2355
2356     reginfo->cutpoint=NULL;
2357
2358     if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2359         MAGIC *mg;
2360
2361         PL_reg_eval_set = RS_init;
2362         DEBUG_EXECUTE_r(DEBUG_s(
2363             PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2364                           (IV)(PL_stack_sp - PL_stack_base));
2365             ));
2366         SAVESTACK_CXPOS();
2367         cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2368         /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2369         SAVETMPS;
2370         /* Apparently this is not needed, judging by wantarray. */
2371         /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2372            cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2373
2374         if (reginfo->sv) {
2375             /* Make $_ available to executed code. */
2376             if (reginfo->sv != DEFSV) {
2377                 SAVE_DEFSV;
2378                 DEFSV_set(reginfo->sv);
2379             }
2380
2381             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2382                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2383                 /* prepare for quick setting of pos */
2384 #ifdef PERL_OLD_COPY_ON_WRITE
2385                 if (SvIsCOW(reginfo->sv))
2386                     sv_force_normal_flags(reginfo->sv, 0);
2387 #endif
2388                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2389                                  &PL_vtbl_mglob, NULL, 0);
2390                 mg->mg_len = -1;
2391             }
2392             PL_reg_magic    = mg;
2393             PL_reg_oldpos   = mg->mg_len;
2394             SAVEDESTRUCTOR_X(restore_pos, prog);
2395         }
2396         if (!PL_reg_curpm) {
2397             Newxz(PL_reg_curpm, 1, PMOP);
2398 #ifdef USE_ITHREADS
2399             {
2400                 SV* const repointer = &PL_sv_undef;
2401                 /* this regexp is also owned by the new PL_reg_curpm, which
2402                    will try to free it.  */
2403                 av_push(PL_regex_padav, repointer);
2404                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2405                 PL_regex_pad = AvARRAY(PL_regex_padav);
2406             }
2407 #endif
2408         }
2409 #ifdef USE_ITHREADS
2410         /* It seems that non-ithreads works both with and without this code.
2411            So for efficiency reasons it seems best not to have the code
2412            compiled when it is not needed.  */
2413         /* This is safe against NULLs: */
2414         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2415         /* PM_reg_curpm owns a reference to this regexp.  */
2416         ReREFCNT_inc(rx);
2417 #endif
2418         PM_SETRE(PL_reg_curpm, rx);
2419         PL_reg_oldcurpm = PL_curpm;
2420         PL_curpm = PL_reg_curpm;
2421         if (RXp_MATCH_COPIED(prog)) {
2422             /*  Here is a serious problem: we cannot rewrite subbeg,
2423                 since it may be needed if this match fails.  Thus
2424                 $` inside (?{}) could fail... */
2425             PL_reg_oldsaved = prog->subbeg;
2426             PL_reg_oldsavedlen = prog->sublen;
2427 #ifdef PERL_OLD_COPY_ON_WRITE
2428             PL_nrs = prog->saved_copy;
2429 #endif
2430             RXp_MATCH_COPIED_off(prog);
2431         }
2432         else
2433             PL_reg_oldsaved = NULL;
2434         prog->subbeg = PL_bostr;
2435         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2436     }
2437     DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2438     prog->offs[0].start = *startpos - PL_bostr;
2439     PL_reginput = *startpos;
2440     PL_reglastparen = &prog->lastparen;
2441     PL_reglastcloseparen = &prog->lastcloseparen;
2442     prog->lastparen = 0;
2443     prog->lastcloseparen = 0;
2444     PL_regsize = 0;
2445     PL_regoffs = prog->offs;
2446     if (PL_reg_start_tmpl <= prog->nparens) {
2447         PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2448         if(PL_reg_start_tmp)
2449             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2450         else
2451             Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2452     }
2453
2454     /* XXXX What this code is doing here?!!!  There should be no need
2455        to do this again and again, PL_reglastparen should take care of
2456        this!  --ilya*/
2457
2458     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2459      * Actually, the code in regcppop() (which Ilya may be meaning by
2460      * PL_reglastparen), is not needed at all by the test suite
2461      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2462      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2463      * Meanwhile, this code *is* needed for the
2464      * above-mentioned test suite tests to succeed.  The common theme
2465      * on those tests seems to be returning null fields from matches.
2466      * --jhi updated by dapm */
2467 #if 1
2468     if (prog->nparens) {
2469         regexp_paren_pair *pp = PL_regoffs;
2470         register I32 i;
2471         for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2472             ++pp;
2473             pp->start = -1;
2474             pp->end = -1;
2475         }
2476     }
2477 #endif
2478     REGCP_SET(lastcp);
2479     if (regmatch(reginfo, progi->program + 1)) {
2480         PL_regoffs[0].end = PL_reginput - PL_bostr;
2481         return 1;
2482     }
2483     if (reginfo->cutpoint)
2484         *startpos= reginfo->cutpoint;
2485     REGCP_UNWIND(lastcp);
2486     return 0;
2487 }
2488
2489
2490 #define sayYES goto yes
2491 #define sayNO goto no
2492 #define sayNO_SILENT goto no_silent
2493
2494 /* we dont use STMT_START/END here because it leads to
2495    "unreachable code" warnings, which are bogus, but distracting. */
2496 #define CACHEsayNO \
2497     if (ST.cache_mask) \
2498        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2499     sayNO
2500
2501 /* this is used to determine how far from the left messages like
2502    'failed...' are printed. It should be set such that messages
2503    are inline with the regop output that created them.
2504 */
2505 #define REPORT_CODE_OFF 32
2506
2507
2508 /* Make sure there is a test for this +1 options in re_tests */
2509 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2510
2511 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2512 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2513
2514 #define SLAB_FIRST(s) (&(s)->states[0])
2515 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2516
2517 /* grab a new slab and return the first slot in it */
2518
2519 STATIC regmatch_state *
2520 S_push_slab(pTHX)
2521 {
2522 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2523     dMY_CXT;
2524 #endif
2525     regmatch_slab *s = PL_regmatch_slab->next;
2526     if (!s) {
2527         Newx(s, 1, regmatch_slab);
2528         s->prev = PL_regmatch_slab;
2529         s->next = NULL;
2530         PL_regmatch_slab->next = s;
2531     }
2532     PL_regmatch_slab = s;
2533     return SLAB_FIRST(s);
2534 }
2535
2536
2537 /* push a new state then goto it */
2538
2539 #define PUSH_STATE_GOTO(state, node) \
2540     scan = node; \
2541     st->resume_state = state; \
2542     goto push_state;
2543
2544 /* push a new state with success backtracking, then goto it */
2545
2546 #define PUSH_YES_STATE_GOTO(state, node) \
2547     scan = node; \
2548     st->resume_state = state; \
2549     goto push_yes_state;
2550
2551
2552
2553 /*
2554
2555 regmatch() - main matching routine
2556
2557 This is basically one big switch statement in a loop. We execute an op,
2558 set 'next' to point the next op, and continue. If we come to a point which
2559 we may need to backtrack to on failure such as (A|B|C), we push a
2560 backtrack state onto the backtrack stack. On failure, we pop the top
2561 state, and re-enter the loop at the state indicated. If there are no more
2562 states to pop, we return failure.
2563
2564 Sometimes we also need to backtrack on success; for example /A+/, where
2565 after successfully matching one A, we need to go back and try to
2566 match another one; similarly for lookahead assertions: if the assertion
2567 completes successfully, we backtrack to the state just before the assertion
2568 and then carry on.  In these cases, the pushed state is marked as
2569 'backtrack on success too'. This marking is in fact done by a chain of
2570 pointers, each pointing to the previous 'yes' state. On success, we pop to
2571 the nearest yes state, discarding any intermediate failure-only states.
2572 Sometimes a yes state is pushed just to force some cleanup code to be
2573 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2574 it to free the inner regex.
2575
2576 Note that failure backtracking rewinds the cursor position, while
2577 success backtracking leaves it alone.
2578
2579 A pattern is complete when the END op is executed, while a subpattern
2580 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2581 ops trigger the "pop to last yes state if any, otherwise return true"
2582 behaviour.
2583
2584 A common convention in this function is to use A and B to refer to the two
2585 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2586 the subpattern to be matched possibly multiple times, while B is the entire
2587 rest of the pattern. Variable and state names reflect this convention.
2588
2589 The states in the main switch are the union of ops and failure/success of
2590 substates associated with with that op.  For example, IFMATCH is the op
2591 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2592 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2593 successfully matched A and IFMATCH_A_fail is a state saying that we have
2594 just failed to match A. Resume states always come in pairs. The backtrack
2595 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2596 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2597 on success or failure.
2598
2599 The struct that holds a backtracking state is actually a big union, with
2600 one variant for each major type of op. The variable st points to the
2601 top-most backtrack struct. To make the code clearer, within each
2602 block of code we #define ST to alias the relevant union.
2603
2604 Here's a concrete example of a (vastly oversimplified) IFMATCH
2605 implementation:
2606
2607     switch (state) {
2608     ....
2609
2610 #define ST st->u.ifmatch
2611
2612     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2613         ST.foo = ...; // some state we wish to save
2614         ...
2615         // push a yes backtrack state with a resume value of
2616         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2617         // first node of A:
2618         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2619         // NOTREACHED
2620
2621     case IFMATCH_A: // we have successfully executed A; now continue with B
2622         next = B;
2623         bar = ST.foo; // do something with the preserved value
2624         break;
2625
2626     case IFMATCH_A_fail: // A failed, so the assertion failed
2627         ...;   // do some housekeeping, then ...
2628         sayNO; // propagate the failure
2629
2630 #undef ST
2631
2632     ...
2633     }
2634
2635 For any old-timers reading this who are familiar with the old recursive
2636 approach, the code above is equivalent to:
2637
2638     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2639     {
2640         int foo = ...
2641         ...
2642         if (regmatch(A)) {
2643             next = B;
2644             bar = foo;
2645             break;
2646         }
2647         ...;   // do some housekeeping, then ...
2648         sayNO; // propagate the failure
2649     }
2650
2651 The topmost backtrack state, pointed to by st, is usually free. If you
2652 want to claim it, populate any ST.foo fields in it with values you wish to
2653 save, then do one of
2654
2655         PUSH_STATE_GOTO(resume_state, node);
2656         PUSH_YES_STATE_GOTO(resume_state, node);
2657
2658 which sets that backtrack state's resume value to 'resume_state', pushes a
2659 new free entry to the top of the backtrack stack, then goes to 'node'.
2660 On backtracking, the free slot is popped, and the saved state becomes the
2661 new free state. An ST.foo field in this new top state can be temporarily
2662 accessed to retrieve values, but once the main loop is re-entered, it
2663 becomes available for reuse.
2664
2665 Note that the depth of the backtrack stack constantly increases during the
2666 left-to-right execution of the pattern, rather than going up and down with
2667 the pattern nesting. For example the stack is at its maximum at Z at the
2668 end of the pattern, rather than at X in the following:
2669
2670     /(((X)+)+)+....(Y)+....Z/
2671
2672 The only exceptions to this are lookahead/behind assertions and the cut,
2673 (?>A), which pop all the backtrack states associated with A before
2674 continuing.
2675
2676 Bascktrack state structs are allocated in slabs of about 4K in size.
2677 PL_regmatch_state and st always point to the currently active state,
2678 and PL_regmatch_slab points to the slab currently containing
2679 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2680 allocated, and is never freed until interpreter destruction. When the slab
2681 is full, a new one is allocated and chained to the end. At exit from
2682 regmatch(), slabs allocated since entry are freed.
2683
2684 */
2685
2686
2687 #define DEBUG_STATE_pp(pp)                                  \
2688     DEBUG_STATE_r({                                         \
2689         DUMP_EXEC_POS(locinput, scan, do_utf8);             \
2690         PerlIO_printf(Perl_debug_log,                       \
2691             "    %*s"pp" %s%s%s%s%s\n",                     \
2692             depth*2, "",                                    \
2693             PL_reg_name[st->resume_state],                     \
2694             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2695             ((st==yes_state) ? "Y" : ""),                   \
2696             ((st==mark_state) ? "M" : ""),                  \
2697             ((st==yes_state||st==mark_state) ? "]" : "")    \
2698         );                                                  \
2699     });
2700
2701
2702 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2703
2704 #ifdef DEBUGGING
2705
2706 STATIC void
2707 S_debug_start_match(pTHX_ const REGEXP *prog, const bool do_utf8,
2708     const char *start, const char *end, const char *blurb)
2709 {
2710     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2711
2712     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2713
2714     if (!PL_colorset)
2715             reginitcolors();
2716     {
2717         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2718             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2719
2720         RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2721             start, end - start, 60);
2722
2723         PerlIO_printf(Perl_debug_log,
2724             "%s%s REx%s %s against %s\n",
2725                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2726
2727         if (do_utf8||utf8_pat)
2728             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2729                 utf8_pat ? "pattern" : "",
2730                 utf8_pat && do_utf8 ? " and " : "",
2731                 do_utf8 ? "string" : ""
2732             );
2733     }
2734 }
2735
2736 STATIC void
2737 S_dump_exec_pos(pTHX_ const char *locinput,
2738                       const regnode *scan,
2739                       const char *loc_regeol,
2740                       const char *loc_bostr,
2741                       const char *loc_reg_starttry,
2742                       const bool do_utf8)
2743 {
2744     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2745     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2746     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2747     /* The part of the string before starttry has one color
2748        (pref0_len chars), between starttry and current
2749        position another one (pref_len - pref0_len chars),
2750        after the current position the third one.
2751        We assume that pref0_len <= pref_len, otherwise we
2752        decrease pref0_len.  */
2753     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2754         ? (5 + taill) - l : locinput - loc_bostr;
2755     int pref0_len;
2756
2757     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2758
2759     while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2760         pref_len++;
2761     pref0_len = pref_len  - (locinput - loc_reg_starttry);
2762     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2763         l = ( loc_regeol - locinput > (5 + taill) - pref_len
2764               ? (5 + taill) - pref_len : loc_regeol - locinput);
2765     while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2766         l--;
2767     if (pref0_len < 0)
2768         pref0_len = 0;
2769     if (pref0_len > pref_len)
2770         pref0_len = pref_len;
2771     {
2772         const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2773
2774         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2775             (locinput - pref_len),pref0_len, 60, 4, 5);
2776
2777         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2778                     (locinput - pref_len + pref0_len),
2779                     pref_len - pref0_len, 60, 2, 3);
2780
2781         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2782                     locinput, loc_regeol - locinput, 10, 0, 1);
2783
2784         const STRLEN tlen=len0+len1+len2;
2785         PerlIO_printf(Perl_debug_log,
2786                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2787                     (IV)(locinput - loc_bostr),
2788                     len0, s0,
2789                     len1, s1,
2790                     (docolor ? "" : "> <"),
2791                     len2, s2,
2792                     (int)(tlen > 19 ? 0 :  19 - tlen),
2793                     "");
2794     }
2795 }
2796
2797 #endif
2798
2799 /* reg_check_named_buff_matched()
2800  * Checks to see if a named buffer has matched. The data array of
2801  * buffer numbers corresponding to the buffer is expected to reside
2802  * in the regexp->data->data array in the slot stored in the ARG() of
2803  * node involved. Note that this routine doesn't actually care about the
2804  * name, that information is not preserved from compilation to execution.
2805  * Returns the index of the leftmost defined buffer with the given name
2806  * or 0 if non of the buffers matched.
2807  */
2808 STATIC I32
2809 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2810 {
2811     I32 n;
2812     RXi_GET_DECL(rex,rexi);
2813     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2814     I32 *nums=(I32*)SvPVX(sv_dat);
2815
2816     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2817
2818     for ( n=0; n<SvIVX(sv_dat); n++ ) {
2819         if ((I32)*PL_reglastparen >= nums[n] &&
2820             PL_regoffs[nums[n]].end != -1)
2821         {
2822             return nums[n];
2823         }
2824     }
2825     return 0;
2826 }
2827
2828
2829 /* free all slabs above current one  - called during LEAVE_SCOPE */
2830
2831 STATIC void
2832 S_clear_backtrack_stack(pTHX_ void *p)
2833 {
2834     regmatch_slab *s = PL_regmatch_slab->next;
2835     PERL_UNUSED_ARG(p);
2836
2837     if (!s)
2838         return;
2839     PL_regmatch_slab->next = NULL;
2840     while (s) {
2841         regmatch_slab * const osl = s;
2842         s = s->next;
2843         Safefree(osl);
2844     }
2845 }
2846
2847
2848 #define SETREX(Re1,Re2) \
2849     if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2850     Re1 = (Re2)
2851
2852 STATIC I32                      /* 0 failure, 1 success */
2853 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2854 {
2855 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2856     dMY_CXT;
2857 #endif
2858     dVAR;
2859     register const bool do_utf8 = PL_reg_match_utf8;
2860     const U32 uniflags = UTF8_ALLOW_DEFAULT;
2861     REGEXP *rex_sv = reginfo->prog;
2862     regexp *rex = (struct regexp *)SvANY(rex_sv);
2863     RXi_GET_DECL(rex,rexi);
2864     I32 oldsave;
2865     /* the current state. This is a cached copy of PL_regmatch_state */
2866     register regmatch_state *st;
2867     /* cache heavy used fields of st in registers */
2868     register regnode *scan;
2869     register regnode *next;
2870     register U32 n = 0; /* general value; init to avoid compiler warning */
2871     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
2872     register char *locinput = PL_reginput;
2873     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
2874
2875     bool result = 0;        /* return value of S_regmatch */
2876     int depth = 0;          /* depth of backtrack stack */
2877     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2878     const U32 max_nochange_depth =
2879         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2880         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2881     regmatch_state *yes_state = NULL; /* state to pop to on success of
2882                                                             subpattern */
2883     /* mark_state piggy backs on the yes_state logic so that when we unwind
2884        the stack on success we can update the mark_state as we go */
2885     regmatch_state *mark_state = NULL; /* last mark state we have seen */
2886     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2887     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
2888     U32 state_num;
2889     bool no_final = 0;      /* prevent failure from backtracking? */
2890     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
2891     char *startpoint = PL_reginput;
2892     SV *popmark = NULL;     /* are we looking for a mark? */
2893     SV *sv_commit = NULL;   /* last mark name seen in failure */
2894     SV *sv_yes_mark = NULL; /* last mark name we have seen
2895                                during a successfull match */
2896     U32 lastopen = 0;       /* last open we saw */
2897     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
2898     SV* const oreplsv = GvSV(PL_replgv);
2899     /* these three flags are set by various ops to signal information to
2900      * the very next op. They have a useful lifetime of exactly one loop
2901      * iteration, and are not preserved or restored by state pushes/pops
2902      */
2903     bool sw = 0;            /* the condition value in (?(cond)a|b) */
2904     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
2905     int logical = 0;        /* the following EVAL is:
2906                                 0: (?{...})
2907                                 1: (?(?{...})X|Y)
2908                                 2: (??{...})
2909                                or the following IFMATCH/UNLESSM is:
2910                                 false: plain (?=foo)
2911                                 true:  used as a condition: (?(?=foo))
2912                             */
2913 #ifdef DEBUGGING
2914     GET_RE_DEBUG_FLAGS_DECL;
2915 #endif
2916
2917     PERL_ARGS_ASSERT_REGMATCH;
2918
2919     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
2920             PerlIO_printf(Perl_debug_log,"regmatch start\n");
2921     }));
2922     /* on first ever call to regmatch, allocate first slab */
2923     if (!PL_regmatch_slab) {
2924         Newx(PL_regmatch_slab, 1, regmatch_slab);
2925         PL_regmatch_slab->prev = NULL;
2926         PL_regmatch_slab->next = NULL;
2927         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2928     }
2929
2930     oldsave = PL_savestack_ix;
2931     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
2932     SAVEVPTR(PL_regmatch_slab);
2933     SAVEVPTR(PL_regmatch_state);
2934
2935     /* grab next free state slot */
2936     st = ++PL_regmatch_state;
2937     if (st >  SLAB_LAST(PL_regmatch_slab))
2938         st = PL_regmatch_state = S_push_slab(aTHX);
2939
2940     /* Note that nextchr is a byte even in UTF */
2941     nextchr = UCHARAT(locinput);
2942     scan = prog;
2943     while (scan != NULL) {
2944
2945         DEBUG_EXECUTE_r( {
2946             SV * const prop = sv_newmortal();
2947             regnode *rnext=regnext(scan);
2948             DUMP_EXEC_POS( locinput, scan, do_utf8 );
2949             regprop(rex, prop, scan);
2950
2951             PerlIO_printf(Perl_debug_log,
2952                     "%3"IVdf":%*s%s(%"IVdf")\n",
2953                     (IV)(scan - rexi->program), depth*2, "",
2954                     SvPVX_const(prop),
2955                     (PL_regkind[OP(scan)] == END || !rnext) ?
2956                         0 : (IV)(rnext - rexi->program));
2957         });
2958
2959         next = scan + NEXT_OFF(scan);
2960         if (next == scan)
2961             next = NULL;
2962         state_num = OP(scan);
2963
2964       reenter_switch:
2965
2966         assert(PL_reglastparen == &rex->lastparen);
2967         assert(PL_reglastcloseparen == &rex->lastcloseparen);
2968         assert(PL_regoffs == rex->offs);
2969
2970         switch (state_num) {
2971         case BOL:
2972             if (locinput == PL_bostr)
2973             {
2974                 /* reginfo->till = reginfo->bol; */
2975                 break;
2976             }
2977             sayNO;
2978         case MBOL:
2979             if (locinput == PL_bostr ||
2980                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2981             {
2982                 break;
2983             }
2984             sayNO;
2985         case SBOL:
2986             if (locinput == PL_bostr)
2987                 break;
2988             sayNO;
2989         case GPOS:
2990             if (locinput == reginfo->ganch)
2991                 break;
2992             sayNO;
2993
2994         case KEEPS:
2995             /* update the startpoint */
2996             st->u.keeper.val = PL_regoffs[0].start;
2997             PL_reginput = locinput;
2998             PL_regoffs[0].start = locinput - PL_bostr;
2999             PUSH_STATE_GOTO(KEEPS_next, next);
3000             /*NOT-REACHED*/
3001         case KEEPS_next_fail:
3002             /* rollback the start point change */
3003             PL_regoffs[0].start = st->u.keeper.val;
3004             sayNO_SILENT;
3005             /*NOT-REACHED*/
3006         case EOL:
3007                 goto seol;
3008         case MEOL:
3009             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3010                 sayNO;
3011             break;
3012         case SEOL:
3013           seol:
3014             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3015                 sayNO;
3016             if (PL_regeol - locinput > 1)
3017                 sayNO;
3018             break;
3019         case EOS:
3020             if (PL_regeol != locinput)
3021                 sayNO;
3022             break;
3023         case SANY:
3024             if (!nextchr && locinput >= PL_regeol)
3025                 sayNO;
3026             if (do_utf8) {
3027                 locinput += PL_utf8skip[nextchr];
3028                 if (locinput > PL_regeol)
3029                     sayNO;
3030                 nextchr = UCHARAT(locinput);
3031             }
3032             else
3033                 nextchr = UCHARAT(++locinput);
3034             break;
3035         case CANY:
3036             if (!nextchr && locinput >= PL_regeol)
3037                 sayNO;
3038             nextchr = UCHARAT(++locinput);
3039             break;
3040         case REG_ANY:
3041             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3042                 sayNO;
3043             if (do_utf8) {
3044                 locinput += PL_utf8skip[nextchr];
3045                 if (locinput > PL_regeol)
3046                     sayNO;
3047                 nextchr = UCHARAT(locinput);
3048             }
3049             else
3050                 nextchr = UCHARAT(++locinput);
3051             break;
3052
3053 #undef  ST
3054 #define ST st->u.trie
3055         case TRIEC:
3056             /* In this case the charclass data is available inline so
3057                we can fail fast without a lot of extra overhead.
3058              */
3059             if (scan->flags == EXACT || !do_utf8) {
3060                 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3061                     DEBUG_EXECUTE_r(
3062                         PerlIO_printf(Perl_debug_log,
3063                                   "%*s  %sfailed to match trie start class...%s\n",
3064                                   REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3065                     );
3066                     sayNO_SILENT;
3067                     /* NOTREACHED */
3068                 }
3069             }
3070             /* FALL THROUGH */
3071         case TRIE:
3072             {
3073                 /* what type of TRIE am I? (utf8 makes this contextual) */
3074                 DECL_TRIE_TYPE(scan);
3075
3076                 /* what trie are we using right now */
3077                 reg_trie_data * const trie
3078                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3079                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3080                 U32 state = trie->startstate;
3081
3082                 if (trie->bitmap && trie_type != trie_utf8_fold &&
3083                     !TRIE_BITMAP_TEST(trie,*locinput)
3084                 ) {
3085                     if (trie->states[ state ].wordnum) {
3086                          DEBUG_EXECUTE_r(
3087                             PerlIO_printf(Perl_debug_log,
3088                                           "%*s  %smatched empty string...%s\n",
3089                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3090                         );
3091                         break;
3092                     } else {
3093                         DEBUG_EXECUTE_r(
3094                             PerlIO_printf(Perl_debug_log,
3095                                           "%*s  %sfailed to match trie start class...%s\n",
3096                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3097                         );
3098                         sayNO_SILENT;
3099                    }
3100                 }
3101
3102             {
3103                 U8 *uc = ( U8* )locinput;
3104
3105                 STRLEN len = 0;
3106                 STRLEN foldlen = 0;
3107                 U8 *uscan = (U8*)NULL;
3108                 STRLEN bufflen=0;
3109                 SV *sv_accept_buff = NULL;
3110                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3111
3112                 ST.accepted = 0; /* how many accepting states we have seen */
3113                 ST.B = next;
3114                 ST.jump = trie->jump;
3115                 ST.me = scan;
3116                 /*
3117                    traverse the TRIE keeping track of all accepting states
3118                    we transition through until we get to a failing node.
3119                 */
3120
3121                 while ( state && uc <= (U8*)PL_regeol ) {
3122                     U32 base = trie->states[ state ].trans.base;
3123                     UV uvc = 0;
3124                     U16 charid;
3125                     /* We use charid to hold the wordnum as we don't use it
3126                        for charid until after we have done the wordnum logic.
3127                        We define an alias just so that the wordnum logic reads
3128                        more naturally. */
3129
3130 #define got_wordnum charid
3131                     got_wordnum = trie->states[ state ].wordnum;
3132
3133                     if ( got_wordnum ) {
3134                         if ( ! ST.accepted ) {
3135                             ENTER;
3136                             SAVETMPS; /* XXX is this necessary? dmq */
3137                             bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
3138                             sv_accept_buff=newSV(bufflen *
3139                                             sizeof(reg_trie_accepted) - 1);
3140                             SvCUR_set(sv_accept_buff, 0);
3141                             SvPOK_on(sv_accept_buff);
3142                             sv_2mortal(sv_accept_buff);
3143                             SAVETMPS;
3144                             ST.accept_buff =
3145                                 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
3146                         }
3147                         do {
3148                             if (ST.accepted >= bufflen) {
3149                                 bufflen *= 2;
3150                                 ST.accept_buff =(reg_trie_accepted*)
3151                                     SvGROW(sv_accept_buff,
3152                                         bufflen * sizeof(reg_trie_accepted));
3153                             }
3154                             SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
3155                                 + sizeof(reg_trie_accepted));
3156
3157
3158                             ST.accept_buff[ST.accepted].wordnum = got_wordnum;
3159                             ST.accept_buff[ST.accepted].endpos = uc;
3160                             ++ST.accepted;
3161                         } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
3162                     }
3163 #undef got_wordnum
3164
3165                     DEBUG_TRIE_EXECUTE_r({
3166                                 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
3167                                 PerlIO_printf( Perl_debug_log,
3168                                     "%*s  %sState: %4"UVxf" Accepted: %4"UVxf" ",
3169                                     2+depth * 2, "", PL_colors[4],
3170                                     (UV)state, (UV)ST.accepted );
3171                     });
3172
3173                     if ( base ) {
3174                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3175                                              uscan, len, uvc, charid, foldlen,
3176                                              foldbuf, uniflags);
3177
3178                         if (charid &&
3179                              (base + charid > trie->uniquecharcount )
3180                              && (base + charid - 1 - trie->uniquecharcount
3181                                     < trie->lasttrans)
3182                              && trie->trans[base + charid - 1 -
3183                                     trie->uniquecharcount].check == state)
3184                         {
3185                             state = trie->trans[base + charid - 1 -
3186                                 trie->uniquecharcount ].next;
3187                         }
3188                         else {
3189                             state = 0;
3190                         }
3191                         uc += len;
3192
3193                     }
3194                     else {
3195                         state = 0;
3196                     }
3197                     DEBUG_TRIE_EXECUTE_r(
3198                         PerlIO_printf( Perl_debug_log,
3199                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3200                             charid, uvc, (UV)state, PL_colors[5] );
3201                     );
3202                 }
3203                 if (!ST.accepted )
3204                    sayNO;
3205
3206                 DEBUG_EXECUTE_r(
3207                     PerlIO_printf( Perl_debug_log,
3208                         "%*s  %sgot %"IVdf" possible matches%s\n",
3209                         REPORT_CODE_OFF + depth * 2, "",
3210                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3211                 );
3212             }}
3213             goto trie_first_try; /* jump into the fail handler */
3214             /* NOTREACHED */
3215         case TRIE_next_fail: /* we failed - try next alterative */
3216             if ( ST.jump) {
3217                 REGCP_UNWIND(ST.cp);
3218                 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3219                     PL_regoffs[n].end = -1;
3220                 *PL_reglastparen = n;
3221             }
3222           trie_first_try:
3223             if (do_cutgroup) {
3224                 do_cutgroup = 0;
3225                 no_final = 0;
3226             }
3227
3228             if ( ST.jump) {
3229                 ST.lastparen = *PL_reglastparen;
3230                 REGCP_SET(ST.cp);
3231             }
3232             if ( ST.accepted == 1 ) {
3233                 /* only one choice left - just continue */
3234                 DEBUG_EXECUTE_r({
3235                     AV *const trie_words
3236                         = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3237                     SV ** const tmp = av_fetch( trie_words,
3238                         ST.accept_buff[ 0 ].wordnum-1, 0 );
3239                     SV *sv= tmp ? sv_newmortal() : NULL;
3240
3241                     PerlIO_printf( Perl_debug_log,
3242                         "%*s  %sonly one match left: #%d <%s>%s\n",
3243                         REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3244                         ST.accept_buff[ 0 ].wordnum,
3245                         tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3246                                 PL_colors[0], PL_colors[1],
3247                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3248                             )
3249                         : "not compiled under -Dr",
3250                         PL_colors[5] );
3251                 });
3252                 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3253                 /* in this case we free tmps/leave before we call regmatch
3254                    as we wont be using accept_buff again. */
3255
3256                 locinput = PL_reginput;
3257                 nextchr = UCHARAT(locinput);
3258                 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
3259                     scan = ST.B;
3260                 else
3261                     scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
3262                 if (!has_cutgroup) {
3263                     FREETMPS;
3264                     LEAVE;
3265                 } else {
3266                     ST.accepted--;
3267                     PUSH_YES_STATE_GOTO(TRIE_next, scan);
3268                 }
3269
3270                 continue; /* execute rest of RE */
3271             }
3272
3273             if ( !ST.accepted-- ) {
3274                 DEBUG_EXECUTE_r({
3275                     PerlIO_printf( Perl_debug_log,
3276                         "%*s  %sTRIE failed...%s\n",
3277                         REPORT_CODE_OFF+depth*2, "",
3278                         PL_colors[4],
3279                         PL_colors[5] );
3280                 });
3281                 FREETMPS;
3282                 LEAVE;
3283                 sayNO_SILENT;
3284                 /*NOTREACHED*/
3285             }
3286
3287             /*
3288                There are at least two accepting states left.  Presumably
3289                the number of accepting states is going to be low,
3290                typically two. So we simply scan through to find the one
3291                with lowest wordnum.  Once we find it, we swap the last
3292                state into its place and decrement the size. We then try to
3293                match the rest of the pattern at the point where the word
3294                ends. If we succeed, control just continues along the
3295                regex; if we fail we return here to try the next accepting
3296                state
3297              */
3298
3299             {
3300                 U32 best = 0;
3301                 U32 cur;
3302                 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3303                     DEBUG_TRIE_EXECUTE_r(
3304                         PerlIO_printf( Perl_debug_log,
3305                             "%*s  %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3306                             REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
3307                             (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3308                             ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3309                     );
3310
3311                     if (ST.accept_buff[cur].wordnum <
3312                             ST.accept_buff[best].wordnum)
3313                         best = cur;
3314                 }
3315
3316                 DEBUG_EXECUTE_r({
3317                     AV *const trie_words
3318                         = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3319                     SV ** const tmp = av_fetch( trie_words,
3320                         ST.accept_buff[ best ].wordnum - 1, 0 );
3321                     regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
3322                                     ST.B :
3323                                     ST.me + ST.jump[ST.accept_buff[best].wordnum];
3324                     SV *sv= tmp ? sv_newmortal() : NULL;
3325
3326                     PerlIO_printf( Perl_debug_log,
3327                         "%*s  %strying alternation #%d <%s> at node #%d %s\n",
3328                         REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3329                         ST.accept_buff[best].wordnum,
3330                         tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3331                                 PL_colors[0], PL_colors[1],
3332                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3333                             ) : "not compiled under -Dr",
3334                             REG_NODE_NUM(nextop),
3335                         PL_colors[5] );
3336                 });
3337
3338                 if ( best<ST.accepted ) {
3339                     reg_trie_accepted tmp = ST.accept_buff[ best ];
3340                     ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3341                     ST.accept_buff[ ST.accepted ] = tmp;
3342                     best = ST.accepted;
3343                 }
3344                 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
3345                 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
3346                     scan = ST.B;
3347                 } else {
3348                     scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
3349                 }
3350                 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3351                 /* NOTREACHED */
3352             }
3353             /* NOTREACHED */
3354         case TRIE_next:
3355             /* we dont want to throw this away, see bug 57042*/
3356             if (oreplsv != GvSV(PL_replgv))
3357                 sv_setsv(oreplsv, GvSV(PL_replgv));
3358             FREETMPS;
3359             LEAVE;
3360             sayYES;
3361 #undef  ST
3362
3363         case EXACT: {
3364             char *s = STRING(scan);
3365             ln = STR_LEN(scan);
3366             if (do_utf8 != UTF) {
3367                 /* The target and the pattern have differing utf8ness. */
3368                 char *l = locinput;
3369                 const char * const e = s + ln;
3370
3371                 if (do_utf8) {
3372                     /* The target is utf8, the pattern is not utf8. */
3373                     while (s < e) {
3374                         STRLEN ulen;
3375                         if (l >= PL_regeol)
3376                              sayNO;
3377                         if (NATIVE_TO_UNI(*(U8*)s) !=
3378                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3379                                             uniflags))
3380                              sayNO;
3381                         l += ulen;
3382                         s ++;
3383                     }
3384                 }
3385                 else {
3386                     /* The target is not utf8, the pattern is utf8. */
3387                     while (s < e) {
3388                         STRLEN ulen;
3389                         if (l >= PL_regeol)
3390                             sayNO;
3391                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3392                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3393                                            uniflags))
3394                             sayNO;
3395                         s += ulen;
3396                         l ++;
3397                     }
3398                 }
3399                 locinput = l;
3400                 nextchr = UCHARAT(locinput);
3401                 break;
3402             }
3403             /* The target and the pattern have the same utf8ness. */
3404             /* Inline the first character, for speed. */
3405             if (UCHARAT(s) != nextchr)
3406                 sayNO;
3407             if (PL_regeol - locinput < ln)
3408                 sayNO;
3409             if (ln > 1 && memNE(s, locinput, ln))
3410                 sayNO;
3411             locinput += ln;
3412             nextchr = UCHARAT(locinput);
3413             break;
3414             }
3415         case EXACTFL:
3416             PL_reg_flags |= RF_tainted;
3417             /* FALL THROUGH */
3418         case EXACTF: {
3419             char * const s = STRING(scan);
3420             ln = STR_LEN(scan);
3421
3422             if (do_utf8 || UTF) {
3423               /* Either target or the pattern are utf8. */
3424                 const char * const l = locinput;
3425                 char *e = PL_regeol;
3426
3427                 if (ibcmp_utf8(s, 0,  ln, cBOOL(UTF),
3428                                l, &e, 0,  do_utf8)) {
3429                      /* One more case for the sharp s:
3430                       * pack("U0U*", 0xDF) =~ /ss/i,
3431                       * the 0xC3 0x9F are the UTF-8
3432                       * byte sequence for the U+00DF. */
3433
3434                      if (!(do_utf8 &&
3435                            toLOWER(s[0]) == 's' &&
3436                            ln >= 2 &&
3437                            toLOWER(s[1]) == 's' &&
3438                            (U8)l[0] == 0xC3 &&
3439                            e - l >= 2 &&
3440                            (U8)l[1] == 0x9F))
3441                           sayNO;
3442                 }
3443                 locinput = e;
3444                 nextchr = UCHARAT(locinput);
3445                 break;
3446             }
3447
3448             /* Neither the target and the pattern are utf8. */
3449
3450             /* Inline the first character, for speed. */
3451             if (UCHARAT(s) != nextchr &&
3452                 UCHARAT(s) != ((OP(scan) == EXACTF)
3453                                ? PL_fold : PL_fold_locale)[nextchr])
3454                 sayNO;
3455             if (PL_regeol - locinput < ln)
3456                 sayNO;
3457             if (ln > 1 && (OP(scan) == EXACTF
3458                            ? ibcmp(s, locinput, ln)
3459                            : ibcmp_locale(s, locinput, ln)))
3460                 sayNO;
3461             locinput += ln;
3462             nextchr = UCHARAT(locinput);
3463             break;
3464             }
3465         case BOUNDL:
3466         case NBOUNDL:
3467             PL_reg_flags |= RF_tainted;
3468             /* FALL THROUGH */
3469         case BOUND:
3470         case NBOUND:
3471             /* was last char in word? */
3472             if (do_utf8) {
3473                 if (locinput == PL_bostr)
3474                     ln = '\n';
3475                 else {
3476                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3477
3478                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3479                 }
3480                 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3481                     ln = isALNUM_uni(ln);
3482                     LOAD_UTF8_CHARCLASS_ALNUM();
3483                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3484                 }
3485                 else {
3486                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3487                     n = isALNUM_LC_utf8((U8*)locinput);
3488                 }
3489             }
3490             else {
3491                 ln = (locinput != PL_bostr) ?
3492                     UCHARAT(locinput - 1) : '\n';
3493                 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3494                     ln = isALNUM(ln);
3495                     n = isALNUM(nextchr);
3496                 }
3497                 else {
3498                     ln = isALNUM_LC(ln);
3499                     n = isALNUM_LC(nextchr);
3500                 }
3501             }
3502             if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3503                                     OP(scan) == BOUNDL))
3504                     sayNO;
3505             break;
3506         case ANYOF:
3507             if (do_utf8) {
3508                 STRLEN inclasslen = PL_regeol - locinput;
3509
3510                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3511                     goto anyof_fail;
3512                 if (locinput >= PL_regeol)
3513                     sayNO;
3514                 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3515                 nextchr = UCHARAT(locinput);
3516                 break;
3517             }
3518             else {
3519                 if (nextchr < 0)
3520                     nextchr = UCHARAT(locinput);
3521                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3522                     goto anyof_fail;
3523                 if (!nextchr && locinput >= PL_regeol)
3524                     sayNO;
3525                 nextchr = UCHARAT(++locinput);
3526                 break;
3527             }
3528         anyof_fail:
3529             /* If we might have the case of the German sharp s
3530              * in a casefolding Unicode character class. */
3531
3532             if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3533                  locinput += SHARP_S_SKIP;
3534                  nextchr = UCHARAT(locinput);
3535             }
3536             else
3537                  sayNO;
3538             break;
3539         /* Special char classes - The defines start on line 129 or so */
3540         CCC_TRY_AFF( ALNUM,  ALNUML, perl_word,   "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
3541         CCC_TRY_NEG(NALNUM, NALNUML, perl_word,   "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
3542
3543         CCC_TRY_AFF( SPACE,  SPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
3544         CCC_TRY_NEG(NSPACE, NSPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
3545
3546         CCC_TRY_AFF( DIGIT,  DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3547         CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3548
3549         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3550                        a Unicode extended Grapheme Cluster */
3551             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3552               extended Grapheme Cluster is:
3553
3554                CR LF
3555                | Prepend* Begin Extend*
3556                | .
3557
3558                Begin is (Hangul-syllable | ! Control)
3559                Extend is (Grapheme_Extend | Spacing_Mark)
3560                Control is [ GCB_Control CR LF ]
3561
3562                The discussion below shows how the code for CLUMP is derived
3563                from this regex.  Note that most of these concepts are from
3564                property values of the Grapheme Cluster Boundary (GCB) property.
3565                No code point can have multiple property values for a given
3566                property.  Thus a code point in Prepend can't be in Control, but
3567                it must be in !Control.  This is why Control above includes
3568                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3569                property separately, and so can't be in GCB_Control, even though
3570                they logically are controls.  Control is not the same as gc=cc,
3571                but includes format and other characters as well.
3572
3573                The Unicode definition of Hangul-syllable is:
3574                    L+
3575                    | (L* ( ( V | LV ) V* | LVT ) T*)
3576                    | T+
3577                   )
3578                Each of these is a value for the GCB property, and hence must be
3579                disjoint, so the order they are tested is immaterial, so the
3580                above can safely be changed to
3581                    T+
3582                    | L+
3583                    | (L* ( LVT | ( V | LV ) V*) T*)
3584
3585                The last two terms can be combined like this:
3586                    L* ( L
3587                         | (( LVT | ( V | LV ) V*) T*))
3588
3589                And refactored into this:
3590                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3591
3592                That means that if we have seen any L's at all we can quit
3593                there, but if the next character is a LVT, a V or and LV we
3594                should keep going.
3595
3596                There is a subtlety with Prepend* which showed up in testing.
3597                Note that the Begin, and only the Begin is required in:
3598                 | Prepend* Begin Extend*
3599                Also, Begin contains '! Control'.  A Prepend must be a '!
3600                Control', which means it must be a Begin.  What it comes down to
3601                is that if we match Prepend* and then find no suitable Begin
3602                afterwards, that if we backtrack the last Prepend, that one will
3603                be a suitable Begin.
3604             */
3605
3606             if (locinput >= PL_regeol)
3607                 sayNO;
3608             if  (! do_utf8) {
3609
3610                 /* Match either CR LF  or '.', as all the other possibilities
3611                  * require utf8 */
3612                 locinput++;         /* Match the . or CR */
3613                 if (nextchr == '\r'
3614                     && locinput < PL_regeol
3615                     && UCHARAT(locinput) == '\n') locinput++;
3616             }
3617             else {
3618
3619                 /* Utf8: See if is ( CR LF ); already know that locinput <
3620                  * PL_regeol, so locinput+1 is in bounds */
3621                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3622                     locinput += 2;
3623                 }
3624                 else {
3625                     /* In case have to backtrack to beginning, then match '.' */
3626                     char *starting = locinput;
3627
3628                     /* In case have to backtrack the last prepend */
3629                     char *previous_prepend = 0;
3630
3631                     LOAD_UTF8_CHARCLASS_GCB();
3632
3633                     /* Match (prepend)* */
3634                     while (locinput < PL_regeol
3635                            && swash_fetch(PL_utf8_X_prepend,
3636                                           (U8*)locinput, do_utf8))
3637                     {
3638                         previous_prepend = locinput;
3639                         locinput += UTF8SKIP(locinput);
3640                     }
3641
3642                     /* As noted above, if we matched a prepend character, but
3643                      * the next thing won't match, back off the last prepend we
3644                      * matched, as it is guaranteed to match the begin */
3645                     if (previous_prepend
3646                         && (locinput >=  PL_regeol
3647                             || ! swash_fetch(PL_utf8_X_begin,
3648                                              (U8*)locinput, do_utf8)))
3649                     {
3650                         locinput = previous_prepend;
3651                     }
3652
3653                     /* Note that here we know PL_regeol > locinput, as we
3654                      * tested that upon input to this switch case, and if we
3655                      * moved locinput forward, we tested the result just above
3656                      * and it either passed, or we backed off so that it will
3657                      * now pass */
3658                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, do_utf8)) {
3659
3660                         /* Here did not match the required 'Begin' in the
3661                          * second term.  So just match the very first
3662                          * character, the '.' of the final term of the regex */
3663                         locinput = starting + UTF8SKIP(starting);
3664                     } else {
3665
3666                         /* Here is the beginning of a character that can have
3667                          * an extender.  It is either a hangul syllable, or a
3668                          * non-control */
3669                         if (swash_fetch(PL_utf8_X_non_hangul,
3670                                         (U8*)locinput, do_utf8))
3671                         {
3672
3673                             /* Here not a Hangul syllable, must be a
3674                              * ('!  * Control') */
3675                             locinput += UTF8SKIP(locinput);
3676                         } else {
3677
3678                             /* Here is a Hangul syllable.  It can be composed
3679                              * of several individual characters.  One
3680                              * possibility is T+ */
3681                             if (swash_fetch(PL_utf8_X_T,
3682                                             (U8*)locinput, do_utf8))
3683                             {
3684                                 while (locinput < PL_regeol
3685                                         && swash_fetch(PL_utf8_X_T,
3686                                                         (U8*)locinput, do_utf8))
3687                                 {
3688                                     locinput += UTF8SKIP(locinput);
3689                                 }
3690                             } else {
3691
3692                                 /* Here, not T+, but is a Hangul.  That means
3693                                  * it is one of the others: L, LV, LVT or V,
3694                                  * and matches:
3695                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3696
3697                                 /* Match L*           */
3698                                 while (locinput < PL_regeol
3699                                         && swash_fetch(PL_utf8_X_L,
3700                                                         (U8*)locinput, do_utf8))
3701                                 {
3702                                     locinput += UTF8SKIP(locinput);
3703                                 }
3704
3705                                 /* Here, have exhausted L*.  If the next
3706                                  * character is not an LV, LVT nor V, it means
3707                                  * we had to have at least one L, so matches L+
3708                                  * in the original equation, we have a complete
3709                                  * hangul syllable.  Are done. */
3710
3711                                 if (locinput < PL_regeol
3712                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
3713                                                     (U8*)locinput, do_utf8))
3714                                 {
3715
3716                                     /* Otherwise keep going.  Must be LV, LVT
3717                                      * or V.  See if LVT */
3718                                     if (swash_fetch(PL_utf8_X_LVT,
3719                                                     (U8*)locinput, do_utf8))
3720                                     {
3721                                         locinput += UTF8SKIP(locinput);
3722                                     } else {
3723
3724                                         /* Must be  V or LV.  Take it, then
3725                                          * match V*     */
3726                                         locinput += UTF8SKIP(locinput);
3727                                         while (locinput < PL_regeol
3728                                                 && swash_fetch(PL_utf8_X_V,
3729                                                          (U8*)locinput, do_utf8))
3730                                         {
3731                                             locinput += UTF8SKIP(locinput);
3732                                         }
3733                                     }
3734
3735                                     /* And any of LV, LVT, or V can be followed
3736                                      * by T*            */
3737                                     while (locinput < PL_regeol
3738                                            && swash_fetch(PL_utf8_X_T,
3739                                                            (U8*)locinput,
3740                                                            do_utf8))
3741                                     {
3742                                         locinput += UTF8SKIP(locinput);
3743                                     }
3744                                 }
3745                             }
3746                         }
3747
3748                         /* Match any extender */
3749                         while (locinput < PL_regeol
3750                                 && swash_fetch(PL_utf8_X_extend,
3751                                                 (U8*)locinput, do_utf8))
3752                         {
3753                             locinput += UTF8SKIP(locinput);
3754                         }
3755                     }
3756                 }
3757                 if (locinput > PL_regeol) sayNO;
3758             }
3759             nextchr = UCHARAT(locinput);
3760             break;
3761
3762         case NREFFL:
3763         {
3764             char *s;
3765             char type;
3766             PL_reg_flags |= RF_tainted;
3767             /* FALL THROUGH */
3768         case NREF:
3769         case NREFF:
3770             type = OP(scan);
3771             n = reg_check_named_buff_matched(rex,scan);
3772
3773             if ( n ) {
3774                 type = REF + ( type - NREF );
3775                 goto do_ref;
3776             } else {
3777                 sayNO;
3778             }
3779             /* unreached */
3780         case REFFL:
3781             PL_reg_flags |= RF_tainted;
3782             /* FALL THROUGH */
3783         case REF:
3784         case REFF:
3785             n = ARG(scan);  /* which paren pair */
3786             type = OP(scan);
3787           do_ref:
3788             ln = PL_regoffs[n].start;
3789             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
3790             if (*PL_reglastparen < n || ln == -1)
3791                 sayNO;                  /* Do not match unless seen CLOSEn. */
3792             if (ln == PL_regoffs[n].end)
3793                 break;
3794
3795             s = PL_bostr + ln;
3796             if (do_utf8 && type != REF) {       /* REF can do byte comparison */
3797                 char *l = locinput;
3798                 const char *e = PL_bostr + PL_regoffs[n].end;
3799                 /*
3800                  * Note that we can't do the "other character" lookup trick as
3801                  * in the 8-bit case (no pun intended) because in Unicode we
3802                  * have to map both upper and title case to lower case.
3803                  */
3804                 if (type == REFF) {
3805                     while (s < e) {
3806                         STRLEN ulen1, ulen2;
3807                         U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3808                         U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3809
3810                         if (l >= PL_regeol)
3811                             sayNO;
3812                         toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3813                         toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3814                         if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3815                             sayNO;
3816                         s += ulen1;
3817                         l += ulen2;
3818                     }
3819                 }
3820                 locinput = l;
3821                 nextchr = UCHARAT(locinput);
3822                 break;
3823             }
3824
3825             /* Inline the first character, for speed. */
3826             if (UCHARAT(s) != nextchr &&
3827                 (type == REF ||
3828                  (UCHARAT(s) != (type == REFF
3829                                   ? PL_fold : PL_fold_locale)[nextchr])))
3830                 sayNO;
3831             ln = PL_regoffs[n].end - ln;
3832             if (locinput + ln > PL_regeol)
3833                 sayNO;
3834             if (ln > 1 && (type == REF
3835                            ? memNE(s, locinput, ln)
3836                            : (type == REFF
3837                               ? ibcmp(s, locinput, ln)
3838                               : ibcmp_locale(s, locinput, ln))))
3839                 sayNO;
3840             locinput += ln;
3841             nextchr = UCHARAT(locinput);
3842             break;
3843         }
3844         case NOTHING:
3845         case TAIL:
3846             break;
3847         case BACK:
3848             break;
3849
3850 #undef  ST
3851 #define ST st->u.eval
3852         {
3853             SV *ret;
3854             REGEXP *re_sv;
3855             regexp *re;
3856             regexp_internal *rei;
3857             regnode *startpoint;
3858
3859         case GOSTART:
3860         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
3861             if (cur_eval && cur_eval->locinput==locinput) {
3862                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
3863                     Perl_croak(aTHX_ "Infinite recursion in regex");
3864                 if ( ++nochange_depth > max_nochange_depth )
3865                     Perl_croak(aTHX_
3866                         "Pattern subroutine nesting without pos change"
3867                         " exceeded limit in regex");
3868             } else {
3869                 nochange_depth = 0;
3870             }
3871             re_sv = rex_sv;
3872             re = rex;
3873             rei = rexi;
3874             (void)ReREFCNT_inc(rex_sv);
3875             if (OP(scan)==GOSUB) {
3876                 startpoint = scan + ARG2L(scan);
3877                 ST.close_paren = ARG(scan);
3878             } else {
3879                 startpoint = rei->program+1;
3880                 ST.close_paren = 0;
3881             }
3882             goto eval_recurse_doit;
3883             /* NOTREACHED */
3884         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
3885             if (cur_eval && cur_eval->locinput==locinput) {
3886                 if ( ++nochange_depth > max_nochange_depth )
3887                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
3888             } else {
3889                 nochange_depth = 0;
3890             }
3891             {
3892                 /* execute the code in the {...} */
3893                 dSP;
3894                 SV ** const before = SP;
3895                 OP_4tree * const oop = PL_op;
3896                 COP * const ocurcop = PL_curcop;
3897                 PAD *old_comppad;
3898                 char *saved_regeol = PL_regeol;
3899
3900                 n = ARG(scan);
3901                 PL_op = (OP_4tree*)rexi->data->data[n];
3902                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
3903                     "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3904                 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
3905                 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
3906
3907                 if (sv_yes_mark) {
3908                     SV *sv_mrk = get_sv("REGMARK", 1);
3909                     sv_setsv(sv_mrk, sv_yes_mark);
3910                 }
3911
3912                 CALLRUNOPS(aTHX);                       /* Scalar context. */
3913                 SPAGAIN;
3914                 if (SP == before)
3915                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
3916                 else {
3917                     ret = POPs;
3918                     PUTBACK;
3919                 }
3920
3921                 PL_op = oop;
3922                 PAD_RESTORE_LOCAL(old_comppad);
3923                 PL_curcop = ocurcop;
3924                 PL_regeol = saved_regeol;
3925                 if (!logical) {
3926                     /* /(?{...})/ */
3927                     sv_setsv(save_scalar(PL_replgv), ret);
3928                     break;
3929                 }
3930             }
3931             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3932                 logical = 0;
3933                 {
3934                     /* extract RE object from returned value; compiling if
3935                      * necessary */
3936                     MAGIC *mg = NULL;
3937                     REGEXP *rx = NULL;
3938
3939                     if (SvROK(ret)) {
3940                         SV *const sv = SvRV(ret);
3941
3942                         if (SvTYPE(sv) == SVt_REGEXP) {
3943                             rx = (REGEXP*) sv;
3944                         } else if (SvSMAGICAL(sv)) {
3945                             mg = mg_find(sv, PERL_MAGIC_qr);
3946                             assert(mg);
3947                         }
3948                     } else if (SvTYPE(ret) == SVt_REGEXP) {
3949                         rx = (REGEXP*) ret;
3950                     } else if (SvSMAGICAL(ret)) {
3951                         if (SvGMAGICAL(ret)) {
3952                             /* I don't believe that there is ever qr magic
3953                                here.  */
3954                             assert(!mg_find(ret, PERL_MAGIC_qr));
3955                             sv_unmagic(ret, PERL_MAGIC_qr);
3956                         }
3957                         else {
3958                             mg = mg_find(ret, PERL_MAGIC_qr);
3959                             /* testing suggests mg only ends up non-NULL for
3960                                scalars who were upgraded and compiled in the
3961                                else block below. In turn, this is only
3962                                triggered in the "postponed utf8 string" tests
3963                                in t/op/pat.t  */
3964                         }
3965                     }
3966
3967                     if (mg) {
3968                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
3969                         assert(rx);
3970                     }
3971                     if (rx) {
3972                         rx = reg_temp_copy(NULL, rx);
3973                     }
3974                     else {
3975                         U32 pm_flags = 0;
3976                         const I32 osize = PL_regsize;
3977
3978                         if (DO_UTF8(ret)) {
3979                             assert (SvUTF8(ret));
3980                         } else if (SvUTF8(ret)) {
3981                             /* Not doing UTF-8, despite what the SV says. Is
3982                                this only if we're trapped in use 'bytes'?  */
3983                             /* Make a copy of the octet sequence, but without
3984                                the flag on, as the compiler now honours the
3985                                SvUTF8 flag on ret.  */
3986                             STRLEN len;
3987                             const char *const p = SvPV(ret, len);
3988                             ret = newSVpvn_flags(p, len, SVs_TEMP);
3989                         }
3990                         rx = CALLREGCOMP(ret, pm_flags);
3991                         if (!(SvFLAGS(ret)
3992                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3993                                  | SVs_GMG))) {
3994                             /* This isn't a first class regexp. Instead, it's
3995                                caching a regexp onto an existing, Perl visible
3996                                scalar.  */
3997                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
3998                         }
3999                         PL_regsize = osize;
4000                     }
4001                     re_sv = rx;
4002                     re = (struct regexp *)SvANY(rx);
4003                 }
4004                 RXp_MATCH_COPIED_off(re);
4005                 re->subbeg = rex->subbeg;
4006                 re->sublen = rex->sublen;
4007                 rei = RXi_GET(re);
4008                 DEBUG_EXECUTE_r(
4009                     debug_start_match(re_sv, do_utf8, locinput, PL_regeol,
4010                         "Matching embedded");
4011                 );
4012                 startpoint = rei->program + 1;
4013                 ST.close_paren = 0; /* only used for GOSUB */
4014                 /* borrowed from regtry */
4015                 if (PL_reg_start_tmpl <= re->nparens) {
4016                     PL_reg_start_tmpl = re->nparens*3/2 + 3;
4017                     if(PL_reg_start_tmp)
4018                         Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4019                     else
4020                         Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4021                 }
4022
4023         eval_recurse_doit: /* Share code with GOSUB below this line */
4024                 /* run the pattern returned from (??{...}) */
4025                 ST.cp = regcppush(0);   /* Save *all* the positions. */
4026                 REGCP_SET(ST.lastcp);
4027
4028                 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4029
4030                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4031                 PL_reglastparen = &re->lastparen;
4032                 PL_reglastcloseparen = &re->lastcloseparen;
4033                 re->lastparen = 0;
4034                 re->lastcloseparen = 0;
4035
4036                 PL_reginput = locinput;
4037                 PL_regsize = 0;
4038
4039                 /* XXXX This is too dramatic a measure... */
4040                 PL_reg_maxiter = 0;
4041
4042                 ST.toggle_reg_flags = PL_reg_flags;
4043                 if (RX_UTF8(re_sv))
4044                     PL_reg_flags |= RF_utf8;
4045                 else
4046                     PL_reg_flags &= ~RF_utf8;
4047                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4048
4049                 ST.prev_rex = rex_sv;
4050                 ST.prev_curlyx = cur_curlyx;
4051                 SETREX(rex_sv,re_sv);
4052                 rex = re;
4053                 rexi = rei;
4054                 cur_curlyx = NULL;
4055                 ST.B = next;
4056                 ST.prev_eval = cur_eval;
4057                 cur_eval = st;
4058                 /* now continue from first node in postoned RE */
4059                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4060                 /* NOTREACHED */
4061             }
4062             /* logical is 1,   /(?(?{...})X|Y)/ */
4063             sw = cBOOL(SvTRUE(ret));
4064             logical = 0;
4065             break;
4066         }
4067
4068         case EVAL_AB: /* cleanup after a successful (??{A})B */
4069             /* note: this is called twice; first after popping B, then A */
4070             PL_reg_flags ^= ST.toggle_reg_flags;
4071             ReREFCNT_dec(rex_sv);
4072             SETREX(rex_sv,ST.prev_rex);
4073             rex = (struct regexp *)SvANY(rex_sv);
4074             rexi = RXi_GET(rex);
4075             regcpblow(ST.cp);
4076             cur_eval = ST.prev_eval;
4077             cur_curlyx = ST.prev_curlyx;
4078
4079             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4080             PL_reglastparen = &rex->lastparen;
4081             PL_reglastcloseparen = &rex->lastcloseparen;
4082             /* also update PL_regoffs */
4083             PL_regoffs = rex->offs;
4084
4085             /* XXXX This is too dramatic a measure... */
4086             PL_reg_maxiter = 0;
4087             if ( nochange_depth )
4088                 nochange_depth--;
4089             sayYES;
4090
4091
4092         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4093             /* note: this is called twice; first after popping B, then A */
4094             PL_reg_flags ^= ST.toggle_reg_flags;
4095             ReREFCNT_dec(rex_sv);
4096             SETREX(rex_sv,ST.prev_rex);
4097             rex = (struct regexp *)SvANY(rex_sv);
4098             rexi = RXi_GET(rex);
4099             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4100             PL_reglastparen = &rex->lastparen;
4101             PL_reglastcloseparen = &rex->lastcloseparen;
4102
4103             PL_reginput = locinput;
4104             REGCP_UNWIND(ST.lastcp);
4105             regcppop(rex);
4106             cur_eval = ST.prev_eval;
4107             cur_curlyx = ST.prev_curlyx;
4108             /* XXXX This is too dramatic a measure... */
4109             PL_reg_maxiter = 0;
4110             if ( nochange_depth )
4111                 nochange_depth--;
4112             sayNO_SILENT;
4113 #undef ST
4114
4115         case OPEN:
4116             n = ARG(scan);  /* which paren pair */
4117             PL_reg_start_tmp[n] = locinput;
4118             if (n > PL_regsize)
4119                 PL_regsize = n;
4120             lastopen = n;
4121             break;
4122         case CLOSE:
4123             n = ARG(scan);  /* which paren pair */
4124             PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4125             PL_regoffs[n].end = locinput - PL_bostr;
4126             /*if (n > PL_regsize)
4127                 PL_regsize = n;*/
4128             if (n > *PL_reglastparen)
4129                 *PL_reglastparen = n;
4130             *PL_reglastcloseparen = n;
4131             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4132                 goto fake_end;
4133             }
4134             break;
4135         case ACCEPT:
4136             if (ARG(scan)){
4137                 regnode *cursor;
4138                 for (cursor=scan;
4139                      cursor && OP(cursor)!=END;
4140                      cursor=regnext(cursor))
4141                 {
4142                     if ( OP(cursor)==CLOSE ){
4143                         n = ARG(cursor);
4144                         if ( n <= lastopen ) {
4145                             PL_regoffs[n].start
4146                                 = PL_reg_start_tmp[n] - PL_bostr;
4147                             PL_regoffs[n].end = locinput - PL_bostr;
4148                             /*if (n > PL_regsize)
4149                             PL_regsize = n;*/
4150                             if (n > *PL_reglastparen)
4151                                 *PL_reglastparen = n;
4152                             *PL_reglastcloseparen = n;
4153                             if ( n == ARG(scan) || (cur_eval &&
4154                                 cur_eval->u.eval.close_paren == n))
4155                                 break;
4156                         }
4157                     }
4158                 }
4159             }
4160             goto fake_end;
4161             /*NOTREACHED*/
4162         case GROUPP:
4163             n = ARG(scan);  /* which paren pair */
4164             sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4165             break;
4166         case NGROUPP:
4167             /* reg_check_named_buff_matched returns 0 for no match */
4168             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4169             break;
4170         case INSUBP:
4171             n = ARG(scan);
4172             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4173             break;
4174         case DEFINEP:
4175             sw = 0;
4176             break;
4177         case IFTHEN:
4178             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4179             if (sw)
4180                 next = NEXTOPER(NEXTOPER(scan));
4181             else {
4182                 next = scan + ARG(scan);
4183                 if (OP(next) == IFTHEN) /* Fake one. */
4184                     next = NEXTOPER(NEXTOPER(next));
4185             }
4186             break;
4187         case LOGICAL:
4188             logical = scan->flags;
4189             break;
4190
4191 /*******************************************************************
4192
4193 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4194 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4195 STAR/PLUS/CURLY/CURLYN are used instead.)
4196
4197 A*B is compiled as <CURLYX><A><WHILEM><B>
4198
4199 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4200 state, which contains the current count, initialised to -1. It also sets
4201 cur_curlyx to point to this state, with any previous value saved in the
4202 state block.
4203
4204 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4205 since the pattern may possibly match zero times (i.e. it's a while {} loop
4206 rather than a do {} while loop).
4207
4208 Each entry to WHILEM represents a successful match of A. The count in the
4209 CURLYX block is incremented, another WHILEM state is pushed, and execution
4210 passes to A or B depending on greediness and the current count.
4211
4212 For example, if matching against the string a1a2a3b (where the aN are
4213 substrings that match /A/), then the match progresses as follows: (the
4214 pushed states are interspersed with the bits of strings matched so far):
4215
4216     <CURLYX cnt=-1>
4217     <CURLYX cnt=0><WHILEM>
4218     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4219     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4220     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4221     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4222
4223 (Contrast this with something like CURLYM, which maintains only a single
4224 backtrack state:
4225
4226     <CURLYM cnt=0> a1
4227     a1 <CURLYM cnt=1> a2
4228     a1 a2 <CURLYM cnt=2> a3
4229     a1 a2 a3 <CURLYM cnt=3> b
4230 )
4231
4232 Each WHILEM state block marks a point to backtrack to upon partial failure
4233 of A or B, and also contains some minor state data related to that
4234 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4235 overall state, such as the count, and pointers to the A and B ops.
4236
4237 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4238 must always point to the *current* CURLYX block, the rules are:
4239
4240 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4241 and set cur_curlyx to point the new block.
4242
4243 When popping the CURLYX block after a successful or unsuccessful match,
4244 restore the previous cur_curlyx.
4245
4246 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4247 to the outer one saved in the CURLYX block.
4248
4249 When popping the WHILEM block after a successful or unsuccessful B match,
4250 restore the previous cur_curlyx.
4251
4252 Here's an example for the pattern (AI* BI)*BO
4253 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4254
4255 cur_
4256 curlyx backtrack stack
4257 ------ ---------------
4258 NULL
4259 CO     <CO prev=NULL> <WO>
4260 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4261 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4262 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4263
4264 At this point the pattern succeeds, and we work back down the stack to
4265 clean up, restoring as we go:
4266
4267 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4268 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4269 CO     <CO prev=NULL> <WO>
4270 NULL
4271
4272 *******************************************************************/
4273
4274 #define ST st->u.curlyx
4275
4276         case CURLYX:    /* start of /A*B/  (for complex A) */
4277         {
4278             /* No need to save/restore up to this paren */
4279             I32 parenfloor = scan->flags;
4280
4281             assert(next); /* keep Coverity happy */
4282             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4283                 next += ARG(next);
4284
4285             /* XXXX Probably it is better to teach regpush to support
4286                parenfloor > PL_regsize... */
4287             if (parenfloor > (I32)*PL_reglastparen)
4288                 parenfloor = *PL_reglastparen; /* Pessimization... */
4289
4290             ST.prev_curlyx= cur_curlyx;
4291             cur_curlyx = st;
4292             ST.cp = PL_savestack_ix;
4293
4294             /* these fields contain the state of the current curly.
4295              * they are accessed by subsequent WHILEMs */
4296             ST.parenfloor = parenfloor;
4297             ST.min = ARG1(scan);
4298             ST.max = ARG2(scan);
4299             ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4300             ST.B = next;
4301             ST.minmod = minmod;
4302             minmod = 0;
4303             ST.count = -1;      /* this will be updated by WHILEM */
4304             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4305
4306             PL_reginput = locinput;
4307             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4308             /* NOTREACHED */
4309         }
4310
4311         case CURLYX_end: /* just finished matching all of A*B */
4312             cur_curlyx = ST.prev_curlyx;
4313             sayYES;
4314             /* NOTREACHED */
4315
4316         case CURLYX_end_fail: /* just failed to match all of A*B */
4317             regcpblow(ST.cp);
4318             cur_curlyx = ST.prev_curlyx;
4319             sayNO;
4320             /* NOTREACHED */
4321
4322
4323 #undef ST
4324 #define ST st->u.whilem
4325
4326         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4327         {
4328             /* see the discussion above about CURLYX/WHILEM */
4329             I32 n;
4330             assert(cur_curlyx); /* keep Coverity happy */
4331             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4332             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4333             ST.cache_offset = 0;
4334             ST.cache_mask = 0;
4335
4336             PL_reginput = locinput;
4337
4338             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4339                   "%*s  whilem: matched %ld out of %ld..%ld\n",
4340                   REPORT_CODE_OFF+depth*2, "", (long)n,
4341                   (long)cur_curlyx->u.curlyx.min,
4342                   (long)cur_curlyx->u.curlyx.max)
4343             );
4344
4345             /* First just match a string of min A's. */
4346
4347             if (n < cur_curlyx->u.curlyx.min) {
4348                 cur_curlyx->u.curlyx.lastloc = locinput;
4349                 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
4350                 /* NOTREACHED */
4351             }
4352
4353             /* If degenerate A matches "", assume A done. */
4354
4355             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4356                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4357                    "%*s  whilem: empty match detected, trying continuation...\n",
4358                    REPORT_CODE_OFF+depth*2, "")
4359                 );
4360                 goto do_whilem_B_max;
4361             }
4362
4363             /* super-linear cache processing */
4364
4365             if (scan->flags) {
4366
4367                 if (!PL_reg_maxiter) {
4368                     /* start the countdown: Postpone detection until we
4369                      * know the match is not *that* much linear. */
4370                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4371                     /* possible overflow for long strings and many CURLYX's */
4372                     if (PL_reg_maxiter < 0)
4373                         PL_reg_maxiter = I32_MAX;
4374                     PL_reg_leftiter = PL_reg_maxiter;
4375                 }
4376
4377                 if (PL_reg_leftiter-- == 0) {
4378                     /* initialise cache */
4379                     const I32 size = (PL_reg_maxiter + 7)/8;
4380                     if (PL_reg_poscache) {
4381                         if ((I32)PL_reg_poscache_size < size) {
4382                             Renew(PL_reg_poscache, size, char);
4383                             PL_reg_poscache_size = size;
4384                         }
4385                         Zero(PL_reg_poscache, size, char);
4386                     }
4387                     else {
4388                         PL_reg_poscache_size = size;
4389                         Newxz(PL_reg_poscache, size, char);
4390                     }
4391                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4392       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4393                               PL_colors[4], PL_colors[5])
4394                     );
4395                 }
4396
4397                 if (PL_reg_leftiter < 0) {
4398                     /* have we already failed at this position? */
4399                     I32 offset, mask;
4400                     offset  = (scan->flags & 0xf) - 1
4401                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4402                     mask    = 1 << (offset % 8);
4403                     offset /= 8;
4404                     if (PL_reg_poscache[offset] & mask) {
4405                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4406                             "%*s  whilem: (cache) already tried at this position...\n",
4407                             REPORT_CODE_OFF+depth*2, "")
4408                         );
4409                         sayNO; /* cache records failure */
4410                     }
4411                     ST.cache_offset = offset;
4412                     ST.cache_mask   = mask;
4413                 }
4414             }
4415
4416             /* Prefer B over A for minimal matching. */
4417
4418             if (cur_curlyx->u.curlyx.minmod) {
4419                 ST.save_curlyx = cur_curlyx;
4420                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4421                 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4422                 REGCP_SET(ST.lastcp);
4423                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4424                 /* NOTREACHED */
4425             }
4426
4427             /* Prefer A over B for maximal matching. */
4428
4429             if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
4430                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4431                 cur_curlyx->u.curlyx.lastloc = locinput;
4432                 REGCP_SET(ST.lastcp);
4433                 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
4434                 /* NOTREACHED */
4435             }
4436             goto do_whilem_B_max;
4437         }
4438         /* NOTREACHED */
4439
4440         case WHILEM_B_min: /* just matched B in a minimal match */
4441         case WHILEM_B_max: /* just matched B in a maximal match */
4442             cur_curlyx = ST.save_curlyx;
4443             sayYES;
4444             /* NOTREACHED */
4445
4446         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4447             cur_curlyx = ST.save_curlyx;
4448             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4449             cur_curlyx->u.curlyx.count--;
4450             CACHEsayNO;
4451             /* NOTREACHED */
4452
4453         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4454             REGCP_UNWIND(ST.lastcp);
4455             regcppop(rex);
4456             /* FALL THROUGH */
4457         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4458             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4459             cur_curlyx->u.curlyx.count--;
4460             CACHEsayNO;
4461             /* NOTREACHED */
4462
4463         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4464             REGCP_UNWIND(ST.lastcp);
4465             regcppop(rex);      /* Restore some previous $<digit>s? */
4466             PL_reginput = locinput;
4467             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4468                 "%*s  whilem: failed, trying continuation...\n",
4469                 REPORT_CODE_OFF+depth*2, "")
4470             );
4471           do_whilem_B_max:
4472             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4473                 && ckWARN(WARN_REGEXP)
4474                 && !(PL_reg_flags & RF_warned))
4475             {
4476                 PL_reg_flags |= RF_warned;
4477                 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4478                      "Complex regular subexpression recursion",
4479                      REG_INFTY - 1);
4480             }
4481
4482             /* now try B */
4483             ST.save_curlyx = cur_curlyx;
4484             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4485             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4486             /* NOTREACHED */
4487
4488         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4489             cur_curlyx = ST.save_curlyx;
4490             REGCP_UNWIND(ST.lastcp);
4491             regcppop(rex);
4492
4493             if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
4494                 /* Maximum greed exceeded */
4495                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4496                     && ckWARN(WARN_REGEXP)
4497                     && !(PL_reg_flags & RF_warned))
4498                 {
4499                     PL_reg_flags |= RF_warned;
4500                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4501                         "%s limit (%d) exceeded",
4502                         "Complex regular subexpression recursion",
4503                         REG_INFTY - 1);
4504                 }
4505                 cur_curlyx->u.curlyx.count--;
4506                 CACHEsayNO;
4507             }
4508
4509             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4510                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4511             );
4512             /* Try grabbing another A and see if it helps. */
4513             PL_reginput = locinput;
4514             cur_curlyx->u.curlyx.lastloc = locinput;
4515             ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4516             REGCP_SET(ST.lastcp);
4517             PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
4518             /* NOTREACHED */
4519
4520 #undef  ST
4521 #define ST st->u.branch
4522
4523         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
4524             next = scan + ARG(scan);
4525             if (next == scan)
4526                 next = NULL;
4527             scan = NEXTOPER(scan);
4528             /* FALL THROUGH */
4529
4530         case BRANCH:        /*  /(...|A|...)/ */
4531             scan = NEXTOPER(scan); /* scan now points to inner node */
4532             ST.lastparen = *PL_reglastparen;
4533             ST.next_branch = next;
4534             REGCP_SET(ST.cp);
4535             PL_reginput = locinput;
4536
4537             /* Now go into the branch */
4538             if (has_cutgroup) {
4539                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4540             } else {
4541                 PUSH_STATE_GOTO(BRANCH_next, scan);
4542             }
4543             /* NOTREACHED */
4544         case CUTGROUP:
4545             PL_reginput = locinput;
4546             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4547                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4548             PUSH_STATE_GOTO(CUTGROUP_next,next);
4549             /* NOTREACHED */
4550         case CUTGROUP_next_fail:
4551             do_cutgroup = 1;
4552             no_final = 1;
4553             if (st->u.mark.mark_name)
4554                 sv_commit = st->u.mark.mark_name;
4555             sayNO;
4556             /* NOTREACHED */
4557         case BRANCH_next:
4558             sayYES;
4559             /* NOTREACHED */
4560         case BRANCH_next_fail: /* that branch failed; try the next, if any */
4561             if (do_cutgroup) {
4562                 do_cutgroup = 0;
4563                 no_final = 0;
4564             }
4565             REGCP_UNWIND(ST.cp);
4566             for (n = *PL_reglastparen; n > ST.lastparen; n--)
4567                 PL_regoffs[n].end = -1;
4568             *PL_reglastparen = n;
4569             /*dmq: *PL_reglastcloseparen = n; */
4570             scan = ST.next_branch;
4571             /* no more branches? */
4572             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4573                 DEBUG_EXECUTE_r({
4574                     PerlIO_printf( Perl_debug_log,
4575                         "%*s  %sBRANCH failed...%s\n",
4576                         REPORT_CODE_OFF+depth*2, "",
4577                         PL_colors[4],
4578                         PL_colors[5] );
4579                 });
4580                 sayNO_SILENT;
4581             }
4582             continue; /* execute next BRANCH[J] op */
4583             /* NOTREACHED */
4584
4585         case MINMOD:
4586             minmod = 1;
4587             break;
4588
4589 #undef  ST
4590 #define ST st->u.curlym
4591
4592         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
4593
4594             /* This is an optimisation of CURLYX that enables us to push
4595              * only a single backtracking state, no matter how many matches
4596              * there are in {m,n}. It relies on the pattern being constant
4597              * length, with no parens to influence future backrefs
4598              */
4599
4600             ST.me = scan;
4601             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4602
4603             /* if paren positive, emulate an OPEN/CLOSE around A */
4604             if (ST.me->flags) {
4605                 U32 paren = ST.me->flags;
4606                 if (paren > PL_regsize)
4607                     PL_regsize = paren;
4608                 if (paren > *PL_reglastparen)
4609                     *PL_reglastparen = paren;
4610                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4611             }
4612             ST.A = scan;
4613             ST.B = next;
4614             ST.alen = 0;
4615             ST.count = 0;
4616             ST.minmod = minmod;
4617             minmod = 0;
4618             ST.c1 = CHRTEST_UNINIT;
4619             REGCP_SET(ST.cp);
4620
4621             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4622                 goto curlym_do_B;
4623
4624           curlym_do_A: /* execute the A in /A{m,n}B/  */
4625             PL_reginput = locinput;
4626             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4627             /* NOTREACHED */
4628
4629         case CURLYM_A: /* we've just matched an A */
4630             locinput = st->locinput;
4631             nextchr = UCHARAT(locinput);
4632
4633             ST.count++;
4634             /* after first match, determine A's length: u.curlym.alen */
4635             if (ST.count == 1) {
4636                 if (PL_reg_match_utf8) {
4637                     char *s = locinput;
4638                     while (s < PL_reginput) {
4639                         ST.alen++;
4640                         s += UTF8SKIP(s);
4641                     }
4642                 }
4643                 else {
4644                     ST.alen = PL_reginput - locinput;
4645                 }
4646                 if (ST.alen == 0)
4647                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4648             }
4649             DEBUG_EXECUTE_r(
4650                 PerlIO_printf(Perl_debug_log,
4651                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4652                           (int)(REPORT_CODE_OFF+(depth*2)), "",
4653                           (IV) ST.count, (IV)ST.alen)
4654             );
4655
4656             locinput = PL_reginput;
4657
4658             if (cur_eval && cur_eval->u.eval.close_paren &&
4659                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4660                 goto fake_end;
4661
4662             {
4663                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4664                 if ( max == REG_INFTY || ST.count < max )
4665                     goto curlym_do_A; /* try to match another A */
4666             }
4667             goto curlym_do_B; /* try to match B */
4668
4669         case CURLYM_A_fail: /* just failed to match an A */
4670             REGCP_UNWIND(ST.cp);
4671
4672             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4673                 || (cur_eval && cur_eval->u.eval.close_paren &&
4674                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4675                 sayNO;
4676
4677           curlym_do_B: /* execute the B in /A{m,n}B/  */
4678             PL_reginput = locinput;
4679             if (ST.c1 == CHRTEST_UNINIT) {
4680                 /* calculate c1 and c2 for possible match of 1st char
4681                  * following curly */
4682                 ST.c1 = ST.c2 = CHRTEST_VOID;
4683                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4684                     regnode *text_node = ST.B;
4685                     if (! HAS_TEXT(text_node))
4686                         FIND_NEXT_IMPT(text_node);
4687                     /* this used to be
4688
4689                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4690
4691                         But the former is redundant in light of the latter.
4692
4693                         if this changes back then the macro for
4694                         IS_TEXT and friends need to change.
4695                      */
4696                     if (PL_regkind[OP(text_node)] == EXACT)
4697                     {
4698
4699                         ST.c1 = (U8)*STRING(text_node);
4700                         ST.c2 =
4701                             (IS_TEXTF(text_node))
4702                             ? PL_fold[ST.c1]
4703                             : (IS_TEXTFL(text_node))
4704                                 ? PL_fold_locale[ST.c1]
4705                                 : ST.c1;
4706                     }
4707                 }
4708             }
4709
4710             DEBUG_EXECUTE_r(
4711                 PerlIO_printf(Perl_debug_log,
4712                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
4713                     (int)(REPORT_CODE_OFF+(depth*2)),
4714                     "", (IV)ST.count)
4715                 );
4716             if (ST.c1 != CHRTEST_VOID
4717                     && UCHARAT(PL_reginput) != ST.c1
4718                     && UCHARAT(PL_reginput) != ST.c2)
4719             {
4720                 /* simulate B failing */
4721                 DEBUG_OPTIMISE_r(
4722                     PerlIO_printf(Perl_debug_log,
4723                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4724                         (int)(REPORT_CODE_OFF+(depth*2)),"",
4725                         (IV)ST.c1,(IV)ST.c2
4726                 ));
4727                 state_num = CURLYM_B_fail;
4728                 goto reenter_switch;
4729             }
4730
4731             if (ST.me->flags) {
4732                 /* mark current A as captured */
4733                 I32 paren = ST.me->flags;
4734                 if (ST.count) {
4735                     PL_regoffs[paren].start
4736                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4737                     PL_regoffs[paren].end = PL_reginput - PL_bostr;
4738                     /*dmq: *PL_reglastcloseparen = paren; */
4739                 }
4740                 else
4741                     PL_regoffs[paren].end = -1;
4742                 if (cur_eval && cur_eval->u.eval.close_paren &&
4743                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4744                 {
4745                     if (ST.count)
4746                         goto fake_end;
4747                     else
4748                         sayNO;
4749                 }
4750             }
4751
4752             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4753             /* NOTREACHED */
4754
4755         case CURLYM_B_fail: /* just failed to match a B */
4756             REGCP_UNWIND(ST.cp);
4757             if (ST.minmod) {
4758                 I32 max = ARG2(ST.me);
4759                 if (max != REG_INFTY && ST.count == max)
4760                     sayNO;
4761                 goto curlym_do_A; /* try to match a further A */
4762             }
4763             /* backtrack one A */
4764             if (ST.count == ARG1(ST.me) /* min */)
4765                 sayNO;
4766             ST.count--;
4767             locinput = HOPc(locinput, -ST.alen);
4768             goto curlym_do_B; /* try to match B */
4769
4770 #undef ST
4771 #define ST st->u.curly
4772
4773 #define CURLY_SETPAREN(paren, success) \
4774     if (paren) { \
4775         if (success) { \
4776             PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
4777             PL_regoffs[paren].end = locinput - PL_bostr; \
4778             *PL_reglastcloseparen = paren; \
4779         } \
4780         else \
4781             PL_regoffs[paren].end = -1; \
4782     }
4783
4784         case STAR:              /*  /A*B/ where A is width 1 */
4785             ST.paren = 0;
4786             ST.min = 0;
4787             ST.max = REG_INFTY;
4788             scan = NEXTOPER(scan);
4789             goto repeat;
4790         case PLUS:              /*  /A+B/ where A is width 1 */
4791             ST.paren = 0;
4792             ST.min = 1;
4793             ST.max = REG_INFTY;
4794             scan = NEXTOPER(scan);
4795             goto repeat;
4796         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
4797             ST.paren = scan->flags;     /* Which paren to set */
4798             if (ST.paren > PL_regsize)
4799                 PL_regsize = ST.paren;
4800             if (ST.paren > *PL_reglastparen)
4801                 *PL_reglastparen = ST.paren;
4802             ST.min = ARG1(scan);  /* min to match */
4803             ST.max = ARG2(scan);  /* max to match */
4804             if (cur_eval && cur_eval->u.eval.close_paren &&
4805                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4806                 ST.min=1;
4807                 ST.max=1;
4808             }
4809             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4810             goto repeat;
4811         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
4812             ST.paren = 0;
4813             ST.min = ARG1(scan);  /* min to match */
4814             ST.max = ARG2(scan);  /* max to match */
4815             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4816           repeat:
4817             /*
4818             * Lookahead to avoid useless match attempts
4819             * when we know what character comes next.
4820             *
4821             * Used to only do .*x and .*?x, but now it allows
4822             * for )'s, ('s and (?{ ... })'s to be in the way
4823             * of the quantifier and the EXACT-like node.  -- japhy
4824             */
4825
4826             if (ST.min > ST.max) /* XXX make this a compile-time check? */
4827                 sayNO;
4828             if (HAS_TEXT(next) || JUMPABLE(next)) {
4829                 U8 *s;
4830                 regnode *text_node = next;
4831
4832                 if (! HAS_TEXT(text_node))
4833                     FIND_NEXT_IMPT(text_node);
4834
4835                 if (! HAS_TEXT(text_node))
4836                     ST.c1 = ST.c2 = CHRTEST_VOID;
4837                 else {
4838                     if ( PL_regkind[OP(text_node)] != EXACT ) {
4839                         ST.c1 = ST.c2 = CHRTEST_VOID;
4840                         goto assume_ok_easy;
4841                     }
4842                     else
4843                         s = (U8*)STRING(text_node);
4844
4845                     /*  Currently we only get here when
4846
4847                         PL_rekind[OP(text_node)] == EXACT
4848
4849                         if this changes back then the macro for IS_TEXT and
4850                         friends need to change. */
4851                     if (!UTF) {
4852                         ST.c2 = ST.c1 = *s;
4853                         if (IS_TEXTF(text_node))
4854                             ST.c2 = PL_fold[ST.c1];
4855                         else if (IS_TEXTFL(text_node))
4856                             ST.c2 = PL_fold_locale[ST.c1];
4857                     }
4858                     else { /* UTF */
4859                         if (IS_TEXTF(text_node)) {
4860                              STRLEN ulen1, ulen2;
4861                              U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4862                              U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4863
4864                              to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4865                              to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4866 #ifdef EBCDIC
4867                              ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4868                                                     ckWARN(WARN_UTF8) ?
4869                                                     0 : UTF8_ALLOW_ANY);
4870                              ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4871                                                     ckWARN(WARN_UTF8) ?
4872                                                     0 : UTF8_ALLOW_ANY);
4873 #else
4874                              ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
4875                                                     uniflags);
4876                              ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
4877                                                     uniflags);
4878 #endif
4879                         }
4880                         else {
4881                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
4882                                                      uniflags);
4883                         }
4884                     }
4885                 }
4886             }
4887             else
4888                 ST.c1 = ST.c2 = CHRTEST_VOID;
4889         assume_ok_easy:
4890
4891             ST.A = scan;
4892             ST.B = next;
4893             PL_reginput = locinput;
4894             if (minmod) {
4895                 minmod = 0;
4896                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4897                     sayNO;
4898                 ST.count = ST.min;
4899                 locinput = PL_reginput;
4900                 REGCP_SET(ST.cp);
4901                 if (ST.c1 == CHRTEST_VOID)
4902                     goto curly_try_B_min;
4903
4904                 ST.oldloc = locinput;
4905
4906                 /* set ST.maxpos to the furthest point along the
4907                  * string that could possibly match */
4908                 if  (ST.max == REG_INFTY) {
4909                     ST.maxpos = PL_regeol - 1;
4910                     if (do_utf8)
4911                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4912                             ST.maxpos--;
4913                 }
4914                 else if (do_utf8) {
4915                     int m = ST.max - ST.min;
4916                     for (ST.maxpos = locinput;
4917                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4918                         ST.maxpos += UTF8SKIP(ST.maxpos);
4919                 }
4920                 else {
4921                     ST.maxpos = locinput + ST.max - ST.min;
4922                     if (ST.maxpos >= PL_regeol)
4923                         ST.maxpos = PL_regeol - 1;
4924                 }
4925                 goto curly_try_B_min_known;
4926
4927             }
4928             else {
4929                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
4930                 locinput = PL_reginput;
4931                 if (ST.count < ST.min)
4932                     sayNO;
4933                 if ((ST.count > ST.min)
4934                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4935                 {
4936                     /* A{m,n} must come at the end of the string, there's
4937                      * no point in backing off ... */
4938                     ST.min = ST.count;
4939                     /* ...except that $ and \Z can match before *and* after
4940                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
4941                        We may back off by one in this case. */
4942                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4943                         ST.min--;
4944                 }
4945                 REGCP_SET(ST.cp);
4946                 goto curly_try_B_max;
4947             }
4948             /* NOTREACHED */
4949
4950
4951         case CURLY_B_min_known_fail:
4952             /* failed to find B in a non-greedy match where c1,c2 valid */
4953             if (ST.paren && ST.count)
4954                 PL_regoffs[ST.paren].end = -1;
4955
4956             PL_reginput = locinput;     /* Could be reset... */
4957             REGCP_UNWIND(ST.cp);
4958             /* Couldn't or didn't -- move forward. */
4959             ST.oldloc = locinput;
4960             if (do_utf8)
4961                 locinput += UTF8SKIP(locinput);
4962             else
4963                 locinput++;
4964             ST.count++;
4965           curly_try_B_min_known:
4966              /* find the next place where 'B' could work, then call B */
4967             {
4968                 int n;
4969                 if (do_utf8) {
4970                     n = (ST.oldloc == locinput) ? 0 : 1;
4971                     if (ST.c1 == ST.c2) {
4972                         STRLEN len;
4973                         /* set n to utf8_distance(oldloc, locinput) */
4974                         while (locinput <= ST.maxpos &&
4975                                utf8n_to_uvchr((U8*)locinput,
4976                                               UTF8_MAXBYTES, &len,
4977                                               uniflags) != (UV)ST.c1) {
4978                             locinput += len;
4979                             n++;
4980                         }
4981                     }
4982                     else {
4983                         /* set n to utf8_distance(oldloc, locinput) */
4984                         while (locinput <= ST.maxpos) {
4985                             STRLEN len;
4986                             const UV c = utf8n_to_uvchr((U8*)locinput,
4987                                                   UTF8_MAXBYTES, &len,
4988                                                   uniflags);
4989                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
4990                                 break;
4991                             locinput += len;
4992                             n++;
4993                         }
4994                     }
4995                 }
4996                 else {
4997                     if (ST.c1 == ST.c2) {
4998                         while (locinput <= ST.maxpos &&
4999                                UCHARAT(locinput) != ST.c1)
5000                             locinput++;
5001                     }
5002                     else {
5003                         while (locinput <= ST.maxpos
5004                                && UCHARAT(locinput) != ST.c1
5005                                && UCHARAT(locinput) != ST.c2)
5006                             locinput++;
5007                     }
5008                     n = locinput - ST.oldloc;
5009                 }
5010                 if (locinput > ST.maxpos)
5011                     sayNO;
5012                 /* PL_reginput == oldloc now */
5013                 if (n) {
5014                     ST.count += n;
5015                     if (regrepeat(rex, ST.A, n, depth) < n)
5016                         sayNO;
5017                 }
5018                 PL_reginput = locinput;
5019                 CURLY_SETPAREN(ST.paren, ST.count);
5020                 if (cur_eval && cur_eval->u.eval.close_paren &&
5021                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5022                     goto fake_end;
5023                 }
5024                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5025             }
5026             /* NOTREACHED */
5027
5028
5029         case CURLY_B_min_fail:
5030             /* failed to find B in a non-greedy match where c1,c2 invalid */
5031             if (ST.paren && ST.count)
5032                 PL_regoffs[ST.paren].end = -1;
5033
5034             REGCP_UNWIND(ST.cp);
5035             /* failed -- move forward one */
5036             PL_reginput = locinput;
5037             if (regrepeat(rex, ST.A, 1, depth)) {
5038                 ST.count++;
5039                 locinput = PL_reginput;
5040                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5041                         ST.count > 0)) /* count overflow ? */
5042                 {
5043                   curly_try_B_min:
5044                     CURLY_SETPAREN(ST.paren, ST.count);
5045                     if (cur_eval && cur_eval->u.eval.close_paren &&
5046                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5047                         goto fake_end;
5048                     }
5049                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5050                 }
5051             }
5052             sayNO;
5053             /* NOTREACHED */
5054
5055
5056         curly_try_B_max:
5057             /* a successful greedy match: now try to match B */
5058             if (cur_eval && cur_eval->u.eval.close_paren &&
5059                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5060                 goto fake_end;
5061             }
5062             {
5063                 UV c = 0;
5064                 if (ST.c1 != CHRTEST_VOID)
5065                     c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
5066                                            UTF8_MAXBYTES, 0, uniflags)
5067                                 : (UV) UCHARAT(PL_reginput);
5068                 /* If it could work, try it. */
5069                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5070                     CURLY_SETPAREN(ST.paren, ST.count);
5071                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5072                     /* NOTREACHED */
5073                 }
5074             }
5075             /* FALL THROUGH */
5076         case CURLY_B_max_fail:
5077             /* failed to find B in a greedy match */
5078             if (ST.paren && ST.count)
5079                 PL_regoffs[ST.paren].end = -1;
5080
5081             REGCP_UNWIND(ST.cp);
5082             /*  back up. */
5083             if (--ST.count < ST.min)
5084                 sayNO;
5085             PL_reginput = locinput = HOPc(locinput, -1);
5086             goto curly_try_B_max;
5087
5088 #undef ST
5089
5090         case END:
5091             fake_end:
5092             if (cur_eval) {
5093                 /* we've just finished A in /(??{A})B/; now continue with B */
5094                 I32 tmpix;
5095                 st->u.eval.toggle_reg_flags
5096                             = cur_eval->u.eval.toggle_reg_flags;
5097                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5098
5099                 st->u.eval.prev_rex = rex_sv;           /* inner */
5100                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5101                 rex = (struct regexp *)SvANY(rex_sv);
5102                 rexi = RXi_GET(rex);
5103                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5104                 ReREFCNT_inc(rex_sv);
5105                 st->u.eval.cp = regcppush(0);   /* Save *all* the positions. */
5106
5107                 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5108                 PL_reglastparen = &rex->lastparen;
5109                 PL_reglastcloseparen = &rex->lastcloseparen;
5110
5111                 REGCP_SET(st->u.eval.lastcp);
5112                 PL_reginput = locinput;
5113
5114                 /* Restore parens of the outer rex without popping the
5115                  * savestack */
5116                 tmpix = PL_savestack_ix;
5117                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5118                 regcppop(rex);
5119                 PL_savestack_ix = tmpix;
5120
5121                 st->u.eval.prev_eval = cur_eval;
5122                 cur_eval = cur_eval->u.eval.prev_eval;
5123                 DEBUG_EXECUTE_r(
5124                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5125                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5126                 if ( nochange_depth )
5127                     nochange_depth--;
5128
5129                 PUSH_YES_STATE_GOTO(EVAL_AB,
5130                         st->u.eval.prev_eval->u.eval.B); /* match B */
5131             }
5132
5133             if (locinput < reginfo->till) {
5134                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5135                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5136                                       PL_colors[4],
5137                                       (long)(locinput - PL_reg_starttry),
5138                                       (long)(reginfo->till - PL_reg_starttry),
5139                                       PL_colors[5]));
5140
5141                 sayNO_SILENT;           /* Cannot match: too short. */
5142             }
5143             PL_reginput = locinput;     /* put where regtry can find it */
5144             sayYES;                     /* Success! */
5145
5146         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5147             DEBUG_EXECUTE_r(
5148             PerlIO_printf(Perl_debug_log,
5149                 "%*s  %ssubpattern success...%s\n",
5150                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5151             PL_reginput = locinput;     /* put where regtry can find it */
5152             sayYES;                     /* Success! */
5153
5154 #undef  ST
5155 #define ST st->u.ifmatch
5156
5157         case SUSPEND:   /* (?>A) */
5158             ST.wanted = 1;
5159             PL_reginput = locinput;
5160             goto do_ifmatch;
5161
5162         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5163             ST.wanted = 0;
5164             goto ifmatch_trivial_fail_test;
5165
5166         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5167             ST.wanted = 1;
5168           ifmatch_trivial_fail_test:
5169             if (scan->flags) {
5170                 char * const s = HOPBACKc(locinput, scan->flags);
5171                 if (!s) {
5172                     /* trivial fail */
5173                     if (logical) {
5174                         logical = 0;
5175                         sw = 1 - cBOOL(ST.wanted);
5176                     }
5177                     else if (ST.wanted)
5178                         sayNO;
5179                     next = scan + ARG(scan);
5180                     if (next == scan)
5181                         next = NULL;
5182                     break;
5183                 }
5184                 PL_reginput = s;
5185             }
5186             else
5187                 PL_reginput = locinput;
5188
5189           do_ifmatch:
5190             ST.me = scan;
5191             ST.logical = logical;
5192             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5193
5194             /* execute body of (?...A) */
5195             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5196             /* NOTREACHED */
5197
5198         case IFMATCH_A_fail: /* body of (?...A) failed */
5199             ST.wanted = !ST.wanted;
5200             /* FALL THROUGH */
5201
5202         case IFMATCH_A: /* body of (?...A) succeeded */
5203             if (ST.logical) {
5204                 sw = cBOOL(ST.wanted);
5205             }
5206             else if (!ST.wanted)
5207                 sayNO;
5208
5209             if (OP(ST.me) == SUSPEND)
5210                 locinput = PL_reginput;
5211             else {
5212                 locinput = PL_reginput = st->locinput;
5213                 nextchr = UCHARAT(locinput);
5214             }
5215             scan = ST.me + ARG(ST.me);
5216             if (scan == ST.me)
5217                 scan = NULL;
5218             continue; /* execute B */
5219
5220 #undef ST
5221
5222         case LONGJMP:
5223             next = scan + ARG(scan);
5224             if (next == scan)
5225                 next = NULL;
5226             break;
5227         case COMMIT:
5228             reginfo->cutpoint = PL_regeol;
5229             /* FALLTHROUGH */
5230         case PRUNE:
5231             PL_reginput = locinput;
5232             if (!scan->flags)
5233                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5234             PUSH_STATE_GOTO(COMMIT_next,next);
5235             /* NOTREACHED */
5236         case COMMIT_next_fail:
5237             no_final = 1;
5238             /* FALLTHROUGH */
5239         case OPFAIL:
5240             sayNO;
5241             /* NOTREACHED */
5242
5243 #define ST st->u.mark
5244         case MARKPOINT:
5245             ST.prev_mark = mark_state;
5246             ST.mark_name = sv_commit = sv_yes_mark
5247                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5248             mark_state = st;
5249             ST.mark_loc = PL_reginput = locinput;
5250             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5251             /* NOTREACHED */
5252         case MARKPOINT_next:
5253             mark_state = ST.prev_mark;
5254             sayYES;
5255             /* NOTREACHED */
5256         case MARKPOINT_next_fail:
5257             if (popmark && sv_eq(ST.mark_name,popmark))
5258             {
5259                 if (ST.mark_loc > startpoint)
5260                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5261                 popmark = NULL; /* we found our mark */
5262                 sv_commit = ST.mark_name;
5263
5264                 DEBUG_EXECUTE_r({
5265                         PerlIO_printf(Perl_debug_log,
5266                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5267                             REPORT_CODE_OFF+depth*2, "",
5268                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5269                 });
5270             }
5271             mark_state = ST.prev_mark;
5272             sv_yes_mark = mark_state ?
5273                 mark_state->u.mark.mark_name : NULL;
5274             sayNO;
5275             /* NOTREACHED */
5276         case SKIP:
5277             PL_reginput = locinput;
5278             if (scan->flags) {
5279                 /* (*SKIP) : if we fail we cut here*/
5280                 ST.mark_name = NULL;
5281                 ST.mark_loc = locinput;
5282                 PUSH_STATE_GOTO(SKIP_next,next);
5283             } else {
5284                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5285                    otherwise do nothing.  Meaning we need to scan
5286                  */
5287                 regmatch_state *cur = mark_state;
5288                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5289
5290                 while (cur) {
5291                     if ( sv_eq( cur->u.mark.mark_name,
5292                                 find ) )
5293                     {
5294                         ST.mark_name = find;
5295                         PUSH_STATE_GOTO( SKIP_next, next );
5296                     }
5297                     cur = cur->u.mark.prev_mark;
5298                 }
5299             }
5300             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5301             break;
5302         case SKIP_next_fail:
5303             if (ST.mark_name) {
5304                 /* (*CUT:NAME) - Set up to search for the name as we
5305                    collapse the stack*/
5306                 popmark = ST.mark_name;
5307             } else {
5308                 /* (*CUT) - No name, we cut here.*/
5309                 if (ST.mark_loc > startpoint)
5310                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5311                 /* but we set sv_commit to latest mark_name if there
5312                    is one so they can test to see how things lead to this
5313                    cut */
5314                 if (mark_state)
5315                     sv_commit=mark_state->u.mark.mark_name;
5316             }
5317             no_final = 1;
5318             sayNO;
5319             /* NOTREACHED */
5320 #undef ST
5321         case FOLDCHAR:
5322             n = ARG(scan);
5323             if ( n == (U32)what_len_TRICKYFOLD(locinput,do_utf8,ln) ) {
5324                 locinput += ln;
5325             } else if ( 0xDF == n && !do_utf8 && !UTF ) {
5326                 sayNO;
5327             } else  {
5328                 U8 folded[UTF8_MAXBYTES_CASE+1];
5329                 STRLEN foldlen;
5330                 const char * const l = locinput;
5331                 char *e = PL_regeol;
5332                 to_uni_fold(n, folded, &foldlen);
5333
5334                 if (ibcmp_utf8((const char*) folded, 0,  foldlen, 1,
5335                                l, &e, 0,  do_utf8)) {
5336                         sayNO;
5337                 }
5338                 locinput = e;
5339             }
5340             nextchr = UCHARAT(locinput);
5341             break;
5342         case LNBREAK:
5343             if ((n=is_LNBREAK(locinput,do_utf8))) {
5344                 locinput += n;
5345                 nextchr = UCHARAT(locinput);
5346             } else
5347                 sayNO;
5348             break;
5349
5350 #define CASE_CLASS(nAmE)                              \
5351         case nAmE:                                    \
5352             if ((n=is_##nAmE(locinput,do_utf8))) {    \
5353                 locinput += n;                        \
5354                 nextchr = UCHARAT(locinput);          \
5355             } else                                    \
5356                 sayNO;                                \
5357             break;                                    \
5358         case N##nAmE:                                 \
5359             if ((n=is_##nAmE(locinput,do_utf8))) {    \
5360                 sayNO;                                \
5361             } else {                                  \
5362                 locinput += UTF8SKIP(locinput);       \
5363                 nextchr = UCHARAT(locinput);          \
5364             }                                         \
5365             break
5366
5367         CASE_CLASS(VERTWS);
5368         CASE_CLASS(HORIZWS);
5369 #undef CASE_CLASS
5370
5371         default:
5372             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5373                           PTR2UV(scan), OP(scan));
5374             Perl_croak(aTHX_ "regexp memory corruption");
5375
5376         } /* end switch */
5377
5378         /* switch break jumps here */
5379         scan = next; /* prepare to execute the next op and ... */
5380         continue;    /* ... jump back to the top, reusing st */
5381         /* NOTREACHED */
5382
5383       push_yes_state:
5384         /* push a state that backtracks on success */
5385         st->u.yes.prev_yes_state = yes_state;
5386         yes_state = st;
5387         /* FALL THROUGH */
5388       push_state:
5389         /* push a new regex state, then continue at scan  */
5390         {
5391             regmatch_state *newst;
5392
5393             DEBUG_STACK_r({
5394                 regmatch_state *cur = st;
5395                 regmatch_state *curyes = yes_state;
5396                 int curd = depth;
5397                 regmatch_slab *slab = PL_regmatch_slab;
5398                 for (;curd > -1;cur--,curd--) {
5399                     if (cur < SLAB_FIRST(slab)) {
5400                         slab = slab->prev;
5401                         cur = SLAB_LAST(slab);
5402                     }
5403                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5404                         REPORT_CODE_OFF + 2 + depth * 2,"",
5405                         curd, PL_reg_name[cur->resume_state],
5406                         (curyes == cur) ? "yes" : ""
5407                     );
5408                     if (curyes == cur)
5409                         curyes = cur->u.yes.prev_yes_state;
5410                 }
5411             } else
5412                 DEBUG_STATE_pp("push")
5413             );
5414             depth++;
5415             st->locinput = locinput;
5416             newst = st+1;
5417             if (newst >  SLAB_LAST(PL_regmatch_slab))
5418                 newst = S_push_slab(aTHX);
5419             PL_regmatch_state = newst;
5420
5421             locinput = PL_reginput;
5422             nextchr = UCHARAT(locinput);
5423             st = newst;
5424             continue;
5425             /* NOTREACHED */
5426         }
5427     }
5428
5429     /*
5430     * We get here only if there's trouble -- normally "case END" is
5431     * the terminating point.
5432     */
5433     Perl_croak(aTHX_ "corrupted regexp pointers");
5434     /*NOTREACHED*/
5435     sayNO;
5436
5437 yes:
5438     if (yes_state) {
5439         /* we have successfully completed a subexpression, but we must now
5440          * pop to the state marked by yes_state and continue from there */
5441         assert(st != yes_state);
5442 #ifdef DEBUGGING
5443         while (st != yes_state) {
5444             st--;
5445             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5446                 PL_regmatch_slab = PL_regmatch_slab->prev;
5447                 st = SLAB_LAST(PL_regmatch_slab);
5448             }
5449             DEBUG_STATE_r({
5450                 if (no_final) {
5451                     DEBUG_STATE_pp("pop (no final)");
5452                 } else {
5453                     DEBUG_STATE_pp("pop (yes)");
5454                 }
5455             });
5456             depth--;
5457         }
5458 #else
5459         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5460             || yes_state > SLAB_LAST(PL_regmatch_slab))
5461         {
5462             /* not in this slab, pop slab */
5463             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5464             PL_regmatch_slab = PL_regmatch_slab->prev;
5465             st = SLAB_LAST(PL_regmatch_slab);
5466         }
5467         depth -= (st - yes_state);
5468 #endif
5469         st = yes_state;
5470         yes_state = st->u.yes.prev_yes_state;
5471         PL_regmatch_state = st;
5472
5473         if (no_final) {
5474             locinput= st->locinput;
5475             nextchr = UCHARAT(locinput);
5476         }
5477         state_num = st->resume_state + no_final;
5478         goto reenter_switch;
5479     }
5480
5481     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5482                           PL_colors[4], PL_colors[5]));
5483
5484     if (PL_reg_eval_set) {
5485         /* each successfully executed (?{...}) block does the equivalent of
5486          *   local $^R = do {...}
5487          * When popping the save stack, all these locals would be undone;
5488          * bypass this by setting the outermost saved $^R to the latest
5489          * value */
5490         if (oreplsv != GvSV(PL_replgv))
5491             sv_setsv(oreplsv, GvSV(PL_replgv));
5492     }
5493     result = 1;
5494     goto final_exit;
5495
5496 no:
5497     DEBUG_EXECUTE_r(
5498         PerlIO_printf(Perl_debug_log,
5499             "%*s  %sfailed...%s\n",
5500             REPORT_CODE_OFF+depth*2, "",
5501             PL_colors[4], PL_colors[5])
5502         );
5503
5504 no_silent:
5505     if (no_final) {
5506         if (yes_state) {
5507             goto yes;
5508         } else {
5509             goto final_exit;
5510         }
5511     }
5512     if (depth) {
5513         /* there's a previous state to backtrack to */
5514         st--;
5515         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5516             PL_regmatch_slab = PL_regmatch_slab->prev;
5517             st = SLAB_LAST(PL_regmatch_slab);
5518         }
5519         PL_regmatch_state = st;
5520         locinput= st->locinput;
5521         nextchr = UCHARAT(locinput);
5522
5523         DEBUG_STATE_pp("pop");
5524         depth--;
5525         if (yes_state == st)
5526             yes_state = st->u.yes.prev_yes_state;
5527
5528         state_num = st->resume_state + 1; /* failure = success + 1 */
5529         goto reenter_switch;
5530     }
5531     result = 0;
5532
5533   final_exit:
5534     if (rex->intflags & PREGf_VERBARG_SEEN) {
5535         SV *sv_err = get_sv("REGERROR", 1);
5536         SV *sv_mrk = get_sv("REGMARK", 1);
5537         if (result) {
5538             sv_commit = &PL_sv_no;
5539             if (!sv_yes_mark)
5540                 sv_yes_mark = &PL_sv_yes;
5541         } else {
5542             if (!sv_commit)
5543                 sv_commit = &PL_sv_yes;
5544             sv_yes_mark = &PL_sv_no;
5545         }
5546         sv_setsv(sv_err, sv_commit);
5547         sv_setsv(sv_mrk, sv_yes_mark);
5548     }
5549
5550     /* clean up; in particular, free all slabs above current one */
5551     LEAVE_SCOPE(oldsave);
5552
5553     return result;
5554 }
5555
5556 /*
5557  - regrepeat - repeatedly match something simple, report how many
5558  */
5559 /*
5560  * [This routine now assumes that it will only match on things of length 1.
5561  * That was true before, but now we assume scan - reginput is the count,
5562  * rather than incrementing count on every character.  [Er, except utf8.]]
5563  */
5564 STATIC I32
5565 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5566 {
5567     dVAR;
5568     register char *scan;
5569     register I32 c;
5570     register char *loceol = PL_regeol;
5571     register I32 hardcount = 0;
5572     register bool do_utf8 = PL_reg_match_utf8;
5573 #ifndef DEBUGGING
5574     PERL_UNUSED_ARG(depth);
5575 #endif
5576
5577     PERL_ARGS_ASSERT_REGREPEAT;
5578
5579     scan = PL_reginput;
5580     if (max == REG_INFTY)
5581         max = I32_MAX;
5582     else if (max < loceol - scan)
5583         loceol = scan + max;
5584     switch (OP(p)) {
5585     case REG_ANY:
5586         if (do_utf8) {
5587             loceol = PL_regeol;
5588             while (scan < loceol && hardcount < max && *scan != '\n') {
5589                 scan += UTF8SKIP(scan);
5590                 hardcount++;
5591             }
5592         } else {
5593             while (scan < loceol && *scan != '\n')
5594                 scan++;
5595         }
5596         break;
5597     case SANY:
5598         if (do_utf8) {
5599             loceol = PL_regeol;
5600             while (scan < loceol && hardcount < max) {
5601                 scan += UTF8SKIP(scan);
5602                 hardcount++;
5603             }
5604         }
5605         else
5606             scan = loceol;
5607         break;
5608     case CANY:
5609         scan = loceol;
5610         break;
5611     case EXACT:         /* length of string is 1 */
5612         c = (U8)*STRING(p);
5613         while (scan < loceol && UCHARAT(scan) == c)
5614             scan++;
5615         break;
5616     case EXACTF:        /* length of string is 1 */
5617         c = (U8)*STRING(p);
5618         while (scan < loceol &&
5619                (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
5620             scan++;
5621         break;
5622     case EXACTFL:       /* length of string is 1 */
5623         PL_reg_flags |= RF_tainted;
5624         c = (U8)*STRING(p);
5625         while (scan < loceol &&
5626                (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
5627             scan++;
5628         break;
5629     case ANYOF:
5630         if (do_utf8) {
5631             loceol = PL_regeol;
5632             while (hardcount < max && scan < loceol &&
5633                    reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
5634                 scan += UTF8SKIP(scan);
5635                 hardcount++;
5636             }
5637         } else {
5638             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
5639                 scan++;
5640         }
5641         break;
5642     case ALNUM:
5643         if (do_utf8) {
5644             loceol = PL_regeol;
5645             LOAD_UTF8_CHARCLASS_ALNUM();
5646             while (hardcount < max && scan < loceol &&
5647                    swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5648                 scan += UTF8SKIP(scan);
5649                 hardcount++;
5650             }
5651         } else {
5652             while (scan < loceol && isALNUM(*scan))
5653                 scan++;
5654         }
5655         break;
5656     case ALNUML:
5657         PL_reg_flags |= RF_tainted;
5658         if (do_utf8) {
5659             loceol = PL_regeol;
5660             while (hardcount < max && scan < loceol &&
5661                    isALNUM_LC_utf8((U8*)scan)) {
5662                 scan += UTF8SKIP(scan);
5663                 hardcount++;
5664             }
5665         } else {
5666             while (scan < loceol && isALNUM_LC(*scan))
5667                 scan++;
5668         }
5669         break;
5670     case NALNUM:
5671         if (do_utf8) {
5672             loceol = PL_regeol;
5673             LOAD_UTF8_CHARCLASS_ALNUM();
5674             while (hardcount < max && scan < loceol &&
5675                    !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5676                 scan += UTF8SKIP(scan);
5677                 hardcount++;
5678             }
5679         } else {
5680             while (scan < loceol && !isALNUM(*scan))
5681                 scan++;
5682         }
5683         break;
5684     case NALNUML:
5685         PL_reg_flags |= RF_tainted;
5686         if (do_utf8) {
5687             loceol = PL_regeol;
5688             while (hardcount < max && scan < loceol &&
5689                    !isALNUM_LC_utf8((U8*)scan)) {
5690                 scan += UTF8SKIP(scan);
5691                 hardcount++;
5692             }
5693         } else {
5694             while (scan < loceol && !isALNUM_LC(*scan))
5695                 scan++;
5696         }
5697         break;
5698     case SPACE:
5699         if (do_utf8) {
5700             loceol = PL_regeol;
5701             LOAD_UTF8_CHARCLASS_SPACE();
5702             while (hardcount < max && scan < loceol &&
5703                    (*scan == ' ' ||
5704                     swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5705                 scan += UTF8SKIP(scan);
5706                 hardcount++;
5707             }
5708         } else {
5709             while (scan < loceol && isSPACE(*scan))
5710                 scan++;
5711         }
5712         break;
5713     case SPACEL:
5714         PL_reg_flags |= RF_tainted;
5715         if (do_utf8) {
5716             loceol = PL_regeol;
5717             while (hardcount < max && scan < loceol &&
5718                    (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5719                 scan += UTF8SKIP(scan);
5720                 hardcount++;
5721             }
5722         } else {
5723             while (scan < loceol && isSPACE_LC(*scan))
5724                 scan++;
5725         }
5726         break;
5727     case NSPACE:
5728         if (do_utf8) {
5729             loceol = PL_regeol;
5730             LOAD_UTF8_CHARCLASS_SPACE();
5731             while (hardcount < max && scan < loceol &&
5732                    !(*scan == ' ' ||
5733                      swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5734                 scan += UTF8SKIP(scan);
5735                 hardcount++;
5736             }
5737         } else {
5738             while (scan < loceol && !isSPACE(*scan))
5739                 scan++;
5740         }
5741         break;
5742     case NSPACEL:
5743         PL_reg_flags |= RF_tainted;
5744         if (do_utf8) {
5745             loceol = PL_regeol;
5746             while (hardcount < max && scan < loceol &&
5747                    !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5748                 scan += UTF8SKIP(scan);
5749                 hardcount++;
5750             }
5751         } else {
5752             while (scan < loceol && !isSPACE_LC(*scan))
5753                 scan++;
5754         }
5755         break;
5756     case DIGIT:
5757         if (do_utf8) {
5758             loceol = PL_regeol;
5759             LOAD_UTF8_CHARCLASS_DIGIT();
5760             while (hardcount < max && scan < loceol &&
5761                    swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5762                 scan += UTF8SKIP(scan);
5763                 hardcount++;
5764             }
5765         } else {
5766             while (scan < loceol && isDIGIT(*scan))
5767                 scan++;
5768         }
5769         break;
5770     case NDIGIT:
5771         if (do_utf8) {
5772             loceol = PL_regeol;
5773             LOAD_UTF8_CHARCLASS_DIGIT();
5774             while (hardcount < max && scan < loceol &&
5775                    !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5776                 scan += UTF8SKIP(scan);
5777                 hardcount++;
5778             }
5779         } else {
5780             while (scan < loceol && !isDIGIT(*scan))
5781                 scan++;
5782         }
5783     case LNBREAK:
5784         if (do_utf8) {
5785             loceol = PL_regeol;
5786             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
5787                 scan += c;
5788                 hardcount++;
5789             }
5790         } else {
5791             /*
5792               LNBREAK can match two latin chars, which is ok,
5793               because we have a null terminated string, but we
5794               have to use hardcount in this situation
5795             */
5796             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
5797                 scan+=c;
5798                 hardcount++;
5799             }
5800         }
5801         break;
5802     case HORIZWS:
5803         if (do_utf8) {
5804             loceol = PL_regeol;
5805             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
5806                 scan += c;
5807                 hardcount++;
5808             }
5809         } else {
5810             while (scan < loceol && is_HORIZWS_latin1(scan))
5811                 scan++;
5812         }
5813         break;
5814     case NHORIZWS:
5815         if (do_utf8) {
5816             loceol = PL_regeol;
5817             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
5818                 scan += UTF8SKIP(scan);
5819                 hardcount++;
5820             }
5821         } else {
5822             while (scan < loceol && !is_HORIZWS_latin1(scan))
5823                 scan++;
5824
5825         }
5826         break;
5827     case VERTWS:
5828         if (do_utf8) {
5829             loceol = PL_regeol;
5830             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
5831                 scan += c;
5832                 hardcount++;
5833             }
5834         } else {
5835             while (scan < loceol && is_VERTWS_latin1(scan))
5836                 scan++;
5837
5838         }
5839         break;
5840     case NVERTWS:
5841         if (do_utf8) {
5842             loceol = PL_regeol;
5843             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
5844                 scan += UTF8SKIP(scan);
5845                 hardcount++;
5846             }
5847         } else {
5848             while (scan < loceol && !is_VERTWS_latin1(scan))
5849                 scan++;
5850
5851         }
5852         break;
5853
5854     default:            /* Called on something of 0 width. */
5855         break;          /* So match right here or not at all. */
5856     }
5857
5858     if (hardcount)
5859         c = hardcount;
5860     else
5861         c = scan - PL_reginput;
5862     PL_reginput = scan;
5863
5864     DEBUG_r({
5865         GET_RE_DEBUG_FLAGS_DECL;
5866         DEBUG_EXECUTE_r({
5867             SV * const prop = sv_newmortal();
5868             regprop(prog, prop, p);
5869             PerlIO_printf(Perl_debug_log,
5870                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
5871                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
5872         });
5873     });
5874
5875     return(c);
5876 }
5877
5878
5879 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
5880 /*
5881 - regclass_swash - prepare the utf8 swash
5882 */
5883
5884 SV *
5885 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
5886 {
5887     dVAR;
5888     SV *sw  = NULL;
5889     SV *si  = NULL;
5890     SV *alt = NULL;
5891     RXi_GET_DECL(prog,progi);
5892     const struct reg_data * const data = prog ? progi->data : NULL;
5893
5894     PERL_ARGS_ASSERT_REGCLASS_SWASH;
5895
5896     if (data && data->count) {
5897         const U32 n = ARG(node);
5898
5899         if (data->what[n] == 's') {
5900             SV * const rv = MUTABLE_SV(data->data[n]);
5901             AV * const av = MUTABLE_AV(SvRV(rv));
5902             SV **const ary = AvARRAY(av);
5903             SV **a, **b;
5904
5905             /* See the end of regcomp.c:S_regclass() for
5906              * documentation of these array elements. */
5907
5908             si = *ary;
5909             a  = SvROK(ary[1]) ? &ary[1] : NULL;
5910             b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
5911
5912             if (a)
5913                 sw = *a;
5914             else if (si && doinit) {
5915                 sw = swash_init("utf8", "", si, 1, 0);
5916                 (void)av_store(av, 1, sw);
5917             }
5918             if (b)
5919                 alt = *b;
5920         }
5921     }
5922
5923     if (listsvp)
5924         *listsvp = si;
5925     if (altsvp)
5926         *altsvp  = alt;
5927
5928     return sw;
5929 }
5930 #endif
5931
5932 /*
5933  - reginclass - determine if a character falls into a character class
5934
5935   The n is the ANYOF regnode, the p is the target string, lenp
5936   is pointer to the maximum length of how far to go in the p
5937   (if the lenp is zero, UTF8SKIP(p) is used),
5938   do_utf8 tells whether the target string is in UTF-8.
5939
5940  */
5941
5942 STATIC bool
5943 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
5944 {
5945     dVAR;
5946     const char flags = ANYOF_FLAGS(n);
5947     bool match = FALSE;
5948     UV c = *p;
5949     STRLEN len = 0;
5950     STRLEN plen;
5951
5952     PERL_ARGS_ASSERT_REGINCLASS;
5953
5954     if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5955         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
5956                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
5957                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
5958                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
5959                  * UTF8_ALLOW_FFFF */
5960         if (len == (STRLEN)-1)
5961             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5962     }
5963
5964     plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
5965     if (do_utf8 || (flags & ANYOF_UNICODE)) {
5966         if (lenp)
5967             *lenp = 0;
5968         if (do_utf8 && !ANYOF_RUNTIME(n)) {
5969             if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5970                 match = TRUE;
5971         }
5972         if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
5973             match = TRUE;
5974         if (!match) {
5975             AV *av;
5976             SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
5977
5978             if (sw) {
5979                 U8 * utf8_p;
5980                 if (do_utf8) {
5981                     utf8_p = (U8 *) p;
5982                 } else {
5983                     STRLEN len = 1;
5984                     utf8_p = bytes_to_utf8(p, &len);
5985                 }
5986                 if (swash_fetch(sw, utf8_p, 1))
5987                     match = TRUE;
5988                 else if (flags & ANYOF_FOLD) {
5989                     if (!match && lenp && av) {
5990                         I32 i;
5991                         for (i = 0; i <= av_len(av); i++) {
5992                             SV* const sv = *av_fetch(av, i, FALSE);
5993                             STRLEN len;
5994                             const char * const s = SvPV_const(sv, len);
5995                             if (len <= plen && memEQ(s, (char*)utf8_p, len)) {
5996                                 *lenp = len;
5997                                 match = TRUE;
5998                                 break;
5999                             }
6000                         }
6001                     }
6002                     if (!match) {
6003                         U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
6004
6005                         STRLEN tmplen;
6006                         to_utf8_fold(utf8_p, tmpbuf, &tmplen);
6007                         if (swash_fetch(sw, tmpbuf, 1))
6008                             match = TRUE;
6009                     }
6010                 }
6011
6012                 /* If we allocated a string above, free it */
6013                 if (! do_utf8) Safefree(utf8_p);
6014             }
6015         }
6016         if (match && lenp && *lenp == 0)
6017             *lenp = UNISKIP(NATIVE_TO_UNI(c));
6018     }
6019     if (!match && c < 256) {
6020         if (ANYOF_BITMAP_TEST(n, c))
6021             match = TRUE;
6022         else if (flags & ANYOF_FOLD) {
6023             U8 f;
6024
6025             if (flags & ANYOF_LOCALE) {
6026                 PL_reg_flags |= RF_tainted;
6027                 f = PL_fold_locale[c];
6028             }
6029             else
6030                 f = PL_fold[c];
6031             if (f != c && ANYOF_BITMAP_TEST(n, f))
6032                 match = TRUE;
6033         }
6034
6035         if (!match && (flags & ANYOF_CLASS)) {
6036             PL_reg_flags |= RF_tainted;
6037             if (
6038                 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6039                 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6040                 (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6041                 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6042                 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6043                 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6044                 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6045                 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6046                 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6047                 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6048                 (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6049                 (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6050                 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6051                 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6052                 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6053                 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6054                 (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6055                 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6056                 (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6057                 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6058                 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6059                 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6060                 (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6061                 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6062                 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6063                 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6064                 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6065                 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6066                 (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6067                 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6068                 ) /* How's that for a conditional? */
6069             {
6070                 match = TRUE;
6071             }
6072         }
6073     }
6074
6075     return (flags & ANYOF_INVERT) ? !match : match;
6076 }
6077
6078 STATIC U8 *
6079 S_reghop3(U8 *s, I32 off, const U8* lim)
6080 {
6081     dVAR;
6082
6083     PERL_ARGS_ASSERT_REGHOP3;
6084
6085     if (off >= 0) {
6086         while (off-- && s < lim) {
6087             /* XXX could check well-formedness here */
6088             s += UTF8SKIP(s);
6089         }
6090     }
6091     else {
6092         while (off++ && s > lim) {
6093             s--;
6094             if (UTF8_IS_CONTINUED(*s)) {
6095                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6096                     s--;
6097             }
6098             /* XXX could check well-formedness here */
6099         }
6100     }
6101     return s;
6102 }
6103
6104 #ifdef XXX_dmq
6105 /* there are a bunch of places where we use two reghop3's that should
6106    be replaced with this routine. but since thats not done yet
6107    we ifdef it out - dmq
6108 */
6109 STATIC U8 *
6110 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6111 {
6112     dVAR;
6113
6114     PERL_ARGS_ASSERT_REGHOP4;
6115
6116     if (off >= 0) {
6117         while (off-- && s < rlim) {
6118             /* XXX could check well-formedness here */
6119             s += UTF8SKIP(s);
6120         }
6121     }
6122     else {
6123         while (off++ && s > llim) {
6124             s--;
6125             if (UTF8_IS_CONTINUED(*s)) {
6126                 while (s > llim && UTF8_IS_CONTINUATION(*s))
6127                     s--;
6128             }
6129             /* XXX could check well-formedness here */
6130         }
6131     }
6132     return s;
6133 }
6134 #endif
6135
6136 STATIC U8 *
6137 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6138 {
6139     dVAR;
6140
6141     PERL_ARGS_ASSERT_REGHOPMAYBE3;
6142
6143     if (off >= 0) {
6144         while (off-- && s < lim) {
6145             /* XXX could check well-formedness here */
6146             s += UTF8SKIP(s);
6147         }
6148         if (off >= 0)
6149             return NULL;
6150     }
6151     else {
6152         while (off++ && s > lim) {
6153             s--;
6154             if (UTF8_IS_CONTINUED(*s)) {
6155                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6156                     s--;
6157             }
6158             /* XXX could check well-formedness here */
6159         }
6160         if (off <= 0)
6161             return NULL;
6162     }
6163     return s;
6164 }
6165
6166 static void
6167 restore_pos(pTHX_ void *arg)
6168 {
6169     dVAR;
6170     regexp * const rex = (regexp *)arg;
6171     if (PL_reg_eval_set) {
6172         if (PL_reg_oldsaved) {
6173             rex->subbeg = PL_reg_oldsaved;
6174             rex->sublen = PL_reg_oldsavedlen;
6175 #ifdef PERL_OLD_COPY_ON_WRITE
6176             rex->saved_copy = PL_nrs;
6177 #endif
6178             RXp_MATCH_COPIED_on(rex);
6179         }
6180         PL_reg_magic->mg_len = PL_reg_oldpos;
6181         PL_reg_eval_set = 0;
6182         PL_curpm = PL_reg_oldcurpm;
6183     }
6184 }
6185
6186 STATIC void
6187 S_to_utf8_substr(pTHX_ register regexp *prog)
6188 {
6189     int i = 1;
6190
6191     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6192
6193     do {
6194         if (prog->substrs->data[i].substr
6195             && !prog->substrs->data[i].utf8_substr) {
6196             SV* const sv = newSVsv(prog->substrs->data[i].substr);
6197             prog->substrs->data[i].utf8_substr = sv;
6198             sv_utf8_upgrade(sv);
6199             if (SvVALID(prog->substrs->data[i].substr)) {
6200                 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
6201                 if (flags & FBMcf_TAIL) {
6202                     /* Trim the trailing \n that fbm_compile added last
6203                        time.  */
6204                     SvCUR_set(sv, SvCUR(sv) - 1);
6205                     /* Whilst this makes the SV technically "invalid" (as its
6206                        buffer is no longer followed by "\0") when fbm_compile()
6207                        adds the "\n" back, a "\0" is restored.  */
6208                 }
6209                 fbm_compile(sv, flags);
6210             }
6211             if (prog->substrs->data[i].substr == prog->check_substr)
6212                 prog->check_utf8 = sv;
6213         }
6214     } while (i--);
6215 }
6216
6217 STATIC void
6218 S_to_byte_substr(pTHX_ register regexp *prog)
6219 {
6220     dVAR;
6221     int i = 1;
6222
6223     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6224
6225     do {
6226         if (prog->substrs->data[i].utf8_substr
6227             && !prog->substrs->data[i].substr) {
6228             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6229             if (sv_utf8_downgrade(sv, TRUE)) {
6230                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6231                     const U8 flags
6232                         = BmFLAGS(prog->substrs->data[i].utf8_substr);
6233                     if (flags & FBMcf_TAIL) {
6234                         /* Trim the trailing \n that fbm_compile added last
6235                            time.  */
6236                         SvCUR_set(sv, SvCUR(sv) - 1);
6237                     }
6238                     fbm_compile(sv, flags);
6239                 }
6240             } else {
6241                 SvREFCNT_dec(sv);
6242                 sv = &PL_sv_undef;
6243             }
6244             prog->substrs->data[i].substr = sv;
6245             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6246                 prog->check_substr = sv;
6247         }
6248     } while (i--);
6249 }
6250
6251 /*
6252  * Local variables:
6253  * c-indentation-style: bsd
6254  * c-basic-offset: 4
6255  * indent-tabs-mode: t
6256  * End:
6257  *
6258  * ex: set ts=8 sts=4 sw=4 noet:
6259  */