regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
   6  */
   7
   8 /* This file contains functions for compiling a regular expression.  See
   9  * also regexec.c which funnily enough, contains functions for executing
  10  * a regular expression.
  11  *
  12  * This file is also copied at build time to ext/re/re_comp.c, where
  13  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  14  * This causes the main functions to be compiled under new names and with
  15  * debugging support added, which makes "use re 'debug'" work.
  16  */
  17
  18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  19  * confused with the original package (see point 3 below).  Thanks, Henry!
  20  */
  21
  22 /* Additional note: this code is very heavily munged from Henry's version
  23  * in places.  In some spots I've traded clarity for efficiency, so don't
  24  * blame Henry for some of the lack of readability.
  25  */
  26
  27 /* The names of the functions have been changed from regcomp and
  28  * regexec to  pregcomp and pregexec in order to avoid conflicts
  29  * with the POSIX routines of the same names.
  30 */
  31
  32 #ifdef PERL_EXT_RE_BUILD
  33 #include "re_top.h"
  34 #endif
  35
  36 /*
  37  * pregcomp and pregexec -- regsub and regerror are not used in perl
  38  *
  39  *      Copyright (c) 1986 by University of Toronto.
  40  *      Written by Henry Spencer.  Not derived from licensed software.
  41  *
  42  *      Permission is granted to anyone to use this software for any
  43  *      purpose on any computer system, and to redistribute it freely,
  44  *      subject to the following restrictions:
  45  *
  46  *      1. The author is not responsible for the consequences of use of
  47  *              this software, no matter how awful, even if they arise
  48  *              from defects in it.
  49  *
  50  *      2. The origin of this software must not be misrepresented, either
  51  *              by explicit claim or by omission.
  52  *
  53  *      3. Altered versions must be plainly marked as such, and must not
  54  *              be misrepresented as being the original software.
  55  *
  56  *
  57  ****    Alterations to Henry's code are...
  58  ****
  59  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  60  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
  61  ****
  62  ****    You may distribute under the terms of either the GNU General Public
  63  ****    License or the Artistic License, as specified in the README file.
  64
  65  *
  66  * Beware that some of this code is subtly aware of the way operator
  67  * precedence is structured in regular expressions.  Serious changes in
  68  * regular-expression syntax might require a total rethink.
  69  */
  70 #include "EXTERN.h"
  71 #define PERL_IN_REGCOMP_C
  72 #include "perl.h"
  73
  74 #ifndef PERL_IN_XSUB_RE
  75 #  include "INTERN.h"
  76 #endif
  77
  78 #define REG_COMP_C
  79 #ifdef PERL_IN_XSUB_RE
  80 #  include "re_comp.h"
  81 #else
  82 #  include "regcomp.h"
  83 #endif
  84
  85 #ifdef op
  86 #undef op
  87 #endif /* op */
  88
  89 #ifdef MSDOS
  90 #  if defined(BUGGY_MSC6)
  91  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
  92 #    pragma optimize("a",off)
  93  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
  94 #    pragma optimize("w",on )
  95 #  endif /* BUGGY_MSC6 */
  96 #endif /* MSDOS */
  97
  98 #ifndef STATIC
  99 #define STATIC  static
 100 #endif
 101
 102 typedef struct RExC_state_t {
 103     U32         flags;                  /* are we folding, multilining? */
 104     char        *precomp;               /* uncompiled string. */
 105     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 106     regexp      *rx;                    /* perl core regexp structure */
 107     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 108     char        *start;                 /* Start of input for compile */
 109     char        *end;                   /* End of input for compile */
 110     char        *parse;                 /* Input-scan pointer. */
 111     I32         whilem_seen;            /* number of WHILEM in this expr */
 112     regnode     *emit_start;            /* Start of emitted-code area */
 113     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 114     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 115     I32         naughty;                /* How bad is this pattern? */
 116     I32         sawback;                /* Did we see \1, ...? */
 117     U32         seen;
 118     I32         size;                   /* Code size. */
 119     I32         npar;                   /* Capture buffer count, (OPEN). */
 120     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 121     I32         nestroot;               /* root parens we are in - used by accept */
 122     I32         extralen;
 123     I32         seen_zerolen;
 124     I32         seen_evals;
 125     regnode     **open_parens;          /* pointers to open parens */
 126     regnode     **close_parens;         /* pointers to close parens */
 127     regnode     *opend;                 /* END node in program */
 128     I32         utf8;           /* whether the pattern is utf8 or not */
 129     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 130                                 /* XXX use this for future optimisation of case
 131                                  * where pattern must be upgraded to utf8. */
 132     HV          *charnames;             /* cache of named sequences */
 133     HV          *paren_names;           /* Paren names */
 134
 135     regnode     **recurse;              /* Recurse regops */
 136     I32         recurse_count;          /* Number of recurse regops */
 137 #if ADD_TO_REGEXEC
 138     char        *starttry;              /* -Dr: where regtry was called. */
 139 #define RExC_starttry   (pRExC_state->starttry)
 140 #endif
 141 #ifdef DEBUGGING
 142     const char  *lastparse;
 143     I32         lastnum;
 144     AV          *paren_name_list;       /* idx -> name */
 145 #define RExC_lastparse  (pRExC_state->lastparse)
 146 #define RExC_lastnum    (pRExC_state->lastnum)
 147 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 148 #endif
 149 } RExC_state_t;
 150
 151 #define RExC_flags      (pRExC_state->flags)
 152 #define RExC_precomp    (pRExC_state->precomp)
 153 #define RExC_rx_sv      (pRExC_state->rx_sv)
 154 #define RExC_rx         (pRExC_state->rx)
 155 #define RExC_rxi        (pRExC_state->rxi)
 156 #define RExC_start      (pRExC_state->start)
 157 #define RExC_end        (pRExC_state->end)
 158 #define RExC_parse      (pRExC_state->parse)
 159 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 160 #ifdef RE_TRACK_PATTERN_OFFSETS
 161 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 162 #endif
 163 #define RExC_emit       (pRExC_state->emit)
 164 #define RExC_emit_start (pRExC_state->emit_start)
 165 #define RExC_emit_bound (pRExC_state->emit_bound)
 166 #define RExC_naughty    (pRExC_state->naughty)
 167 #define RExC_sawback    (pRExC_state->sawback)
 168 #define RExC_seen       (pRExC_state->seen)
 169 #define RExC_size       (pRExC_state->size)
 170 #define RExC_npar       (pRExC_state->npar)
 171 #define RExC_nestroot   (pRExC_state->nestroot)
 172 #define RExC_extralen   (pRExC_state->extralen)
 173 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 174 #define RExC_seen_evals (pRExC_state->seen_evals)
 175 #define RExC_utf8       (pRExC_state->utf8)
 176 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 177 #define RExC_charnames  (pRExC_state->charnames)
 178 #define RExC_open_parens        (pRExC_state->open_parens)
 179 #define RExC_close_parens       (pRExC_state->close_parens)
 180 #define RExC_opend      (pRExC_state->opend)
 181 #define RExC_paren_names        (pRExC_state->paren_names)
 182 #define RExC_recurse    (pRExC_state->recurse)
 183 #define RExC_recurse_count      (pRExC_state->recurse_count)
 184
 185
 186 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 187 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 188         ((*s) == '{' && regcurly(s)))
 189
 190 #ifdef SPSTART
 191 #undef SPSTART          /* dratted cpp namespace... */
 192 #endif
 193 /*
 194  * Flags to be passed up and down.
 195  */
 196 #define WORST           0       /* Worst case. */
 197 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 198 #define SIMPLE          0x02    /* Simple enough to be STAR/PLUS operand. */
 199 #define SPSTART         0x04    /* Starts with * or +. */
 200 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 201 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 202
 203 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 204
 205 /* whether trie related optimizations are enabled */
 206 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 207 #define TRIE_STUDY_OPT
 208 #define FULL_TRIE_STUDY
 209 #define TRIE_STCLASS
 210 #endif
 211
 212
 213
 214 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 215 #define PBITVAL(paren) (1 << ((paren) & 7))
 216 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 217 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 218 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 219
 220
 221 /* About scan_data_t.
 222
 223   During optimisation we recurse through the regexp program performing
 224   various inplace (keyhole style) optimisations. In addition study_chunk
 225   and scan_commit populate this data structure with information about
 226   what strings MUST appear in the pattern. We look for the longest
 227   string that must appear for at a fixed location, and we look for the
 228   longest string that may appear at a floating location. So for instance
 229   in the pattern:
 230
 231     /FOO[xX]A.*B[xX]BAR/
 232
 233   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 234   strings (because they follow a .* construct). study_chunk will identify
 235   both FOO and BAR as being the longest fixed and floating strings respectively.
 236
 237   The strings can be composites, for instance
 238
 239      /(f)(o)(o)/
 240
 241   will result in a composite fixed substring 'foo'.
 242
 243   For each string some basic information is maintained:
 244
 245   - offset or min_offset
 246     This is the position the string must appear at, or not before.
 247     It also implicitly (when combined with minlenp) tells us how many
 248     character must match before the string we are searching.
 249     Likewise when combined with minlenp and the length of the string
 250     tells us how many characters must appear after the string we have
 251     found.
 252
 253   - max_offset
 254     Only used for floating strings. This is the rightmost point that
 255     the string can appear at. Ifset to I32 max it indicates that the
 256     string can occur infinitely far to the right.
 257
 258   - minlenp
 259     A pointer to the minimum length of the pattern that the string
 260     was found inside. This is important as in the case of positive
 261     lookahead or positive lookbehind we can have multiple patterns
 262     involved. Consider
 263
 264     /(?=FOO).*F/
 265
 266     The minimum length of the pattern overall is 3, the minimum length
 267     of the lookahead part is 3, but the minimum length of the part that
 268     will actually match is 1. So 'FOO's minimum length is 3, but the
 269     minimum length for the F is 1. This is important as the minimum length
 270     is used to determine offsets in front of and behind the string being
 271     looked for.  Since strings can be composites this is the length of the
 272     pattern at the time it was commited with a scan_commit. Note that
 273     the length is calculated by study_chunk, so that the minimum lengths
 274     are not known until the full pattern has been compiled, thus the
 275     pointer to the value.
 276
 277   - lookbehind
 278
 279     In the case of lookbehind the string being searched for can be
 280     offset past the start point of the final matching string.
 281     If this value was just blithely removed from the min_offset it would
 282     invalidate some of the calculations for how many chars must match
 283     before or after (as they are derived from min_offset and minlen and
 284     the length of the string being searched for).
 285     When the final pattern is compiled and the data is moved from the
 286     scan_data_t structure into the regexp structure the information
 287     about lookbehind is factored in, with the information that would
 288     have been lost precalculated in the end_shift field for the
 289     associated string.
 290
 291   The fields pos_min and pos_delta are used to store the minimum offset
 292   and the delta to the maximum offset at the current point in the pattern.
 293
 294 */
 295
 296 typedef struct scan_data_t {
 297     /*I32 len_min;      unused */
 298     /*I32 len_delta;    unused */
 299     I32 pos_min;
 300     I32 pos_delta;
 301     SV *last_found;
 302     I32 last_end;           /* min value, <0 unless valid. */
 303     I32 last_start_min;
 304     I32 last_start_max;
 305     SV **longest;           /* Either &l_fixed, or &l_float. */
 306     SV *longest_fixed;      /* longest fixed string found in pattern */
 307     I32 offset_fixed;       /* offset where it starts */
 308     I32 *minlen_fixed;      /* pointer to the minlen relevent to the string */
 309     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 310     SV *longest_float;      /* longest floating string found in pattern */
 311     I32 offset_float_min;   /* earliest point in string it can appear */
 312     I32 offset_float_max;   /* latest point in string it can appear */
 313     I32 *minlen_float;      /* pointer to the minlen relevent to the string */
 314     I32 lookbehind_float;   /* is the position of the string modified by LB */
 315     I32 flags;
 316     I32 whilem_c;
 317     I32 *last_closep;
 318     struct regnode_charclass_class *start_class;
 319 } scan_data_t;
 320
 321 /*
 322  * Forward declarations for pregcomp()'s friends.
 323  */
 324
 325 static const scan_data_t zero_scan_data =
 326   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 327
 328 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 329 #define SF_BEFORE_SEOL          0x0001
 330 #define SF_BEFORE_MEOL          0x0002
 331 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 332 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 333
 334 #ifdef NO_UNARY_PLUS
 335 #  define SF_FIX_SHIFT_EOL      (0+2)
 336 #  define SF_FL_SHIFT_EOL               (0+4)
 337 #else
 338 #  define SF_FIX_SHIFT_EOL      (+2)
 339 #  define SF_FL_SHIFT_EOL               (+4)
 340 #endif
 341
 342 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 343 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 344
 345 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 346 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 347 #define SF_IS_INF               0x0040
 348 #define SF_HAS_PAR              0x0080
 349 #define SF_IN_PAR               0x0100
 350 #define SF_HAS_EVAL             0x0200
 351 #define SCF_DO_SUBSTR           0x0400
 352 #define SCF_DO_STCLASS_AND      0x0800
 353 #define SCF_DO_STCLASS_OR       0x1000
 354 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 355 #define SCF_WHILEM_VISITED_POS  0x2000
 356
 357 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 358 #define SCF_SEEN_ACCEPT         0x8000
 359
 360 #define UTF (RExC_utf8 != 0)
 361 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
 362 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
 363
 364 #define OOB_UNICODE             12345678
 365 #define OOB_NAMEDCLASS          -1
 366
 367 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 368 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 369
 370
 371 /* length of regex to show in messages that don't mark a position within */
 372 #define RegexLengthToShowInErrorMessages 127
 373
 374 /*
 375  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 376  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 377  * op/pragma/warn/regcomp.
 378  */
 379 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 380 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 381
 382 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 383
 384 /*
 385  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 386  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 387  * "...".
 388  */
 389 #define _FAIL(code) STMT_START {                                        \
 390     const char *ellipses = "";                                          \
 391     IV len = RExC_end - RExC_precomp;                                   \
 392                                                                         \
 393     if (!SIZE_ONLY)                                                     \
 394         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 395     if (len > RegexLengthToShowInErrorMessages) {                       \
 396         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 397         len = RegexLengthToShowInErrorMessages - 10;                    \
 398         ellipses = "...";                                               \
 399     }                                                                   \
 400     code;                                                               \
 401 } STMT_END
 402
 403 #define FAIL(msg) _FAIL(                            \
 404     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 405             msg, (int)len, RExC_precomp, ellipses))
 406
 407 #define FAIL2(msg,arg) _FAIL(                       \
 408     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 409             arg, (int)len, RExC_precomp, ellipses))
 410
 411 /*
 412  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 413  */
 414 #define Simple_vFAIL(m) STMT_START {                                    \
 415     const IV offset = RExC_parse - RExC_precomp;                        \
 416     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 417             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 418 } STMT_END
 419
 420 /*
 421  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 422  */
 423 #define vFAIL(m) STMT_START {                           \
 424     if (!SIZE_ONLY)                                     \
 425         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 426     Simple_vFAIL(m);                                    \
 427 } STMT_END
 428
 429 /*
 430  * Like Simple_vFAIL(), but accepts two arguments.
 431  */
 432 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 433     const IV offset = RExC_parse - RExC_precomp;                        \
 434     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 435             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 436 } STMT_END
 437
 438 /*
 439  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 440  */
 441 #define vFAIL2(m,a1) STMT_START {                       \
 442     if (!SIZE_ONLY)                                     \
 443         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 444     Simple_vFAIL2(m, a1);                               \
 445 } STMT_END
 446
 447
 448 /*
 449  * Like Simple_vFAIL(), but accepts three arguments.
 450  */
 451 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 452     const IV offset = RExC_parse - RExC_precomp;                \
 453     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 454             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 455 } STMT_END
 456
 457 /*
 458  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 459  */
 460 #define vFAIL3(m,a1,a2) STMT_START {                    \
 461     if (!SIZE_ONLY)                                     \
 462         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 463     Simple_vFAIL3(m, a1, a2);                           \
 464 } STMT_END
 465
 466 /*
 467  * Like Simple_vFAIL(), but accepts four arguments.
 468  */
 469 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 470     const IV offset = RExC_parse - RExC_precomp;                \
 471     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 472             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 473 } STMT_END
 474
 475 #define vWARN(loc,m) STMT_START {                                       \
 476     const IV offset = loc - RExC_precomp;                               \
 477     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 478             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 479 } STMT_END
 480
 481 #define vWARNdep(loc,m) STMT_START {                                    \
 482     const IV offset = loc - RExC_precomp;                               \
 483     Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),          \
 484             "%s" REPORT_LOCATION,                                       \
 485             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 486 } STMT_END
 487
 488
 489 #define vWARN2(loc, m, a1) STMT_START {                                 \
 490     const IV offset = loc - RExC_precomp;                               \
 491     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 492             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 493 } STMT_END
 494
 495 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 496     const IV offset = loc - RExC_precomp;                               \
 497     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 498             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 499 } STMT_END
 500
 501 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 502     const IV offset = loc - RExC_precomp;                               \
 503     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 504             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 505 } STMT_END
 506
 507 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 508     const IV offset = loc - RExC_precomp;                               \
 509     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 510             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 511 } STMT_END
 512
 513
 514 /* Allow for side effects in s */
 515 #define REGC(c,s) STMT_START {                  \
 516     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 517 } STMT_END
 518
 519 /* Macros for recording node offsets.   20001227 mjd@plover.com
 520  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 521  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 522  * Element 0 holds the number n.
 523  * Position is 1 indexed.
 524  */
 525 #ifndef RE_TRACK_PATTERN_OFFSETS
 526 #define Set_Node_Offset_To_R(node,byte)
 527 #define Set_Node_Offset(node,byte)
 528 #define Set_Cur_Node_Offset
 529 #define Set_Node_Length_To_R(node,len)
 530 #define Set_Node_Length(node,len)
 531 #define Set_Node_Cur_Length(node)
 532 #define Node_Offset(n)
 533 #define Node_Length(n)
 534 #define Set_Node_Offset_Length(node,offset,len)
 535 #define ProgLen(ri) ri->u.proglen
 536 #define SetProgLen(ri,x) ri->u.proglen = x
 537 #else
 538 #define ProgLen(ri) ri->u.offsets[0]
 539 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 540 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 541     if (! SIZE_ONLY) {                                                  \
 542         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 543                     __LINE__, (int)(node), (int)(byte)));               \
 544         if((node) < 0) {                                                \
 545             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 546         } else {                                                        \
 547             RExC_offsets[2*(node)-1] = (byte);                          \
 548         }                                                               \
 549     }                                                                   \
 550 } STMT_END
 551
 552 #define Set_Node_Offset(node,byte) \
 553     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 554 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 555
 556 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 557     if (! SIZE_ONLY) {                                                  \
 558         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 559                 __LINE__, (int)(node), (int)(len)));                    \
 560         if((node) < 0) {                                                \
 561             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 562         } else {                                                        \
 563             RExC_offsets[2*(node)] = (len);                             \
 564         }                                                               \
 565     }                                                                   \
 566 } STMT_END
 567
 568 #define Set_Node_Length(node,len) \
 569     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 570 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 571 #define Set_Node_Cur_Length(node) \
 572     Set_Node_Length(node, RExC_parse - parse_start)
 573
 574 /* Get offsets and lengths */
 575 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 576 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 577
 578 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 579     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 580     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 581 } STMT_END
 582 #endif
 583
 584 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 585 #define EXPERIMENTAL_INPLACESCAN
 586 #endif /*RE_TRACK_PATTERN_OFFSETS*/
 587
 588 #define DEBUG_STUDYDATA(str,data,depth)                              \
 589 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 590     PerlIO_printf(Perl_debug_log,                                    \
 591         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 592         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 593         (int)(depth)*2, "",                                          \
 594         (IV)((data)->pos_min),                                       \
 595         (IV)((data)->pos_delta),                                     \
 596         (UV)((data)->flags),                                         \
 597         (IV)((data)->whilem_c),                                      \
 598         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 599         is_inf ? "INF " : ""                                         \
 600     );                                                               \
 601     if ((data)->last_found)                                          \
 602         PerlIO_printf(Perl_debug_log,                                \
 603             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 604             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 605             SvPVX_const((data)->last_found),                         \
 606             (IV)((data)->last_end),                                  \
 607             (IV)((data)->last_start_min),                            \
 608             (IV)((data)->last_start_max),                            \
 609             ((data)->longest &&                                      \
 610              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 611             SvPVX_const((data)->longest_fixed),                      \
 612             (IV)((data)->offset_fixed),                              \
 613             ((data)->longest &&                                      \
 614              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 615             SvPVX_const((data)->longest_float),                      \
 616             (IV)((data)->offset_float_min),                          \
 617             (IV)((data)->offset_float_max)                           \
 618         );                                                           \
 619     PerlIO_printf(Perl_debug_log,"\n");                              \
 620 });
 621
 622 static void clear_re(pTHX_ void *r);
 623
 624 /* Mark that we cannot extend a found fixed substring at this point.
 625    Update the longest found anchored substring and the longest found
 626    floating substrings if needed. */
 627
 628 STATIC void
 629 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 630 {
 631     const STRLEN l = CHR_SVLEN(data->last_found);
 632     const STRLEN old_l = CHR_SVLEN(*data->longest);
 633     GET_RE_DEBUG_FLAGS_DECL;
 634
 635     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 636         SvSetMagicSV(*data->longest, data->last_found);
 637         if (*data->longest == data->longest_fixed) {
 638             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 639             if (data->flags & SF_BEFORE_EOL)
 640                 data->flags
 641                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 642             else
 643                 data->flags &= ~SF_FIX_BEFORE_EOL;
 644             data->minlen_fixed=minlenp;
 645             data->lookbehind_fixed=0;
 646         }
 647         else { /* *data->longest == data->longest_float */
 648             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 649             data->offset_float_max = (l
 650                                       ? data->last_start_max
 651                                       : data->pos_min + data->pos_delta);
 652             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 653                 data->offset_float_max = I32_MAX;
 654             if (data->flags & SF_BEFORE_EOL)
 655                 data->flags
 656                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 657             else
 658                 data->flags &= ~SF_FL_BEFORE_EOL;
 659             data->minlen_float=minlenp;
 660             data->lookbehind_float=0;
 661         }
 662     }
 663     SvCUR_set(data->last_found, 0);
 664     {
 665         SV * const sv = data->last_found;
 666         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 667             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 668             if (mg)
 669                 mg->mg_len = 0;
 670         }
 671     }
 672     data->last_end = -1;
 673     data->flags &= ~SF_BEFORE_EOL;
 674     DEBUG_STUDYDATA("commit: ",data,0);
 675 }
 676
 677 /* Can match anything (initialization) */
 678 STATIC void
 679 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 680 {
 681     ANYOF_CLASS_ZERO(cl);
 682     ANYOF_BITMAP_SETALL(cl);
 683     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
 684     if (LOC)
 685         cl->flags |= ANYOF_LOCALE;
 686 }
 687
 688 /* Can match anything (initialization) */
 689 STATIC int
 690 S_cl_is_anything(const struct regnode_charclass_class *cl)
 691 {
 692     int value;
 693
 694     for (value = 0; value <= ANYOF_MAX; value += 2)
 695         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 696             return 1;
 697     if (!(cl->flags & ANYOF_UNICODE_ALL))
 698         return 0;
 699     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 700         return 0;
 701     return 1;
 702 }
 703
 704 /* Can match anything (initialization) */
 705 STATIC void
 706 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 707 {
 708     Zero(cl, 1, struct regnode_charclass_class);
 709     cl->type = ANYOF;
 710     cl_anything(pRExC_state, cl);
 711 }
 712
 713 STATIC void
 714 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 715 {
 716     Zero(cl, 1, struct regnode_charclass_class);
 717     cl->type = ANYOF;
 718     cl_anything(pRExC_state, cl);
 719     if (LOC)
 720         cl->flags |= ANYOF_LOCALE;
 721 }
 722
 723 /* 'And' a given class with another one.  Can create false positives */
 724 /* We assume that cl is not inverted */
 725 STATIC void
 726 S_cl_and(struct regnode_charclass_class *cl,
 727         const struct regnode_charclass_class *and_with)
 728 {
 729
 730     assert(and_with->type == ANYOF);
 731     if (!(and_with->flags & ANYOF_CLASS)
 732         && !(cl->flags & ANYOF_CLASS)
 733         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 734         && !(and_with->flags & ANYOF_FOLD)
 735         && !(cl->flags & ANYOF_FOLD)) {
 736         int i;
 737
 738         if (and_with->flags & ANYOF_INVERT)
 739             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 740                 cl->bitmap[i] &= ~and_with->bitmap[i];
 741         else
 742             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 743                 cl->bitmap[i] &= and_with->bitmap[i];
 744     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 745     if (!(and_with->flags & ANYOF_EOS))
 746         cl->flags &= ~ANYOF_EOS;
 747
 748     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
 749         !(and_with->flags & ANYOF_INVERT)) {
 750         cl->flags &= ~ANYOF_UNICODE_ALL;
 751         cl->flags |= ANYOF_UNICODE;
 752         ARG_SET(cl, ARG(and_with));
 753     }
 754     if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
 755         !(and_with->flags & ANYOF_INVERT))
 756         cl->flags &= ~ANYOF_UNICODE_ALL;
 757     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
 758         !(and_with->flags & ANYOF_INVERT))
 759         cl->flags &= ~ANYOF_UNICODE;
 760 }
 761
 762 /* 'OR' a given class with another one.  Can create false positives */
 763 /* We assume that cl is not inverted */
 764 STATIC void
 765 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 766 {
 767     if (or_with->flags & ANYOF_INVERT) {
 768         /* We do not use
 769          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 770          *   <= (B1 | !B2) | (CL1 | !CL2)
 771          * which is wasteful if CL2 is small, but we ignore CL2:
 772          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 773          * XXXX Can we handle case-fold?  Unclear:
 774          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 775          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 776          */
 777         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 778              && !(or_with->flags & ANYOF_FOLD)
 779              && !(cl->flags & ANYOF_FOLD) ) {
 780             int i;
 781
 782             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 783                 cl->bitmap[i] |= ~or_with->bitmap[i];
 784         } /* XXXX: logic is complicated otherwise */
 785         else {
 786             cl_anything(pRExC_state, cl);
 787         }
 788     } else {
 789         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 790         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 791              && (!(or_with->flags & ANYOF_FOLD)
 792                  || (cl->flags & ANYOF_FOLD)) ) {
 793             int i;
 794
 795             /* OR char bitmap and class bitmap separately */
 796             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 797                 cl->bitmap[i] |= or_with->bitmap[i];
 798             if (or_with->flags & ANYOF_CLASS) {
 799                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 800                     cl->classflags[i] |= or_with->classflags[i];
 801                 cl->flags |= ANYOF_CLASS;
 802             }
 803         }
 804         else { /* XXXX: logic is complicated, leave it along for a moment. */
 805             cl_anything(pRExC_state, cl);
 806         }
 807     }
 808     if (or_with->flags & ANYOF_EOS)
 809         cl->flags |= ANYOF_EOS;
 810
 811     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
 812         ARG(cl) != ARG(or_with)) {
 813         cl->flags |= ANYOF_UNICODE_ALL;
 814         cl->flags &= ~ANYOF_UNICODE;
 815     }
 816     if (or_with->flags & ANYOF_UNICODE_ALL) {
 817         cl->flags |= ANYOF_UNICODE_ALL;
 818         cl->flags &= ~ANYOF_UNICODE;
 819     }
 820 }
 821
 822 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
 823 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
 824 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
 825 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
 826
 827
 828 #ifdef DEBUGGING
 829 /*
 830    dump_trie(trie,widecharmap,revcharmap)
 831    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
 832    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
 833
 834    These routines dump out a trie in a somewhat readable format.
 835    The _interim_ variants are used for debugging the interim
 836    tables that are used to generate the final compressed
 837    representation which is what dump_trie expects.
 838
 839    Part of the reason for their existance is to provide a form
 840    of documentation as to how the different representations function.
 841
 842 */
 843
 844 /*
 845   Dumps the final compressed table form of the trie to Perl_debug_log.
 846   Used for debugging make_trie().
 847 */
 848
 849 STATIC void
 850 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
 851             AV *revcharmap, U32 depth)
 852 {
 853     U32 state;
 854     SV *sv=sv_newmortal();
 855     int colwidth= widecharmap ? 6 : 4;
 856     GET_RE_DEBUG_FLAGS_DECL;
 857
 858
 859     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
 860         (int)depth * 2 + 2,"",
 861         "Match","Base","Ofs" );
 862
 863     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
 864         SV ** const tmp = av_fetch( revcharmap, state, 0);
 865         if ( tmp ) {
 866             PerlIO_printf( Perl_debug_log, "%*s",
 867                 colwidth,
 868                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
 869                             PL_colors[0], PL_colors[1],
 870                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
 871                             PERL_PV_ESCAPE_FIRSTCHAR
 872                 )
 873             );
 874         }
 875     }
 876     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
 877         (int)depth * 2 + 2,"");
 878
 879     for( state = 0 ; state < trie->uniquecharcount ; state++ )
 880         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
 881     PerlIO_printf( Perl_debug_log, "\n");
 882
 883     for( state = 1 ; state < trie->statecount ; state++ ) {
 884         const U32 base = trie->states[ state ].trans.base;
 885
 886         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
 887
 888         if ( trie->states[ state ].wordnum ) {
 889             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
 890         } else {
 891             PerlIO_printf( Perl_debug_log, "%6s", "" );
 892         }
 893
 894         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
 895
 896         if ( base ) {
 897             U32 ofs = 0;
 898
 899             while( ( base + ofs  < trie->uniquecharcount ) ||
 900                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
 901                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
 902                     ofs++;
 903
 904             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
 905
 906             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
 907                 if ( ( base + ofs >= trie->uniquecharcount ) &&
 908                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
 909                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
 910                 {
 911                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
 912                     colwidth,
 913                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
 914                 } else {
 915                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
 916                 }
 917             }
 918
 919             PerlIO_printf( Perl_debug_log, "]");
 920
 921         }
 922         PerlIO_printf( Perl_debug_log, "\n" );
 923     }
 924 }
 925 /*
 926   Dumps a fully constructed but uncompressed trie in list form.
 927   List tries normally only are used for construction when the number of
 928   possible chars (trie->uniquecharcount) is very high.
 929   Used for debugging make_trie().
 930 */
 931 STATIC void
 932 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
 933                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
 934                          U32 depth)
 935 {
 936     U32 state;
 937     SV *sv=sv_newmortal();
 938     int colwidth= widecharmap ? 6 : 4;
 939     GET_RE_DEBUG_FLAGS_DECL;
 940     /* print out the table precompression.  */
 941     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
 942         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
 943         "------:-----+-----------------\n" );
 944
 945     for( state=1 ; state < next_alloc ; state ++ ) {
 946         U16 charid;
 947
 948         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
 949             (int)depth * 2 + 2,"", (UV)state  );
 950         if ( ! trie->states[ state ].wordnum ) {
 951             PerlIO_printf( Perl_debug_log, "%5s| ","");
 952         } else {
 953             PerlIO_printf( Perl_debug_log, "W%4x| ",
 954                 trie->states[ state ].wordnum
 955             );
 956         }
 957         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
 958             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
 959             if ( tmp ) {
 960                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
 961                     colwidth,
 962                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
 963                             PL_colors[0], PL_colors[1],
 964                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
 965                             PERL_PV_ESCAPE_FIRSTCHAR
 966                     ) ,
 967                     TRIE_LIST_ITEM(state,charid).forid,
 968                     (UV)TRIE_LIST_ITEM(state,charid).newstate
 969                 );
 970                 if (!(charid % 10))
 971                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
 972                         (int)((depth * 2) + 14), "");
 973             }
 974         }
 975         PerlIO_printf( Perl_debug_log, "\n");
 976     }
 977 }
 978
 979 /*
 980   Dumps a fully constructed but uncompressed trie in table form.
 981   This is the normal DFA style state transition table, with a few
 982   twists to facilitate compression later.
 983   Used for debugging make_trie().
 984 */
 985 STATIC void
 986 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
 987                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
 988                           U32 depth)
 989 {
 990     U32 state;
 991     U16 charid;
 992     SV *sv=sv_newmortal();
 993     int colwidth= widecharmap ? 6 : 4;
 994     GET_RE_DEBUG_FLAGS_DECL;
 995
 996     /*
 997        print out the table precompression so that we can do a visual check
 998        that they are identical.
 999      */
1000
1001     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1002
1003     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1004         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1005         if ( tmp ) {
1006             PerlIO_printf( Perl_debug_log, "%*s",
1007                 colwidth,
1008                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1009                             PL_colors[0], PL_colors[1],
1010                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1011                             PERL_PV_ESCAPE_FIRSTCHAR
1012                 )
1013             );
1014         }
1015     }
1016
1017     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1018
1019     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1020         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1021     }
1022
1023     PerlIO_printf( Perl_debug_log, "\n" );
1024
1025     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1026
1027         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1028             (int)depth * 2 + 2,"",
1029             (UV)TRIE_NODENUM( state ) );
1030
1031         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1032             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1033             if (v)
1034                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1035             else
1036                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1037         }
1038         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1039             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1040         } else {
1041             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1042             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1043         }
1044     }
1045 }
1046
1047 #endif
1048
1049 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1050   startbranch: the first branch in the whole branch sequence
1051   first      : start branch of sequence of branch-exact nodes.
1052                May be the same as startbranch
1053   last       : Thing following the last branch.
1054                May be the same as tail.
1055   tail       : item following the branch sequence
1056   count      : words in the sequence
1057   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1058   depth      : indent depth
1059
1060 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1061
1062 A trie is an N'ary tree where the branches are determined by digital
1063 decomposition of the key. IE, at the root node you look up the 1st character and
1064 follow that branch repeat until you find the end of the branches. Nodes can be
1065 marked as "accepting" meaning they represent a complete word. Eg:
1066
1067   /he|she|his|hers/
1068
1069 would convert into the following structure. Numbers represent states, letters
1070 following numbers represent valid transitions on the letter from that state, if
1071 the number is in square brackets it represents an accepting state, otherwise it
1072 will be in parenthesis.
1073
1074       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1075       |    |
1076       |   (2)
1077       |    |
1078      (1)   +-i->(6)-+-s->[7]
1079       |
1080       +-s->(3)-+-h->(4)-+-e->[5]
1081
1082       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1083
1084 This shows that when matching against the string 'hers' we will begin at state 1
1085 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1086 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1087 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1088 single traverse. We store a mapping from accepting to state to which word was
1089 matched, and then when we have multiple possibilities we try to complete the
1090 rest of the regex in the order in which they occured in the alternation.
1091
1092 The only prior NFA like behaviour that would be changed by the TRIE support is
1093 the silent ignoring of duplicate alternations which are of the form:
1094
1095  / (DUPE|DUPE) X? (?{ ... }) Y /x
1096
1097 Thus EVAL blocks follwing a trie may be called a different number of times with
1098 and without the optimisation. With the optimisations dupes will be silently
1099 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1100 the following demonstrates:
1101
1102  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1103
1104 which prints out 'word' three times, but
1105
1106  'words'=~/(word|word|word)(?{ print $1 })S/
1107
1108 which doesnt print it out at all. This is due to other optimisations kicking in.
1109
1110 Example of what happens on a structural level:
1111
1112 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1113
1114    1: CURLYM[1] {1,32767}(18)
1115    5:   BRANCH(8)
1116    6:     EXACT <ac>(16)
1117    8:   BRANCH(11)
1118    9:     EXACT <ad>(16)
1119   11:   BRANCH(14)
1120   12:     EXACT <ab>(16)
1121   16:   SUCCEED(0)
1122   17:   NOTHING(18)
1123   18: END(0)
1124
1125 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1126 and should turn into:
1127
1128    1: CURLYM[1] {1,32767}(18)
1129    5:   TRIE(16)
1130         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1131           <ac>
1132           <ad>
1133           <ab>
1134   16:   SUCCEED(0)
1135   17:   NOTHING(18)
1136   18: END(0)
1137
1138 Cases where tail != last would be like /(?foo|bar)baz/:
1139
1140    1: BRANCH(4)
1141    2:   EXACT <foo>(8)
1142    4: BRANCH(7)
1143    5:   EXACT <bar>(8)
1144    7: TAIL(8)
1145    8: EXACT <baz>(10)
1146   10: END(0)
1147
1148 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1149 and would end up looking like:
1150
1151     1: TRIE(8)
1152       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1153         <foo>
1154         <bar>
1155    7: TAIL(8)
1156    8: EXACT <baz>(10)
1157   10: END(0)
1158
1159     d = uvuni_to_utf8_flags(d, uv, 0);
1160
1161 is the recommended Unicode-aware way of saying
1162
1163     *(d++) = uv;
1164 */
1165
1166 #define TRIE_STORE_REVCHAR                                                 \
1167     STMT_START {                                                           \
1168         if (UTF) {                                                         \
1169             SV *zlopp = newSV(2);                                          \
1170             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1171             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1172             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1173             SvPOK_on(zlopp);                                               \
1174             SvUTF8_on(zlopp);                                              \
1175             av_push(revcharmap, zlopp);                                    \
1176         } else {                                                           \
1177             char ooooff = (char)uvc;                                               \
1178             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1179         }                                                                  \
1180         } STMT_END
1181
1182 #define TRIE_READ_CHAR STMT_START {                                           \
1183     wordlen++;                                                                \
1184     if ( UTF ) {                                                              \
1185         if ( folder ) {                                                       \
1186             if ( foldlen > 0 ) {                                              \
1187                uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1188                foldlen -= len;                                                \
1189                scan += len;                                                   \
1190                len = 0;                                                       \
1191             } else {                                                          \
1192                 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1193                 uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
1194                 foldlen -= UNISKIP( uvc );                                    \
1195                 scan = foldbuf + UNISKIP( uvc );                              \
1196             }                                                                 \
1197         } else {                                                              \
1198             uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1199         }                                                                     \
1200     } else {                                                                  \
1201         uvc = (U32)*uc;                                                       \
1202         len = 1;                                                              \
1203     }                                                                         \
1204 } STMT_END
1205
1206
1207
1208 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1209     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1210         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1211         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1212     }                                                           \
1213     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1214     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1215     TRIE_LIST_CUR( state )++;                                   \
1216 } STMT_END
1217
1218 #define TRIE_LIST_NEW(state) STMT_START {                       \
1219     Newxz( trie->states[ state ].trans.list,               \
1220         4, reg_trie_trans_le );                                 \
1221      TRIE_LIST_CUR( state ) = 1;                                \
1222      TRIE_LIST_LEN( state ) = 4;                                \
1223 } STMT_END
1224
1225 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1226     U16 dupe= trie->states[ state ].wordnum;                    \
1227     regnode * const noper_next = regnext( noper );              \
1228                                                                 \
1229     if (trie->wordlen)                                          \
1230         trie->wordlen[ curword ] = wordlen;                     \
1231     DEBUG_r({                                                   \
1232         /* store the word for dumping */                        \
1233         SV* tmp;                                                \
1234         if (OP(noper) != NOTHING)                               \
1235             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1236         else                                                    \
1237             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1238         av_push( trie_words, tmp );                             \
1239     });                                                         \
1240                                                                 \
1241     curword++;                                                  \
1242                                                                 \
1243     if ( noper_next < tail ) {                                  \
1244         if (!trie->jump)                                        \
1245             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1246         trie->jump[curword] = (U16)(noper_next - convert);      \
1247         if (!jumper)                                            \
1248             jumper = noper_next;                                \
1249         if (!nextbranch)                                        \
1250             nextbranch= regnext(cur);                           \
1251     }                                                           \
1252                                                                 \
1253     if ( dupe ) {                                               \
1254         /* So it's a dupe. This means we need to maintain a   */\
1255         /* linked-list from the first to the next.            */\
1256         /* we only allocate the nextword buffer when there    */\
1257         /* a dupe, so first time we have to do the allocation */\
1258         if (!trie->nextword)                                    \
1259             trie->nextword = (U16 *)                                    \
1260                 PerlMemShared_calloc( word_count + 1, sizeof(U16));     \
1261         while ( trie->nextword[dupe] )                          \
1262             dupe= trie->nextword[dupe];                         \
1263         trie->nextword[dupe]= curword;                          \
1264     } else {                                                    \
1265         /* we haven't inserted this word yet.                */ \
1266         trie->states[ state ].wordnum = curword;                \
1267     }                                                           \
1268 } STMT_END
1269
1270
1271 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1272      ( ( base + charid >=  ucharcount                                   \
1273          && base + charid < ubound                                      \
1274          && state == trie->trans[ base - ucharcount + charid ].check    \
1275          && trie->trans[ base - ucharcount + charid ].next )            \
1276            ? trie->trans[ base - ucharcount + charid ].next             \
1277            : ( state==1 ? special : 0 )                                 \
1278       )
1279
1280 #define MADE_TRIE       1
1281 #define MADE_JUMP_TRIE  2
1282 #define MADE_EXACT_TRIE 4
1283
1284 STATIC I32
1285 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1286 {
1287     dVAR;
1288     /* first pass, loop through and scan words */
1289     reg_trie_data *trie;
1290     HV *widecharmap = NULL;
1291     AV *revcharmap = newAV();
1292     regnode *cur;
1293     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1294     STRLEN len = 0;
1295     UV uvc = 0;
1296     U16 curword = 0;
1297     U32 next_alloc = 0;
1298     regnode *jumper = NULL;
1299     regnode *nextbranch = NULL;
1300     regnode *convert = NULL;
1301     /* we just use folder as a flag in utf8 */
1302     const U8 * const folder = ( flags == EXACTF
1303                        ? PL_fold
1304                        : ( flags == EXACTFL
1305                            ? PL_fold_locale
1306                            : NULL
1307                          )
1308                      );
1309
1310 #ifdef DEBUGGING
1311     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1312     AV *trie_words = NULL;
1313     /* along with revcharmap, this only used during construction but both are
1314      * useful during debugging so we store them in the struct when debugging.
1315      */
1316 #else
1317     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1318     STRLEN trie_charcount=0;
1319 #endif
1320     SV *re_trie_maxbuff;
1321     GET_RE_DEBUG_FLAGS_DECL;
1322 #ifndef DEBUGGING
1323     PERL_UNUSED_ARG(depth);
1324 #endif
1325
1326     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1327     trie->refcount = 1;
1328     trie->startstate = 1;
1329     trie->wordcount = word_count;
1330     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1331     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1332     if (!(UTF && folder))
1333         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1334     DEBUG_r({
1335         trie_words = newAV();
1336     });
1337
1338     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1339     if (!SvIOK(re_trie_maxbuff)) {
1340         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1341     }
1342     DEBUG_OPTIMISE_r({
1343                 PerlIO_printf( Perl_debug_log,
1344                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1345                   (int)depth * 2 + 2, "",
1346                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1347                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1348                   (int)depth);
1349     });
1350
1351    /* Find the node we are going to overwrite */
1352     if ( first == startbranch && OP( last ) != BRANCH ) {
1353         /* whole branch chain */
1354         convert = first;
1355     } else {
1356         /* branch sub-chain */
1357         convert = NEXTOPER( first );
1358     }
1359
1360     /*  -- First loop and Setup --
1361
1362        We first traverse the branches and scan each word to determine if it
1363        contains widechars, and how many unique chars there are, this is
1364        important as we have to build a table with at least as many columns as we
1365        have unique chars.
1366
1367        We use an array of integers to represent the character codes 0..255
1368        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1369        native representation of the character value as the key and IV's for the
1370        coded index.
1371
1372        *TODO* If we keep track of how many times each character is used we can
1373        remap the columns so that the table compression later on is more
1374        efficient in terms of memory by ensuring most common value is in the
1375        middle and the least common are on the outside.  IMO this would be better
1376        than a most to least common mapping as theres a decent chance the most
1377        common letter will share a node with the least common, meaning the node
1378        will not be compressable. With a middle is most common approach the worst
1379        case is when we have the least common nodes twice.
1380
1381      */
1382
1383     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1384         regnode * const noper = NEXTOPER( cur );
1385         const U8 *uc = (U8*)STRING( noper );
1386         const U8 * const e  = uc + STR_LEN( noper );
1387         STRLEN foldlen = 0;
1388         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1389         const U8 *scan = (U8*)NULL;
1390         U32 wordlen      = 0;         /* required init */
1391         STRLEN chars = 0;
1392         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1393
1394         if (OP(noper) == NOTHING) {
1395             trie->minlen= 0;
1396             continue;
1397         }
1398         if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1399             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1400                                           regardless of encoding */
1401
1402         for ( ; uc < e ; uc += len ) {
1403             TRIE_CHARCOUNT(trie)++;
1404             TRIE_READ_CHAR;
1405             chars++;
1406             if ( uvc < 256 ) {
1407                 if ( !trie->charmap[ uvc ] ) {
1408                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1409                     if ( folder )
1410                         trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1411                     TRIE_STORE_REVCHAR;
1412                 }
1413                 if ( set_bit ) {
1414                     /* store the codepoint in the bitmap, and if its ascii
1415                        also store its folded equivelent. */
1416                     TRIE_BITMAP_SET(trie,uvc);
1417
1418                     /* store the folded codepoint */
1419                     if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1420
1421                     if ( !UTF ) {
1422                         /* store first byte of utf8 representation of
1423                            codepoints in the 127 < uvc < 256 range */
1424                         if (127 < uvc && uvc < 192) {
1425                             TRIE_BITMAP_SET(trie,194);
1426                         } else if (191 < uvc ) {
1427                             TRIE_BITMAP_SET(trie,195);
1428                         /* && uvc < 256 -- we know uvc is < 256 already */
1429                         }
1430                     }
1431                     set_bit = 0; /* We've done our bit :-) */
1432                 }
1433             } else {
1434                 SV** svpp;
1435                 if ( !widecharmap )
1436                     widecharmap = newHV();
1437
1438                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1439
1440                 if ( !svpp )
1441                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1442
1443                 if ( !SvTRUE( *svpp ) ) {
1444                     sv_setiv( *svpp, ++trie->uniquecharcount );
1445                     TRIE_STORE_REVCHAR;
1446                 }
1447             }
1448         }
1449         if( cur == first ) {
1450             trie->minlen=chars;
1451             trie->maxlen=chars;
1452         } else if (chars < trie->minlen) {
1453             trie->minlen=chars;
1454         } else if (chars > trie->maxlen) {
1455             trie->maxlen=chars;
1456         }
1457
1458     } /* end first pass */
1459     DEBUG_TRIE_COMPILE_r(
1460         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1461                 (int)depth * 2 + 2,"",
1462                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1463                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1464                 (int)trie->minlen, (int)trie->maxlen )
1465     );
1466     trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
1467
1468     /*
1469         We now know what we are dealing with in terms of unique chars and
1470         string sizes so we can calculate how much memory a naive
1471         representation using a flat table  will take. If it's over a reasonable
1472         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1473         conservative but potentially much slower representation using an array
1474         of lists.
1475
1476         At the end we convert both representations into the same compressed
1477         form that will be used in regexec.c for matching with. The latter
1478         is a form that cannot be used to construct with but has memory
1479         properties similar to the list form and access properties similar
1480         to the table form making it both suitable for fast searches and
1481         small enough that its feasable to store for the duration of a program.
1482
1483         See the comment in the code where the compressed table is produced
1484         inplace from the flat tabe representation for an explanation of how
1485         the compression works.
1486
1487     */
1488
1489
1490     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1491         /*
1492             Second Pass -- Array Of Lists Representation
1493
1494             Each state will be represented by a list of charid:state records
1495             (reg_trie_trans_le) the first such element holds the CUR and LEN
1496             points of the allocated array. (See defines above).
1497
1498             We build the initial structure using the lists, and then convert
1499             it into the compressed table form which allows faster lookups
1500             (but cant be modified once converted).
1501         */
1502
1503         STRLEN transcount = 1;
1504
1505         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1506             "%*sCompiling trie using list compiler\n",
1507             (int)depth * 2 + 2, ""));
1508
1509         trie->states = (reg_trie_state *)
1510             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1511                                   sizeof(reg_trie_state) );
1512         TRIE_LIST_NEW(1);
1513         next_alloc = 2;
1514
1515         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1516
1517             regnode * const noper = NEXTOPER( cur );
1518             U8 *uc           = (U8*)STRING( noper );
1519             const U8 * const e = uc + STR_LEN( noper );
1520             U32 state        = 1;         /* required init */
1521             U16 charid       = 0;         /* sanity init */
1522             U8 *scan         = (U8*)NULL; /* sanity init */
1523             STRLEN foldlen   = 0;         /* required init */
1524             U32 wordlen      = 0;         /* required init */
1525             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1526
1527             if (OP(noper) != NOTHING) {
1528                 for ( ; uc < e ; uc += len ) {
1529
1530                     TRIE_READ_CHAR;
1531
1532                     if ( uvc < 256 ) {
1533                         charid = trie->charmap[ uvc ];
1534                     } else {
1535                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1536                         if ( !svpp ) {
1537                             charid = 0;
1538                         } else {
1539                             charid=(U16)SvIV( *svpp );
1540                         }
1541                     }
1542                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1543                     if ( charid ) {
1544
1545                         U16 check;
1546                         U32 newstate = 0;
1547
1548                         charid--;
1549                         if ( !trie->states[ state ].trans.list ) {
1550                             TRIE_LIST_NEW( state );
1551                         }
1552                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1553                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1554                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1555                                 break;
1556                             }
1557                         }
1558                         if ( ! newstate ) {
1559                             newstate = next_alloc++;
1560                             TRIE_LIST_PUSH( state, charid, newstate );
1561                             transcount++;
1562                         }
1563                         state = newstate;
1564                     } else {
1565                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1566                     }
1567                 }
1568             }
1569             TRIE_HANDLE_WORD(state);
1570
1571         } /* end second pass */
1572
1573         /* next alloc is the NEXT state to be allocated */
1574         trie->statecount = next_alloc;
1575         trie->states = (reg_trie_state *)
1576             PerlMemShared_realloc( trie->states,
1577                                    next_alloc
1578                                    * sizeof(reg_trie_state) );
1579
1580         /* and now dump it out before we compress it */
1581         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1582                                                          revcharmap, next_alloc,
1583                                                          depth+1)
1584         );
1585
1586         trie->trans = (reg_trie_trans *)
1587             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1588         {
1589             U32 state;
1590             U32 tp = 0;
1591             U32 zp = 0;
1592
1593
1594             for( state=1 ; state < next_alloc ; state ++ ) {
1595                 U32 base=0;
1596
1597                 /*
1598                 DEBUG_TRIE_COMPILE_MORE_r(
1599                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1600                 );
1601                 */
1602
1603                 if (trie->states[state].trans.list) {
1604                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1605                     U16 maxid=minid;
1606                     U16 idx;
1607
1608                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1609                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1610                         if ( forid < minid ) {
1611                             minid=forid;
1612                         } else if ( forid > maxid ) {
1613                             maxid=forid;
1614                         }
1615                     }
1616                     if ( transcount < tp + maxid - minid + 1) {
1617                         transcount *= 2;
1618                         trie->trans = (reg_trie_trans *)
1619                             PerlMemShared_realloc( trie->trans,
1620                                                      transcount
1621                                                      * sizeof(reg_trie_trans) );
1622                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1623                     }
1624                     base = trie->uniquecharcount + tp - minid;
1625                     if ( maxid == minid ) {
1626                         U32 set = 0;
1627                         for ( ; zp < tp ; zp++ ) {
1628                             if ( ! trie->trans[ zp ].next ) {
1629                                 base = trie->uniquecharcount + zp - minid;
1630                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1631                                 trie->trans[ zp ].check = state;
1632                                 set = 1;
1633                                 break;
1634                             }
1635                         }
1636                         if ( !set ) {
1637                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1638                             trie->trans[ tp ].check = state;
1639                             tp++;
1640                             zp = tp;
1641                         }
1642                     } else {
1643                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1644                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1645                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1646                             trie->trans[ tid ].check = state;
1647                         }
1648                         tp += ( maxid - minid + 1 );
1649                     }
1650                     Safefree(trie->states[ state ].trans.list);
1651                 }
1652                 /*
1653                 DEBUG_TRIE_COMPILE_MORE_r(
1654                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1655                 );
1656                 */
1657                 trie->states[ state ].trans.base=base;
1658             }
1659             trie->lasttrans = tp + 1;
1660         }
1661     } else {
1662         /*
1663            Second Pass -- Flat Table Representation.
1664
1665            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1666            We know that we will need Charcount+1 trans at most to store the data
1667            (one row per char at worst case) So we preallocate both structures
1668            assuming worst case.
1669
1670            We then construct the trie using only the .next slots of the entry
1671            structs.
1672
1673            We use the .check field of the first entry of the node  temporarily to
1674            make compression both faster and easier by keeping track of how many non
1675            zero fields are in the node.
1676
1677            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1678            transition.
1679
1680            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1681            number representing the first entry of the node, and state as a
1682            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1683            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1684            are 2 entrys per node. eg:
1685
1686              A B       A B
1687           1. 2 4    1. 3 7
1688           2. 0 3    3. 0 5
1689           3. 0 0    5. 0 0
1690           4. 0 0    7. 0 0
1691
1692            The table is internally in the right hand, idx form. However as we also
1693            have to deal with the states array which is indexed by nodenum we have to
1694            use TRIE_NODENUM() to convert.
1695
1696         */
1697         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1698             "%*sCompiling trie using table compiler\n",
1699             (int)depth * 2 + 2, ""));
1700
1701         trie->trans = (reg_trie_trans *)
1702             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1703                                   * trie->uniquecharcount + 1,
1704                                   sizeof(reg_trie_trans) );
1705         trie->states = (reg_trie_state *)
1706             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1707                                   sizeof(reg_trie_state) );
1708         next_alloc = trie->uniquecharcount + 1;
1709
1710
1711         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1712
1713             regnode * const noper   = NEXTOPER( cur );
1714             const U8 *uc     = (U8*)STRING( noper );
1715             const U8 * const e = uc + STR_LEN( noper );
1716
1717             U32 state        = 1;         /* required init */
1718
1719             U16 charid       = 0;         /* sanity init */
1720             U32 accept_state = 0;         /* sanity init */
1721             U8 *scan         = (U8*)NULL; /* sanity init */
1722
1723             STRLEN foldlen   = 0;         /* required init */
1724             U32 wordlen      = 0;         /* required init */
1725             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1726
1727             if ( OP(noper) != NOTHING ) {
1728                 for ( ; uc < e ; uc += len ) {
1729
1730                     TRIE_READ_CHAR;
1731
1732                     if ( uvc < 256 ) {
1733                         charid = trie->charmap[ uvc ];
1734                     } else {
1735                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1736                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1737                     }
1738                     if ( charid ) {
1739                         charid--;
1740                         if ( !trie->trans[ state + charid ].next ) {
1741                             trie->trans[ state + charid ].next = next_alloc;
1742                             trie->trans[ state ].check++;
1743                             next_alloc += trie->uniquecharcount;
1744                         }
1745                         state = trie->trans[ state + charid ].next;
1746                     } else {
1747                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1748                     }
1749                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1750                 }
1751             }
1752             accept_state = TRIE_NODENUM( state );
1753             TRIE_HANDLE_WORD(accept_state);
1754
1755         } /* end second pass */
1756
1757         /* and now dump it out before we compress it */
1758         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1759                                                           revcharmap,
1760                                                           next_alloc, depth+1));
1761
1762         {
1763         /*
1764            * Inplace compress the table.*
1765
1766            For sparse data sets the table constructed by the trie algorithm will
1767            be mostly 0/FAIL transitions or to put it another way mostly empty.
1768            (Note that leaf nodes will not contain any transitions.)
1769
1770            This algorithm compresses the tables by eliminating most such
1771            transitions, at the cost of a modest bit of extra work during lookup:
1772
1773            - Each states[] entry contains a .base field which indicates the
1774            index in the state[] array wheres its transition data is stored.
1775
1776            - If .base is 0 there are no  valid transitions from that node.
1777
1778            - If .base is nonzero then charid is added to it to find an entry in
1779            the trans array.
1780
1781            -If trans[states[state].base+charid].check!=state then the
1782            transition is taken to be a 0/Fail transition. Thus if there are fail
1783            transitions at the front of the node then the .base offset will point
1784            somewhere inside the previous nodes data (or maybe even into a node
1785            even earlier), but the .check field determines if the transition is
1786            valid.
1787
1788            XXX - wrong maybe?
1789            The following process inplace converts the table to the compressed
1790            table: We first do not compress the root node 1,and mark its all its
1791            .check pointers as 1 and set its .base pointer as 1 as well. This
1792            allows to do a DFA construction from the compressed table later, and
1793            ensures that any .base pointers we calculate later are greater than
1794            0.
1795
1796            - We set 'pos' to indicate the first entry of the second node.
1797
1798            - We then iterate over the columns of the node, finding the first and
1799            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1800            and set the .check pointers accordingly, and advance pos
1801            appropriately and repreat for the next node. Note that when we copy
1802            the next pointers we have to convert them from the original
1803            NODEIDX form to NODENUM form as the former is not valid post
1804            compression.
1805
1806            - If a node has no transitions used we mark its base as 0 and do not
1807            advance the pos pointer.
1808
1809            - If a node only has one transition we use a second pointer into the
1810            structure to fill in allocated fail transitions from other states.
1811            This pointer is independent of the main pointer and scans forward
1812            looking for null transitions that are allocated to a state. When it
1813            finds one it writes the single transition into the "hole".  If the
1814            pointer doesnt find one the single transition is appended as normal.
1815
1816            - Once compressed we can Renew/realloc the structures to release the
1817            excess space.
1818
1819            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1820            specifically Fig 3.47 and the associated pseudocode.
1821
1822            demq
1823         */
1824         const U32 laststate = TRIE_NODENUM( next_alloc );
1825         U32 state, charid;
1826         U32 pos = 0, zp=0;
1827         trie->statecount = laststate;
1828
1829         for ( state = 1 ; state < laststate ; state++ ) {
1830             U8 flag = 0;
1831             const U32 stateidx = TRIE_NODEIDX( state );
1832             const U32 o_used = trie->trans[ stateidx ].check;
1833             U32 used = trie->trans[ stateidx ].check;
1834             trie->trans[ stateidx ].check = 0;
1835
1836             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1837                 if ( flag || trie->trans[ stateidx + charid ].next ) {
1838                     if ( trie->trans[ stateidx + charid ].next ) {
1839                         if (o_used == 1) {
1840                             for ( ; zp < pos ; zp++ ) {
1841                                 if ( ! trie->trans[ zp ].next ) {
1842                                     break;
1843                                 }
1844                             }
1845                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1846                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1847                             trie->trans[ zp ].check = state;
1848                             if ( ++zp > pos ) pos = zp;
1849                             break;
1850                         }
1851                         used--;
1852                     }
1853                     if ( !flag ) {
1854                         flag = 1;
1855                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1856                     }
1857                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1858                     trie->trans[ pos ].check = state;
1859                     pos++;
1860                 }
1861             }
1862         }
1863         trie->lasttrans = pos + 1;
1864         trie->states = (reg_trie_state *)
1865             PerlMemShared_realloc( trie->states, laststate
1866                                    * sizeof(reg_trie_state) );
1867         DEBUG_TRIE_COMPILE_MORE_r(
1868                 PerlIO_printf( Perl_debug_log,
1869                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1870                     (int)depth * 2 + 2,"",
1871                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1872                     (IV)next_alloc,
1873                     (IV)pos,
1874                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1875             );
1876
1877         } /* end table compress */
1878     }
1879     DEBUG_TRIE_COMPILE_MORE_r(
1880             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1881                 (int)depth * 2 + 2, "",
1882                 (UV)trie->statecount,
1883                 (UV)trie->lasttrans)
1884     );
1885     /* resize the trans array to remove unused space */
1886     trie->trans = (reg_trie_trans *)
1887         PerlMemShared_realloc( trie->trans, trie->lasttrans
1888                                * sizeof(reg_trie_trans) );
1889
1890     /* and now dump out the compressed format */
1891     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
1892
1893     {   /* Modify the program and insert the new TRIE node*/
1894         U8 nodetype =(U8)(flags & 0xFF);
1895         char *str=NULL;
1896
1897 #ifdef DEBUGGING
1898         regnode *optimize = NULL;
1899 #ifdef RE_TRACK_PATTERN_OFFSETS
1900
1901         U32 mjd_offset = 0;
1902         U32 mjd_nodelen = 0;
1903 #endif /* RE_TRACK_PATTERN_OFFSETS */
1904 #endif /* DEBUGGING */
1905         /*
1906            This means we convert either the first branch or the first Exact,
1907            depending on whether the thing following (in 'last') is a branch
1908            or not and whther first is the startbranch (ie is it a sub part of
1909            the alternation or is it the whole thing.)
1910            Assuming its a sub part we conver the EXACT otherwise we convert
1911            the whole branch sequence, including the first.
1912          */
1913         /* Find the node we are going to overwrite */
1914         if ( first != startbranch || OP( last ) == BRANCH ) {
1915             /* branch sub-chain */
1916             NEXT_OFF( first ) = (U16)(last - first);
1917 #ifdef RE_TRACK_PATTERN_OFFSETS
1918             DEBUG_r({
1919                 mjd_offset= Node_Offset((convert));
1920                 mjd_nodelen= Node_Length((convert));
1921             });
1922 #endif
1923             /* whole branch chain */
1924         }
1925 #ifdef RE_TRACK_PATTERN_OFFSETS
1926         else {
1927             DEBUG_r({
1928                 const  regnode *nop = NEXTOPER( convert );
1929                 mjd_offset= Node_Offset((nop));
1930                 mjd_nodelen= Node_Length((nop));
1931             });
1932         }
1933         DEBUG_OPTIMISE_r(
1934             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1935                 (int)depth * 2 + 2, "",
1936                 (UV)mjd_offset, (UV)mjd_nodelen)
1937         );
1938 #endif
1939         /* But first we check to see if there is a common prefix we can
1940            split out as an EXACT and put in front of the TRIE node.  */
1941         trie->startstate= 1;
1942         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
1943             U32 state;
1944             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1945                 U32 ofs = 0;
1946                 I32 idx = -1;
1947                 U32 count = 0;
1948                 const U32 base = trie->states[ state ].trans.base;
1949
1950                 if ( trie->states[state].wordnum )
1951                         count = 1;
1952
1953                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1954                     if ( ( base + ofs >= trie->uniquecharcount ) &&
1955                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1956                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1957                     {
1958                         if ( ++count > 1 ) {
1959                             SV **tmp = av_fetch( revcharmap, ofs, 0);
1960                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1961                             if ( state == 1 ) break;
1962                             if ( count == 2 ) {
1963                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1964                                 DEBUG_OPTIMISE_r(
1965                                     PerlIO_printf(Perl_debug_log,
1966                                         "%*sNew Start State=%"UVuf" Class: [",
1967                                         (int)depth * 2 + 2, "",
1968                                         (UV)state));
1969                                 if (idx >= 0) {
1970                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
1971                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1972
1973                                     TRIE_BITMAP_SET(trie,*ch);
1974                                     if ( folder )
1975                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
1976                                     DEBUG_OPTIMISE_r(
1977                                         PerlIO_printf(Perl_debug_log, (char*)ch)
1978                                     );
1979                                 }
1980                             }
1981                             TRIE_BITMAP_SET(trie,*ch);
1982                             if ( folder )
1983                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1984                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1985                         }
1986                         idx = ofs;
1987                     }
1988                 }
1989                 if ( count == 1 ) {
1990                     SV **tmp = av_fetch( revcharmap, idx, 0);
1991                     STRLEN len;
1992                     char *ch = SvPV( *tmp, len );
1993                     DEBUG_OPTIMISE_r({
1994                         SV *sv=sv_newmortal();
1995                         PerlIO_printf( Perl_debug_log,
1996                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1997                             (int)depth * 2 + 2, "",
1998                             (UV)state, (UV)idx,
1999                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2000                                 PL_colors[0], PL_colors[1],
2001                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2002                                 PERL_PV_ESCAPE_FIRSTCHAR
2003                             )
2004                         );
2005                     });
2006                     if ( state==1 ) {
2007                         OP( convert ) = nodetype;
2008                         str=STRING(convert);
2009                         STR_LEN(convert)=0;
2010                     }
2011                     STR_LEN(convert) += len;
2012                     while (len--)
2013                         *str++ = *ch++;
2014                 } else {
2015 #ifdef DEBUGGING
2016                     if (state>1)
2017                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2018 #endif
2019                     break;
2020                 }
2021             }
2022             if (str) {
2023                 regnode *n = convert+NODE_SZ_STR(convert);
2024                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2025                 trie->startstate = state;
2026                 trie->minlen -= (state - 1);
2027                 trie->maxlen -= (state - 1);
2028 #ifdef DEBUGGING
2029                /* At least the UNICOS C compiler choked on this
2030                 * being argument to DEBUG_r(), so let's just have
2031                 * it right here. */
2032                if (
2033 #ifdef PERL_EXT_RE_BUILD
2034                    1
2035 #else
2036                    DEBUG_r_TEST
2037 #endif
2038                    ) {
2039                    regnode *fix = convert;
2040                    U32 word = trie->wordcount;
2041                    mjd_nodelen++;
2042                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2043                    while( ++fix < n ) {
2044                        Set_Node_Offset_Length(fix, 0, 0);
2045                    }
2046                    while (word--) {
2047                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2048                        if (tmp) {
2049                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2050                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2051                            else
2052                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2053                        }
2054                    }
2055                }
2056 #endif
2057                 if (trie->maxlen) {
2058                     convert = n;
2059                 } else {
2060                     NEXT_OFF(convert) = (U16)(tail - convert);
2061                     DEBUG_r(optimize= n);
2062                 }
2063             }
2064         }
2065         if (!jumper)
2066             jumper = last;
2067         if ( trie->maxlen ) {
2068             NEXT_OFF( convert ) = (U16)(tail - convert);
2069             ARG_SET( convert, data_slot );
2070             /* Store the offset to the first unabsorbed branch in
2071                jump[0], which is otherwise unused by the jump logic.
2072                We use this when dumping a trie and during optimisation. */
2073             if (trie->jump)
2074                 trie->jump[0] = (U16)(nextbranch - convert);
2075
2076             /* XXXX */
2077             if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2078                  ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2079             {
2080                 OP( convert ) = TRIEC;
2081                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2082                 PerlMemShared_free(trie->bitmap);
2083                 trie->bitmap= NULL;
2084             } else
2085                 OP( convert ) = TRIE;
2086
2087             /* store the type in the flags */
2088             convert->flags = nodetype;
2089             DEBUG_r({
2090             optimize = convert
2091                       + NODE_STEP_REGNODE
2092                       + regarglen[ OP( convert ) ];
2093             });
2094             /* XXX We really should free up the resource in trie now,
2095                    as we won't use them - (which resources?) dmq */
2096         }
2097         /* needed for dumping*/
2098         DEBUG_r(if (optimize) {
2099             regnode *opt = convert;
2100
2101             while ( ++opt < optimize) {
2102                 Set_Node_Offset_Length(opt,0,0);
2103             }
2104             /*
2105                 Try to clean up some of the debris left after the
2106                 optimisation.
2107              */
2108             while( optimize < jumper ) {
2109                 mjd_nodelen += Node_Length((optimize));
2110                 OP( optimize ) = OPTIMIZED;
2111                 Set_Node_Offset_Length(optimize,0,0);
2112                 optimize++;
2113             }
2114             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2115         });
2116     } /* end node insert */
2117     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2118 #ifdef DEBUGGING
2119     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2120     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2121 #else
2122     SvREFCNT_dec(revcharmap);
2123 #endif
2124     return trie->jump
2125            ? MADE_JUMP_TRIE
2126            : trie->startstate>1
2127              ? MADE_EXACT_TRIE
2128              : MADE_TRIE;
2129 }
2130
2131 STATIC void
2132 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2133 {
2134 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2135
2136    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2137    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2138    ISBN 0-201-10088-6
2139
2140    We find the fail state for each state in the trie, this state is the longest proper
2141    suffix of the current states 'word' that is also a proper prefix of another word in our
2142    trie. State 1 represents the word '' and is the thus the default fail state. This allows
2143    the DFA not to have to restart after its tried and failed a word at a given point, it
2144    simply continues as though it had been matching the other word in the first place.
2145    Consider
2146       'abcdgu'=~/abcdefg|cdgu/
2147    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2148    fail, which would bring use to the state representing 'd' in the second word where we would
2149    try 'g' and succeed, prodceding to match 'cdgu'.
2150  */
2151  /* add a fail transition */
2152     const U32 trie_offset = ARG(source);
2153     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2154     U32 *q;
2155     const U32 ucharcount = trie->uniquecharcount;
2156     const U32 numstates = trie->statecount;
2157     const U32 ubound = trie->lasttrans + ucharcount;
2158     U32 q_read = 0;
2159     U32 q_write = 0;
2160     U32 charid;
2161     U32 base = trie->states[ 1 ].trans.base;
2162     U32 *fail;
2163     reg_ac_data *aho;
2164     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2165     GET_RE_DEBUG_FLAGS_DECL;
2166 #ifndef DEBUGGING
2167     PERL_UNUSED_ARG(depth);
2168 #endif
2169
2170
2171     ARG_SET( stclass, data_slot );
2172     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2173     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2174     aho->trie=trie_offset;
2175     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2176     Copy( trie->states, aho->states, numstates, reg_trie_state );
2177     Newxz( q, numstates, U32);
2178     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2179     aho->refcount = 1;
2180     fail = aho->fail;
2181     /* initialize fail[0..1] to be 1 so that we always have
2182        a valid final fail state */
2183     fail[ 0 ] = fail[ 1 ] = 1;
2184
2185     for ( charid = 0; charid < ucharcount ; charid++ ) {
2186         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2187         if ( newstate ) {
2188             q[ q_write ] = newstate;
2189             /* set to point at the root */
2190             fail[ q[ q_write++ ] ]=1;
2191         }
2192     }
2193     while ( q_read < q_write) {
2194         const U32 cur = q[ q_read++ % numstates ];
2195         base = trie->states[ cur ].trans.base;
2196
2197         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2198             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2199             if (ch_state) {
2200                 U32 fail_state = cur;
2201                 U32 fail_base;
2202                 do {
2203                     fail_state = fail[ fail_state ];
2204                     fail_base = aho->states[ fail_state ].trans.base;
2205                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2206
2207                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2208                 fail[ ch_state ] = fail_state;
2209                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2210                 {
2211                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2212                 }
2213                 q[ q_write++ % numstates] = ch_state;
2214             }
2215         }
2216     }
2217     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2218        when we fail in state 1, this allows us to use the
2219        charclass scan to find a valid start char. This is based on the principle
2220        that theres a good chance the string being searched contains lots of stuff
2221        that cant be a start char.
2222      */
2223     fail[ 0 ] = fail[ 1 ] = 0;
2224     DEBUG_TRIE_COMPILE_r({
2225         PerlIO_printf(Perl_debug_log,
2226                       "%*sStclass Failtable (%"UVuf" states): 0",
2227                       (int)(depth * 2), "", (UV)numstates
2228         );
2229         for( q_read=1; q_read<numstates; q_read++ ) {
2230             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2231         }
2232         PerlIO_printf(Perl_debug_log, "\n");
2233     });
2234     Safefree(q);
2235     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2236 }
2237
2238
2239 /*
2240  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2241  * These need to be revisited when a newer toolchain becomes available.
2242  */
2243 #if defined(__sparc64__) && defined(__GNUC__)
2244 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2245 #       undef  SPARC64_GCC_WORKAROUND
2246 #       define SPARC64_GCC_WORKAROUND 1
2247 #   endif
2248 #endif
2249
2250 #define DEBUG_PEEP(str,scan,depth) \
2251     DEBUG_OPTIMISE_r({if (scan){ \
2252        SV * const mysv=sv_newmortal(); \
2253        regnode *Next = regnext(scan); \
2254        regprop(RExC_rx, mysv, scan); \
2255        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2256        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2257        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2258    }});
2259
2260
2261
2262
2263
2264 #define JOIN_EXACT(scan,min,flags) \
2265     if (PL_regkind[OP(scan)] == EXACT) \
2266         join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2267
2268 STATIC U32
2269 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2270     /* Merge several consecutive EXACTish nodes into one. */
2271     regnode *n = regnext(scan);
2272     U32 stringok = 1;
2273     regnode *next = scan + NODE_SZ_STR(scan);
2274     U32 merged = 0;
2275     U32 stopnow = 0;
2276 #ifdef DEBUGGING
2277     regnode *stop = scan;
2278     GET_RE_DEBUG_FLAGS_DECL;
2279 #else
2280     PERL_UNUSED_ARG(depth);
2281 #endif
2282 #ifndef EXPERIMENTAL_INPLACESCAN
2283     PERL_UNUSED_ARG(flags);
2284     PERL_UNUSED_ARG(val);
2285 #endif
2286     DEBUG_PEEP("join",scan,depth);
2287
2288     /* Skip NOTHING, merge EXACT*. */
2289     while (n &&
2290            ( PL_regkind[OP(n)] == NOTHING ||
2291              (stringok && (OP(n) == OP(scan))))
2292            && NEXT_OFF(n)
2293            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2294
2295         if (OP(n) == TAIL || n > next)
2296             stringok = 0;
2297         if (PL_regkind[OP(n)] == NOTHING) {
2298             DEBUG_PEEP("skip:",n,depth);
2299             NEXT_OFF(scan) += NEXT_OFF(n);
2300             next = n + NODE_STEP_REGNODE;
2301 #ifdef DEBUGGING
2302             if (stringok)
2303                 stop = n;
2304 #endif
2305             n = regnext(n);
2306         }
2307         else if (stringok) {
2308             const unsigned int oldl = STR_LEN(scan);
2309             regnode * const nnext = regnext(n);
2310
2311             DEBUG_PEEP("merg",n,depth);
2312
2313             merged++;
2314             if (oldl + STR_LEN(n) > U8_MAX)
2315                 break;
2316             NEXT_OFF(scan) += NEXT_OFF(n);
2317             STR_LEN(scan) += STR_LEN(n);
2318             next = n + NODE_SZ_STR(n);
2319             /* Now we can overwrite *n : */
2320             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2321 #ifdef DEBUGGING
2322             stop = next - 1;
2323 #endif
2324             n = nnext;
2325             if (stopnow) break;
2326         }
2327
2328 #ifdef EXPERIMENTAL_INPLACESCAN
2329         if (flags && !NEXT_OFF(n)) {
2330             DEBUG_PEEP("atch", val, depth);
2331             if (reg_off_by_arg[OP(n)]) {
2332                 ARG_SET(n, val - n);
2333             }
2334             else {
2335                 NEXT_OFF(n) = val - n;
2336             }
2337             stopnow = 1;
2338         }
2339 #endif
2340     }
2341
2342     if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2343     /*
2344     Two problematic code points in Unicode casefolding of EXACT nodes:
2345
2346     U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2347     U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2348
2349     which casefold to
2350
2351     Unicode                      UTF-8
2352
2353     U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2354     U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2355
2356     This means that in case-insensitive matching (or "loose matching",
2357     as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2358     length of the above casefolded versions) can match a target string
2359     of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2360     This would rather mess up the minimum length computation.
2361
2362     What we'll do is to look for the tail four bytes, and then peek
2363     at the preceding two bytes to see whether we need to decrease
2364     the minimum length by four (six minus two).
2365
2366     Thanks to the design of UTF-8, there cannot be false matches:
2367     A sequence of valid UTF-8 bytes cannot be a subsequence of
2368     another valid sequence of UTF-8 bytes.
2369
2370     */
2371          char * const s0 = STRING(scan), *s, *t;
2372          char * const s1 = s0 + STR_LEN(scan) - 1;
2373          char * const s2 = s1 - 4;
2374 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2375          const char t0[] = "\xaf\x49\xaf\x42";
2376 #else
2377          const char t0[] = "\xcc\x88\xcc\x81";
2378 #endif
2379          const char * const t1 = t0 + 3;
2380
2381          for (s = s0 + 2;
2382               s < s2 && (t = ninstr(s, s1, t0, t1));
2383               s = t + 4) {
2384 #ifdef EBCDIC
2385               if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2386                   ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2387 #else
2388               if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2389                   ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2390 #endif
2391                    *min -= 4;
2392          }
2393     }
2394
2395 #ifdef DEBUGGING
2396     /* Allow dumping */
2397     n = scan + NODE_SZ_STR(scan);
2398     while (n <= stop) {
2399         if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2400             OP(n) = OPTIMIZED;
2401             NEXT_OFF(n) = 0;
2402         }
2403         n++;
2404     }
2405 #endif
2406     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2407     return stopnow;
2408 }
2409
2410 /* REx optimizer.  Converts nodes into quickier variants "in place".
2411    Finds fixed substrings.  */
2412
2413 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2414    to the position after last scanned or to NULL. */
2415
2416 #define INIT_AND_WITHP \
2417     assert(!and_withp); \
2418     Newx(and_withp,1,struct regnode_charclass_class); \
2419     SAVEFREEPV(and_withp)
2420
2421 /* this is a chain of data about sub patterns we are processing that
2422    need to be handled seperately/specially in study_chunk. Its so
2423    we can simulate recursion without losing state.  */
2424 struct scan_frame;
2425 typedef struct scan_frame {
2426     regnode *last;  /* last node to process in this frame */
2427     regnode *next;  /* next node to process when last is reached */
2428     struct scan_frame *prev; /*previous frame*/
2429     I32 stop; /* what stopparen do we use */
2430 } scan_frame;
2431
2432
2433 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2434
2435 #define CASE_SYNST_FNC(nAmE)                                       \
2436 case nAmE:                                                         \
2437     if (flags & SCF_DO_STCLASS_AND) {                              \
2438             for (value = 0; value < 256; value++)                  \
2439                 if (!is_ ## nAmE ## _cp(value))                       \
2440                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2441     }                                                              \
2442     else {                                                         \
2443             for (value = 0; value < 256; value++)                  \
2444                 if (is_ ## nAmE ## _cp(value))                        \
2445                     ANYOF_BITMAP_SET(data->start_class, value);    \
2446     }                                                              \
2447     break;                                                         \
2448 case N ## nAmE:                                                    \
2449     if (flags & SCF_DO_STCLASS_AND) {                              \
2450             for (value = 0; value < 256; value++)                   \
2451                 if (is_ ## nAmE ## _cp(value))                         \
2452                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2453     }                                                               \
2454     else {                                                          \
2455             for (value = 0; value < 256; value++)                   \
2456                 if (!is_ ## nAmE ## _cp(value))                        \
2457                     ANYOF_BITMAP_SET(data->start_class, value);     \
2458     }                                                               \
2459     break
2460
2461
2462
2463 STATIC I32
2464 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2465                         I32 *minlenp, I32 *deltap,
2466                         regnode *last,
2467                         scan_data_t *data,
2468                         I32 stopparen,
2469                         U8* recursed,
2470                         struct regnode_charclass_class *and_withp,
2471                         U32 flags, U32 depth)
2472                         /* scanp: Start here (read-write). */
2473                         /* deltap: Write maxlen-minlen here. */
2474                         /* last: Stop before this one. */
2475                         /* data: string data about the pattern */
2476                         /* stopparen: treat close N as END */
2477                         /* recursed: which subroutines have we recursed into */
2478                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2479 {
2480     dVAR;
2481     I32 min = 0, pars = 0, code;
2482     regnode *scan = *scanp, *next;
2483     I32 delta = 0;
2484     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2485     int is_inf_internal = 0;            /* The studied chunk is infinite */
2486     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2487     scan_data_t data_fake;
2488     SV *re_trie_maxbuff = NULL;
2489     regnode *first_non_open = scan;
2490     I32 stopmin = I32_MAX;
2491     scan_frame *frame = NULL;
2492
2493     GET_RE_DEBUG_FLAGS_DECL;
2494
2495 #ifdef DEBUGGING
2496     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2497 #endif
2498
2499     if ( depth == 0 ) {
2500         while (first_non_open && OP(first_non_open) == OPEN)
2501             first_non_open=regnext(first_non_open);
2502     }
2503
2504
2505   fake_study_recurse:
2506     while ( scan && OP(scan) != END && scan < last ){
2507         /* Peephole optimizer: */
2508         DEBUG_STUDYDATA("Peep:", data,depth);
2509         DEBUG_PEEP("Peep",scan,depth);
2510         JOIN_EXACT(scan,&min,0);
2511
2512         /* Follow the next-chain of the current node and optimize
2513            away all the NOTHINGs from it.  */
2514         if (OP(scan) != CURLYX) {
2515             const int max = (reg_off_by_arg[OP(scan)]
2516                        ? I32_MAX
2517                        /* I32 may be smaller than U16 on CRAYs! */
2518                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2519             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2520             int noff;
2521             regnode *n = scan;
2522
2523             /* Skip NOTHING and LONGJMP. */
2524             while ((n = regnext(n))
2525                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2526                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2527                    && off + noff < max)
2528                 off += noff;
2529             if (reg_off_by_arg[OP(scan)])
2530                 ARG(scan) = off;
2531             else
2532                 NEXT_OFF(scan) = off;
2533         }
2534
2535
2536
2537         /* The principal pseudo-switch.  Cannot be a switch, since we
2538            look into several different things.  */
2539         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2540                    || OP(scan) == IFTHEN) {
2541             next = regnext(scan);
2542             code = OP(scan);
2543             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2544
2545             if (OP(next) == code || code == IFTHEN) {
2546                 /* NOTE - There is similar code to this block below for handling
2547                    TRIE nodes on a re-study.  If you change stuff here check there
2548                    too. */
2549                 I32 max1 = 0, min1 = I32_MAX, num = 0;
2550                 struct regnode_charclass_class accum;
2551                 regnode * const startbranch=scan;
2552
2553                 if (flags & SCF_DO_SUBSTR)
2554                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2555                 if (flags & SCF_DO_STCLASS)
2556                     cl_init_zero(pRExC_state, &accum);
2557
2558                 while (OP(scan) == code) {
2559                     I32 deltanext, minnext, f = 0, fake;
2560                     struct regnode_charclass_class this_class;
2561
2562                     num++;
2563                     data_fake.flags = 0;
2564                     if (data) {
2565                         data_fake.whilem_c = data->whilem_c;
2566                         data_fake.last_closep = data->last_closep;
2567                     }
2568                     else
2569                         data_fake.last_closep = &fake;
2570
2571                     data_fake.pos_delta = delta;
2572                     next = regnext(scan);
2573                     scan = NEXTOPER(scan);
2574                     if (code != BRANCH)
2575                         scan = NEXTOPER(scan);
2576                     if (flags & SCF_DO_STCLASS) {
2577                         cl_init(pRExC_state, &this_class);
2578                         data_fake.start_class = &this_class;
2579                         f = SCF_DO_STCLASS_AND;
2580                     }
2581                     if (flags & SCF_WHILEM_VISITED_POS)
2582                         f |= SCF_WHILEM_VISITED_POS;
2583
2584                     /* we suppose the run is continuous, last=next...*/
2585                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2586                                           next, &data_fake,
2587                                           stopparen, recursed, NULL, f,depth+1);
2588                     if (min1 > minnext)
2589                         min1 = minnext;
2590                     if (max1 < minnext + deltanext)
2591                         max1 = minnext + deltanext;
2592                     if (deltanext == I32_MAX)
2593                         is_inf = is_inf_internal = 1;
2594                     scan = next;
2595                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2596                         pars++;
2597                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
2598                         if ( stopmin > minnext)
2599                             stopmin = min + min1;
2600                         flags &= ~SCF_DO_SUBSTR;
2601                         if (data)
2602                             data->flags |= SCF_SEEN_ACCEPT;
2603                     }
2604                     if (data) {
2605                         if (data_fake.flags & SF_HAS_EVAL)
2606                             data->flags |= SF_HAS_EVAL;
2607                         data->whilem_c = data_fake.whilem_c;
2608                     }
2609                     if (flags & SCF_DO_STCLASS)
2610                         cl_or(pRExC_state, &accum, &this_class);
2611                 }
2612                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2613                     min1 = 0;
2614                 if (flags & SCF_DO_SUBSTR) {
2615                     data->pos_min += min1;
2616                     data->pos_delta += max1 - min1;
2617                     if (max1 != min1 || is_inf)
2618                         data->longest = &(data->longest_float);
2619                 }
2620                 min += min1;
2621                 delta += max1 - min1;
2622                 if (flags & SCF_DO_STCLASS_OR) {
2623                     cl_or(pRExC_state, data->start_class, &accum);
2624                     if (min1) {
2625                         cl_and(data->start_class, and_withp);
2626                         flags &= ~SCF_DO_STCLASS;
2627                     }
2628                 }
2629                 else if (flags & SCF_DO_STCLASS_AND) {
2630                     if (min1) {
2631                         cl_and(data->start_class, &accum);
2632                         flags &= ~SCF_DO_STCLASS;
2633                     }
2634                     else {
2635                         /* Switch to OR mode: cache the old value of
2636                          * data->start_class */
2637                         INIT_AND_WITHP;
2638                         StructCopy(data->start_class, and_withp,
2639                                    struct regnode_charclass_class);
2640                         flags &= ~SCF_DO_STCLASS_AND;
2641                         StructCopy(&accum, data->start_class,
2642                                    struct regnode_charclass_class);
2643                         flags |= SCF_DO_STCLASS_OR;
2644                         data->start_class->flags |= ANYOF_EOS;
2645                     }
2646                 }
2647
2648                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2649                 /* demq.
2650
2651                    Assuming this was/is a branch we are dealing with: 'scan' now
2652                    points at the item that follows the branch sequence, whatever
2653                    it is. We now start at the beginning of the sequence and look
2654                    for subsequences of
2655
2656                    BRANCH->EXACT=>x1
2657                    BRANCH->EXACT=>x2
2658                    tail
2659
2660                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
2661
2662                    If we can find such a subseqence we need to turn the first
2663                    element into a trie and then add the subsequent branch exact
2664                    strings to the trie.
2665
2666                    We have two cases
2667
2668                      1. patterns where the whole set of branch can be converted.
2669
2670                      2. patterns where only a subset can be converted.
2671
2672                    In case 1 we can replace the whole set with a single regop
2673                    for the trie. In case 2 we need to keep the start and end
2674                    branchs so
2675
2676                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2677                      becomes BRANCH TRIE; BRANCH X;
2678
2679                   There is an additional case, that being where there is a
2680                   common prefix, which gets split out into an EXACT like node
2681                   preceding the TRIE node.
2682
2683                   If x(1..n)==tail then we can do a simple trie, if not we make
2684                   a "jump" trie, such that when we match the appropriate word
2685                   we "jump" to the appopriate tail node. Essentailly we turn
2686                   a nested if into a case structure of sorts.
2687
2688                 */
2689
2690                     int made=0;
2691                     if (!re_trie_maxbuff) {
2692                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2693                         if (!SvIOK(re_trie_maxbuff))
2694                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2695                     }
2696                     if ( SvIV(re_trie_maxbuff)>=0  ) {
2697                         regnode *cur;
2698                         regnode *first = (regnode *)NULL;
2699                         regnode *last = (regnode *)NULL;
2700                         regnode *tail = scan;
2701                         U8 optype = 0;
2702                         U32 count=0;
2703
2704 #ifdef DEBUGGING
2705                         SV * const mysv = sv_newmortal();       /* for dumping */
2706 #endif
2707                         /* var tail is used because there may be a TAIL
2708                            regop in the way. Ie, the exacts will point to the
2709                            thing following the TAIL, but the last branch will
2710                            point at the TAIL. So we advance tail. If we
2711                            have nested (?:) we may have to move through several
2712                            tails.
2713                          */
2714
2715                         while ( OP( tail ) == TAIL ) {
2716                             /* this is the TAIL generated by (?:) */
2717                             tail = regnext( tail );
2718                         }
2719
2720
2721                         DEBUG_OPTIMISE_r({
2722                             regprop(RExC_rx, mysv, tail );
2723                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2724                                 (int)depth * 2 + 2, "",
2725                                 "Looking for TRIE'able sequences. Tail node is: ",
2726                                 SvPV_nolen_const( mysv )
2727                             );
2728                         });
2729
2730                         /*
2731
2732                            step through the branches, cur represents each
2733                            branch, noper is the first thing to be matched
2734                            as part of that branch and noper_next is the
2735                            regnext() of that node. if noper is an EXACT
2736                            and noper_next is the same as scan (our current
2737                            position in the regex) then the EXACT branch is
2738                            a possible optimization target. Once we have
2739                            two or more consequetive such branches we can
2740                            create a trie of the EXACT's contents and stich
2741                            it in place. If the sequence represents all of
2742                            the branches we eliminate the whole thing and
2743                            replace it with a single TRIE. If it is a
2744                            subsequence then we need to stitch it in. This
2745                            means the first branch has to remain, and needs
2746                            to be repointed at the item on the branch chain
2747                            following the last branch optimized. This could
2748                            be either a BRANCH, in which case the
2749                            subsequence is internal, or it could be the
2750                            item following the branch sequence in which
2751                            case the subsequence is at the end.
2752
2753                         */
2754
2755                         /* dont use tail as the end marker for this traverse */
2756                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2757                             regnode * const noper = NEXTOPER( cur );
2758 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2759                             regnode * const noper_next = regnext( noper );
2760 #endif
2761
2762                             DEBUG_OPTIMISE_r({
2763                                 regprop(RExC_rx, mysv, cur);
2764                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2765                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2766
2767                                 regprop(RExC_rx, mysv, noper);
2768                                 PerlIO_printf( Perl_debug_log, " -> %s",
2769                                     SvPV_nolen_const(mysv));
2770
2771                                 if ( noper_next ) {
2772                                   regprop(RExC_rx, mysv, noper_next );
2773                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2774                                     SvPV_nolen_const(mysv));
2775                                 }
2776                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2777                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2778                             });
2779                             if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2780                                          : PL_regkind[ OP( noper ) ] == EXACT )
2781                                   || OP(noper) == NOTHING )
2782 #ifdef NOJUMPTRIE
2783                                   && noper_next == tail
2784 #endif
2785                                   && count < U16_MAX)
2786                             {
2787                                 count++;
2788                                 if ( !first || optype == NOTHING ) {
2789                                     if (!first) first = cur;
2790                                     optype = OP( noper );
2791                                 } else {
2792                                     last = cur;
2793                                 }
2794                             } else {
2795 /*
2796     Currently we assume that the trie can handle unicode and ascii
2797     matches fold cased matches. If this proves true then the following
2798     define will prevent tries in this situation.
2799
2800     #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2801 */
2802 #define TRIE_TYPE_IS_SAFE 1
2803                                 if ( last && TRIE_TYPE_IS_SAFE ) {
2804                                     make_trie( pRExC_state,
2805                                             startbranch, first, cur, tail, count,
2806                                             optype, depth+1 );
2807                                 }
2808                                 if ( PL_regkind[ OP( noper ) ] == EXACT
2809 #ifdef NOJUMPTRIE
2810                                      && noper_next == tail
2811 #endif
2812                                 ){
2813                                     count = 1;
2814                                     first = cur;
2815                                     optype = OP( noper );
2816                                 } else {
2817                                     count = 0;
2818                                     first = NULL;
2819                                     optype = 0;
2820                                 }
2821                                 last = NULL;
2822                             }
2823                         }
2824                         DEBUG_OPTIMISE_r({
2825                             regprop(RExC_rx, mysv, cur);
2826                             PerlIO_printf( Perl_debug_log,
2827                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2828                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2829
2830                         });
2831
2832                         if ( last && TRIE_TYPE_IS_SAFE ) {
2833                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2834 #ifdef TRIE_STUDY_OPT
2835                             if ( ((made == MADE_EXACT_TRIE &&
2836                                  startbranch == first)
2837                                  || ( first_non_open == first )) &&
2838                                  depth==0 ) {
2839                                 flags |= SCF_TRIE_RESTUDY;
2840                                 if ( startbranch == first
2841                                      && scan == tail )
2842                                 {
2843                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2844                                 }
2845                             }
2846 #endif
2847                         }
2848                     }
2849
2850                 } /* do trie */
2851
2852             }
2853             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
2854                 scan = NEXTOPER(NEXTOPER(scan));
2855             } else                      /* single branch is optimized. */
2856                 scan = NEXTOPER(scan);
2857             continue;
2858         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2859             scan_frame *newframe = NULL;
2860             I32 paren;
2861             regnode *start;
2862             regnode *end;
2863
2864             if (OP(scan) != SUSPEND) {
2865             /* set the pointer */
2866                 if (OP(scan) == GOSUB) {
2867                     paren = ARG(scan);
2868                     RExC_recurse[ARG2L(scan)] = scan;
2869                     start = RExC_open_parens[paren-1];
2870                     end   = RExC_close_parens[paren-1];
2871                 } else {
2872                     paren = 0;
2873                     start = RExC_rxi->program + 1;
2874                     end   = RExC_opend;
2875                 }
2876                 if (!recursed) {
2877                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2878                     SAVEFREEPV(recursed);
2879                 }
2880                 if (!PAREN_TEST(recursed,paren+1)) {
2881                     PAREN_SET(recursed,paren+1);
2882                     Newx(newframe,1,scan_frame);
2883                 } else {
2884                     if (flags & SCF_DO_SUBSTR) {
2885                         SCAN_COMMIT(pRExC_state,data,minlenp);
2886                         data->longest = &(data->longest_float);
2887                     }
2888                     is_inf = is_inf_internal = 1;
2889                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2890                         cl_anything(pRExC_state, data->start_class);
2891                     flags &= ~SCF_DO_STCLASS;
2892                 }
2893             } else {
2894                 Newx(newframe,1,scan_frame);
2895                 paren = stopparen;
2896                 start = scan+2;
2897                 end = regnext(scan);
2898             }
2899             if (newframe) {
2900                 assert(start);
2901                 assert(end);
2902                 SAVEFREEPV(newframe);
2903                 newframe->next = regnext(scan);
2904                 newframe->last = last;
2905                 newframe->stop = stopparen;
2906                 newframe->prev = frame;
2907
2908                 frame = newframe;
2909                 scan =  start;
2910                 stopparen = paren;
2911                 last = end;
2912
2913                 continue;
2914             }
2915         }
2916         else if (OP(scan) == EXACT) {
2917             I32 l = STR_LEN(scan);
2918             UV uc;
2919             if (UTF) {
2920                 const U8 * const s = (U8*)STRING(scan);
2921                 l = utf8_length(s, s + l);
2922                 uc = utf8_to_uvchr(s, NULL);
2923             } else {
2924                 uc = *((U8*)STRING(scan));
2925             }
2926             min += l;
2927             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2928                 /* The code below prefers earlier match for fixed
2929                    offset, later match for variable offset.  */
2930                 if (data->last_end == -1) { /* Update the start info. */
2931                     data->last_start_min = data->pos_min;
2932                     data->last_start_max = is_inf
2933                         ? I32_MAX : data->pos_min + data->pos_delta;
2934                 }
2935                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2936                 if (UTF)
2937                     SvUTF8_on(data->last_found);
2938                 {
2939                     SV * const sv = data->last_found;
2940                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2941                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
2942                     if (mg && mg->mg_len >= 0)
2943                         mg->mg_len += utf8_length((U8*)STRING(scan),
2944                                                   (U8*)STRING(scan)+STR_LEN(scan));
2945                 }
2946                 data->last_end = data->pos_min + l;
2947                 data->pos_min += l; /* As in the first entry. */
2948                 data->flags &= ~SF_BEFORE_EOL;
2949             }
2950             if (flags & SCF_DO_STCLASS_AND) {
2951                 /* Check whether it is compatible with what we know already! */
2952                 int compat = 1;
2953
2954                 if (uc >= 0x100 ||
2955                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2956                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
2957                     && (!(data->start_class->flags & ANYOF_FOLD)
2958                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2959                     )
2960                     compat = 0;
2961                 ANYOF_CLASS_ZERO(data->start_class);
2962                 ANYOF_BITMAP_ZERO(data->start_class);
2963                 if (compat)
2964                     ANYOF_BITMAP_SET(data->start_class, uc);
2965                 data->start_class->flags &= ~ANYOF_EOS;
2966                 if (uc < 0x100)
2967                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2968             }
2969             else if (flags & SCF_DO_STCLASS_OR) {
2970                 /* false positive possible if the class is case-folded */
2971                 if (uc < 0x100)
2972                     ANYOF_BITMAP_SET(data->start_class, uc);
2973                 else
2974                     data->start_class->flags |= ANYOF_UNICODE_ALL;
2975                 data->start_class->flags &= ~ANYOF_EOS;
2976                 cl_and(data->start_class, and_withp);
2977             }
2978             flags &= ~SCF_DO_STCLASS;
2979         }
2980         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2981             I32 l = STR_LEN(scan);
2982             UV uc = *((U8*)STRING(scan));
2983
2984             /* Search for fixed substrings supports EXACT only. */
2985             if (flags & SCF_DO_SUBSTR) {
2986                 assert(data);
2987                 SCAN_COMMIT(pRExC_state, data, minlenp);
2988             }
2989             if (UTF) {
2990                 const U8 * const s = (U8 *)STRING(scan);
2991                 l = utf8_length(s, s + l);
2992                 uc = utf8_to_uvchr(s, NULL);
2993             }
2994             min += l;
2995             if (flags & SCF_DO_SUBSTR)
2996                 data->pos_min += l;
2997             if (flags & SCF_DO_STCLASS_AND) {
2998                 /* Check whether it is compatible with what we know already! */
2999                 int compat = 1;
3000
3001                 if (uc >= 0x100 ||
3002                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3003                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3004                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3005                     compat = 0;
3006                 ANYOF_CLASS_ZERO(data->start_class);
3007                 ANYOF_BITMAP_ZERO(data->start_class);
3008                 if (compat) {
3009                     ANYOF_BITMAP_SET(data->start_class, uc);
3010                     data->start_class->flags &= ~ANYOF_EOS;
3011                     data->start_class->flags |= ANYOF_FOLD;
3012                     if (OP(scan) == EXACTFL)
3013                         data->start_class->flags |= ANYOF_LOCALE;
3014                 }
3015             }
3016             else if (flags & SCF_DO_STCLASS_OR) {
3017                 if (data->start_class->flags & ANYOF_FOLD) {
3018                     /* false positive possible if the class is case-folded.
3019                        Assume that the locale settings are the same... */
3020                     if (uc < 0x100)
3021                         ANYOF_BITMAP_SET(data->start_class, uc);
3022                     data->start_class->flags &= ~ANYOF_EOS;
3023                 }
3024                 cl_and(data->start_class, and_withp);
3025             }
3026             flags &= ~SCF_DO_STCLASS;
3027         }
3028         else if (strchr((const char*)PL_varies,OP(scan))) {
3029             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3030             I32 f = flags, pos_before = 0;
3031             regnode * const oscan = scan;
3032             struct regnode_charclass_class this_class;
3033             struct regnode_charclass_class *oclass = NULL;
3034             I32 next_is_eval = 0;
3035
3036             switch (PL_regkind[OP(scan)]) {
3037             case WHILEM:                /* End of (?:...)* . */
3038                 scan = NEXTOPER(scan);
3039                 goto finish;
3040             case PLUS:
3041                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3042                     next = NEXTOPER(scan);
3043                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3044                         mincount = 1;
3045                         maxcount = REG_INFTY;
3046                         next = regnext(scan);
3047                         scan = NEXTOPER(scan);
3048                         goto do_curly;
3049                     }
3050                 }
3051                 if (flags & SCF_DO_SUBSTR)
3052                     data->pos_min++;
3053                 min++;
3054                 /* Fall through. */
3055             case STAR:
3056                 if (flags & SCF_DO_STCLASS) {
3057                     mincount = 0;
3058                     maxcount = REG_INFTY;
3059                     next = regnext(scan);
3060                     scan = NEXTOPER(scan);
3061                     goto do_curly;
3062                 }
3063                 is_inf = is_inf_internal = 1;
3064                 scan = regnext(scan);
3065                 if (flags & SCF_DO_SUBSTR) {
3066                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3067                     data->longest = &(data->longest_float);
3068                 }
3069                 goto optimize_curly_tail;
3070             case CURLY:
3071                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3072                     && (scan->flags == stopparen))
3073                 {
3074                     mincount = 1;
3075                     maxcount = 1;
3076                 } else {
3077                     mincount = ARG1(scan);
3078                     maxcount = ARG2(scan);
3079                 }
3080                 next = regnext(scan);
3081                 if (OP(scan) == CURLYX) {
3082                     I32 lp = (data ? *(data->last_closep) : 0);
3083                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3084                 }
3085                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3086                 next_is_eval = (OP(scan) == EVAL);
3087               do_curly:
3088                 if (flags & SCF_DO_SUBSTR) {
3089                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3090                     pos_before = data->pos_min;
3091                 }
3092                 if (data) {
3093                     fl = data->flags;
3094                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3095                     if (is_inf)
3096                         data->flags |= SF_IS_INF;
3097                 }
3098                 if (flags & SCF_DO_STCLASS) {
3099                     cl_init(pRExC_state, &this_class);
3100                     oclass = data->start_class;
3101                     data->start_class = &this_class;
3102                     f |= SCF_DO_STCLASS_AND;
3103                     f &= ~SCF_DO_STCLASS_OR;
3104                 }
3105                 /* These are the cases when once a subexpression
3106                    fails at a particular position, it cannot succeed
3107                    even after backtracking at the enclosing scope.
3108
3109                    XXXX what if minimal match and we are at the
3110                         initial run of {n,m}? */
3111                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3112                     f &= ~SCF_WHILEM_VISITED_POS;
3113
3114                 /* This will finish on WHILEM, setting scan, or on NULL: */
3115                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3116                                       last, data, stopparen, recursed, NULL,
3117                                       (mincount == 0
3118                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3119
3120                 if (flags & SCF_DO_STCLASS)
3121                     data->start_class = oclass;
3122                 if (mincount == 0 || minnext == 0) {
3123                     if (flags & SCF_DO_STCLASS_OR) {
3124                         cl_or(pRExC_state, data->start_class, &this_class);
3125                     }
3126                     else if (flags & SCF_DO_STCLASS_AND) {
3127                         /* Switch to OR mode: cache the old value of
3128                          * data->start_class */
3129                         INIT_AND_WITHP;
3130                         StructCopy(data->start_class, and_withp,
3131                                    struct regnode_charclass_class);
3132                         flags &= ~SCF_DO_STCLASS_AND;
3133                         StructCopy(&this_class, data->start_class,
3134                                    struct regnode_charclass_class);
3135                         flags |= SCF_DO_STCLASS_OR;
3136                         data->start_class->flags |= ANYOF_EOS;
3137                     }
3138                 } else {                /* Non-zero len */
3139                     if (flags & SCF_DO_STCLASS_OR) {
3140                         cl_or(pRExC_state, data->start_class, &this_class);
3141                         cl_and(data->start_class, and_withp);
3142                     }
3143                     else if (flags & SCF_DO_STCLASS_AND)
3144                         cl_and(data->start_class, &this_class);
3145                     flags &= ~SCF_DO_STCLASS;
3146                 }
3147                 if (!scan)              /* It was not CURLYX, but CURLY. */
3148                     scan = next;
3149                 if ( /* ? quantifier ok, except for (?{ ... }) */
3150                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3151                     && (minnext == 0) && (deltanext == 0)
3152                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3153                     && maxcount <= REG_INFTY/3 /* Complement check for big count */
3154                     && ckWARN(WARN_REGEXP))
3155                 {
3156                     vWARN(RExC_parse,
3157                           "Quantifier unexpected on zero-length expression");
3158                 }
3159
3160                 min += minnext * mincount;
3161                 is_inf_internal |= ((maxcount == REG_INFTY
3162                                      && (minnext + deltanext) > 0)
3163                                     || deltanext == I32_MAX);
3164                 is_inf |= is_inf_internal;
3165                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3166
3167                 /* Try powerful optimization CURLYX => CURLYN. */
3168                 if (  OP(oscan) == CURLYX && data
3169                       && data->flags & SF_IN_PAR
3170                       && !(data->flags & SF_HAS_EVAL)
3171                       && !deltanext && minnext == 1 ) {
3172                     /* Try to optimize to CURLYN.  */
3173                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3174                     regnode * const nxt1 = nxt;
3175 #ifdef DEBUGGING
3176                     regnode *nxt2;
3177 #endif
3178
3179                     /* Skip open. */
3180                     nxt = regnext(nxt);
3181                     if (!strchr((const char*)PL_simple,OP(nxt))
3182                         && !(PL_regkind[OP(nxt)] == EXACT
3183                              && STR_LEN(nxt) == 1))
3184                         goto nogo;
3185 #ifdef DEBUGGING
3186                     nxt2 = nxt;
3187 #endif
3188                     nxt = regnext(nxt);
3189                     if (OP(nxt) != CLOSE)
3190                         goto nogo;
3191                     if (RExC_open_parens) {
3192                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3193                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3194                     }
3195                     /* Now we know that nxt2 is the only contents: */
3196                     oscan->flags = (U8)ARG(nxt);
3197                     OP(oscan) = CURLYN;
3198                     OP(nxt1) = NOTHING; /* was OPEN. */
3199
3200 #ifdef DEBUGGING
3201                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3202                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3203                     NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3204                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3205                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3206                     NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3207 #endif
3208                 }
3209               nogo:
3210
3211                 /* Try optimization CURLYX => CURLYM. */
3212                 if (  OP(oscan) == CURLYX && data
3213                       && !(data->flags & SF_HAS_PAR)
3214                       && !(data->flags & SF_HAS_EVAL)
3215                       && !deltanext     /* atom is fixed width */
3216                       && minnext != 0   /* CURLYM can't handle zero width */
3217                 ) {
3218                     /* XXXX How to optimize if data == 0? */
3219                     /* Optimize to a simpler form.  */
3220                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3221                     regnode *nxt2;
3222
3223                     OP(oscan) = CURLYM;
3224                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3225                             && (OP(nxt2) != WHILEM))
3226                         nxt = nxt2;
3227                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3228                     /* Need to optimize away parenths. */
3229                     if (data->flags & SF_IN_PAR) {
3230                         /* Set the parenth number.  */
3231                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3232
3233                         if (OP(nxt) != CLOSE)
3234                             FAIL("Panic opt close");
3235                         oscan->flags = (U8)ARG(nxt);
3236                         if (RExC_open_parens) {
3237                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3238                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3239                         }
3240                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3241                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3242
3243 #ifdef DEBUGGING
3244                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3245                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3246                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3247                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3248 #endif
3249 #if 0
3250                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3251                             regnode *nnxt = regnext(nxt1);
3252
3253                             if (nnxt == nxt) {
3254                                 if (reg_off_by_arg[OP(nxt1)])
3255                                     ARG_SET(nxt1, nxt2 - nxt1);
3256                                 else if (nxt2 - nxt1 < U16_MAX)
3257                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3258                                 else
3259                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3260                             }
3261                             nxt1 = nnxt;
3262                         }
3263 #endif
3264                         /* Optimize again: */
3265                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3266                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3267                     }
3268                     else
3269                         oscan->flags = 0;
3270                 }
3271                 else if ((OP(oscan) == CURLYX)
3272                          && (flags & SCF_WHILEM_VISITED_POS)
3273                          /* See the comment on a similar expression above.
3274                             However, this time it not a subexpression
3275                             we care about, but the expression itself. */
3276                          && (maxcount == REG_INFTY)
3277                          && data && ++data->whilem_c < 16) {
3278                     /* This stays as CURLYX, we can put the count/of pair. */
3279                     /* Find WHILEM (as in regexec.c) */
3280                     regnode *nxt = oscan + NEXT_OFF(oscan);
3281
3282                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3283                         nxt += ARG(nxt);
3284                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
3285                         | (RExC_whilem_seen << 4)); /* On WHILEM */
3286                 }
3287                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3288                     pars++;
3289                 if (flags & SCF_DO_SUBSTR) {
3290                     SV *last_str = NULL;
3291                     int counted = mincount != 0;
3292
3293                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3294 #if defined(SPARC64_GCC_WORKAROUND)
3295                         I32 b = 0;
3296                         STRLEN l = 0;
3297                         const char *s = NULL;
3298                         I32 old = 0;
3299
3300                         if (pos_before >= data->last_start_min)
3301                             b = pos_before;
3302                         else
3303                             b = data->last_start_min;
3304
3305                         l = 0;
3306                         s = SvPV_const(data->last_found, l);
3307                         old = b - data->last_start_min;
3308
3309 #else
3310                         I32 b = pos_before >= data->last_start_min
3311                             ? pos_before : data->last_start_min;
3312                         STRLEN l;
3313                         const char * const s = SvPV_const(data->last_found, l);
3314                         I32 old = b - data->last_start_min;
3315 #endif
3316
3317                         if (UTF)
3318                             old = utf8_hop((U8*)s, old) - (U8*)s;
3319
3320                         l -= old;
3321                         /* Get the added string: */
3322                         last_str = newSVpvn_utf8(s  + old, l, UTF);
3323                         if (deltanext == 0 && pos_before == b) {
3324                             /* What was added is a constant string */
3325                             if (mincount > 1) {
3326                                 SvGROW(last_str, (mincount * l) + 1);
3327                                 repeatcpy(SvPVX(last_str) + l,
3328                                           SvPVX_const(last_str), l, mincount - 1);
3329                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3330                                 /* Add additional parts. */
3331                                 SvCUR_set(data->last_found,
3332                                           SvCUR(data->last_found) - l);
3333                                 sv_catsv(data->last_found, last_str);
3334                                 {
3335                                     SV * sv = data->last_found;
3336                                     MAGIC *mg =
3337                                         SvUTF8(sv) && SvMAGICAL(sv) ?
3338                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3339                                     if (mg && mg->mg_len >= 0)
3340                                         mg->mg_len += CHR_SVLEN(last_str) - l;
3341                                 }
3342                                 data->last_end += l * (mincount - 1);
3343                             }
3344                         } else {
3345                             /* start offset must point into the last copy */
3346                             data->last_start_min += minnext * (mincount - 1);
3347                             data->last_start_max += is_inf ? I32_MAX
3348                                 : (maxcount - 1) * (minnext + data->pos_delta);
3349                         }
3350                     }
3351                     /* It is counted once already... */
3352                     data->pos_min += minnext * (mincount - counted);
3353                     data->pos_delta += - counted * deltanext +
3354                         (minnext + deltanext) * maxcount - minnext * mincount;
3355                     if (mincount != maxcount) {
3356                          /* Cannot extend fixed substrings found inside
3357                             the group.  */
3358                         SCAN_COMMIT(pRExC_state,data,minlenp);
3359                         if (mincount && last_str) {
3360                             SV * const sv = data->last_found;
3361                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3362                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3363
3364                             if (mg)
3365                                 mg->mg_len = -1;
3366                             sv_setsv(sv, last_str);
3367                             data->last_end = data->pos_min;
3368                             data->last_start_min =
3369                                 data->pos_min - CHR_SVLEN(last_str);
3370                             data->last_start_max = is_inf
3371                                 ? I32_MAX
3372                                 : data->pos_min + data->pos_delta
3373                                 - CHR_SVLEN(last_str);
3374                         }
3375                         data->longest = &(data->longest_float);
3376                     }
3377                     SvREFCNT_dec(last_str);
3378                 }
3379                 if (data && (fl & SF_HAS_EVAL))
3380                     data->flags |= SF_HAS_EVAL;
3381               optimize_curly_tail:
3382                 if (OP(oscan) != CURLYX) {
3383                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3384                            && NEXT_OFF(next))
3385                         NEXT_OFF(oscan) += NEXT_OFF(next);
3386                 }
3387                 continue;
3388             default:                    /* REF and CLUMP only? */
3389                 if (flags & SCF_DO_SUBSTR) {
3390                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
3391                     data->longest = &(data->longest_float);
3392                 }
3393                 is_inf = is_inf_internal = 1;
3394                 if (flags & SCF_DO_STCLASS_OR)
3395                     cl_anything(pRExC_state, data->start_class);
3396                 flags &= ~SCF_DO_STCLASS;
3397                 break;
3398             }
3399         }
3400         else if (OP(scan) == LNBREAK) {
3401             if (flags & SCF_DO_STCLASS) {
3402                 int value = 0;
3403                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3404                 if (flags & SCF_DO_STCLASS_AND) {
3405                     for (value = 0; value < 256; value++)
3406                         if (!is_VERTWS_cp(value))
3407                             ANYOF_BITMAP_CLEAR(data->start_class, value);
3408                 }
3409                 else {
3410                     for (value = 0; value < 256; value++)
3411                         if (is_VERTWS_cp(value))
3412                             ANYOF_BITMAP_SET(data->start_class, value);
3413                 }
3414                 if (flags & SCF_DO_STCLASS_OR)
3415                     cl_and(data->start_class, and_withp);
3416                 flags &= ~SCF_DO_STCLASS;
3417             }
3418             min += 1;
3419             delta += 1;
3420             if (flags & SCF_DO_SUBSTR) {
3421                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3422                 data->pos_min += 1;
3423                 data->pos_delta += 1;
3424                 data->longest = &(data->longest_float);
3425             }
3426
3427         }
3428         else if (OP(scan) == FOLDCHAR) {
3429             int d = ARG(scan)==0xDF ? 1 : 2;
3430             flags &= ~SCF_DO_STCLASS;
3431             min += 1;
3432             delta += d;
3433             if (flags & SCF_DO_SUBSTR) {
3434                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3435                 data->pos_min += 1;
3436                 data->pos_delta += d;
3437                 data->longest = &(data->longest_float);
3438             }
3439         }
3440         else if (strchr((const char*)PL_simple,OP(scan))) {
3441             int value = 0;
3442
3443             if (flags & SCF_DO_SUBSTR) {
3444                 SCAN_COMMIT(pRExC_state,data,minlenp);
3445                 data->pos_min++;
3446             }
3447             min++;
3448             if (flags & SCF_DO_STCLASS) {
3449                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3450
3451                 /* Some of the logic below assumes that switching
3452                    locale on will only add false positives. */
3453                 switch (PL_regkind[OP(scan)]) {
3454                 case SANY:
3455                 default:
3456                   do_default:
3457                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3458                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3459                         cl_anything(pRExC_state, data->start_class);
3460                     break;
3461                 case REG_ANY:
3462                     if (OP(scan) == SANY)
3463                         goto do_default;
3464                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3465                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3466                                  || (data->start_class->flags & ANYOF_CLASS));
3467                         cl_anything(pRExC_state, data->start_class);
3468                     }
3469                     if (flags & SCF_DO_STCLASS_AND || !value)
3470                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3471                     break;
3472                 case ANYOF:
3473                     if (flags & SCF_DO_STCLASS_AND)
3474                         cl_and(data->start_class,
3475                                (struct regnode_charclass_class*)scan);
3476                     else
3477                         cl_or(pRExC_state, data->start_class,
3478                               (struct regnode_charclass_class*)scan);
3479                     break;
3480                 case ALNUM:
3481                     if (flags & SCF_DO_STCLASS_AND) {
3482                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3483                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3484                             for (value = 0; value < 256; value++)
3485                                 if (!isALNUM(value))
3486                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3487                         }
3488                     }
3489                     else {
3490                         if (data->start_class->flags & ANYOF_LOCALE)
3491                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3492                         else {
3493                             for (value = 0; value < 256; value++)
3494                                 if (isALNUM(value))
3495                                     ANYOF_BITMAP_SET(data->start_class, value);
3496                         }
3497                     }
3498                     break;
3499                 case ALNUML:
3500                     if (flags & SCF_DO_STCLASS_AND) {
3501                         if (data->start_class->flags & ANYOF_LOCALE)
3502                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3503                     }
3504                     else {
3505                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3506                         data->start_class->flags |= ANYOF_LOCALE;
3507                     }
3508                     break;
3509                 case NALNUM:
3510                     if (flags & SCF_DO_STCLASS_AND) {
3511                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3512                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3513                             for (value = 0; value < 256; value++)
3514                                 if (isALNUM(value))
3515                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3516                         }
3517                     }
3518                     else {
3519                         if (data->start_class->flags & ANYOF_LOCALE)
3520                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3521                         else {
3522                             for (value = 0; value < 256; value++)
3523                                 if (!isALNUM(value))
3524                                     ANYOF_BITMAP_SET(data->start_class, value);
3525                         }
3526                     }
3527                     break;
3528                 case NALNUML:
3529                     if (flags & SCF_DO_STCLASS_AND) {
3530                         if (data->start_class->flags & ANYOF_LOCALE)
3531                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3532                     }
3533                     else {
3534                         data->start_class->flags |= ANYOF_LOCALE;
3535                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3536                     }
3537                     break;
3538                 case SPACE:
3539                     if (flags & SCF_DO_STCLASS_AND) {
3540                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3541                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3542                             for (value = 0; value < 256; value++)
3543                                 if (!isSPACE(value))
3544                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3545                         }
3546                     }
3547                     else {
3548                         if (data->start_class->flags & ANYOF_LOCALE)
3549                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3550                         else {
3551                             for (value = 0; value < 256; value++)
3552                                 if (isSPACE(value))
3553                                     ANYOF_BITMAP_SET(data->start_class, value);
3554                         }
3555                     }
3556                     break;
3557                 case SPACEL:
3558                     if (flags & SCF_DO_STCLASS_AND) {
3559                         if (data->start_class->flags & ANYOF_LOCALE)
3560                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3561                     }
3562                     else {
3563                         data->start_class->flags |= ANYOF_LOCALE;
3564                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3565                     }
3566                     break;
3567                 case NSPACE:
3568                     if (flags & SCF_DO_STCLASS_AND) {
3569                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3570                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3571                             for (value = 0; value < 256; value++)
3572                                 if (isSPACE(value))
3573                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3574                         }
3575                     }
3576                     else {
3577                         if (data->start_class->flags & ANYOF_LOCALE)
3578                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3579                         else {
3580                             for (value = 0; value < 256; value++)
3581                                 if (!isSPACE(value))
3582                                     ANYOF_BITMAP_SET(data->start_class, value);
3583                         }
3584                     }
3585                     break;
3586                 case NSPACEL:
3587                     if (flags & SCF_DO_STCLASS_AND) {
3588                         if (data->start_class->flags & ANYOF_LOCALE) {
3589                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3590                             for (value = 0; value < 256; value++)
3591                                 if (!isSPACE(value))
3592                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3593                         }
3594                     }
3595                     else {
3596                         data->start_class->flags |= ANYOF_LOCALE;
3597                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3598                     }
3599                     break;
3600                 case DIGIT:
3601                     if (flags & SCF_DO_STCLASS_AND) {
3602                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3603                         for (value = 0; value < 256; value++)
3604                             if (!isDIGIT(value))
3605                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3606                     }
3607                     else {
3608                         if (data->start_class->flags & ANYOF_LOCALE)
3609                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3610                         else {
3611                             for (value = 0; value < 256; value++)
3612                                 if (isDIGIT(value))
3613                                     ANYOF_BITMAP_SET(data->start_class, value);
3614                         }
3615                     }
3616                     break;
3617                 case NDIGIT:
3618                     if (flags & SCF_DO_STCLASS_AND) {
3619                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3620                         for (value = 0; value < 256; value++)
3621                             if (isDIGIT(value))
3622                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3623                     }
3624                     else {
3625                         if (data->start_class->flags & ANYOF_LOCALE)
3626                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3627                         else {
3628                             for (value = 0; value < 256; value++)
3629                                 if (!isDIGIT(value))
3630                                     ANYOF_BITMAP_SET(data->start_class, value);
3631                         }
3632                     }
3633                     break;
3634                 CASE_SYNST_FNC(VERTWS);
3635                 CASE_SYNST_FNC(HORIZWS);
3636
3637                 }
3638                 if (flags & SCF_DO_STCLASS_OR)
3639                     cl_and(data->start_class, and_withp);
3640                 flags &= ~SCF_DO_STCLASS;
3641             }
3642         }
3643         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3644             data->flags |= (OP(scan) == MEOL
3645                             ? SF_BEFORE_MEOL
3646                             : SF_BEFORE_SEOL);
3647         }
3648         else if (  PL_regkind[OP(scan)] == BRANCHJ
3649                  /* Lookbehind, or need to calculate parens/evals/stclass: */
3650                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
3651                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3652             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3653                 || OP(scan) == UNLESSM )
3654             {
3655                 /* Negative Lookahead/lookbehind
3656                    In this case we can't do fixed string optimisation.
3657                 */
3658
3659                 I32 deltanext, minnext, fake = 0;
3660                 regnode *nscan;
3661                 struct regnode_charclass_class intrnl;
3662                 int f = 0;
3663
3664                 data_fake.flags = 0;
3665                 if (data) {
3666                     data_fake.whilem_c = data->whilem_c;
3667                     data_fake.last_closep = data->last_closep;
3668                 }
3669                 else
3670                     data_fake.last_closep = &fake;
3671                 data_fake.pos_delta = delta;
3672                 if ( flags & SCF_DO_STCLASS && !scan->flags
3673                      && OP(scan) == IFMATCH ) { /* Lookahead */
3674                     cl_init(pRExC_state, &intrnl);
3675                     data_fake.start_class = &intrnl;
3676                     f |= SCF_DO_STCLASS_AND;
3677                 }
3678                 if (flags & SCF_WHILEM_VISITED_POS)
3679                     f |= SCF_WHILEM_VISITED_POS;
3680                 next = regnext(scan);
3681                 nscan = NEXTOPER(NEXTOPER(scan));
3682                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3683                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3684                 if (scan->flags) {
3685                     if (deltanext) {
3686                         FAIL("Variable length lookbehind not implemented");
3687                     }
3688                     else if (minnext > (I32)U8_MAX) {
3689                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3690                     }
3691                     scan->flags = (U8)minnext;
3692                 }
3693                 if (data) {
3694                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3695                         pars++;
3696                     if (data_fake.flags & SF_HAS_EVAL)
3697                         data->flags |= SF_HAS_EVAL;
3698                     data->whilem_c = data_fake.whilem_c;
3699                 }
3700                 if (f & SCF_DO_STCLASS_AND) {
3701                     const int was = (data->start_class->flags & ANYOF_EOS);
3702
3703                     cl_and(data->start_class, &intrnl);
3704                     if (was)
3705                         data->start_class->flags |= ANYOF_EOS;
3706                 }
3707             }
3708 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3709             else {
3710                 /* Positive Lookahead/lookbehind
3711                    In this case we can do fixed string optimisation,
3712                    but we must be careful about it. Note in the case of
3713                    lookbehind the positions will be offset by the minimum
3714                    length of the pattern, something we won't know about
3715                    until after the recurse.
3716                 */
3717                 I32 deltanext, fake = 0;
3718                 regnode *nscan;
3719                 struct regnode_charclass_class intrnl;
3720                 int f = 0;
3721                 /* We use SAVEFREEPV so that when the full compile
3722                     is finished perl will clean up the allocated
3723                     minlens when its all done. This was we don't
3724                     have to worry about freeing them when we know
3725                     they wont be used, which would be a pain.
3726                  */
3727                 I32 *minnextp;
3728                 Newx( minnextp, 1, I32 );
3729                 SAVEFREEPV(minnextp);
3730
3731                 if (data) {
3732                     StructCopy(data, &data_fake, scan_data_t);
3733                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3734                         f |= SCF_DO_SUBSTR;
3735                         if (scan->flags)
3736                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3737                         data_fake.last_found=newSVsv(data->last_found);
3738                     }
3739                 }
3740                 else
3741                     data_fake.last_closep = &fake;
3742                 data_fake.flags = 0;
3743                 data_fake.pos_delta = delta;
3744                 if (is_inf)
3745                     data_fake.flags |= SF_IS_INF;
3746                 if ( flags & SCF_DO_STCLASS && !scan->flags
3747                      && OP(scan) == IFMATCH ) { /* Lookahead */
3748                     cl_init(pRExC_state, &intrnl);
3749                     data_fake.start_class = &intrnl;
3750                     f |= SCF_DO_STCLASS_AND;
3751                 }
3752                 if (flags & SCF_WHILEM_VISITED_POS)
3753                     f |= SCF_WHILEM_VISITED_POS;
3754                 next = regnext(scan);
3755                 nscan = NEXTOPER(NEXTOPER(scan));
3756
3757                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3758                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3759                 if (scan->flags) {
3760                     if (deltanext) {
3761                         FAIL("Variable length lookbehind not implemented");
3762                     }
3763                     else if (*minnextp > (I32)U8_MAX) {
3764                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3765                     }
3766                     scan->flags = (U8)*minnextp;
3767                 }
3768
3769                 *minnextp += min;
3770
3771                 if (f & SCF_DO_STCLASS_AND) {
3772                     const int was = (data->start_class->flags & ANYOF_EOS);
3773
3774                     cl_and(data->start_class, &intrnl);
3775                     if (was)
3776                         data->start_class->flags |= ANYOF_EOS;
3777                 }
3778                 if (data) {
3779                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3780                         pars++;
3781                     if (data_fake.flags & SF_HAS_EVAL)
3782                         data->flags |= SF_HAS_EVAL;
3783                     data->whilem_c = data_fake.whilem_c;
3784                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3785                         if (RExC_rx->minlen<*minnextp)
3786                             RExC_rx->minlen=*minnextp;
3787                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3788                         SvREFCNT_dec(data_fake.last_found);
3789
3790                         if ( data_fake.minlen_fixed != minlenp )
3791                         {
3792                             data->offset_fixed= data_fake.offset_fixed;
3793                             data->minlen_fixed= data_fake.minlen_fixed;
3794                             data->lookbehind_fixed+= scan->flags;
3795                         }
3796                         if ( data_fake.minlen_float != minlenp )
3797                         {
3798                             data->minlen_float= data_fake.minlen_float;
3799                             data->offset_float_min=data_fake.offset_float_min;
3800                             data->offset_float_max=data_fake.offset_float_max;
3801                             data->lookbehind_float+= scan->flags;
3802                         }
3803                     }
3804                 }
3805
3806
3807             }
3808 #endif
3809         }
3810         else if (OP(scan) == OPEN) {
3811             if (stopparen != (I32)ARG(scan))
3812                 pars++;
3813         }
3814         else if (OP(scan) == CLOSE) {
3815             if (stopparen == (I32)ARG(scan)) {
3816                 break;
3817             }
3818             if ((I32)ARG(scan) == is_par) {
3819                 next = regnext(scan);
3820
3821                 if ( next && (OP(next) != WHILEM) && next < last)
3822                     is_par = 0;         /* Disable optimization */
3823             }
3824             if (data)
3825                 *(data->last_closep) = ARG(scan);
3826         }
3827         else if (OP(scan) == EVAL) {
3828                 if (data)
3829                     data->flags |= SF_HAS_EVAL;
3830         }
3831         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3832             if (flags & SCF_DO_SUBSTR) {
3833                 SCAN_COMMIT(pRExC_state,data,minlenp);
3834                 flags &= ~SCF_DO_SUBSTR;
3835             }
3836             if (data && OP(scan)==ACCEPT) {
3837                 data->flags |= SCF_SEEN_ACCEPT;
3838                 if (stopmin > min)
3839                     stopmin = min;
3840             }
3841         }
3842         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3843         {
3844                 if (flags & SCF_DO_SUBSTR) {
3845                     SCAN_COMMIT(pRExC_state,data,minlenp);
3846                     data->longest = &(data->longest_float);
3847                 }
3848                 is_inf = is_inf_internal = 1;
3849                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3850                     cl_anything(pRExC_state, data->start_class);
3851                 flags &= ~SCF_DO_STCLASS;
3852         }
3853         else if (OP(scan) == GPOS) {
3854             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3855                 !(delta || is_inf || (data && data->pos_delta)))
3856             {
3857                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3858                     RExC_rx->extflags |= RXf_ANCH_GPOS;
3859                 if (RExC_rx->gofs < (U32)min)
3860                     RExC_rx->gofs = min;
3861             } else {
3862                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3863                 RExC_rx->gofs = 0;
3864             }
3865         }
3866 #ifdef TRIE_STUDY_OPT
3867 #ifdef FULL_TRIE_STUDY
3868         else if (PL_regkind[OP(scan)] == TRIE) {
3869             /* NOTE - There is similar code to this block above for handling
3870                BRANCH nodes on the initial study.  If you change stuff here
3871                check there too. */
3872             regnode *trie_node= scan;
3873             regnode *tail= regnext(scan);
3874             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3875             I32 max1 = 0, min1 = I32_MAX;
3876             struct regnode_charclass_class accum;
3877
3878             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3879                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3880             if (flags & SCF_DO_STCLASS)
3881                 cl_init_zero(pRExC_state, &accum);
3882
3883             if (!trie->jump) {
3884                 min1= trie->minlen;
3885                 max1= trie->maxlen;
3886             } else {
3887                 const regnode *nextbranch= NULL;
3888                 U32 word;
3889
3890                 for ( word=1 ; word <= trie->wordcount ; word++)
3891                 {
3892                     I32 deltanext=0, minnext=0, f = 0, fake;
3893                     struct regnode_charclass_class this_class;
3894
3895                     data_fake.flags = 0;
3896                     if (data) {
3897                         data_fake.whilem_c = data->whilem_c;
3898                         data_fake.last_closep = data->last_closep;
3899                     }
3900                     else
3901                         data_fake.last_closep = &fake;
3902                     data_fake.pos_delta = delta;
3903                     if (flags & SCF_DO_STCLASS) {
3904                         cl_init(pRExC_state, &this_class);
3905                         data_fake.start_class = &this_class;
3906                         f = SCF_DO_STCLASS_AND;
3907                     }
3908                     if (flags & SCF_WHILEM_VISITED_POS)
3909                         f |= SCF_WHILEM_VISITED_POS;
3910
3911                     if (trie->jump[word]) {
3912                         if (!nextbranch)
3913                             nextbranch = trie_node + trie->jump[0];
3914                         scan= trie_node + trie->jump[word];
3915                         /* We go from the jump point to the branch that follows
3916                            it. Note this means we need the vestigal unused branches
3917                            even though they arent otherwise used.
3918                          */
3919                         minnext = study_chunk(pRExC_state, &scan, minlenp,
3920                             &deltanext, (regnode *)nextbranch, &data_fake,
3921                             stopparen, recursed, NULL, f,depth+1);
3922                     }
3923                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3924                         nextbranch= regnext((regnode*)nextbranch);
3925
3926                     if (min1 > (I32)(minnext + trie->minlen))
3927                         min1 = minnext + trie->minlen;
3928                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3929                         max1 = minnext + deltanext + trie->maxlen;
3930                     if (deltanext == I32_MAX)
3931                         is_inf = is_inf_internal = 1;
3932
3933                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3934                         pars++;
3935                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3936                         if ( stopmin > min + min1)
3937                             stopmin = min + min1;
3938                         flags &= ~SCF_DO_SUBSTR;
3939                         if (data)
3940                             data->flags |= SCF_SEEN_ACCEPT;
3941                     }
3942                     if (data) {
3943                         if (data_fake.flags & SF_HAS_EVAL)
3944                             data->flags |= SF_HAS_EVAL;
3945                         data->whilem_c = data_fake.whilem_c;
3946                     }
3947                     if (flags & SCF_DO_STCLASS)
3948                         cl_or(pRExC_state, &accum, &this_class);
3949                 }
3950             }
3951             if (flags & SCF_DO_SUBSTR) {
3952                 data->pos_min += min1;
3953                 data->pos_delta += max1 - min1;
3954                 if (max1 != min1 || is_inf)
3955                     data->longest = &(data->longest_float);
3956             }
3957             min += min1;
3958             delta += max1 - min1;
3959             if (flags & SCF_DO_STCLASS_OR) {
3960                 cl_or(pRExC_state, data->start_class, &accum);
3961                 if (min1) {
3962                     cl_and(data->start_class, and_withp);
3963                     flags &= ~SCF_DO_STCLASS;
3964                 }
3965             }
3966             else if (flags & SCF_DO_STCLASS_AND) {
3967                 if (min1) {
3968                     cl_and(data->start_class, &accum);
3969                     flags &= ~SCF_DO_STCLASS;
3970                 }
3971                 else {
3972                     /* Switch to OR mode: cache the old value of
3973                      * data->start_class */
3974                     INIT_AND_WITHP;
3975                     StructCopy(data->start_class, and_withp,
3976                                struct regnode_charclass_class);
3977                     flags &= ~SCF_DO_STCLASS_AND;
3978                     StructCopy(&accum, data->start_class,
3979                                struct regnode_charclass_class);
3980                     flags |= SCF_DO_STCLASS_OR;
3981                     data->start_class->flags |= ANYOF_EOS;
3982                 }
3983             }
3984             scan= tail;
3985             continue;
3986         }
3987 #else
3988         else if (PL_regkind[OP(scan)] == TRIE) {
3989             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3990             U8*bang=NULL;
3991
3992             min += trie->minlen;
3993             delta += (trie->maxlen - trie->minlen);
3994             flags &= ~SCF_DO_STCLASS; /* xxx */
3995             if (flags & SCF_DO_SUBSTR) {
3996                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3997                 data->pos_min += trie->minlen;
3998                 data->pos_delta += (trie->maxlen - trie->minlen);
3999                 if (trie->maxlen != trie->minlen)
4000                     data->longest = &(data->longest_float);
4001             }
4002             if (trie->jump) /* no more substrings -- for now /grr*/
4003                 flags &= ~SCF_DO_SUBSTR;
4004         }
4005 #endif /* old or new */
4006 #endif /* TRIE_STUDY_OPT */
4007
4008         /* Else: zero-length, ignore. */
4009         scan = regnext(scan);
4010     }
4011     if (frame) {
4012         last = frame->last;
4013         scan = frame->next;
4014         stopparen = frame->stop;
4015         frame = frame->prev;
4016         goto fake_study_recurse;
4017     }
4018
4019   finish:
4020     assert(!frame);
4021     DEBUG_STUDYDATA("pre-fin:",data,depth);
4022
4023     *scanp = scan;
4024     *deltap = is_inf_internal ? I32_MAX : delta;
4025     if (flags & SCF_DO_SUBSTR && is_inf)
4026         data->pos_delta = I32_MAX - data->pos_min;
4027     if (is_par > (I32)U8_MAX)
4028         is_par = 0;
4029     if (is_par && pars==1 && data) {
4030         data->flags |= SF_IN_PAR;
4031         data->flags &= ~SF_HAS_PAR;
4032     }
4033     else if (pars && data) {
4034         data->flags |= SF_HAS_PAR;
4035         data->flags &= ~SF_IN_PAR;
4036     }
4037     if (flags & SCF_DO_STCLASS_OR)
4038         cl_and(data->start_class, and_withp);
4039     if (flags & SCF_TRIE_RESTUDY)
4040         data->flags |=  SCF_TRIE_RESTUDY;
4041
4042     DEBUG_STUDYDATA("post-fin:",data,depth);
4043
4044     return min < stopmin ? min : stopmin;
4045 }
4046
4047 STATIC U32
4048 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4049 {
4050     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4051
4052     Renewc(RExC_rxi->data,
4053            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4054            char, struct reg_data);
4055     if(count)
4056         Renew(RExC_rxi->data->what, count + n, U8);
4057     else
4058         Newx(RExC_rxi->data->what, n, U8);
4059     RExC_rxi->data->count = count + n;
4060     Copy(s, RExC_rxi->data->what + count, n, U8);
4061     return count;
4062 }
4063
4064 /*XXX: todo make this not included in a non debugging perl */
4065 #ifndef PERL_IN_XSUB_RE
4066 void
4067 Perl_reginitcolors(pTHX)
4068 {
4069     dVAR;
4070     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4071     if (s) {
4072         char *t = savepv(s);
4073         int i = 0;
4074         PL_colors[0] = t;
4075         while (++i < 6) {
4076             t = strchr(t, '\t');
4077             if (t) {
4078                 *t = '\0';
4079                 PL_colors[i] = ++t;
4080             }
4081             else
4082                 PL_colors[i] = t = (char *)"";
4083         }
4084     } else {
4085         int i = 0;
4086         while (i < 6)
4087             PL_colors[i++] = (char *)"";
4088     }
4089     PL_colorset = 1;
4090 }
4091 #endif
4092
4093
4094 #ifdef TRIE_STUDY_OPT
4095 #define CHECK_RESTUDY_GOTO                                  \
4096         if (                                                \
4097               (data.flags & SCF_TRIE_RESTUDY)               \
4098               && ! restudied++                              \
4099         )     goto reStudy
4100 #else
4101 #define CHECK_RESTUDY_GOTO
4102 #endif
4103
4104 /*
4105  - pregcomp - compile a regular expression into internal code
4106  *
4107  * We can't allocate space until we know how big the compiled form will be,
4108  * but we can't compile it (and thus know how big it is) until we've got a
4109  * place to put the code.  So we cheat:  we compile it twice, once with code
4110  * generation turned off and size counting turned on, and once "for real".
4111  * This also means that we don't allocate space until we are sure that the
4112  * thing really will compile successfully, and we never have to move the
4113  * code and thus invalidate pointers into it.  (Note that it has to be in
4114  * one piece because free() must be able to free it all.) [NB: not true in perl]
4115  *
4116  * Beware that the optimization-preparation code in here knows about some
4117  * of the structure of the compiled regexp.  [I'll say.]
4118  */
4119
4120
4121
4122 #ifndef PERL_IN_XSUB_RE
4123 #define RE_ENGINE_PTR &PL_core_reg_engine
4124 #else
4125 extern const struct regexp_engine my_reg_engine;
4126 #define RE_ENGINE_PTR &my_reg_engine
4127 #endif
4128
4129 #ifndef PERL_IN_XSUB_RE
4130 REGEXP *
4131 Perl_pregcomp(pTHX_ const SV * const pattern, const U32 flags)
4132 {
4133     dVAR;
4134     HV * const table = GvHV(PL_hintgv);
4135     /* Dispatch a request to compile a regexp to correct
4136        regexp engine. */
4137     if (table) {
4138         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4139         GET_RE_DEBUG_FLAGS_DECL;
4140         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4141             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4142             DEBUG_COMPILE_r({
4143                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4144                     SvIV(*ptr));
4145             });
4146             return CALLREGCOMP_ENG(eng, pattern, flags);
4147         }
4148     }
4149     return Perl_re_compile(aTHX_ pattern, flags);
4150 }
4151 #endif
4152
4153 REGEXP *
4154 Perl_re_compile(pTHX_ const SV * const pattern, const U32 pm_flags)
4155 {
4156     dVAR;
4157     REGEXP *rx;
4158     struct regexp *r;
4159     register regexp_internal *ri;
4160     STRLEN plen;
4161     char*  exp = SvPV((SV*)pattern, plen);
4162     char* xend = exp + plen;
4163     regnode *scan;
4164     I32 flags;
4165     I32 minlen = 0;
4166     I32 sawplus = 0;
4167     I32 sawopen = 0;
4168     scan_data_t data;
4169     RExC_state_t RExC_state;
4170     RExC_state_t * const pRExC_state = &RExC_state;
4171 #ifdef TRIE_STUDY_OPT
4172     int restudied= 0;
4173     RExC_state_t copyRExC_state;
4174 #endif
4175     GET_RE_DEBUG_FLAGS_DECL;
4176     DEBUG_r(if (!PL_colorset) reginitcolors());
4177
4178     RExC_utf8 = RExC_orig_utf8 = pm_flags & RXf_UTF8;
4179
4180     DEBUG_COMPILE_r({
4181         SV *dsv= sv_newmortal();
4182         RE_PV_QUOTED_DECL(s, RExC_utf8,
4183             dsv, exp, plen, 60);
4184         PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4185                        PL_colors[4],PL_colors[5],s);
4186     });
4187
4188 redo_first_pass:
4189     RExC_precomp = exp;
4190     RExC_flags = pm_flags;
4191     RExC_sawback = 0;
4192
4193     RExC_seen = 0;
4194     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4195     RExC_seen_evals = 0;
4196     RExC_extralen = 0;
4197
4198     /* First pass: determine size, legality. */
4199     RExC_parse = exp;
4200     RExC_start = exp;
4201     RExC_end = xend;
4202     RExC_naughty = 0;
4203     RExC_npar = 1;
4204     RExC_nestroot = 0;
4205     RExC_size = 0L;
4206     RExC_emit = &PL_regdummy;
4207     RExC_whilem_seen = 0;
4208     RExC_charnames = NULL;
4209     RExC_open_parens = NULL;
4210     RExC_close_parens = NULL;
4211     RExC_opend = NULL;
4212     RExC_paren_names = NULL;
4213 #ifdef DEBUGGING
4214     RExC_paren_name_list = NULL;
4215 #endif
4216     RExC_recurse = NULL;
4217     RExC_recurse_count = 0;
4218
4219 #if 0 /* REGC() is (currently) a NOP at the first pass.
4220        * Clever compilers notice this and complain. --jhi */
4221     REGC((U8)REG_MAGIC, (char*)RExC_emit);
4222 #endif
4223     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4224     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4225         RExC_precomp = NULL;
4226         return(NULL);
4227     }
4228     if (RExC_utf8 && !RExC_orig_utf8) {
4229         /* It's possible to write a regexp in ascii that represents Unicode
4230         codepoints outside of the byte range, such as via \x{100}. If we
4231         detect such a sequence we have to convert the entire pattern to utf8
4232         and then recompile, as our sizing calculation will have been based
4233         on 1 byte == 1 character, but we will need to use utf8 to encode
4234         at least some part of the pattern, and therefore must convert the whole
4235         thing.
4236         XXX: somehow figure out how to make this less expensive...
4237         -- dmq */
4238         STRLEN len = plen;
4239         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4240             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4241         exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
4242         xend = exp + len;
4243         RExC_orig_utf8 = RExC_utf8;
4244         SAVEFREEPV(exp);
4245         goto redo_first_pass;
4246     }
4247     DEBUG_PARSE_r({
4248         PerlIO_printf(Perl_debug_log,
4249             "Required size %"IVdf" nodes\n"
4250             "Starting second pass (creation)\n",
4251             (IV)RExC_size);
4252         RExC_lastnum=0;
4253         RExC_lastparse=NULL;
4254     });
4255     /* Small enough for pointer-storage convention?
4256        If extralen==0, this means that we will not need long jumps. */
4257     if (RExC_size >= 0x10000L && RExC_extralen)
4258         RExC_size += RExC_extralen;
4259     else
4260         RExC_extralen = 0;
4261     if (RExC_whilem_seen > 15)
4262         RExC_whilem_seen = 15;
4263
4264     /* Allocate space and zero-initialize. Note, the two step process
4265        of zeroing when in debug mode, thus anything assigned has to
4266        happen after that */
4267     rx = newSV_type(SVt_REGEXP);
4268     r = (struct regexp*)SvANY(rx);
4269     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4270          char, regexp_internal);
4271     if ( r == NULL || ri == NULL )
4272         FAIL("Regexp out of space");
4273 #ifdef DEBUGGING
4274     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4275     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4276 #else
4277     /* bulk initialize base fields with 0. */
4278     Zero(ri, sizeof(regexp_internal), char);
4279 #endif
4280
4281     /* non-zero initialization begins here */
4282     RXi_SET( r, ri );
4283     r->engine= RE_ENGINE_PTR;
4284     r->extflags = pm_flags;
4285     {
4286         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4287         bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4288         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4289         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4290                             >> RXf_PMf_STD_PMMOD_SHIFT);
4291         const char *fptr = STD_PAT_MODS;        /*"msix"*/
4292         char *p;
4293         RXp_WRAPLEN(r) = plen + has_minus + has_p + has_runon
4294             + (sizeof(STD_PAT_MODS) - 1)
4295             + (sizeof("(?:)") - 1);
4296
4297         p = sv_grow(rx, RXp_WRAPLEN(r) + 1);
4298         SvCUR_set(rx, RXp_WRAPLEN(r));
4299         SvPOK_on(rx);
4300         *p++='('; *p++='?';
4301         if (has_p)
4302             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4303         {
4304             char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4305             char *colon = r + 1;
4306             char ch;
4307
4308             while((ch = *fptr++)) {
4309                 if(reganch & 1)
4310                     *p++ = ch;
4311                 else
4312                     *r-- = ch;
4313                 reganch >>= 1;
4314             }
4315             if(has_minus) {
4316                 *r = '-';
4317                 p = colon;
4318             }
4319         }
4320
4321         *p++ = ':';
4322         Copy(RExC_precomp, p, plen, char);
4323         assert ((RX_WRAPPED(rx) - p) < 16);
4324         r->pre_prefix = p - RX_WRAPPED(rx);
4325         p += plen;
4326         if (has_runon)
4327             *p++ = '\n';
4328         *p++ = ')';
4329         *p = 0;
4330     }
4331
4332     r->intflags = 0;
4333     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4334
4335     if (RExC_seen & REG_SEEN_RECURSE) {
4336         Newxz(RExC_open_parens, RExC_npar,regnode *);
4337         SAVEFREEPV(RExC_open_parens);
4338         Newxz(RExC_close_parens,RExC_npar,regnode *);
4339         SAVEFREEPV(RExC_close_parens);
4340     }
4341
4342     /* Useful during FAIL. */
4343 #ifdef RE_TRACK_PATTERN_OFFSETS
4344     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4345     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4346                           "%s %"UVuf" bytes for offset annotations.\n",
4347                           ri->u.offsets ? "Got" : "Couldn't get",
4348                           (UV)((2*RExC_size+1) * sizeof(U32))));
4349 #endif
4350     SetProgLen(ri,RExC_size);
4351     RExC_rx_sv = rx;
4352     RExC_rx = r;
4353     RExC_rxi = ri;
4354
4355     /* Second pass: emit code. */
4356     RExC_flags = pm_flags;      /* don't let top level (?i) bleed */
4357     RExC_parse = exp;
4358     RExC_end = xend;
4359     RExC_naughty = 0;
4360     RExC_npar = 1;
4361     RExC_emit_start = ri->program;
4362     RExC_emit = ri->program;
4363     RExC_emit_bound = ri->program + RExC_size + 1;
4364
4365     /* Store the count of eval-groups for security checks: */
4366     RExC_rx->seen_evals = RExC_seen_evals;
4367     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4368     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4369         ReREFCNT_dec(rx);
4370         return(NULL);
4371     }
4372     /* XXXX To minimize changes to RE engine we always allocate
4373        3-units-long substrs field. */
4374     Newx(r->substrs, 1, struct reg_substr_data);
4375     if (RExC_recurse_count) {
4376         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4377         SAVEFREEPV(RExC_recurse);
4378     }
4379
4380 reStudy:
4381     r->minlen = minlen = sawplus = sawopen = 0;
4382     Zero(r->substrs, 1, struct reg_substr_data);
4383
4384 #ifdef TRIE_STUDY_OPT
4385     if ( restudied ) {
4386         U32 seen=RExC_seen;
4387         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4388
4389         RExC_state = copyRExC_state;
4390         if (seen & REG_TOP_LEVEL_BRANCHES)
4391             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4392         else
4393             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4394         if (data.last_found) {
4395             SvREFCNT_dec(data.longest_fixed);
4396             SvREFCNT_dec(data.longest_float);
4397             SvREFCNT_dec(data.last_found);
4398         }
4399         StructCopy(&zero_scan_data, &data, scan_data_t);
4400     } else {
4401         StructCopy(&zero_scan_data, &data, scan_data_t);
4402         copyRExC_state = RExC_state;
4403     }
4404 #else
4405     StructCopy(&zero_scan_data, &data, scan_data_t);
4406 #endif
4407
4408     /* Dig out information for optimizations. */
4409     r->extflags = RExC_flags; /* was pm_op */
4410     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4411
4412     if (UTF)
4413         r->extflags |= RXf_UTF8;        /* Unicode in it? */
4414     ri->regstclass = NULL;
4415     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
4416         r->intflags |= PREGf_NAUGHTY;
4417     scan = ri->program + 1;             /* First BRANCH. */
4418
4419     /* testing for BRANCH here tells us whether there is "must appear"
4420        data in the pattern. If there is then we can use it for optimisations */
4421     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
4422         I32 fake;
4423         STRLEN longest_float_length, longest_fixed_length;
4424         struct regnode_charclass_class ch_class; /* pointed to by data */
4425         int stclass_flag;
4426         I32 last_close = 0; /* pointed to by data */
4427         regnode *first= scan;
4428         regnode *first_next= regnext(first);
4429
4430         /* Skip introductions and multiplicators >= 1. */
4431         while ((OP(first) == OPEN && (sawopen = 1)) ||
4432                /* An OR of *one* alternative - should not happen now. */
4433             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4434             /* for now we can't handle lookbehind IFMATCH*/
4435             (OP(first) == IFMATCH && !first->flags) ||
4436             (OP(first) == PLUS) ||
4437             (OP(first) == MINMOD) ||
4438                /* An {n,m} with n>0 */
4439             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4440             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4441         {
4442
4443                 if (OP(first) == PLUS)
4444                     sawplus = 1;
4445                 else
4446                     first += regarglen[OP(first)];
4447                 if (OP(first) == IFMATCH) {
4448                     first = NEXTOPER(first);
4449                     first += EXTRA_STEP_2ARGS;
4450                 } else  /* XXX possible optimisation for /(?=)/  */
4451                     first = NEXTOPER(first);
4452                 first_next= regnext(first);
4453         }
4454
4455         /* Starting-point info. */
4456       again:
4457         DEBUG_PEEP("first:",first,0);
4458         /* Ignore EXACT as we deal with it later. */
4459         if (PL_regkind[OP(first)] == EXACT) {
4460             if (OP(first) == EXACT)
4461                 NOOP;   /* Empty, get anchored substr later. */
4462             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4463                 ri->regstclass = first;
4464         }
4465 #ifdef TRIE_STCLASS
4466         else if (PL_regkind[OP(first)] == TRIE &&
4467                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4468         {
4469             regnode *trie_op;
4470             /* this can happen only on restudy */
4471             if ( OP(first) == TRIE ) {
4472                 struct regnode_1 *trieop = (struct regnode_1 *)
4473                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
4474                 StructCopy(first,trieop,struct regnode_1);
4475                 trie_op=(regnode *)trieop;
4476             } else {
4477                 struct regnode_charclass *trieop = (struct regnode_charclass *)
4478                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4479                 StructCopy(first,trieop,struct regnode_charclass);
4480                 trie_op=(regnode *)trieop;
4481             }
4482             OP(trie_op)+=2;
4483             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4484             ri->regstclass = trie_op;
4485         }
4486 #endif
4487         else if (strchr((const char*)PL_simple,OP(first)))
4488             ri->regstclass = first;
4489         else if (PL_regkind[OP(first)] == BOUND ||
4490                  PL_regkind[OP(first)] == NBOUND)
4491             ri->regstclass = first;
4492         else if (PL_regkind[OP(first)] == BOL) {
4493             r->extflags |= (OP(first) == MBOL
4494                            ? RXf_ANCH_MBOL
4495                            : (OP(first) == SBOL
4496                               ? RXf_ANCH_SBOL
4497                               : RXf_ANCH_BOL));
4498             first = NEXTOPER(first);
4499             goto again;
4500         }
4501         else if (OP(first) == GPOS) {
4502             r->extflags |= RXf_ANCH_GPOS;
4503             first = NEXTOPER(first);
4504             goto again;
4505         }
4506         else if ((!sawopen || !RExC_sawback) &&
4507             (OP(first) == STAR &&
4508             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4509             !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4510         {
4511             /* turn .* into ^.* with an implied $*=1 */
4512             const int type =
4513                 (OP(NEXTOPER(first)) == REG_ANY)
4514                     ? RXf_ANCH_MBOL
4515                     : RXf_ANCH_SBOL;
4516             r->extflags |= type;
4517             r->intflags |= PREGf_IMPLICIT;
4518             first = NEXTOPER(first);
4519             goto again;
4520         }
4521         if (sawplus && (!sawopen || !RExC_sawback)
4522             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4523             /* x+ must match at the 1st pos of run of x's */
4524             r->intflags |= PREGf_SKIP;
4525
4526         /* Scan is after the zeroth branch, first is atomic matcher. */
4527 #ifdef TRIE_STUDY_OPT
4528         DEBUG_PARSE_r(
4529             if (!restudied)
4530                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4531                               (IV)(first - scan + 1))
4532         );
4533 #else
4534         DEBUG_PARSE_r(
4535             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4536                 (IV)(first - scan + 1))
4537         );
4538 #endif
4539
4540
4541         /*
4542         * If there's something expensive in the r.e., find the
4543         * longest literal string that must appear and make it the
4544         * regmust.  Resolve ties in favor of later strings, since
4545         * the regstart check works with the beginning of the r.e.
4546         * and avoiding duplication strengthens checking.  Not a
4547         * strong reason, but sufficient in the absence of others.
4548         * [Now we resolve ties in favor of the earlier string if
4549         * it happens that c_offset_min has been invalidated, since the
4550         * earlier string may buy us something the later one won't.]
4551         */
4552
4553         data.longest_fixed = newSVpvs("");
4554         data.longest_float = newSVpvs("");
4555         data.last_found = newSVpvs("");
4556         data.longest = &(data.longest_fixed);
4557         first = scan;
4558         if (!ri->regstclass) {
4559             cl_init(pRExC_state, &ch_class);
4560             data.start_class = &ch_class;
4561             stclass_flag = SCF_DO_STCLASS_AND;
4562         } else                          /* XXXX Check for BOUND? */
4563             stclass_flag = 0;
4564         data.last_closep = &last_close;
4565
4566         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4567             &data, -1, NULL, NULL,
4568             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4569
4570
4571         CHECK_RESTUDY_GOTO;
4572
4573
4574         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4575              && data.last_start_min == 0 && data.last_end > 0
4576              && !RExC_seen_zerolen
4577              && !(RExC_seen & REG_SEEN_VERBARG)
4578              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4579             r->extflags |= RXf_CHECK_ALL;
4580         scan_commit(pRExC_state, &data,&minlen,0);
4581         SvREFCNT_dec(data.last_found);
4582
4583         /* Note that code very similar to this but for anchored string
4584            follows immediately below, changes may need to be made to both.
4585            Be careful.
4586          */
4587         longest_float_length = CHR_SVLEN(data.longest_float);
4588         if (longest_float_length
4589             || (data.flags & SF_FL_BEFORE_EOL
4590                 && (!(data.flags & SF_FL_BEFORE_MEOL)
4591                     || (RExC_flags & RXf_PMf_MULTILINE))))
4592         {
4593             I32 t,ml;
4594
4595             if (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
4596                 && data.offset_fixed == data.offset_float_min
4597                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4598                     goto remove_float;          /* As in (a)+. */
4599
4600             /* copy the information about the longest float from the reg_scan_data
4601                over to the program. */
4602             if (SvUTF8(data.longest_float)) {
4603                 r->float_utf8 = data.longest_float;
4604                 r->float_substr = NULL;
4605             } else {
4606                 r->float_substr = data.longest_float;
4607                 r->float_utf8 = NULL;
4608             }
4609             /* float_end_shift is how many chars that must be matched that
4610                follow this item. We calculate it ahead of time as once the
4611                lookbehind offset is added in we lose the ability to correctly
4612                calculate it.*/
4613             ml = data.minlen_float ? *(data.minlen_float)
4614                                    : (I32)longest_float_length;
4615             r->float_end_shift = ml - data.offset_float_min
4616                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4617                 + data.lookbehind_float;
4618             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4619             r->float_max_offset = data.offset_float_max;
4620             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4621                 r->float_max_offset -= data.lookbehind_float;
4622
4623             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4624                        && (!(data.flags & SF_FL_BEFORE_MEOL)
4625                            || (RExC_flags & RXf_PMf_MULTILINE)));
4626             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4627         }
4628         else {
4629           remove_float:
4630             r->float_substr = r->float_utf8 = NULL;
4631             SvREFCNT_dec(data.longest_float);
4632             longest_float_length = 0;
4633         }
4634
4635         /* Note that code very similar to this but for floating string
4636            is immediately above, changes may need to be made to both.
4637            Be careful.
4638          */
4639         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4640         if (longest_fixed_length
4641             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4642                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4643                     || (RExC_flags & RXf_PMf_MULTILINE))))
4644         {
4645             I32 t,ml;
4646
4647             /* copy the information about the longest fixed
4648                from the reg_scan_data over to the program. */
4649             if (SvUTF8(data.longest_fixed)) {
4650                 r->anchored_utf8 = data.longest_fixed;
4651                 r->anchored_substr = NULL;
4652             } else {
4653                 r->anchored_substr = data.longest_fixed;
4654                 r->anchored_utf8 = NULL;
4655             }
4656             /* fixed_end_shift is how many chars that must be matched that
4657                follow this item. We calculate it ahead of time as once the
4658                lookbehind offset is added in we lose the ability to correctly
4659                calculate it.*/
4660             ml = data.minlen_fixed ? *(data.minlen_fixed)
4661                                    : (I32)longest_fixed_length;
4662             r->anchored_end_shift = ml - data.offset_fixed
4663                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4664                 + data.lookbehind_fixed;
4665             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4666
4667             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4668                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
4669                      || (RExC_flags & RXf_PMf_MULTILINE)));
4670             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4671         }
4672         else {
4673             r->anchored_substr = r->anchored_utf8 = NULL;
4674             SvREFCNT_dec(data.longest_fixed);
4675             longest_fixed_length = 0;
4676         }
4677         if (ri->regstclass
4678             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4679             ri->regstclass = NULL;
4680         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4681             && stclass_flag
4682             && !(data.start_class->flags & ANYOF_EOS)
4683             && !cl_is_anything(data.start_class))
4684         {
4685             const U32 n = add_data(pRExC_state, 1, "f");
4686
4687             Newx(RExC_rxi->data->data[n], 1,
4688                 struct regnode_charclass_class);
4689             StructCopy(data.start_class,
4690                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4691                        struct regnode_charclass_class);
4692             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4693             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4694             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4695                       regprop(r, sv, (regnode*)data.start_class);
4696                       PerlIO_printf(Perl_debug_log,
4697                                     "synthetic stclass \"%s\".\n",
4698                                     SvPVX_const(sv));});
4699         }
4700
4701         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4702         if (longest_fixed_length > longest_float_length) {
4703             r->check_end_shift = r->anchored_end_shift;
4704             r->check_substr = r->anchored_substr;
4705             r->check_utf8 = r->anchored_utf8;
4706             r->check_offset_min = r->check_offset_max = r->anchored_offset;
4707             if (r->extflags & RXf_ANCH_SINGLE)
4708                 r->extflags |= RXf_NOSCAN;
4709         }
4710         else {
4711             r->check_end_shift = r->float_end_shift;
4712             r->check_substr = r->float_substr;
4713             r->check_utf8 = r->float_utf8;
4714             r->check_offset_min = r->float_min_offset;
4715             r->check_offset_max = r->float_max_offset;
4716         }
4717         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4718            This should be changed ASAP!  */
4719         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4720             r->extflags |= RXf_USE_INTUIT;
4721             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4722                 r->extflags |= RXf_INTUIT_TAIL;
4723         }
4724         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4725         if ( (STRLEN)minlen < longest_float_length )
4726             minlen= longest_float_length;
4727         if ( (STRLEN)minlen < longest_fixed_length )
4728             minlen= longest_fixed_length;
4729         */
4730     }
4731     else {
4732         /* Several toplevels. Best we can is to set minlen. */
4733         I32 fake;
4734         struct regnode_charclass_class ch_class;
4735         I32 last_close = 0;
4736
4737         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4738
4739         scan = ri->program + 1;
4740         cl_init(pRExC_state, &ch_class);
4741         data.start_class = &ch_class;
4742         data.last_closep = &last_close;
4743
4744
4745         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4746             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4747
4748         CHECK_RESTUDY_GOTO;
4749
4750         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4751                 = r->float_substr = r->float_utf8 = NULL;
4752         if (!(data.start_class->flags & ANYOF_EOS)
4753             && !cl_is_anything(data.start_class))
4754         {
4755             const U32 n = add_data(pRExC_state, 1, "f");
4756
4757             Newx(RExC_rxi->data->data[n], 1,
4758                 struct regnode_charclass_class);
4759             StructCopy(data.start_class,
4760                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4761                        struct regnode_charclass_class);
4762             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4763             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4764             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4765                       regprop(r, sv, (regnode*)data.start_class);
4766                       PerlIO_printf(Perl_debug_log,
4767                                     "synthetic stclass \"%s\".\n",
4768                                     SvPVX_const(sv));});
4769         }
4770     }
4771
4772     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4773        the "real" pattern. */
4774     DEBUG_OPTIMISE_r({
4775         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4776                       (IV)minlen, (IV)r->minlen);
4777     });
4778     r->minlenret = minlen;
4779     if (r->minlen < minlen)
4780         r->minlen = minlen;
4781
4782     if (RExC_seen & REG_SEEN_GPOS)
4783         r->extflags |= RXf_GPOS_SEEN;
4784     if (RExC_seen & REG_SEEN_LOOKBEHIND)
4785         r->extflags |= RXf_LOOKBEHIND_SEEN;
4786     if (RExC_seen & REG_SEEN_EVAL)
4787         r->extflags |= RXf_EVAL_SEEN;
4788     if (RExC_seen & REG_SEEN_CANY)
4789         r->extflags |= RXf_CANY_SEEN;
4790     if (RExC_seen & REG_SEEN_VERBARG)
4791         r->intflags |= PREGf_VERBARG_SEEN;
4792     if (RExC_seen & REG_SEEN_CUTGROUP)
4793         r->intflags |= PREGf_CUTGROUP_SEEN;
4794     if (RExC_paren_names)
4795         r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4796     else
4797         r->paren_names = NULL;
4798
4799 #ifdef STUPID_PATTERN_CHECKS
4800     if (RX_PRELEN(r) == 0)
4801         r->extflags |= RXf_NULL;
4802     if (r->extflags & RXf_SPLIT && RX_PRELEN(r) == 1 && RX_PRECOMP(rx)[0] == ' ')
4803         /* XXX: this should happen BEFORE we compile */
4804         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4805     else if (RX_PRELEN(r) == 3 && memEQ("\\s+", RXp_PRECOMP(r), 3))
4806         r->extflags |= RXf_WHITE;
4807     else if (RX_PRELEN(r) == 1 && RXp_PRECOMP(r)[0] == '^')
4808         r->extflags |= RXf_START_ONLY;
4809 #else
4810     if (r->extflags & RXf_SPLIT && RXp_PRELEN(r) == 1 && RX_PRECOMP(rx)[0] == ' ')
4811             /* XXX: this should happen BEFORE we compile */
4812             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4813     else {
4814         regnode *first = ri->program + 1;
4815         U8 fop = OP(first);
4816         U8 nop = OP(NEXTOPER(first));
4817
4818         if (PL_regkind[fop] == NOTHING && nop == END)
4819             r->extflags |= RXf_NULL;
4820         else if (PL_regkind[fop] == BOL && nop == END)
4821             r->extflags |= RXf_START_ONLY;
4822         else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4823             r->extflags |= RXf_WHITE;
4824     }
4825 #endif
4826 #ifdef DEBUGGING
4827     if (RExC_paren_names) {
4828         ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4829         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4830     } else
4831 #endif
4832         ri->name_list_idx = 0;
4833
4834     if (RExC_recurse_count) {
4835         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4836             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4837             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4838         }
4839     }
4840     Newxz(r->offs, RExC_npar, regexp_paren_pair);
4841     /* assume we don't need to swap parens around before we match */
4842
4843     DEBUG_DUMP_r({
4844         PerlIO_printf(Perl_debug_log,"Final program:\n");
4845         regdump(r);
4846     });
4847 #ifdef RE_TRACK_PATTERN_OFFSETS
4848     DEBUG_OFFSETS_r(if (ri->u.offsets) {
4849         const U32 len = ri->u.offsets[0];
4850         U32 i;
4851         GET_RE_DEBUG_FLAGS_DECL;
4852         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
4853         for (i = 1; i <= len; i++) {
4854             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
4855                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4856                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
4857             }
4858         PerlIO_printf(Perl_debug_log, "\n");
4859     });
4860 #endif
4861     return rx;
4862 }
4863
4864 #undef RE_ENGINE_PTR
4865
4866
4867 SV*
4868 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
4869                     const U32 flags)
4870 {
4871     PERL_UNUSED_ARG(value);
4872
4873     if (flags & RXapif_FETCH) {
4874         return reg_named_buff_fetch(rx, key, flags);
4875     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
4876         Perl_croak(aTHX_ PL_no_modify);
4877         return NULL;
4878     } else if (flags & RXapif_EXISTS) {
4879         return reg_named_buff_exists(rx, key, flags)
4880             ? &PL_sv_yes
4881             : &PL_sv_no;
4882     } else if (flags & RXapif_REGNAMES) {
4883         return reg_named_buff_all(rx, flags);
4884     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
4885         return reg_named_buff_scalar(rx, flags);
4886     } else {
4887         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
4888         return NULL;
4889     }
4890 }
4891
4892 SV*
4893 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
4894                          const U32 flags)
4895 {
4896     PERL_UNUSED_ARG(lastkey);
4897
4898     if (flags & RXapif_FIRSTKEY)
4899         return reg_named_buff_firstkey(rx, flags);
4900     else if (flags & RXapif_NEXTKEY)
4901         return reg_named_buff_nextkey(rx, flags);
4902     else {
4903         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
4904         return NULL;
4905     }
4906 }
4907
4908 SV*
4909 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
4910                           const U32 flags)
4911 {
4912     AV *retarray = NULL;
4913     SV *ret;
4914     struct regexp *const rx = (struct regexp *)SvANY(r);
4915     if (flags & RXapif_ALL)
4916         retarray=newAV();
4917
4918     if (rx && rx->paren_names) {
4919         HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4920         if (he_str) {
4921             IV i;
4922             SV* sv_dat=HeVAL(he_str);
4923             I32 *nums=(I32*)SvPVX(sv_dat);
4924             for ( i=0; i<SvIVX(sv_dat); i++ ) {
4925                 if ((I32)(rx->nparens) >= nums[i]
4926                     && rx->offs[nums[i]].start != -1
4927                     && rx->offs[nums[i]].end != -1)
4928                 {
4929                     ret = newSVpvs("");
4930                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
4931                     if (!retarray)
4932                         return ret;
4933                 } else {
4934                     ret = newSVsv(&PL_sv_undef);
4935                 }
4936                 if (retarray) {
4937                     SvREFCNT_inc_simple_void(ret);
4938                     av_push(retarray, ret);
4939                 }
4940             }
4941             if (retarray)
4942                 return newRV((SV*)retarray);
4943         }
4944     }
4945     return NULL;
4946 }
4947
4948 bool
4949 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
4950                            const U32 flags)
4951 {
4952     struct regexp *const rx = (struct regexp *)SvANY(r);
4953     if (rx && rx->paren_names) {
4954         if (flags & RXapif_ALL) {
4955             return hv_exists_ent(rx->paren_names, key, 0);
4956         } else {
4957             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
4958             if (sv) {
4959                 SvREFCNT_dec(sv);
4960                 return TRUE;
4961             } else {
4962                 return FALSE;
4963             }
4964         }
4965     } else {
4966         return FALSE;
4967     }
4968 }
4969
4970 SV*
4971 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
4972 {
4973     struct regexp *const rx = (struct regexp *)SvANY(r);
4974     if ( rx && rx->paren_names ) {
4975         (void)hv_iterinit(rx->paren_names);
4976
4977         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
4978     } else {
4979         return FALSE;
4980     }
4981 }
4982
4983 SV*
4984 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
4985 {
4986     struct regexp *const rx = (struct regexp *)SvANY(r);
4987     if (rx && rx->paren_names) {
4988         HV *hv = rx->paren_names;
4989         HE *temphe;
4990         while ( (temphe = hv_iternext_flags(hv,0)) ) {
4991             IV i;
4992             IV parno = 0;
4993             SV* sv_dat = HeVAL(temphe);
4994             I32 *nums = (I32*)SvPVX(sv_dat);
4995             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
4996                 if ((I32)(rx->lastcloseparen) >= nums[i] &&
4997                     rx->offs[nums[i]].start != -1 &&
4998                     rx->offs[nums[i]].end != -1)
4999                 {
5000                     parno = nums[i];
5001                     break;
5002                 }
5003             }
5004             if (parno || flags & RXapif_ALL) {
5005                 return newSVhek(HeKEY_hek(temphe));
5006             }
5007         }
5008     }
5009     return NULL;
5010 }
5011
5012 SV*
5013 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5014 {
5015     SV *ret;
5016     AV *av;
5017     I32 length;
5018     struct regexp *const rx = (struct regexp *)SvANY(r);
5019
5020     if (rx && rx->paren_names) {
5021         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5022             return newSViv(HvTOTALKEYS(rx->paren_names));
5023         } else if (flags & RXapif_ONE) {
5024             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5025             av = (AV*)SvRV(ret);
5026             length = av_len(av);
5027             return newSViv(length + 1);
5028         } else {
5029             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5030             return NULL;
5031         }
5032     }
5033     return &PL_sv_undef;
5034 }
5035
5036 SV*
5037 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5038 {
5039     struct regexp *const rx = (struct regexp *)SvANY(r);
5040     AV *av = newAV();
5041
5042     if (rx && rx->paren_names) {
5043         HV *hv= rx->paren_names;
5044         HE *temphe;
5045         (void)hv_iterinit(hv);
5046         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5047             IV i;
5048             IV parno = 0;
5049             SV* sv_dat = HeVAL(temphe);
5050             I32 *nums = (I32*)SvPVX(sv_dat);
5051             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5052                 if ((I32)(rx->lastcloseparen) >= nums[i] &&
5053                     rx->offs[nums[i]].start != -1 &&
5054                     rx->offs[nums[i]].end != -1)
5055                 {
5056                     parno = nums[i];
5057                     break;
5058                 }
5059             }
5060             if (parno || flags & RXapif_ALL) {
5061                 av_push(av, newSVhek(HeKEY_hek(temphe)));
5062             }
5063         }
5064     }
5065
5066     return newRV((SV*)av);
5067 }
5068
5069 void
5070 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5071                              SV * const sv)
5072 {
5073     struct regexp *const rx = (struct regexp *)SvANY(r);
5074     char *s = NULL;
5075     I32 i = 0;
5076     I32 s1, t1;
5077
5078     if (!rx->subbeg) {
5079         sv_setsv(sv,&PL_sv_undef);
5080         return;
5081     }
5082     else
5083     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5084         /* $` */
5085         i = rx->offs[0].start;
5086         s = rx->subbeg;
5087     }
5088     else
5089     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5090         /* $' */
5091         s = rx->subbeg + rx->offs[0].end;
5092         i = rx->sublen - rx->offs[0].end;
5093     }
5094     else
5095     if ( 0 <= paren && paren <= (I32)rx->nparens &&
5096         (s1 = rx->offs[paren].start) != -1 &&
5097         (t1 = rx->offs[paren].end) != -1)
5098     {
5099         /* $& $1 ... */
5100         i = t1 - s1;
5101         s = rx->subbeg + s1;
5102     } else {
5103         sv_setsv(sv,&PL_sv_undef);
5104         return;
5105     }
5106     assert(rx->sublen >= (s - rx->subbeg) + i );
5107     if (i >= 0) {
5108         const int oldtainted = PL_tainted;
5109         TAINT_NOT;
5110         sv_setpvn(sv, s, i);
5111         PL_tainted = oldtainted;
5112         if ( (rx->extflags & RXf_CANY_SEEN)
5113             ? (RXp_MATCH_UTF8(rx)
5114                         && (!i || is_utf8_string((U8*)s, i)))
5115             : (RXp_MATCH_UTF8(rx)) )
5116         {
5117             SvUTF8_on(sv);
5118         }
5119         else
5120             SvUTF8_off(sv);
5121         if (PL_tainting) {
5122             if (RXp_MATCH_TAINTED(rx)) {
5123                 if (SvTYPE(sv) >= SVt_PVMG) {
5124                     MAGIC* const mg = SvMAGIC(sv);
5125                     MAGIC* mgt;
5126                     PL_tainted = 1;
5127                     SvMAGIC_set(sv, mg->mg_moremagic);
5128                     SvTAINT(sv);
5129                     if ((mgt = SvMAGIC(sv))) {
5130                         mg->mg_moremagic = mgt;
5131                         SvMAGIC_set(sv, mg);
5132                     }
5133                 } else {
5134                     PL_tainted = 1;
5135                     SvTAINT(sv);
5136                 }
5137             } else
5138                 SvTAINTED_off(sv);
5139         }
5140     } else {
5141         sv_setsv(sv,&PL_sv_undef);
5142         return;
5143     }
5144 }
5145
5146 void
5147 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5148                                                          SV const * const value)
5149 {
5150     PERL_UNUSED_ARG(rx);
5151     PERL_UNUSED_ARG(paren);
5152     PERL_UNUSED_ARG(value);
5153
5154     if (!PL_localizing)
5155         Perl_croak(aTHX_ PL_no_modify);
5156 }
5157
5158 I32
5159 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5160                               const I32 paren)
5161 {
5162     struct regexp *const rx = (struct regexp *)SvANY(r);
5163     I32 i;
5164     I32 s1, t1;
5165
5166     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5167         switch (paren) {
5168       /* $` / ${^PREMATCH} */
5169       case RX_BUFF_IDX_PREMATCH:
5170         if (rx->offs[0].start != -1) {
5171                         i = rx->offs[0].start;
5172                         if (i > 0) {
5173                                 s1 = 0;
5174                                 t1 = i;
5175                                 goto getlen;
5176                         }
5177             }
5178         return 0;
5179       /* $' / ${^POSTMATCH} */
5180       case RX_BUFF_IDX_POSTMATCH:
5181             if (rx->offs[0].end != -1) {
5182                         i = rx->sublen - rx->offs[0].end;
5183                         if (i > 0) {
5184                                 s1 = rx->offs[0].end;
5185                                 t1 = rx->sublen;
5186                                 goto getlen;
5187                         }
5188             }
5189         return 0;
5190       /* $& / ${^MATCH}, $1, $2, ... */
5191       default:
5192             if (paren <= (I32)rx->nparens &&
5193             (s1 = rx->offs[paren].start) != -1 &&
5194             (t1 = rx->offs[paren].end) != -1)
5195             {
5196             i = t1 - s1;
5197             goto getlen;
5198         } else {
5199             if (ckWARN(WARN_UNINITIALIZED))
5200                 report_uninit((SV*)sv);
5201             return 0;
5202         }
5203     }
5204   getlen:
5205     if (i > 0 && RXp_MATCH_UTF8(rx)) {
5206         const char * const s = rx->subbeg + s1;
5207         const U8 *ep;
5208         STRLEN el;
5209
5210         i = t1 - s1;
5211         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5212                         i = el;
5213     }
5214     return i;
5215 }
5216
5217 SV*
5218 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5219 {
5220         PERL_UNUSED_ARG(rx);
5221         return NULL;
5222 }
5223
5224 /* Scans the name of a named buffer from the pattern.
5225  * If flags is REG_RSN_RETURN_NULL returns null.
5226  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5227  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5228  * to the parsed name as looked up in the RExC_paren_names hash.
5229  * If there is an error throws a vFAIL().. type exception.
5230  */
5231
5232 #define REG_RSN_RETURN_NULL    0
5233 #define REG_RSN_RETURN_NAME    1
5234 #define REG_RSN_RETURN_DATA    2
5235
5236 STATIC SV*
5237 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
5238     char *name_start = RExC_parse;
5239
5240     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5241          /* skip IDFIRST by using do...while */
5242         if (UTF)
5243             do {
5244                 RExC_parse += UTF8SKIP(RExC_parse);
5245             } while (isALNUM_utf8((U8*)RExC_parse));
5246         else
5247             do {
5248                 RExC_parse++;
5249             } while (isALNUM(*RExC_parse));
5250     }
5251
5252     if ( flags ) {
5253         SV* sv_name
5254             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5255                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5256         if ( flags == REG_RSN_RETURN_NAME)
5257             return sv_name;
5258         else if (flags==REG_RSN_RETURN_DATA) {
5259             HE *he_str = NULL;
5260             SV *sv_dat = NULL;
5261             if ( ! sv_name )      /* should not happen*/
5262                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5263             if (RExC_paren_names)
5264                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5265             if ( he_str )
5266                 sv_dat = HeVAL(he_str);
5267             if ( ! sv_dat )
5268                 vFAIL("Reference to nonexistent named group");
5269             return sv_dat;
5270         }
5271         else {
5272             Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5273         }
5274         /* NOT REACHED */
5275     }
5276     return NULL;
5277 }
5278
5279 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
5280     int rem=(int)(RExC_end - RExC_parse);                       \
5281     int cut;                                                    \
5282     int num;                                                    \
5283     int iscut=0;                                                \
5284     if (rem>10) {                                               \
5285         rem=10;                                                 \
5286         iscut=1;                                                \
5287     }                                                           \
5288     cut=10-rem;                                                 \
5289     if (RExC_lastparse!=RExC_parse)                             \
5290         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
5291             rem, RExC_parse,                                    \
5292             cut + 4,                                            \
5293             iscut ? "..." : "<"                                 \
5294         );                                                      \
5295     else                                                        \
5296         PerlIO_printf(Perl_debug_log,"%16s","");                \
5297                                                                 \
5298     if (SIZE_ONLY)                                              \
5299        num = RExC_size + 1;                                     \
5300     else                                                        \
5301        num=REG_NODE_NUM(RExC_emit);                             \
5302     if (RExC_lastnum!=num)                                      \
5303        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
5304     else                                                        \
5305        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
5306     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
5307         (int)((depth*2)), "",                                   \
5308         (funcname)                                              \
5309     );                                                          \
5310     RExC_lastnum=num;                                           \
5311     RExC_lastparse=RExC_parse;                                  \
5312 })
5313
5314
5315
5316 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
5317     DEBUG_PARSE_MSG((funcname));                            \
5318     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
5319 })
5320 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
5321     DEBUG_PARSE_MSG((funcname));                            \
5322     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
5323 })
5324 /*
5325  - reg - regular expression, i.e. main body or parenthesized thing
5326  *
5327  * Caller must absorb opening parenthesis.
5328  *
5329  * Combining parenthesis handling with the base level of regular expression
5330  * is a trifle forced, but the need to tie the tails of the branches to what
5331  * follows makes it hard to avoid.
5332  */
5333 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5334 #ifdef DEBUGGING
5335 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5336 #else
5337 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5338 #endif
5339
5340 STATIC regnode *
5341 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5342     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5343 {
5344     dVAR;
5345     register regnode *ret;              /* Will be the head of the group. */
5346     register regnode *br;
5347     register regnode *lastbr;
5348     register regnode *ender = NULL;
5349     register I32 parno = 0;
5350     I32 flags;
5351     U32 oregflags = RExC_flags;
5352     bool have_branch = 0;
5353     bool is_open = 0;
5354     I32 freeze_paren = 0;
5355     I32 after_freeze = 0;
5356
5357     /* for (?g), (?gc), and (?o) warnings; warning
5358        about (?c) will warn about (?g) -- japhy    */
5359
5360 #define WASTED_O  0x01
5361 #define WASTED_G  0x02
5362 #define WASTED_C  0x04
5363 #define WASTED_GC (0x02|0x04)
5364     I32 wastedflags = 0x00;
5365
5366     char * parse_start = RExC_parse; /* MJD */
5367     char * const oregcomp_parse = RExC_parse;
5368
5369     GET_RE_DEBUG_FLAGS_DECL;
5370     DEBUG_PARSE("reg ");
5371
5372     *flagp = 0;                         /* Tentatively. */
5373
5374
5375     /* Make an OPEN node, if parenthesized. */
5376     if (paren) {
5377         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5378             char *start_verb = RExC_parse;
5379             STRLEN verb_len = 0;
5380             char *start_arg = NULL;
5381             unsigned char op = 0;
5382             int argok = 1;
5383             int internal_argval = 0; /* internal_argval is only useful if !argok */
5384             while ( *RExC_parse && *RExC_parse != ')' ) {
5385                 if ( *RExC_parse == ':' ) {
5386                     start_arg = RExC_parse + 1;
5387                     break;
5388                 }
5389                 RExC_parse++;
5390             }
5391             ++start_verb;
5392             verb_len = RExC_parse - start_verb;
5393             if ( start_arg ) {
5394                 RExC_parse++;
5395                 while ( *RExC_parse && *RExC_parse != ')' )
5396                     RExC_parse++;
5397                 if ( *RExC_parse != ')' )
5398                     vFAIL("Unterminated verb pattern argument");
5399                 if ( RExC_parse == start_arg )
5400                     start_arg = NULL;
5401             } else {
5402                 if ( *RExC_parse != ')' )
5403                     vFAIL("Unterminated verb pattern");
5404             }
5405
5406             switch ( *start_verb ) {
5407             case 'A':  /* (*ACCEPT) */
5408                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5409                     op = ACCEPT;
5410                     internal_argval = RExC_nestroot;
5411                 }
5412                 break;
5413             case 'C':  /* (*COMMIT) */
5414                 if ( memEQs(start_verb,verb_len,"COMMIT") )
5415                     op = COMMIT;
5416                 break;
5417             case 'F':  /* (*FAIL) */
5418                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5419                     op = OPFAIL;
5420                     argok = 0;
5421                 }
5422                 break;
5423             case ':':  /* (*:NAME) */
5424             case 'M':  /* (*MARK:NAME) */
5425                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5426                     op = MARKPOINT;
5427                     argok = -1;
5428                 }
5429                 break;
5430             case 'P':  /* (*PRUNE) */
5431                 if ( memEQs(start_verb,verb_len,"PRUNE") )
5432                     op = PRUNE;
5433                 break;
5434             case 'S':   /* (*SKIP) */
5435                 if ( memEQs(start_verb,verb_len,"SKIP") )
5436                     op = SKIP;
5437                 break;
5438             case 'T':  /* (*THEN) */
5439                 /* [19:06] <TimToady> :: is then */
5440                 if ( memEQs(start_verb,verb_len,"THEN") ) {
5441                     op = CUTGROUP;
5442                     RExC_seen |= REG_SEEN_CUTGROUP;
5443                 }
5444                 break;
5445             }
5446             if ( ! op ) {
5447                 RExC_parse++;
5448                 vFAIL3("Unknown verb pattern '%.*s'",
5449                     verb_len, start_verb);
5450             }
5451             if ( argok ) {
5452                 if ( start_arg && internal_argval ) {
5453                     vFAIL3("Verb pattern '%.*s' may not have an argument",
5454                         verb_len, start_verb);
5455                 } else if ( argok < 0 && !start_arg ) {
5456                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5457                         verb_len, start_verb);
5458                 } else {
5459                     ret = reganode(pRExC_state, op, internal_argval);
5460                     if ( ! internal_argval && ! SIZE_ONLY ) {
5461                         if (start_arg) {
5462                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5463                             ARG(ret) = add_data( pRExC_state, 1, "S" );
5464                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5465                             ret->flags = 0;
5466                         } else {
5467                             ret->flags = 1;
5468                         }
5469                     }
5470                 }
5471                 if (!internal_argval)
5472                     RExC_seen |= REG_SEEN_VERBARG;
5473             } else if ( start_arg ) {
5474                 vFAIL3("Verb pattern '%.*s' may not have an argument",
5475                         verb_len, start_verb);
5476             } else {
5477                 ret = reg_node(pRExC_state, op);
5478             }
5479             nextchar(pRExC_state);
5480             return ret;
5481         } else
5482         if (*RExC_parse == '?') { /* (?...) */
5483             bool is_logical = 0;
5484             const char * const seqstart = RExC_parse;
5485
5486             RExC_parse++;
5487             paren = *RExC_parse++;
5488             ret = NULL;                 /* For look-ahead/behind. */
5489             switch (paren) {
5490
5491             case 'P':   /* (?P...) variants for those used to PCRE/Python */
5492                 paren = *RExC_parse++;
5493                 if ( paren == '<')         /* (?P<...>) named capture */
5494                     goto named_capture;
5495                 else if (paren == '>') {   /* (?P>name) named recursion */
5496                     goto named_recursion;
5497                 }
5498                 else if (paren == '=') {   /* (?P=...)  named backref */
5499                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
5500                        you change this make sure you change that */
5501                     char* name_start = RExC_parse;
5502                     U32 num = 0;
5503                     SV *sv_dat = reg_scan_name(pRExC_state,
5504                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5505                     if (RExC_parse == name_start || *RExC_parse != ')')
5506                         vFAIL2("Sequence %.3s... not terminated",parse_start);
5507
5508                     if (!SIZE_ONLY) {
5509                         num = add_data( pRExC_state, 1, "S" );
5510                         RExC_rxi->data->data[num]=(void*)sv_dat;
5511                         SvREFCNT_inc_simple_void(sv_dat);
5512                     }
5513                     RExC_sawback = 1;
5514                     ret = reganode(pRExC_state,
5515                            (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5516                            num);
5517                     *flagp |= HASWIDTH;
5518
5519                     Set_Node_Offset(ret, parse_start+1);
5520                     Set_Node_Cur_Length(ret); /* MJD */
5521
5522                     nextchar(pRExC_state);
5523                     return ret;
5524                 }
5525                 RExC_parse++;
5526                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5527                 /*NOTREACHED*/
5528             case '<':           /* (?<...) */
5529                 if (*RExC_parse == '!')
5530                     paren = ',';
5531                 else if (*RExC_parse != '=')
5532               named_capture:
5533                 {               /* (?<...>) */
5534                     char *name_start;
5535                     SV *svname;
5536                     paren= '>';
5537             case '\'':          /* (?'...') */
5538                     name_start= RExC_parse;
5539                     svname = reg_scan_name(pRExC_state,
5540                         SIZE_ONLY ?  /* reverse test from the others */
5541                         REG_RSN_RETURN_NAME :
5542                         REG_RSN_RETURN_NULL);
5543                     if (RExC_parse == name_start) {
5544                         RExC_parse++;
5545                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5546                         /*NOTREACHED*/
5547                     }
5548                     if (*RExC_parse != paren)
5549                         vFAIL2("Sequence (?%c... not terminated",
5550                             paren=='>' ? '<' : paren);
5551                     if (SIZE_ONLY) {
5552                         HE *he_str;
5553                         SV *sv_dat = NULL;
5554                         if (!svname) /* shouldnt happen */
5555                             Perl_croak(aTHX_
5556                                 "panic: reg_scan_name returned NULL");
5557                         if (!RExC_paren_names) {
5558                             RExC_paren_names= newHV();
5559                             sv_2mortal((SV*)RExC_paren_names);
5560 #ifdef DEBUGGING
5561                             RExC_paren_name_list= newAV();
5562                             sv_2mortal((SV*)RExC_paren_name_list);
5563 #endif
5564                         }
5565                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5566                         if ( he_str )
5567                             sv_dat = HeVAL(he_str);
5568                         if ( ! sv_dat ) {
5569                             /* croak baby croak */
5570                             Perl_croak(aTHX_
5571                                 "panic: paren_name hash element allocation failed");
5572                         } else if ( SvPOK(sv_dat) ) {
5573                             /* (?|...) can mean we have dupes so scan to check
5574                                its already been stored. Maybe a flag indicating
5575                                we are inside such a construct would be useful,
5576                                but the arrays are likely to be quite small, so
5577                                for now we punt -- dmq */
5578                             IV count = SvIV(sv_dat);
5579                             I32 *pv = (I32*)SvPVX(sv_dat);
5580                             IV i;
5581                             for ( i = 0 ; i < count ; i++ ) {
5582                                 if ( pv[i] == RExC_npar ) {
5583                                     count = 0;
5584                                     break;
5585                                 }
5586                             }
5587                             if ( count ) {
5588                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5589                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5590                                 pv[count] = RExC_npar;
5591                                 SvIVX(sv_dat)++;
5592                             }
5593                         } else {
5594                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
5595                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5596                             SvIOK_on(sv_dat);
5597                             SvIVX(sv_dat)= 1;
5598                         }
5599 #ifdef DEBUGGING
5600                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5601                             SvREFCNT_dec(svname);
5602 #endif
5603
5604                         /*sv_dump(sv_dat);*/
5605                     }
5606                     nextchar(pRExC_state);
5607                     paren = 1;
5608                     goto capturing_parens;
5609                 }
5610                 RExC_seen |= REG_SEEN_LOOKBEHIND;
5611                 RExC_parse++;
5612             case '=':           /* (?=...) */
5613             case '!':           /* (?!...) */
5614                 RExC_seen_zerolen++;
5615                 if (*RExC_parse == ')') {
5616                     ret=reg_node(pRExC_state, OPFAIL);
5617                     nextchar(pRExC_state);
5618                     return ret;
5619                 }
5620                 break;
5621             case '|':           /* (?|...) */
5622                 /* branch reset, behave like a (?:...) except that
5623                    buffers in alternations share the same numbers */
5624                 paren = ':';
5625                 after_freeze = freeze_paren = RExC_npar;
5626                 break;
5627             case ':':           /* (?:...) */
5628             case '>':           /* (?>...) */
5629                 break;
5630             case '$':           /* (?$...) */
5631             case '@':           /* (?@...) */
5632                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5633                 break;
5634             case '#':           /* (?#...) */
5635                 while (*RExC_parse && *RExC_parse != ')')
5636                     RExC_parse++;
5637                 if (*RExC_parse != ')')
5638                     FAIL("Sequence (?#... not terminated");
5639                 nextchar(pRExC_state);
5640                 *flagp = TRYAGAIN;
5641                 return NULL;
5642             case '0' :           /* (?0) */
5643             case 'R' :           /* (?R) */
5644                 if (*RExC_parse != ')')
5645                     FAIL("Sequence (?R) not terminated");
5646                 ret = reg_node(pRExC_state, GOSTART);
5647                 *flagp |= POSTPONED;
5648                 nextchar(pRExC_state);
5649                 return ret;
5650                 /*notreached*/
5651             { /* named and numeric backreferences */
5652                 I32 num;
5653             case '&':            /* (?&NAME) */
5654                 parse_start = RExC_parse - 1;
5655               named_recursion:
5656                 {
5657                     SV *sv_dat = reg_scan_name(pRExC_state,
5658                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5659                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5660                 }
5661                 goto gen_recurse_regop;
5662                 /* NOT REACHED */
5663             case '+':
5664                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5665                     RExC_parse++;
5666                     vFAIL("Illegal pattern");
5667                 }
5668                 goto parse_recursion;
5669                 /* NOT REACHED*/
5670             case '-': /* (?-1) */
5671                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5672                     RExC_parse--; /* rewind to let it be handled later */
5673                     goto parse_flags;
5674                 }
5675                 /*FALLTHROUGH */
5676             case '1': case '2': case '3': case '4': /* (?1) */
5677             case '5': case '6': case '7': case '8': case '9':
5678                 RExC_parse--;
5679               parse_recursion:
5680                 num = atoi(RExC_parse);
5681                 parse_start = RExC_parse - 1; /* MJD */
5682                 if (*RExC_parse == '-')
5683                     RExC_parse++;
5684                 while (isDIGIT(*RExC_parse))
5685                         RExC_parse++;
5686                 if (*RExC_parse!=')')
5687                     vFAIL("Expecting close bracket");
5688
5689               gen_recurse_regop:
5690                 if ( paren == '-' ) {
5691                     /*
5692                     Diagram of capture buffer numbering.
5693                     Top line is the normal capture buffer numbers
5694                     Botton line is the negative indexing as from
5695                     the X (the (?-2))
5696
5697                     +   1 2    3 4 5 X          6 7
5698                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5699                     -   5 4    3 2 1 X          x x
5700
5701                     */
5702                     num = RExC_npar + num;
5703                     if (num < 1)  {
5704                         RExC_parse++;
5705                         vFAIL("Reference to nonexistent group");
5706                     }
5707                 } else if ( paren == '+' ) {
5708                     num = RExC_npar + num - 1;
5709                 }
5710
5711                 ret = reganode(pRExC_state, GOSUB, num);
5712                 if (!SIZE_ONLY) {
5713                     if (num > (I32)RExC_rx->nparens) {
5714                         RExC_parse++;
5715                         vFAIL("Reference to nonexistent group");
5716                     }
5717                     ARG2L_SET( ret, RExC_recurse_count++);
5718                     RExC_emit++;
5719                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5720                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5721                 } else {
5722                     RExC_size++;
5723                 }
5724                 RExC_seen |= REG_SEEN_RECURSE;
5725                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5726                 Set_Node_Offset(ret, parse_start); /* MJD */
5727
5728                 *flagp |= POSTPONED;
5729                 nextchar(pRExC_state);
5730                 return ret;
5731             } /* named and numeric backreferences */
5732             /* NOT REACHED */
5733
5734             case '?':           /* (??...) */
5735                 is_logical = 1;
5736                 if (*RExC_parse != '{') {
5737                     RExC_parse++;
5738                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5739                     /*NOTREACHED*/
5740                 }
5741                 *flagp |= POSTPONED;
5742                 paren = *RExC_parse++;
5743                 /* FALL THROUGH */
5744             case '{':           /* (?{...}) */
5745             {
5746                 I32 count = 1;
5747                 U32 n = 0;
5748                 char c;
5749                 char *s = RExC_parse;
5750
5751                 RExC_seen_zerolen++;
5752                 RExC_seen |= REG_SEEN_EVAL;
5753                 while (count && (c = *RExC_parse)) {
5754                     if (c == '\\') {
5755                         if (RExC_parse[1])
5756                             RExC_parse++;
5757                     }
5758                     else if (c == '{')
5759                         count++;
5760                     else if (c == '}')
5761                         count--;
5762                     RExC_parse++;
5763                 }
5764                 if (*RExC_parse != ')') {
5765                     RExC_parse = s;
5766                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5767                 }
5768                 if (!SIZE_ONLY) {
5769                     PAD *pad;
5770                     OP_4tree *sop, *rop;
5771                     SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5772
5773                     ENTER;
5774                     Perl_save_re_context(aTHX);
5775                     rop = sv_compile_2op(sv, &sop, "re", &pad);
5776                     sop->op_private |= OPpREFCOUNTED;
5777                     /* re_dup will OpREFCNT_inc */
5778                     OpREFCNT_set(sop, 1);
5779                     LEAVE;
5780
5781                     n = add_data(pRExC_state, 3, "nop");
5782                     RExC_rxi->data->data[n] = (void*)rop;
5783                     RExC_rxi->data->data[n+1] = (void*)sop;
5784                     RExC_rxi->data->data[n+2] = (void*)pad;
5785                     SvREFCNT_dec(sv);
5786                 }
5787                 else {                                          /* First pass */
5788                     if (PL_reginterp_cnt < ++RExC_seen_evals
5789                         && IN_PERL_RUNTIME)
5790                         /* No compiled RE interpolated, has runtime
5791                            components ===> unsafe.  */
5792                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
5793                     if (PL_tainting && PL_tainted)
5794                         FAIL("Eval-group in insecure regular expression");
5795 #if PERL_VERSION > 8
5796                     if (IN_PERL_COMPILETIME)
5797                         PL_cv_has_eval = 1;
5798 #endif
5799                 }
5800
5801                 nextchar(pRExC_state);
5802                 if (is_logical) {
5803                     ret = reg_node(pRExC_state, LOGICAL);
5804                     if (!SIZE_ONLY)
5805                         ret->flags = 2;
5806                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5807                     /* deal with the length of this later - MJD */
5808                     return ret;
5809                 }
5810                 ret = reganode(pRExC_state, EVAL, n);
5811                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5812                 Set_Node_Offset(ret, parse_start);
5813                 return ret;
5814             }
5815             case '(':           /* (?(?{...})...) and (?(?=...)...) */
5816             {
5817                 int is_define= 0;
5818                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
5819                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5820                         || RExC_parse[1] == '<'
5821                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
5822                         I32 flag;
5823
5824                         ret = reg_node(pRExC_state, LOGICAL);
5825                         if (!SIZE_ONLY)
5826                             ret->flags = 1;
5827                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5828                         goto insert_if;
5829                     }
5830                 }
5831                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
5832                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5833                 {
5834                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
5835                     char *name_start= RExC_parse++;
5836                     U32 num = 0;
5837                     SV *sv_dat=reg_scan_name(pRExC_state,
5838                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5839                     if (RExC_parse == name_start || *RExC_parse != ch)
5840                         vFAIL2("Sequence (?(%c... not terminated",
5841                             (ch == '>' ? '<' : ch));
5842                     RExC_parse++;
5843                     if (!SIZE_ONLY) {
5844                         num = add_data( pRExC_state, 1, "S" );
5845                         RExC_rxi->data->data[num]=(void*)sv_dat;
5846                         SvREFCNT_inc_simple_void(sv_dat);
5847                     }
5848                     ret = reganode(pRExC_state,NGROUPP,num);
5849                     goto insert_if_check_paren;
5850                 }
5851                 else if (RExC_parse[0] == 'D' &&
5852                          RExC_parse[1] == 'E' &&
5853                          RExC_parse[2] == 'F' &&
5854                          RExC_parse[3] == 'I' &&
5855                          RExC_parse[4] == 'N' &&
5856                          RExC_parse[5] == 'E')
5857                 {
5858                     ret = reganode(pRExC_state,DEFINEP,0);
5859                     RExC_parse +=6 ;
5860                     is_define = 1;
5861                     goto insert_if_check_paren;
5862                 }
5863                 else if (RExC_parse[0] == 'R') {
5864                     RExC_parse++;
5865                     parno = 0;
5866                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5867                         parno = atoi(RExC_parse++);
5868                         while (isDIGIT(*RExC_parse))
5869                             RExC_parse++;
5870                     } else if (RExC_parse[0] == '&') {
5871                         SV *sv_dat;
5872                         RExC_parse++;
5873                         sv_dat = reg_scan_name(pRExC_state,
5874                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5875                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5876                     }
5877                     ret = reganode(pRExC_state,INSUBP,parno);
5878                     goto insert_if_check_paren;
5879                 }
5880                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5881                     /* (?(1)...) */
5882                     char c;
5883                     parno = atoi(RExC_parse++);
5884
5885                     while (isDIGIT(*RExC_parse))
5886                         RExC_parse++;
5887                     ret = reganode(pRExC_state, GROUPP, parno);
5888
5889                  insert_if_check_paren:
5890                     if ((c = *nextchar(pRExC_state)) != ')')
5891                         vFAIL("Switch condition not recognized");
5892                   insert_if:
5893                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5894                     br = regbranch(pRExC_state, &flags, 1,depth+1);
5895                     if (br == NULL)
5896                         br = reganode(pRExC_state, LONGJMP, 0);
5897                     else
5898                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
5899                     c = *nextchar(pRExC_state);
5900                     if (flags&HASWIDTH)
5901                         *flagp |= HASWIDTH;
5902                     if (c == '|') {
5903                         if (is_define)
5904                             vFAIL("(?(DEFINE)....) does not allow branches");
5905                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
5906                         regbranch(pRExC_state, &flags, 1,depth+1);
5907                         REGTAIL(pRExC_state, ret, lastbr);
5908                         if (flags&HASWIDTH)
5909                             *flagp |= HASWIDTH;
5910                         c = *nextchar(pRExC_state);
5911                     }
5912                     else
5913                         lastbr = NULL;
5914                     if (c != ')')
5915                         vFAIL("Switch (?(condition)... contains too many branches");
5916                     ender = reg_node(pRExC_state, TAIL);
5917                     REGTAIL(pRExC_state, br, ender);
5918                     if (lastbr) {
5919                         REGTAIL(pRExC_state, lastbr, ender);
5920                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
5921                     }
5922                     else
5923                         REGTAIL(pRExC_state, ret, ender);
5924                     RExC_size++; /* XXX WHY do we need this?!!
5925                                     For large programs it seems to be required
5926                                     but I can't figure out why. -- dmq*/
5927                     return ret;
5928                 }
5929                 else {
5930                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
5931                 }
5932             }
5933             case 0:
5934                 RExC_parse--; /* for vFAIL to print correctly */
5935                 vFAIL("Sequence (? incomplete");
5936                 break;
5937             default:
5938                 --RExC_parse;
5939                 parse_flags:      /* (?i) */
5940             {
5941                 U32 posflags = 0, negflags = 0;
5942                 U32 *flagsp = &posflags;
5943
5944                 while (*RExC_parse) {
5945                     /* && strchr("iogcmsx", *RExC_parse) */
5946                     /* (?g), (?gc) and (?o) are useless here
5947                        and must be globally applied -- japhy */
5948                     switch (*RExC_parse) {
5949                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
5950                     case ONCE_PAT_MOD: /* 'o' */
5951                     case GLOBAL_PAT_MOD: /* 'g' */
5952                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5953                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
5954                             if (! (wastedflags & wflagbit) ) {
5955                                 wastedflags |= wflagbit;
5956                                 vWARN5(
5957                                     RExC_parse + 1,
5958                                     "Useless (%s%c) - %suse /%c modifier",
5959                                     flagsp == &negflags ? "?-" : "?",
5960                                     *RExC_parse,
5961                                     flagsp == &negflags ? "don't " : "",
5962                                     *RExC_parse
5963                                 );
5964                             }
5965                         }
5966                         break;
5967
5968                     case CONTINUE_PAT_MOD: /* 'c' */
5969                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5970                             if (! (wastedflags & WASTED_C) ) {
5971                                 wastedflags |= WASTED_GC;
5972                                 vWARN3(
5973                                     RExC_parse + 1,
5974                                     "Useless (%sc) - %suse /gc modifier",
5975                                     flagsp == &negflags ? "?-" : "?",
5976                                     flagsp == &negflags ? "don't " : ""
5977                                 );
5978                             }
5979                         }
5980                         break;
5981                     case KEEPCOPY_PAT_MOD: /* 'p' */
5982                         if (flagsp == &negflags) {
5983                             if (SIZE_ONLY && ckWARN(WARN_REGEXP))
5984                                 vWARN(RExC_parse + 1,"Useless use of (?-p)");
5985                         } else {
5986                             *flagsp |= RXf_PMf_KEEPCOPY;
5987                         }
5988                         break;
5989                     case '-':
5990                         if (flagsp == &negflags) {
5991                             RExC_parse++;
5992                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5993                             /*NOTREACHED*/
5994                         }
5995                         flagsp = &negflags;
5996                         wastedflags = 0;  /* reset so (?g-c) warns twice */
5997                         break;
5998                     case ':':
5999                         paren = ':';
6000                         /*FALLTHROUGH*/
6001                     case ')':
6002                         RExC_flags |= posflags;
6003                         RExC_flags &= ~negflags;
6004                         if (paren != ':') {
6005                             oregflags |= posflags;
6006                             oregflags &= ~negflags;
6007                         }
6008                         nextchar(pRExC_state);
6009                         if (paren != ':') {
6010                             *flagp = TRYAGAIN;
6011                             return NULL;
6012                         } else {
6013                             ret = NULL;
6014                             goto parse_rest;
6015                         }
6016                         /*NOTREACHED*/
6017                     default:
6018                         RExC_parse++;
6019                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6020                         /*NOTREACHED*/
6021                     }
6022                     ++RExC_parse;
6023                 }
6024             }} /* one for the default block, one for the switch */
6025         }
6026         else {                  /* (...) */
6027           capturing_parens:
6028             parno = RExC_npar;
6029             RExC_npar++;
6030
6031             ret = reganode(pRExC_state, OPEN, parno);
6032             if (!SIZE_ONLY ){
6033                 if (!RExC_nestroot)
6034                     RExC_nestroot = parno;
6035                 if (RExC_seen & REG_SEEN_RECURSE
6036                     && !RExC_open_parens[parno-1])
6037                 {
6038                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6039                         "Setting open paren #%"IVdf" to %d\n",
6040                         (IV)parno, REG_NODE_NUM(ret)));
6041                     RExC_open_parens[parno-1]= ret;
6042                 }
6043             }
6044             Set_Node_Length(ret, 1); /* MJD */
6045             Set_Node_Offset(ret, RExC_parse); /* MJD */
6046             is_open = 1;
6047         }
6048     }
6049     else                        /* ! paren */
6050         ret = NULL;
6051
6052    parse_rest:
6053     /* Pick up the branches, linking them together. */
6054     parse_start = RExC_parse;   /* MJD */
6055     br = regbranch(pRExC_state, &flags, 1,depth+1);
6056     /*     branch_len = (paren != 0); */
6057
6058     if (br == NULL)
6059         return(NULL);
6060     if (*RExC_parse == '|') {
6061         if (!SIZE_ONLY && RExC_extralen) {
6062             reginsert(pRExC_state, BRANCHJ, br, depth+1);
6063         }
6064         else {                  /* MJD */
6065             reginsert(pRExC_state, BRANCH, br, depth+1);
6066             Set_Node_Length(br, paren != 0);
6067             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6068         }
6069         have_branch = 1;
6070         if (SIZE_ONLY)
6071             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
6072     }
6073     else if (paren == ':') {
6074         *flagp |= flags&SIMPLE;
6075     }
6076     if (is_open) {                              /* Starts with OPEN. */
6077         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
6078     }
6079     else if (paren != '?')              /* Not Conditional */
6080         ret = br;
6081     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6082     lastbr = br;
6083     while (*RExC_parse == '|') {
6084         if (!SIZE_ONLY && RExC_extralen) {
6085             ender = reganode(pRExC_state, LONGJMP,0);
6086             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6087         }
6088         if (SIZE_ONLY)
6089             RExC_extralen += 2;         /* Account for LONGJMP. */
6090         nextchar(pRExC_state);
6091         if (freeze_paren) {
6092             if (RExC_npar > after_freeze)
6093                 after_freeze = RExC_npar;
6094             RExC_npar = freeze_paren;
6095         }
6096         br = regbranch(pRExC_state, &flags, 0, depth+1);
6097
6098         if (br == NULL)
6099             return(NULL);
6100         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
6101         lastbr = br;
6102         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6103     }
6104
6105     if (have_branch || paren != ':') {
6106         /* Make a closing node, and hook it on the end. */
6107         switch (paren) {
6108         case ':':
6109             ender = reg_node(pRExC_state, TAIL);
6110             break;
6111         case 1:
6112             ender = reganode(pRExC_state, CLOSE, parno);
6113             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6114                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6115                         "Setting close paren #%"IVdf" to %d\n",
6116                         (IV)parno, REG_NODE_NUM(ender)));
6117                 RExC_close_parens[parno-1]= ender;
6118                 if (RExC_nestroot == parno)
6119                     RExC_nestroot = 0;
6120             }
6121             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6122             Set_Node_Length(ender,1); /* MJD */
6123             break;
6124         case '<':
6125         case ',':
6126         case '=':
6127         case '!':
6128             *flagp &= ~HASWIDTH;
6129             /* FALL THROUGH */
6130         case '>':
6131             ender = reg_node(pRExC_state, SUCCEED);
6132             break;
6133         case 0:
6134             ender = reg_node(pRExC_state, END);
6135             if (!SIZE_ONLY) {
6136                 assert(!RExC_opend); /* there can only be one! */
6137                 RExC_opend = ender;
6138             }
6139             break;
6140         }
6141         REGTAIL(pRExC_state, lastbr, ender);
6142
6143         if (have_branch && !SIZE_ONLY) {
6144             if (depth==1)
6145                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6146
6147             /* Hook the tails of the branches to the closing node. */
6148             for (br = ret; br; br = regnext(br)) {
6149                 const U8 op = PL_regkind[OP(br)];
6150                 if (op == BRANCH) {
6151                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6152                 }
6153                 else if (op == BRANCHJ) {
6154                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6155                 }
6156             }
6157         }
6158     }
6159
6160     {
6161         const char *p;
6162         static const char parens[] = "=!<,>";
6163
6164         if (paren && (p = strchr(parens, paren))) {
6165             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6166             int flag = (p - parens) > 1;
6167
6168             if (paren == '>')
6169                 node = SUSPEND, flag = 0;
6170             reginsert(pRExC_state, node,ret, depth+1);
6171             Set_Node_Cur_Length(ret);
6172             Set_Node_Offset(ret, parse_start + 1);
6173             ret->flags = flag;
6174             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6175         }
6176     }
6177
6178     /* Check for proper termination. */
6179     if (paren) {
6180         RExC_flags = oregflags;
6181         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6182             RExC_parse = oregcomp_parse;
6183             vFAIL("Unmatched (");
6184         }
6185     }
6186     else if (!paren && RExC_parse < RExC_end) {
6187         if (*RExC_parse == ')') {
6188             RExC_parse++;
6189             vFAIL("Unmatched )");
6190         }
6191         else
6192             FAIL("Junk on end of regexp");      /* "Can't happen". */
6193         /* NOTREACHED */
6194     }
6195     if (after_freeze)
6196         RExC_npar = after_freeze;
6197     return(ret);
6198 }
6199
6200 /*
6201  - regbranch - one alternative of an | operator
6202  *
6203  * Implements the concatenation operator.
6204  */
6205 STATIC regnode *
6206 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6207 {
6208     dVAR;
6209     register regnode *ret;
6210     register regnode *chain = NULL;
6211     register regnode *latest;
6212     I32 flags = 0, c = 0;
6213     GET_RE_DEBUG_FLAGS_DECL;
6214     DEBUG_PARSE("brnc");
6215
6216     if (first)
6217         ret = NULL;
6218     else {
6219         if (!SIZE_ONLY && RExC_extralen)
6220             ret = reganode(pRExC_state, BRANCHJ,0);
6221         else {
6222             ret = reg_node(pRExC_state, BRANCH);
6223             Set_Node_Length(ret, 1);
6224         }
6225     }
6226
6227     if (!first && SIZE_ONLY)
6228         RExC_extralen += 1;                     /* BRANCHJ */
6229
6230     *flagp = WORST;                     /* Tentatively. */
6231
6232     RExC_parse--;
6233     nextchar(pRExC_state);
6234     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6235         flags &= ~TRYAGAIN;
6236         latest = regpiece(pRExC_state, &flags,depth+1);
6237         if (latest == NULL) {
6238             if (flags & TRYAGAIN)
6239                 continue;
6240             return(NULL);
6241         }
6242         else if (ret == NULL)
6243             ret = latest;
6244         *flagp |= flags&(HASWIDTH|POSTPONED);
6245         if (chain == NULL)      /* First piece. */
6246             *flagp |= flags&SPSTART;
6247         else {
6248             RExC_naughty++;
6249             REGTAIL(pRExC_state, chain, latest);
6250         }
6251         chain = latest;
6252         c++;
6253     }
6254     if (chain == NULL) {        /* Loop ran zero times. */
6255         chain = reg_node(pRExC_state, NOTHING);
6256         if (ret == NULL)
6257             ret = chain;
6258     }
6259     if (c == 1) {
6260         *flagp |= flags&SIMPLE;
6261     }
6262
6263     return ret;
6264 }
6265
6266 /*
6267  - regpiece - something followed by possible [*+?]
6268  *
6269  * Note that the branching code sequences used for ? and the general cases
6270  * of * and + are somewhat optimized:  they use the same NOTHING node as
6271  * both the endmarker for their branch list and the body of the last branch.
6272  * It might seem that this node could be dispensed with entirely, but the
6273  * endmarker role is not redundant.
6274  */
6275 STATIC regnode *
6276 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6277 {
6278     dVAR;
6279     register regnode *ret;
6280     register char op;
6281     register char *next;
6282     I32 flags;
6283     const char * const origparse = RExC_parse;
6284     I32 min;
6285     I32 max = REG_INFTY;
6286     char *parse_start;
6287     const char *maxpos = NULL;
6288     GET_RE_DEBUG_FLAGS_DECL;
6289     DEBUG_PARSE("piec");
6290
6291     ret = regatom(pRExC_state, &flags,depth+1);
6292     if (ret == NULL) {
6293         if (flags & TRYAGAIN)
6294             *flagp |= TRYAGAIN;
6295         return(NULL);
6296     }
6297
6298     op = *RExC_parse;
6299
6300     if (op == '{' && regcurly(RExC_parse)) {
6301         maxpos = NULL;
6302         parse_start = RExC_parse; /* MJD */
6303         next = RExC_parse + 1;
6304         while (isDIGIT(*next) || *next == ',') {
6305             if (*next == ',') {
6306                 if (maxpos)
6307                     break;
6308                 else
6309                     maxpos = next;
6310             }
6311             next++;
6312         }
6313         if (*next == '}') {             /* got one */
6314             if (!maxpos)
6315                 maxpos = next;
6316             RExC_parse++;
6317             min = atoi(RExC_parse);
6318             if (*maxpos == ',')
6319                 maxpos++;
6320             else
6321                 maxpos = RExC_parse;
6322             max = atoi(maxpos);
6323             if (!max && *maxpos != '0')
6324                 max = REG_INFTY;                /* meaning "infinity" */
6325             else if (max >= REG_INFTY)
6326                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6327             RExC_parse = next;
6328             nextchar(pRExC_state);
6329
6330         do_curly:
6331             if ((flags&SIMPLE)) {
6332                 RExC_naughty += 2 + RExC_naughty / 2;
6333                 reginsert(pRExC_state, CURLY, ret, depth+1);
6334                 Set_Node_Offset(ret, parse_start+1); /* MJD */
6335                 Set_Node_Cur_Length(ret);
6336             }
6337             else {
6338                 regnode * const w = reg_node(pRExC_state, WHILEM);
6339
6340                 w->flags = 0;
6341                 REGTAIL(pRExC_state, ret, w);
6342                 if (!SIZE_ONLY && RExC_extralen) {
6343                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
6344                     reginsert(pRExC_state, NOTHING,ret, depth+1);
6345                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
6346                 }
6347                 reginsert(pRExC_state, CURLYX,ret, depth+1);
6348                                 /* MJD hk */
6349                 Set_Node_Offset(ret, parse_start+1);
6350                 Set_Node_Length(ret,
6351                                 op == '{' ? (RExC_parse - parse_start) : 1);
6352
6353                 if (!SIZE_ONLY && RExC_extralen)
6354                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
6355                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6356                 if (SIZE_ONLY)
6357                     RExC_whilem_seen++, RExC_extralen += 3;
6358                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
6359             }
6360             ret->flags = 0;
6361
6362             if (min > 0)
6363                 *flagp = WORST;
6364             if (max > 0)
6365                 *flagp |= HASWIDTH;
6366             if (max && max < min)
6367                 vFAIL("Can't do {n,m} with n > m");
6368             if (!SIZE_ONLY) {
6369                 ARG1_SET(ret, (U16)min);
6370                 ARG2_SET(ret, (U16)max);
6371             }
6372
6373             goto nest_check;
6374         }
6375     }
6376
6377     if (!ISMULT1(op)) {
6378         *flagp = flags;
6379         return(ret);
6380     }
6381
6382 #if 0                           /* Now runtime fix should be reliable. */
6383
6384     /* if this is reinstated, don't forget to put this back into perldiag:
6385
6386             =item Regexp *+ operand could be empty at {#} in regex m/%s/
6387
6388            (F) The part of the regexp subject to either the * or + quantifier
6389            could match an empty string. The {#} shows in the regular
6390            expression about where the problem was discovered.
6391
6392     */
6393
6394     if (!(flags&HASWIDTH) && op != '?')
6395       vFAIL("Regexp *+ operand could be empty");
6396 #endif
6397
6398     parse_start = RExC_parse;
6399     nextchar(pRExC_state);
6400
6401     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6402
6403     if (op == '*' && (flags&SIMPLE)) {
6404         reginsert(pRExC_state, STAR, ret, depth+1);
6405         ret->flags = 0;
6406         RExC_naughty += 4;
6407     }
6408     else if (op == '*') {
6409         min = 0;
6410         goto do_curly;
6411     }
6412     else if (op == '+' && (flags&SIMPLE)) {
6413         reginsert(pRExC_state, PLUS, ret, depth+1);
6414         ret->flags = 0;
6415         RExC_naughty += 3;
6416     }
6417     else if (op == '+') {
6418         min = 1;
6419         goto do_curly;
6420     }
6421     else if (op == '?') {
6422         min = 0; max = 1;
6423         goto do_curly;
6424     }
6425   nest_check:
6426     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
6427         vWARN3(RExC_parse,
6428                "%.*s matches null string many times",
6429                (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6430                origparse);
6431     }
6432
6433     if (RExC_parse < RExC_end && *RExC_parse == '?') {
6434         nextchar(pRExC_state);
6435         reginsert(pRExC_state, MINMOD, ret, depth+1);
6436         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6437     }
6438 #ifndef REG_ALLOW_MINMOD_SUSPEND
6439     else
6440 #endif
6441     if (RExC_parse < RExC_end && *RExC_parse == '+') {
6442         regnode *ender;
6443         nextchar(pRExC_state);
6444         ender = reg_node(pRExC_state, SUCCEED);
6445         REGTAIL(pRExC_state, ret, ender);
6446         reginsert(pRExC_state, SUSPEND, ret, depth+1);
6447         ret->flags = 0;
6448         ender = reg_node(pRExC_state, TAIL);
6449         REGTAIL(pRExC_state, ret, ender);
6450         /*ret= ender;*/
6451     }
6452
6453     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6454         RExC_parse++;
6455         vFAIL("Nested quantifiers");
6456     }
6457
6458     return(ret);
6459 }
6460
6461
6462 /* reg_namedseq(pRExC_state,UVp)
6463
6464    This is expected to be called by a parser routine that has
6465    recognized'\N' and needs to handle the rest. RExC_parse is
6466    expected to point at the first char following the N at the time
6467    of the call.
6468
6469    If valuep is non-null then it is assumed that we are parsing inside
6470    of a charclass definition and the first codepoint in the resolved
6471    string is returned via *valuep and the routine will return NULL.
6472    In this mode if a multichar string is returned from the charnames
6473    handler a warning will be issued, and only the first char in the
6474    sequence will be examined. If the string returned is zero length
6475    then the value of *valuep is undefined and NON-NULL will
6476    be returned to indicate failure. (This will NOT be a valid pointer
6477    to a regnode.)
6478
6479    If value is null then it is assumed that we are parsing normal text
6480    and inserts a new EXACT node into the program containing the resolved
6481    string and returns a pointer to the new node. If the string is
6482    zerolength a NOTHING node is emitted.
6483
6484    On success RExC_parse is set to the char following the endbrace.
6485    Parsing failures will generate a fatal errorvia vFAIL(...)
6486
6487    NOTE: We cache all results from the charnames handler locally in
6488    the RExC_charnames hash (created on first use) to prevent a charnames
6489    handler from playing silly-buggers and returning a short string and
6490    then a long string for a given pattern. Since the regexp program
6491    size is calculated during an initial parse this would result
6492    in a buffer overrun so we cache to prevent the charname result from
6493    changing during the course of the parse.
6494
6495  */
6496 STATIC regnode *
6497 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
6498 {
6499     char * name;        /* start of the content of the name */
6500     char * endbrace;    /* endbrace following the name */
6501     SV *sv_str = NULL;
6502     SV *sv_name = NULL;
6503     STRLEN len; /* this has various purposes throughout the code */
6504     bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
6505     regnode *ret = NULL;
6506
6507     if (*RExC_parse != '{') {
6508         vFAIL("Missing braces on \\N{}");
6509     }
6510     name = RExC_parse+1;
6511     endbrace = strchr(RExC_parse, '}');
6512     if ( ! endbrace ) {
6513         RExC_parse++;
6514         vFAIL("Missing right brace on \\N{}");
6515     }
6516     RExC_parse = endbrace + 1;
6517
6518
6519     /* RExC_parse points at the beginning brace,
6520        endbrace points at the last */
6521     if ( name[0]=='U' && name[1]=='+' ) {
6522         /* its a "Unicode hex" notation {U+89AB} */
6523         I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
6524             | PERL_SCAN_DISALLOW_PREFIX
6525             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6526         UV cp;
6527         char string;
6528         len = (STRLEN)(endbrace - name - 2);
6529         cp = grok_hex(name + 2, &len, &fl, NULL);
6530         if ( len != (STRLEN)(endbrace - name - 2) ) {
6531             cp = 0xFFFD;
6532         }
6533         if (cp > 0xff)
6534             RExC_utf8 = 1;
6535         if ( valuep ) {
6536             *valuep = cp;
6537             return NULL;
6538         }
6539         string = (char)cp;
6540         sv_str= newSVpvn(&string, 1);
6541     } else {
6542         /* fetch the charnames handler for this scope */
6543         HV * const table = GvHV(PL_hintgv);
6544         SV **cvp= table ?
6545             hv_fetchs(table, "charnames", FALSE) :
6546             NULL;
6547         SV *cv= cvp ? *cvp : NULL;
6548         HE *he_str;
6549         int count;
6550         /* create an SV with the name as argument */
6551         sv_name = newSVpvn(name, endbrace - name);
6552
6553         if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
6554             vFAIL2("Constant(\\N{%s}) unknown: "
6555                   "(possibly a missing \"use charnames ...\")",
6556                   SvPVX(sv_name));
6557         }
6558         if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
6559             vFAIL2("Constant(\\N{%s}): "
6560                   "$^H{charnames} is not defined",SvPVX(sv_name));
6561         }
6562
6563
6564
6565         if (!RExC_charnames) {
6566             /* make sure our cache is allocated */
6567             RExC_charnames = newHV();
6568             sv_2mortal((SV*)RExC_charnames);
6569         }
6570             /* see if we have looked this one up before */
6571         he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
6572         if ( he_str ) {
6573             sv_str = HeVAL(he_str);
6574             cached = 1;
6575         } else {
6576             dSP ;
6577
6578             ENTER ;
6579             SAVETMPS ;
6580             PUSHMARK(SP) ;
6581
6582             XPUSHs(sv_name);
6583
6584             PUTBACK ;
6585
6586             count= call_sv(cv, G_SCALAR);
6587
6588             if (count == 1) { /* XXXX is this right? dmq */
6589                 sv_str = POPs;
6590                 SvREFCNT_inc_simple_void(sv_str);
6591             }
6592
6593             SPAGAIN ;
6594             PUTBACK ;
6595             FREETMPS ;
6596             LEAVE ;
6597
6598             if ( !sv_str || !SvOK(sv_str) ) {
6599                 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
6600                       "did not return a defined value",SvPVX(sv_name));
6601             }
6602             if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
6603                 cached = 1;
6604         }
6605     }
6606     if (valuep) {
6607         char *p = SvPV(sv_str, len);
6608         if (len) {
6609             STRLEN numlen = 1;
6610             if ( SvUTF8(sv_str) ) {
6611                 *valuep = utf8_to_uvchr((U8*)p, &numlen);
6612                 if (*valuep > 0x7F)
6613                     RExC_utf8 = 1;
6614                 /* XXXX
6615                   We have to turn on utf8 for high bit chars otherwise
6616                   we get failures with
6617
6618                    "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6619                    "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6620
6621                   This is different from what \x{} would do with the same
6622                   codepoint, where the condition is > 0xFF.
6623                   - dmq
6624                 */
6625
6626
6627             } else {
6628                 *valuep = (UV)*p;
6629                 /* warn if we havent used the whole string? */
6630             }
6631             if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6632                 vWARN2(RExC_parse,
6633                     "Ignoring excess chars from \\N{%s} in character class",
6634                     SvPVX(sv_name)
6635                 );
6636             }
6637         } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6638             vWARN2(RExC_parse,
6639                     "Ignoring zero length \\N{%s} in character class",
6640                     SvPVX(sv_name)
6641                 );
6642         }
6643         if (sv_name)
6644             SvREFCNT_dec(sv_name);
6645         if (!cached)
6646             SvREFCNT_dec(sv_str);
6647         return len ? NULL : (regnode *)&len;
6648     } else if(SvCUR(sv_str)) {
6649
6650         char *s;
6651         char *p, *pend;
6652         STRLEN charlen = 1;
6653 #ifdef DEBUGGING
6654         char * parse_start = name-3; /* needed for the offsets */
6655 #endif
6656         GET_RE_DEBUG_FLAGS_DECL;     /* needed for the offsets */
6657
6658         ret = reg_node(pRExC_state,
6659             (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6660         s= STRING(ret);
6661
6662         if ( RExC_utf8 && !SvUTF8(sv_str) ) {
6663             sv_utf8_upgrade(sv_str);
6664         } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
6665             RExC_utf8= 1;
6666         }
6667
6668         p = SvPV(sv_str, len);
6669         pend = p + len;
6670         /* len is the length written, charlen is the size the char read */
6671         for ( len = 0; p < pend; p += charlen ) {
6672             if (UTF) {
6673                 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
6674                 if (FOLD) {
6675                     STRLEN foldlen,numlen;
6676                     U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6677                     uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
6678                     /* Emit all the Unicode characters. */
6679
6680                     for (foldbuf = tmpbuf;
6681                         foldlen;
6682                         foldlen -= numlen)
6683                     {
6684                         uvc = utf8_to_uvchr(foldbuf, &numlen);
6685                         if (numlen > 0) {
6686                             const STRLEN unilen = reguni(pRExC_state, uvc, s);
6687                             s       += unilen;
6688                             len     += unilen;
6689                             /* In EBCDIC the numlen
6690                             * and unilen can differ. */
6691                             foldbuf += numlen;
6692                             if (numlen >= foldlen)
6693                                 break;
6694                         }
6695                         else
6696                             break; /* "Can't happen." */
6697                     }
6698                 } else {
6699                     const STRLEN unilen = reguni(pRExC_state, uvc, s);
6700                     if (unilen > 0) {
6701                        s   += unilen;
6702                        len += unilen;
6703                     }
6704                 }
6705             } else {
6706                 len++;
6707                 REGC(*p, s++);
6708             }
6709         }
6710         if (SIZE_ONLY) {
6711             RExC_size += STR_SZ(len);
6712         } else {
6713             STR_LEN(ret) = len;
6714             RExC_emit += STR_SZ(len);
6715         }
6716         Set_Node_Cur_Length(ret); /* MJD */
6717         RExC_parse--;
6718         nextchar(pRExC_state);
6719     } else {
6720         ret = reg_node(pRExC_state,NOTHING);
6721     }
6722     if (!cached) {
6723         SvREFCNT_dec(sv_str);
6724     }
6725     if (sv_name) {
6726         SvREFCNT_dec(sv_name);
6727     }
6728     return ret;
6729
6730 }
6731
6732
6733 /*
6734  * reg_recode
6735  *
6736  * It returns the code point in utf8 for the value in *encp.
6737  *    value: a code value in the source encoding
6738  *    encp:  a pointer to an Encode object
6739  *
6740  * If the result from Encode is not a single character,
6741  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6742  */
6743 STATIC UV
6744 S_reg_recode(pTHX_ const char value, SV **encp)
6745 {
6746     STRLEN numlen = 1;
6747     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
6748     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
6749     const STRLEN newlen = SvCUR(sv);
6750     UV uv = UNICODE_REPLACEMENT;
6751
6752     if (newlen)
6753         uv = SvUTF8(sv)
6754              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6755              : *(U8*)s;
6756
6757     if (!newlen || numlen != newlen) {
6758         uv = UNICODE_REPLACEMENT;
6759         *encp = NULL;
6760     }
6761     return uv;
6762 }
6763
6764
6765 /*
6766  - regatom - the lowest level
6767
6768    Try to identify anything special at the start of the pattern. If there
6769    is, then handle it as required. This may involve generating a single regop,
6770    such as for an assertion; or it may involve recursing, such as to
6771    handle a () structure.
6772
6773    If the string doesn't start with something special then we gobble up
6774    as much literal text as we can.
6775
6776    Once we have been able to handle whatever type of thing started the
6777    sequence, we return.
6778
6779    Note: we have to be careful with escapes, as they can be both literal
6780    and special, and in the case of \10 and friends can either, depending
6781    on context. Specifically there are two seperate switches for handling
6782    escape sequences, with the one for handling literal escapes requiring
6783    a dummy entry for all of the special escapes that are actually handled
6784    by the other.
6785 */
6786
6787 STATIC regnode *
6788 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6789 {
6790     dVAR;
6791     register regnode *ret = NULL;
6792     I32 flags;
6793     char *parse_start = RExC_parse;
6794     GET_RE_DEBUG_FLAGS_DECL;
6795     DEBUG_PARSE("atom");
6796     *flagp = WORST;             /* Tentatively. */
6797
6798
6799 tryagain:
6800     switch ((U8)*RExC_parse) {
6801     case '^':
6802         RExC_seen_zerolen++;
6803         nextchar(pRExC_state);
6804         if (RExC_flags & RXf_PMf_MULTILINE)
6805             ret = reg_node(pRExC_state, MBOL);
6806         else if (RExC_flags & RXf_PMf_SINGLELINE)
6807             ret = reg_node(pRExC_state, SBOL);
6808         else
6809             ret = reg_node(pRExC_state, BOL);
6810         Set_Node_Length(ret, 1); /* MJD */
6811         break;
6812     case '$':
6813         nextchar(pRExC_state);
6814         if (*RExC_parse)
6815             RExC_seen_zerolen++;
6816         if (RExC_flags & RXf_PMf_MULTILINE)
6817             ret = reg_node(pRExC_state, MEOL);
6818         else if (RExC_flags & RXf_PMf_SINGLELINE)
6819             ret = reg_node(pRExC_state, SEOL);
6820         else
6821             ret = reg_node(pRExC_state, EOL);
6822         Set_Node_Length(ret, 1); /* MJD */
6823         break;
6824     case '.':
6825         nextchar(pRExC_state);
6826         if (RExC_flags & RXf_PMf_SINGLELINE)
6827             ret = reg_node(pRExC_state, SANY);
6828         else
6829             ret = reg_node(pRExC_state, REG_ANY);
6830         *flagp |= HASWIDTH|SIMPLE;
6831         RExC_naughty++;
6832         Set_Node_Length(ret, 1); /* MJD */
6833         break;
6834     case '[':
6835     {
6836         char * const oregcomp_parse = ++RExC_parse;
6837         ret = regclass(pRExC_state,depth+1);
6838         if (*RExC_parse != ']') {
6839             RExC_parse = oregcomp_parse;
6840             vFAIL("Unmatched [");
6841         }
6842         nextchar(pRExC_state);
6843         *flagp |= HASWIDTH|SIMPLE;
6844         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
6845         break;
6846     }
6847     case '(':
6848         nextchar(pRExC_state);
6849         ret = reg(pRExC_state, 1, &flags,depth+1);
6850         if (ret == NULL) {
6851                 if (flags & TRYAGAIN) {
6852                     if (RExC_parse == RExC_end) {
6853                          /* Make parent create an empty node if needed. */
6854                         *flagp |= TRYAGAIN;
6855                         return(NULL);
6856                     }
6857                     goto tryagain;
6858                 }
6859                 return(NULL);
6860         }
6861         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
6862         break;
6863     case '|':
6864     case ')':
6865         if (flags & TRYAGAIN) {
6866             *flagp |= TRYAGAIN;
6867             return NULL;
6868         }
6869         vFAIL("Internal urp");
6870                                 /* Supposed to be caught earlier. */
6871         break;
6872     case '{':
6873         if (!regcurly(RExC_parse)) {
6874             RExC_parse++;
6875             goto defchar;
6876         }
6877         /* FALL THROUGH */
6878     case '?':
6879     case '+':
6880     case '*':
6881         RExC_parse++;
6882         vFAIL("Quantifier follows nothing");
6883         break;
6884     case 0xDF:
6885     case 0xC3:
6886     case 0xCE:
6887         do_foldchar:
6888         if (!LOC && FOLD) {
6889             U32 len,cp;
6890             len=0; /* silence a spurious compiler warning */
6891             if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
6892                 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
6893                 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
6894                 ret = reganode(pRExC_state, FOLDCHAR, cp);
6895                 Set_Node_Length(ret, 1); /* MJD */
6896                 nextchar(pRExC_state); /* kill whitespace under /x */
6897                 return ret;
6898             }
6899         }
6900         goto outer_default;
6901     case '\\':
6902         /* Special Escapes
6903
6904            This switch handles escape sequences that resolve to some kind
6905            of special regop and not to literal text. Escape sequnces that
6906            resolve to literal text are handled below in the switch marked
6907            "Literal Escapes".
6908
6909            Every entry in this switch *must* have a corresponding entry
6910            in the literal escape switch. However, the opposite is not
6911            required, as the default for this switch is to jump to the
6912            literal text handling code.
6913         */
6914         switch ((U8)*++RExC_parse) {
6915         case 0xDF:
6916         case 0xC3:
6917         case 0xCE:
6918                    goto do_foldchar;
6919         /* Special Escapes */
6920         case 'A':
6921             RExC_seen_zerolen++;
6922             ret = reg_node(pRExC_state, SBOL);
6923             *flagp |= SIMPLE;
6924             goto finish_meta_pat;
6925         case 'G':
6926             ret = reg_node(pRExC_state, GPOS);
6927             RExC_seen |= REG_SEEN_GPOS;
6928             *flagp |= SIMPLE;
6929             goto finish_meta_pat;
6930         case 'K':
6931             RExC_seen_zerolen++;
6932             ret = reg_node(pRExC_state, KEEPS);
6933             *flagp |= SIMPLE;
6934             /* XXX:dmq : disabling in-place substitution seems to
6935              * be necessary here to avoid cases of memory corruption, as
6936              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
6937              */
6938             RExC_seen |= REG_SEEN_LOOKBEHIND;
6939             goto finish_meta_pat;
6940         case 'Z':
6941             ret = reg_node(pRExC_state, SEOL);
6942             *flagp |= SIMPLE;
6943             RExC_seen_zerolen++;                /* Do not optimize RE away */
6944             goto finish_meta_pat;
6945         case 'z':
6946             ret = reg_node(pRExC_state, EOS);
6947             *flagp |= SIMPLE;
6948             RExC_seen_zerolen++;                /* Do not optimize RE away */
6949             goto finish_meta_pat;
6950         case 'C':
6951             ret = reg_node(pRExC_state, CANY);
6952             RExC_seen |= REG_SEEN_CANY;
6953             *flagp |= HASWIDTH|SIMPLE;
6954             goto finish_meta_pat;
6955         case 'X':
6956             ret = reg_node(pRExC_state, CLUMP);
6957             *flagp |= HASWIDTH;
6958             goto finish_meta_pat;
6959         case 'w':
6960             ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
6961             *flagp |= HASWIDTH|SIMPLE;
6962             goto finish_meta_pat;
6963         case 'W':
6964             ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
6965             *flagp |= HASWIDTH|SIMPLE;
6966             goto finish_meta_pat;
6967         case 'b':
6968             RExC_seen_zerolen++;
6969             RExC_seen |= REG_SEEN_LOOKBEHIND;
6970             ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
6971             *flagp |= SIMPLE;
6972             goto finish_meta_pat;
6973         case 'B':
6974             RExC_seen_zerolen++;
6975             RExC_seen |= REG_SEEN_LOOKBEHIND;
6976             ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
6977             *flagp |= SIMPLE;
6978             goto finish_meta_pat;
6979         case 's':
6980             ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
6981             *flagp |= HASWIDTH|SIMPLE;
6982             goto finish_meta_pat;
6983         case 'S':
6984             ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
6985             *flagp |= HASWIDTH|SIMPLE;
6986             goto finish_meta_pat;
6987         case 'd':
6988             ret = reg_node(pRExC_state, DIGIT);
6989             *flagp |= HASWIDTH|SIMPLE;
6990             goto finish_meta_pat;
6991         case 'D':
6992             ret = reg_node(pRExC_state, NDIGIT);
6993             *flagp |= HASWIDTH|SIMPLE;
6994             goto finish_meta_pat;
6995         case 'R':
6996             ret = reg_node(pRExC_state, LNBREAK);
6997             *flagp |= HASWIDTH|SIMPLE;
6998             goto finish_meta_pat;
6999         case 'h':
7000             ret = reg_node(pRExC_state, HORIZWS);
7001             *flagp |= HASWIDTH|SIMPLE;
7002             goto finish_meta_pat;
7003         case 'H':
7004             ret = reg_node(pRExC_state, NHORIZWS);
7005             *flagp |= HASWIDTH|SIMPLE;
7006             goto finish_meta_pat;
7007         case 'v':
7008             ret = reg_node(pRExC_state, VERTWS);
7009             *flagp |= HASWIDTH|SIMPLE;
7010             goto finish_meta_pat;
7011         case 'V':
7012             ret = reg_node(pRExC_state, NVERTWS);
7013             *flagp |= HASWIDTH|SIMPLE;
7014          finish_meta_pat:
7015             nextchar(pRExC_state);
7016             Set_Node_Length(ret, 2); /* MJD */
7017             break;
7018         case 'p':
7019         case 'P':
7020             {
7021                 char* const oldregxend = RExC_end;
7022 #ifdef DEBUGGING
7023                 char* parse_start = RExC_parse - 2;
7024 #endif
7025
7026                 if (RExC_parse[1] == '{') {
7027                   /* a lovely hack--pretend we saw [\pX] instead */
7028                     RExC_end = strchr(RExC_parse, '}');
7029                     if (!RExC_end) {
7030                         const U8 c = (U8)*RExC_parse;
7031                         RExC_parse += 2;
7032                         RExC_end = oldregxend;
7033                         vFAIL2("Missing right brace on \\%c{}", c);
7034                     }
7035                     RExC_end++;
7036                 }
7037                 else {
7038                     RExC_end = RExC_parse + 2;
7039                     if (RExC_end > oldregxend)
7040                         RExC_end = oldregxend;
7041                 }
7042                 RExC_parse--;
7043
7044                 ret = regclass(pRExC_state,depth+1);
7045
7046                 RExC_end = oldregxend;
7047                 RExC_parse--;
7048
7049                 Set_Node_Offset(ret, parse_start + 2);
7050                 Set_Node_Cur_Length(ret);
7051                 nextchar(pRExC_state);
7052                 *flagp |= HASWIDTH|SIMPLE;
7053             }
7054             break;
7055         case 'N':
7056             /* Handle \N{NAME} here and not below because it can be
7057             multicharacter. join_exact() will join them up later on.
7058             Also this makes sure that things like /\N{BLAH}+/ and
7059             \N{BLAH} being multi char Just Happen. dmq*/
7060             ++RExC_parse;
7061             ret= reg_namedseq(pRExC_state, NULL);
7062             break;
7063         case 'k':    /* Handle \k<NAME> and \k'NAME' */
7064         parse_named_seq:
7065         {
7066             char ch= RExC_parse[1];
7067             if (ch != '<' && ch != '\'' && ch != '{') {
7068                 RExC_parse++;
7069                 vFAIL2("Sequence %.2s... not terminated",parse_start);
7070             } else {
7071                 /* this pretty much dupes the code for (?P=...) in reg(), if
7072                    you change this make sure you change that */
7073                 char* name_start = (RExC_parse += 2);
7074                 U32 num = 0;
7075                 SV *sv_dat = reg_scan_name(pRExC_state,
7076                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7077                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7078                 if (RExC_parse == name_start || *RExC_parse != ch)
7079                     vFAIL2("Sequence %.3s... not terminated",parse_start);
7080
7081                 if (!SIZE_ONLY) {
7082                     num = add_data( pRExC_state, 1, "S" );
7083                     RExC_rxi->data->data[num]=(void*)sv_dat;
7084                     SvREFCNT_inc_simple_void(sv_dat);
7085                 }
7086
7087                 RExC_sawback = 1;
7088                 ret = reganode(pRExC_state,
7089                            (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7090                            num);
7091                 *flagp |= HASWIDTH;
7092
7093                 /* override incorrect value set in reganode MJD */
7094                 Set_Node_Offset(ret, parse_start+1);
7095                 Set_Node_Cur_Length(ret); /* MJD */
7096                 nextchar(pRExC_state);
7097
7098             }
7099             break;
7100         }
7101         case 'g':
7102         case '1': case '2': case '3': case '4':
7103         case '5': case '6': case '7': case '8': case '9':
7104             {
7105                 I32 num;
7106                 bool isg = *RExC_parse == 'g';
7107                 bool isrel = 0;
7108                 bool hasbrace = 0;
7109                 if (isg) {
7110                     RExC_parse++;
7111                     if (*RExC_parse == '{') {
7112                         RExC_parse++;
7113                         hasbrace = 1;
7114                     }
7115                     if (*RExC_parse == '-') {
7116                         RExC_parse++;
7117                         isrel = 1;
7118                     }
7119                     if (hasbrace && !isDIGIT(*RExC_parse)) {
7120                         if (isrel) RExC_parse--;
7121                         RExC_parse -= 2;
7122                         goto parse_named_seq;
7123                 }   }
7124                 num = atoi(RExC_parse);
7125                 if (isg && num == 0)
7126                     vFAIL("Reference to invalid group 0");
7127                 if (isrel) {
7128                     num = RExC_npar - num;
7129                     if (num < 1)
7130                         vFAIL("Reference to nonexistent or unclosed group");
7131                 }
7132                 if (!isg && num > 9 && num >= RExC_npar)
7133                     goto defchar;
7134                 else {
7135                     char * const parse_start = RExC_parse - 1; /* MJD */
7136                     while (isDIGIT(*RExC_parse))
7137                         RExC_parse++;
7138                     if (parse_start == RExC_parse - 1)
7139                         vFAIL("Unterminated \\g... pattern");
7140                     if (hasbrace) {
7141                         if (*RExC_parse != '}')
7142                             vFAIL("Unterminated \\g{...} pattern");
7143                         RExC_parse++;
7144                     }
7145                     if (!SIZE_ONLY) {
7146                         if (num > (I32)RExC_rx->nparens)
7147                             vFAIL("Reference to nonexistent group");
7148                     }
7149                     RExC_sawback = 1;
7150                     ret = reganode(pRExC_state,
7151                                    (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7152                                    num);
7153                     *flagp |= HASWIDTH;
7154
7155                     /* override incorrect value set in reganode MJD */
7156                     Set_Node_Offset(ret, parse_start+1);
7157                     Set_Node_Cur_Length(ret); /* MJD */
7158                     RExC_parse--;
7159                     nextchar(pRExC_state);
7160                 }
7161             }
7162             break;
7163         case '\0':
7164             if (RExC_parse >= RExC_end)
7165                 FAIL("Trailing \\");
7166             /* FALL THROUGH */
7167         default:
7168             /* Do not generate "unrecognized" warnings here, we fall
7169                back into the quick-grab loop below */
7170             parse_start--;
7171             goto defchar;
7172         }
7173         break;
7174
7175     case '#':
7176         if (RExC_flags & RXf_PMf_EXTENDED) {
7177             if ( reg_skipcomment( pRExC_state ) )
7178                 goto tryagain;
7179         }
7180         /* FALL THROUGH */
7181
7182     default:
7183         outer_default:{
7184             register STRLEN len;
7185             register UV ender;
7186             register char *p;
7187             char *s;
7188             STRLEN foldlen;
7189             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7190
7191             parse_start = RExC_parse - 1;
7192
7193             RExC_parse++;
7194
7195         defchar:
7196             ender = 0;
7197             ret = reg_node(pRExC_state,
7198                            (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
7199             s = STRING(ret);
7200             for (len = 0, p = RExC_parse - 1;
7201               len < 127 && p < RExC_end;
7202               len++)
7203             {
7204                 char * const oldp = p;
7205
7206                 if (RExC_flags & RXf_PMf_EXTENDED)
7207                     p = regwhite( pRExC_state, p );
7208                 switch ((U8)*p) {
7209                 case 0xDF:
7210                 case 0xC3:
7211                 case 0xCE:
7212                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7213                                 goto normal_default;
7214                 case '^':
7215                 case '$':
7216                 case '.':
7217                 case '[':
7218                 case '(':
7219                 case ')':
7220                 case '|':
7221                     goto loopdone;
7222                 case '\\':
7223                     /* Literal Escapes Switch
7224
7225                        This switch is meant to handle escape sequences that
7226                        resolve to a literal character.
7227
7228                        Every escape sequence that represents something
7229                        else, like an assertion or a char class, is handled
7230                        in the switch marked 'Special Escapes' above in this
7231                        routine, but also has an entry here as anything that
7232                        isn't explicitly mentioned here will be treated as
7233                        an unescaped equivalent literal.
7234                     */
7235
7236                     switch ((U8)*++p) {
7237                     /* These are all the special escapes. */
7238                     case 0xDF:
7239                     case 0xC3:
7240                     case 0xCE:
7241                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7242                                 goto normal_default;
7243                     case 'A':             /* Start assertion */
7244                     case 'b': case 'B':   /* Word-boundary assertion*/
7245                     case 'C':             /* Single char !DANGEROUS! */
7246                     case 'd': case 'D':   /* digit class */
7247                     case 'g': case 'G':   /* generic-backref, pos assertion */
7248                     case 'h': case 'H':   /* HORIZWS */
7249                     case 'k': case 'K':   /* named backref, keep marker */
7250                     case 'N':             /* named char sequence */
7251                     case 'p': case 'P':   /* Unicode property */
7252                               case 'R':   /* LNBREAK */
7253                     case 's': case 'S':   /* space class */
7254                     case 'v': case 'V':   /* VERTWS */
7255                     case 'w': case 'W':   /* word class */
7256                     case 'X':             /* eXtended Unicode "combining character sequence" */
7257                     case 'z': case 'Z':   /* End of line/string assertion */
7258                         --p;
7259                         goto loopdone;
7260
7261                     /* Anything after here is an escape that resolves to a
7262                        literal. (Except digits, which may or may not)
7263                      */
7264                     case 'n':
7265                         ender = '\n';
7266                         p++;
7267                         break;
7268                     case 'r':
7269                         ender = '\r';
7270                         p++;
7271                         break;
7272                     case 't':
7273                         ender = '\t';
7274                         p++;
7275                         break;
7276                     case 'f':
7277                         ender = '\f';
7278                         p++;
7279                         break;
7280                     case 'e':
7281                           ender = ASCII_TO_NATIVE('\033');
7282                         p++;
7283                         break;
7284                     case 'a':
7285                           ender = ASCII_TO_NATIVE('\007');
7286                         p++;
7287                         break;
7288                     case 'x':
7289                         if (*++p == '{') {
7290                             char* const e = strchr(p, '}');
7291
7292                             if (!e) {
7293                                 RExC_parse = p + 1;
7294                                 vFAIL("Missing right brace on \\x{}");
7295                             }
7296                             else {
7297                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7298                                     | PERL_SCAN_DISALLOW_PREFIX;
7299                                 STRLEN numlen = e - p - 1;
7300                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7301                                 if (ender > 0xff)
7302                                     RExC_utf8 = 1;
7303                                 p = e + 1;
7304                             }
7305                         }
7306                         else {
7307                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7308                             STRLEN numlen = 2;
7309                             ender = grok_hex(p, &numlen, &flags, NULL);
7310                             p += numlen;
7311                         }
7312                         if (PL_encoding && ender < 0x100)
7313                             goto recode_encoding;
7314                         break;
7315                     case 'c':
7316                         p++;
7317                         ender = UCHARAT(p++);
7318                         ender = toCTRL(ender);
7319                         break;
7320                     case '0': case '1': case '2': case '3':case '4':
7321                     case '5': case '6': case '7': case '8':case '9':
7322                         if (*p == '0' ||
7323                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
7324                             I32 flags = 0;
7325                             STRLEN numlen = 3;
7326                             ender = grok_oct(p, &numlen, &flags, NULL);
7327                             p += numlen;
7328                         }
7329                         else {
7330                             --p;
7331                             goto loopdone;
7332                         }
7333                         if (PL_encoding && ender < 0x100)
7334                             goto recode_encoding;
7335                         break;
7336                     recode_encoding:
7337                         {
7338                             SV* enc = PL_encoding;
7339                             ender = reg_recode((const char)(U8)ender, &enc);
7340                             if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7341                                 vWARN(p, "Invalid escape in the specified encoding");
7342                             RExC_utf8 = 1;
7343                         }
7344                         break;
7345                     case '\0':
7346                         if (p >= RExC_end)
7347                             FAIL("Trailing \\");
7348                         /* FALL THROUGH */
7349                     default:
7350                         if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
7351                             vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7352                         goto normal_default;
7353                     }
7354                     break;
7355                 default:
7356                   normal_default:
7357                     if (UTF8_IS_START(*p) && UTF) {
7358                         STRLEN numlen;
7359                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7360                                                &numlen, UTF8_ALLOW_DEFAULT);
7361                         p += numlen;
7362                     }
7363                     else
7364                         ender = *p++;
7365                     break;
7366                 }
7367                 if ( RExC_flags & RXf_PMf_EXTENDED)
7368                     p = regwhite( pRExC_state, p );
7369                 if (UTF && FOLD) {
7370                     /* Prime the casefolded buffer. */
7371                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7372                 }
7373                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7374                     if (len)
7375                         p = oldp;
7376                     else if (UTF) {
7377                          if (FOLD) {
7378                               /* Emit all the Unicode characters. */
7379                               STRLEN numlen;
7380                               for (foldbuf = tmpbuf;
7381                                    foldlen;
7382                                    foldlen -= numlen) {
7383                                    ender = utf8_to_uvchr(foldbuf, &numlen);
7384                                    if (numlen > 0) {
7385                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
7386                                         s       += unilen;
7387                                         len     += unilen;
7388                                         /* In EBCDIC the numlen
7389                                          * and unilen can differ. */
7390                                         foldbuf += numlen;
7391                                         if (numlen >= foldlen)
7392                                              break;
7393                                    }
7394                                    else
7395                                         break; /* "Can't happen." */
7396                               }
7397                          }
7398                          else {
7399                               const STRLEN unilen = reguni(pRExC_state, ender, s);
7400                               if (unilen > 0) {
7401                                    s   += unilen;
7402                                    len += unilen;
7403                               }
7404                          }
7405                     }
7406                     else {
7407                         len++;
7408                         REGC((char)ender, s++);
7409                     }
7410                     break;
7411                 }
7412                 if (UTF) {
7413                      if (FOLD) {
7414                           /* Emit all the Unicode characters. */
7415                           STRLEN numlen;
7416                           for (foldbuf = tmpbuf;
7417                                foldlen;
7418                                foldlen -= numlen) {
7419                                ender = utf8_to_uvchr(foldbuf, &numlen);
7420                                if (numlen > 0) {
7421                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
7422                                     len     += unilen;
7423                                     s       += unilen;
7424                                     /* In EBCDIC the numlen
7425                                      * and unilen can differ. */
7426                                     foldbuf += numlen;
7427                                     if (numlen >= foldlen)
7428                                          break;
7429                                }
7430                                else
7431                                     break;
7432                           }
7433                      }
7434                      else {
7435                           const STRLEN unilen = reguni(pRExC_state, ender, s);
7436                           if (unilen > 0) {
7437                                s   += unilen;
7438                                len += unilen;
7439                           }
7440                      }
7441                      len--;
7442                 }
7443                 else
7444                     REGC((char)ender, s++);
7445             }
7446         loopdone:
7447             RExC_parse = p - 1;
7448             Set_Node_Cur_Length(ret); /* MJD */
7449             nextchar(pRExC_state);
7450             {
7451                 /* len is STRLEN which is unsigned, need to copy to signed */
7452                 IV iv = len;
7453                 if (iv < 0)
7454                     vFAIL("Internal disaster");
7455             }
7456             if (len > 0)
7457                 *flagp |= HASWIDTH;
7458             if (len == 1 && UNI_IS_INVARIANT(ender))
7459                 *flagp |= SIMPLE;
7460
7461             if (SIZE_ONLY)
7462                 RExC_size += STR_SZ(len);
7463             else {
7464                 STR_LEN(ret) = len;
7465                 RExC_emit += STR_SZ(len);
7466             }
7467         }
7468         break;
7469     }
7470
7471     return(ret);
7472 }
7473
7474 STATIC char *
7475 S_regwhite( RExC_state_t *pRExC_state, char *p )
7476 {
7477     const char *e = RExC_end;
7478     while (p < e) {
7479         if (isSPACE(*p))
7480             ++p;
7481         else if (*p == '#') {
7482             bool ended = 0;
7483             do {
7484                 if (*p++ == '\n') {
7485                     ended = 1;
7486                     break;
7487                 }
7488             } while (p < e);
7489             if (!ended)
7490                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7491         }
7492         else
7493             break;
7494     }
7495     return p;
7496 }
7497
7498 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7499    Character classes ([:foo:]) can also be negated ([:^foo:]).
7500    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7501    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7502    but trigger failures because they are currently unimplemented. */
7503
7504 #define POSIXCC_DONE(c)   ((c) == ':')
7505 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7506 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7507
7508 STATIC I32
7509 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7510 {
7511     dVAR;
7512     I32 namedclass = OOB_NAMEDCLASS;
7513
7514     if (value == '[' && RExC_parse + 1 < RExC_end &&
7515         /* I smell either [: or [= or [. -- POSIX has been here, right? */
7516         POSIXCC(UCHARAT(RExC_parse))) {
7517         const char c = UCHARAT(RExC_parse);
7518         char* const s = RExC_parse++;
7519
7520         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
7521             RExC_parse++;
7522         if (RExC_parse == RExC_end)
7523             /* Grandfather lone [:, [=, [. */
7524             RExC_parse = s;
7525         else {
7526             const char* const t = RExC_parse++; /* skip over the c */
7527             assert(*t == c);
7528
7529             if (UCHARAT(RExC_parse) == ']') {
7530                 const char *posixcc = s + 1;
7531                 RExC_parse++; /* skip over the ending ] */
7532
7533                 if (*s == ':') {
7534                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7535                     const I32 skip = t - posixcc;
7536
7537                     /* Initially switch on the length of the name.  */
7538                     switch (skip) {
7539                     case 4:
7540                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7541                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
7542                         break;
7543                     case 5:
7544                         /* Names all of length 5.  */
7545                         /* alnum alpha ascii blank cntrl digit graph lower
7546                            print punct space upper  */
7547                         /* Offset 4 gives the best switch position.  */
7548                         switch (posixcc[4]) {
7549                         case 'a':
7550                             if (memEQ(posixcc, "alph", 4)) /* alpha */
7551                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
7552                             break;
7553                         case 'e':
7554                             if (memEQ(posixcc, "spac", 4)) /* space */
7555                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
7556                             break;
7557                         case 'h':
7558                             if (memEQ(posixcc, "grap", 4)) /* graph */
7559                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
7560                             break;
7561                         case 'i':
7562                             if (memEQ(posixcc, "asci", 4)) /* ascii */
7563                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
7564                             break;
7565                         case 'k':
7566                             if (memEQ(posixcc, "blan", 4)) /* blank */
7567                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
7568                             break;
7569                         case 'l':
7570                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7571                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
7572                             break;
7573                         case 'm':
7574                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
7575                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7576                             break;
7577                         case 'r':
7578                             if (memEQ(posixcc, "lowe", 4)) /* lower */
7579                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7580                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
7581                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
7582                             break;
7583                         case 't':
7584                             if (memEQ(posixcc, "digi", 4)) /* digit */
7585                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7586                             else if (memEQ(posixcc, "prin", 4)) /* print */
7587                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7588                             else if (memEQ(posixcc, "punc", 4)) /* punct */
7589                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
7590                             break;
7591                         }
7592                         break;
7593                     case 6:
7594                         if (memEQ(posixcc, "xdigit", 6))
7595                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
7596                         break;
7597                     }
7598
7599                     if (namedclass == OOB_NAMEDCLASS)
7600                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7601                                       t - s - 1, s + 1);
7602                     assert (posixcc[skip] == ':');
7603                     assert (posixcc[skip+1] == ']');
7604                 } else if (!SIZE_ONLY) {
7605                     /* [[=foo=]] and [[.foo.]] are still future. */
7606
7607                     /* adjust RExC_parse so the warning shows after
7608                        the class closes */
7609                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
7610                         RExC_parse++;
7611                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7612                 }
7613             } else {
7614                 /* Maternal grandfather:
7615                  * "[:" ending in ":" but not in ":]" */
7616                 RExC_parse = s;
7617             }
7618         }
7619     }
7620
7621     return namedclass;
7622 }
7623
7624 STATIC void
7625 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
7626 {
7627     dVAR;
7628     if (POSIXCC(UCHARAT(RExC_parse))) {
7629         const char *s = RExC_parse;
7630         const char  c = *s++;
7631
7632         while (isALNUM(*s))
7633             s++;
7634         if (*s && c == *s && s[1] == ']') {
7635             if (ckWARN(WARN_REGEXP))
7636                 vWARN3(s+2,
7637                         "POSIX syntax [%c %c] belongs inside character classes",
7638                         c, c);
7639
7640             /* [[=foo=]] and [[.foo.]] are still future. */
7641             if (POSIXCC_NOTYET(c)) {
7642                 /* adjust RExC_parse so the error shows after
7643                    the class closes */
7644                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
7645                     NOOP;
7646                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7647             }
7648         }
7649     }
7650 }
7651
7652
7653 #define _C_C_T_(NAME,TEST,WORD)                         \
7654 ANYOF_##NAME:                                           \
7655     if (LOC)                                            \
7656         ANYOF_CLASS_SET(ret, ANYOF_##NAME);             \
7657     else {                                              \
7658         for (value = 0; value < 256; value++)           \
7659             if (TEST)                                   \
7660                 ANYOF_BITMAP_SET(ret, value);           \
7661     }                                                   \
7662     yesno = '+';                                        \
7663     what = WORD;                                        \
7664     break;                                              \
7665 case ANYOF_N##NAME:                                     \
7666     if (LOC)                                            \
7667         ANYOF_CLASS_SET(ret, ANYOF_N##NAME);            \
7668     else {                                              \
7669         for (value = 0; value < 256; value++)           \
7670             if (!TEST)                                  \
7671                 ANYOF_BITMAP_SET(ret, value);           \
7672     }                                                   \
7673     yesno = '!';                                        \
7674     what = WORD;                                        \
7675     break
7676
7677 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                   \
7678 ANYOF_##NAME:                                           \
7679         for (value = 0; value < 256; value++)           \
7680             if (TEST)                                   \
7681                 ANYOF_BITMAP_SET(ret, value);           \
7682     yesno = '+';                                        \
7683     what = WORD;                                        \
7684     break;                                              \
7685 case ANYOF_N##NAME:                                     \
7686         for (value = 0; value < 256; value++)           \
7687             if (!TEST)                                  \
7688                 ANYOF_BITMAP_SET(ret, value);           \
7689     yesno = '!';                                        \
7690     what = WORD;                                        \
7691     break
7692
7693 /*
7694    parse a class specification and produce either an ANYOF node that
7695    matches the pattern or if the pattern matches a single char only and
7696    that char is < 256 and we are case insensitive then we produce an
7697    EXACT node instead.
7698 */
7699
7700 STATIC regnode *
7701 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
7702 {
7703     dVAR;
7704     register UV nextvalue;
7705     register IV prevvalue = OOB_UNICODE;
7706     register IV range = 0;
7707     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
7708     register regnode *ret;
7709     STRLEN numlen;
7710     IV namedclass;
7711     char *rangebegin = NULL;
7712     bool need_class = 0;
7713     SV *listsv = NULL;
7714     UV n;
7715     bool optimize_invert   = TRUE;
7716     AV* unicode_alternate  = NULL;
7717 #ifdef EBCDIC
7718     UV literal_endpoint = 0;
7719 #endif
7720     UV stored = 0;  /* number of chars stored in the class */
7721
7722     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7723         case we need to change the emitted regop to an EXACT. */
7724     const char * orig_parse = RExC_parse;
7725     GET_RE_DEBUG_FLAGS_DECL;
7726 #ifndef DEBUGGING
7727     PERL_UNUSED_ARG(depth);
7728 #endif
7729
7730     DEBUG_PARSE("clas");
7731
7732     /* Assume we are going to generate an ANYOF node. */
7733     ret = reganode(pRExC_state, ANYOF, 0);
7734
7735     if (!SIZE_ONLY)
7736         ANYOF_FLAGS(ret) = 0;
7737
7738     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
7739         RExC_naughty++;
7740         RExC_parse++;
7741         if (!SIZE_ONLY)
7742             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7743     }
7744
7745     if (SIZE_ONLY) {
7746         RExC_size += ANYOF_SKIP;
7747         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7748     }
7749     else {
7750         RExC_emit += ANYOF_SKIP;
7751         if (FOLD)
7752             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7753         if (LOC)
7754             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
7755         ANYOF_BITMAP_ZERO(ret);
7756         listsv = newSVpvs("# comment\n");
7757     }
7758
7759     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7760
7761     if (!SIZE_ONLY && POSIXCC(nextvalue))
7762         checkposixcc(pRExC_state);
7763
7764     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7765     if (UCHARAT(RExC_parse) == ']')
7766         goto charclassloop;
7767
7768 parseit:
7769     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
7770
7771     charclassloop:
7772
7773         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7774
7775         if (!range)
7776             rangebegin = RExC_parse;
7777         if (UTF) {
7778             value = utf8n_to_uvchr((U8*)RExC_parse,
7779                                    RExC_end - RExC_parse,
7780                                    &numlen, UTF8_ALLOW_DEFAULT);
7781             RExC_parse += numlen;
7782         }
7783         else
7784             value = UCHARAT(RExC_parse++);
7785
7786         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7787         if (value == '[' && POSIXCC(nextvalue))
7788             namedclass = regpposixcc(pRExC_state, value);
7789         else if (value == '\\') {
7790             if (UTF) {
7791                 value = utf8n_to_uvchr((U8*)RExC_parse,
7792                                    RExC_end - RExC_parse,
7793                                    &numlen, UTF8_ALLOW_DEFAULT);
7794                 RExC_parse += numlen;
7795             }
7796             else
7797                 value = UCHARAT(RExC_parse++);
7798             /* Some compilers cannot handle switching on 64-bit integer
7799              * values, therefore value cannot be an UV.  Yes, this will
7800              * be a problem later if we want switch on Unicode.
7801              * A similar issue a little bit later when switching on
7802              * namedclass. --jhi */
7803             switch ((I32)value) {
7804             case 'w':   namedclass = ANYOF_ALNUM;       break;
7805             case 'W':   namedclass = ANYOF_NALNUM;      break;
7806             case 's':   namedclass = ANYOF_SPACE;       break;
7807             case 'S':   namedclass = ANYOF_NSPACE;      break;
7808             case 'd':   namedclass = ANYOF_DIGIT;       break;
7809             case 'D':   namedclass = ANYOF_NDIGIT;      break;
7810             case 'v':   namedclass = ANYOF_VERTWS;      break;
7811             case 'V':   namedclass = ANYOF_NVERTWS;     break;
7812             case 'h':   namedclass = ANYOF_HORIZWS;     break;
7813             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
7814             case 'N':  /* Handle \N{NAME} in class */
7815                 {
7816                     /* We only pay attention to the first char of
7817                     multichar strings being returned. I kinda wonder
7818                     if this makes sense as it does change the behaviour
7819                     from earlier versions, OTOH that behaviour was broken
7820                     as well. */
7821                     UV v; /* value is register so we cant & it /grrr */
7822                     if (reg_namedseq(pRExC_state, &v)) {
7823                         goto parseit;
7824                     }
7825                     value= v;
7826                 }
7827                 break;
7828             case 'p':
7829             case 'P':
7830                 {
7831                 char *e;
7832                 if (RExC_parse >= RExC_end)
7833                     vFAIL2("Empty \\%c{}", (U8)value);
7834                 if (*RExC_parse == '{') {
7835                     const U8 c = (U8)value;
7836                     e = strchr(RExC_parse++, '}');
7837                     if (!e)
7838                         vFAIL2("Missing right brace on \\%c{}", c);
7839                     while (isSPACE(UCHARAT(RExC_parse)))
7840                         RExC_parse++;
7841                     if (e == RExC_parse)
7842                         vFAIL2("Empty \\%c{}", c);
7843                     n = e - RExC_parse;
7844                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
7845                         n--;
7846                 }
7847                 else {
7848                     e = RExC_parse;
7849                     n = 1;
7850                 }
7851                 if (!SIZE_ONLY) {
7852                     if (UCHARAT(RExC_parse) == '^') {
7853                          RExC_parse++;
7854                          n--;
7855                          value = value == 'p' ? 'P' : 'p'; /* toggle */
7856                          while (isSPACE(UCHARAT(RExC_parse))) {
7857                               RExC_parse++;
7858                               n--;
7859                          }
7860                     }
7861                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7862                         (value=='p' ? '+' : '!'), (int)n, RExC_parse);
7863                 }
7864                 RExC_parse = e + 1;
7865                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7866                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
7867                 }
7868                 break;
7869             case 'n':   value = '\n';                   break;
7870             case 'r':   value = '\r';                   break;
7871             case 't':   value = '\t';                   break;
7872             case 'f':   value = '\f';                   break;
7873             case 'b':   value = '\b';                   break;
7874             case 'e':   value = ASCII_TO_NATIVE('\033');break;
7875             case 'a':   value = ASCII_TO_NATIVE('\007');break;
7876             case 'x':
7877                 if (*RExC_parse == '{') {
7878                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7879                         | PERL_SCAN_DISALLOW_PREFIX;
7880                     char * const e = strchr(RExC_parse++, '}');
7881                     if (!e)
7882                         vFAIL("Missing right brace on \\x{}");
7883
7884                     numlen = e - RExC_parse;
7885                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
7886                     RExC_parse = e + 1;
7887                 }
7888                 else {
7889                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7890                     numlen = 2;
7891                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
7892                     RExC_parse += numlen;
7893                 }
7894                 if (PL_encoding && value < 0x100)
7895                     goto recode_encoding;
7896                 break;
7897             case 'c':
7898                 value = UCHARAT(RExC_parse++);
7899                 value = toCTRL(value);
7900                 break;
7901             case '0': case '1': case '2': case '3': case '4':
7902             case '5': case '6': case '7': case '8': case '9':
7903                 {
7904                     I32 flags = 0;
7905                     numlen = 3;
7906                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7907                     RExC_parse += numlen;
7908                     if (PL_encoding && value < 0x100)
7909                         goto recode_encoding;
7910                     break;
7911                 }
7912             recode_encoding:
7913                 {
7914                     SV* enc = PL_encoding;
7915                     value = reg_recode((const char)(U8)value, &enc);
7916                     if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7917                         vWARN(RExC_parse,
7918                               "Invalid escape in the specified encoding");
7919                     break;
7920                 }
7921             default:
7922                 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
7923                     vWARN2(RExC_parse,
7924                            "Unrecognized escape \\%c in character class passed through",
7925                            (int)value);
7926                 break;
7927             }
7928         } /* end of \blah */
7929 #ifdef EBCDIC
7930         else
7931             literal_endpoint++;
7932 #endif
7933
7934         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7935
7936             if (!SIZE_ONLY && !need_class)
7937                 ANYOF_CLASS_ZERO(ret);
7938
7939             need_class = 1;
7940
7941             /* a bad range like a-\d, a-[:digit:] ? */
7942             if (range) {
7943                 if (!SIZE_ONLY) {
7944                     if (ckWARN(WARN_REGEXP)) {
7945                         const int w =
7946                             RExC_parse >= rangebegin ?
7947                             RExC_parse - rangebegin : 0;
7948                         vWARN4(RExC_parse,
7949                                "False [] range \"%*.*s\"",
7950                                w, w, rangebegin);
7951                     }
7952                     if (prevvalue < 256) {
7953                         ANYOF_BITMAP_SET(ret, prevvalue);
7954                         ANYOF_BITMAP_SET(ret, '-');
7955                     }
7956                     else {
7957                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7958                         Perl_sv_catpvf(aTHX_ listsv,
7959                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
7960                     }
7961                 }
7962
7963                 range = 0; /* this was not a true range */
7964             }
7965
7966
7967
7968             if (!SIZE_ONLY) {
7969                 const char *what = NULL;
7970                 char yesno = 0;
7971
7972                 if (namedclass > OOB_NAMEDCLASS)
7973                     optimize_invert = FALSE;
7974                 /* Possible truncation here but in some 64-bit environments
7975                  * the compiler gets heartburn about switch on 64-bit values.
7976                  * A similar issue a little earlier when switching on value.
7977                  * --jhi */
7978                 switch ((I32)namedclass) {
7979                 case _C_C_T_(ALNUM, isALNUM(value), "Word");
7980                 case _C_C_T_(ALNUMC, isALNUMC(value), "Alnum");
7981                 case _C_C_T_(ALPHA, isALPHA(value), "Alpha");
7982                 case _C_C_T_(BLANK, isBLANK(value), "Blank");
7983                 case _C_C_T_(CNTRL, isCNTRL(value), "Cntrl");
7984                 case _C_C_T_(GRAPH, isGRAPH(value), "Graph");
7985                 case _C_C_T_(LOWER, isLOWER(value), "Lower");
7986                 case _C_C_T_(PRINT, isPRINT(value), "Print");
7987                 case _C_C_T_(PSXSPC, isPSXSPC(value), "Space");
7988                 case _C_C_T_(PUNCT, isPUNCT(value), "Punct");
7989                 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
7990                 case _C_C_T_(UPPER, isUPPER(value), "Upper");
7991                 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
7992                 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
7993                 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
7994                 case ANYOF_ASCII:
7995                     if (LOC)
7996                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
7997                     else {
7998 #ifndef EBCDIC
7999                         for (value = 0; value < 128; value++)
8000                             ANYOF_BITMAP_SET(ret, value);
8001 #else  /* EBCDIC */
8002                         for (value = 0; value < 256; value++) {
8003                             if (isASCII(value))
8004                                 ANYOF_BITMAP_SET(ret, value);
8005                         }
8006 #endif /* EBCDIC */
8007                     }
8008                     yesno = '+';
8009                     what = "ASCII";
8010                     break;
8011                 case ANYOF_NASCII:
8012                     if (LOC)
8013                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8014                     else {
8015 #ifndef EBCDIC
8016                         for (value = 128; value < 256; value++)
8017                             ANYOF_BITMAP_SET(ret, value);
8018 #else  /* EBCDIC */
8019                         for (value = 0; value < 256; value++) {
8020                             if (!isASCII(value))
8021                                 ANYOF_BITMAP_SET(ret, value);
8022                         }
8023 #endif /* EBCDIC */
8024                     }
8025                     yesno = '!';
8026                     what = "ASCII";
8027                     break;
8028                 case ANYOF_DIGIT:
8029                     if (LOC)
8030                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8031                     else {
8032                         /* consecutive digits assumed */
8033                         for (value = '0'; value <= '9'; value++)
8034                             ANYOF_BITMAP_SET(ret, value);
8035                     }
8036                     yesno = '+';
8037                     what = "Digit";
8038                     break;
8039                 case ANYOF_NDIGIT:
8040                     if (LOC)
8041                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8042                     else {
8043                         /* consecutive digits assumed */
8044                         for (value = 0; value < '0'; value++)
8045                             ANYOF_BITMAP_SET(ret, value);
8046                         for (value = '9' + 1; value < 256; value++)
8047                             ANYOF_BITMAP_SET(ret, value);
8048                     }
8049                     yesno = '!';
8050                     what = "Digit";
8051                     break;
8052                 case ANYOF_MAX:
8053                     /* this is to handle \p and \P */
8054                     break;
8055                 default:
8056                     vFAIL("Invalid [::] class");
8057                     break;
8058                 }
8059                 if (what) {
8060                     /* Strings such as "+utf8::isWord\n" */
8061                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8062                 }
8063                 if (LOC)
8064                     ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8065                 continue;
8066             }
8067         } /* end of namedclass \blah */
8068
8069         if (range) {
8070             if (prevvalue > (IV)value) /* b-a */ {
8071                 const int w = RExC_parse - rangebegin;
8072                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8073                 range = 0; /* not a valid range */
8074             }
8075         }
8076         else {
8077             prevvalue = value; /* save the beginning of the range */
8078             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8079                 RExC_parse[1] != ']') {
8080                 RExC_parse++;
8081
8082                 /* a bad range like \w-, [:word:]- ? */
8083                 if (namedclass > OOB_NAMEDCLASS) {
8084                     if (ckWARN(WARN_REGEXP)) {
8085                         const int w =
8086                             RExC_parse >= rangebegin ?
8087                             RExC_parse - rangebegin : 0;
8088                         vWARN4(RExC_parse,
8089                                "False [] range \"%*.*s\"",
8090                                w, w, rangebegin);
8091                     }
8092                     if (!SIZE_ONLY)
8093                         ANYOF_BITMAP_SET(ret, '-');
8094                 } else
8095                     range = 1;  /* yeah, it's a range! */
8096                 continue;       /* but do it the next time */
8097             }
8098         }
8099
8100         /* now is the next time */
8101         /*stored += (value - prevvalue + 1);*/
8102         if (!SIZE_ONLY) {
8103             if (prevvalue < 256) {
8104                 const IV ceilvalue = value < 256 ? value : 255;
8105                 IV i;
8106 #ifdef EBCDIC
8107                 /* In EBCDIC [\x89-\x91] should include
8108                  * the \x8e but [i-j] should not. */
8109                 if (literal_endpoint == 2 &&
8110                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8111                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8112                 {
8113                     if (isLOWER(prevvalue)) {
8114                         for (i = prevvalue; i <= ceilvalue; i++)
8115                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8116                                 stored++;
8117                                 ANYOF_BITMAP_SET(ret, i);
8118                             }
8119                     } else {
8120                         for (i = prevvalue; i <= ceilvalue; i++)
8121                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8122                                 stored++;
8123                                 ANYOF_BITMAP_SET(ret, i);
8124                             }
8125                     }
8126                 }
8127                 else
8128 #endif
8129                       for (i = prevvalue; i <= ceilvalue; i++) {
8130                         if (!ANYOF_BITMAP_TEST(ret,i)) {
8131                             stored++;
8132                             ANYOF_BITMAP_SET(ret, i);
8133                         }
8134                       }
8135           }
8136           if (value > 255 || UTF) {
8137                 const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
8138                 const UV natvalue      = NATIVE_TO_UNI(value);
8139                 stored+=2; /* can't optimize this class */
8140                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8141                 if (prevnatvalue < natvalue) { /* what about > ? */
8142                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8143                                    prevnatvalue, natvalue);
8144                 }
8145                 else if (prevnatvalue == natvalue) {
8146                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8147                     if (FOLD) {
8148                          U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8149                          STRLEN foldlen;
8150                          const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8151
8152 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8153                          if (RExC_precomp[0] == ':' &&
8154                              RExC_precomp[1] == '[' &&
8155                              (f == 0xDF || f == 0x92)) {
8156                              f = NATIVE_TO_UNI(f);
8157                         }
8158 #endif
8159                          /* If folding and foldable and a single
8160                           * character, insert also the folded version
8161                           * to the charclass. */
8162                          if (f != value) {
8163 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8164                              if ((RExC_precomp[0] == ':' &&
8165                                   RExC_precomp[1] == '[' &&
8166                                   (f == 0xA2 &&
8167                                    (value == 0xFB05 || value == 0xFB06))) ?
8168                                  foldlen == ((STRLEN)UNISKIP(f) - 1) :
8169                                  foldlen == (STRLEN)UNISKIP(f) )
8170 #else
8171                               if (foldlen == (STRLEN)UNISKIP(f))
8172 #endif
8173                                   Perl_sv_catpvf(aTHX_ listsv,
8174                                                  "%04"UVxf"\n", f);
8175                               else {
8176                                   /* Any multicharacter foldings
8177                                    * require the following transform:
8178                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8179                                    * where E folds into "pq" and F folds
8180                                    * into "rst", all other characters
8181                                    * fold to single characters.  We save
8182                                    * away these multicharacter foldings,
8183                                    * to be later saved as part of the
8184                                    * additional "s" data. */
8185                                   SV *sv;
8186
8187                                   if (!unicode_alternate)
8188                                       unicode_alternate = newAV();
8189                                   sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8190                                                      TRUE);
8191                                   av_push(unicode_alternate, sv);
8192                               }
8193                          }
8194
8195                          /* If folding and the value is one of the Greek
8196                           * sigmas insert a few more sigmas to make the
8197                           * folding rules of the sigmas to work right.
8198                           * Note that not all the possible combinations
8199                           * are handled here: some of them are handled
8200                           * by the standard folding rules, and some of
8201                           * them (literal or EXACTF cases) are handled
8202                           * during runtime in regexec.c:S_find_byclass(). */
8203                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8204                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8205                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8206                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8207                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8208                          }
8209                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8210                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8211                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8212                     }
8213                 }
8214             }
8215 #ifdef EBCDIC
8216             literal_endpoint = 0;
8217 #endif
8218         }
8219
8220         range = 0; /* this range (if it was one) is done now */
8221     }
8222
8223     if (need_class) {
8224         ANYOF_FLAGS(ret) |= ANYOF_LARGE;
8225         if (SIZE_ONLY)
8226             RExC_size += ANYOF_CLASS_ADD_SKIP;
8227         else
8228             RExC_emit += ANYOF_CLASS_ADD_SKIP;
8229     }
8230
8231
8232     if (SIZE_ONLY)
8233         return ret;
8234     /****** !SIZE_ONLY AFTER HERE *********/
8235
8236     if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8237         && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8238     ) {
8239         /* optimize single char class to an EXACT node
8240            but *only* when its not a UTF/high char  */
8241         const char * cur_parse= RExC_parse;
8242         RExC_emit = (regnode *)orig_emit;
8243         RExC_parse = (char *)orig_parse;
8244         ret = reg_node(pRExC_state,
8245                        (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8246         RExC_parse = (char *)cur_parse;
8247         *STRING(ret)= (char)value;
8248         STR_LEN(ret)= 1;
8249         RExC_emit += STR_SZ(1);
8250         return ret;
8251     }
8252     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8253     if ( /* If the only flag is folding (plus possibly inversion). */
8254         ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8255        ) {
8256         for (value = 0; value < 256; ++value) {
8257             if (ANYOF_BITMAP_TEST(ret, value)) {
8258                 UV fold = PL_fold[value];
8259
8260                 if (fold != value)
8261                     ANYOF_BITMAP_SET(ret, fold);
8262             }
8263         }
8264         ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8265     }
8266
8267     /* optimize inverted simple patterns (e.g. [^a-z]) */
8268     if (optimize_invert &&
8269         /* If the only flag is inversion. */
8270         (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8271         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8272             ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8273         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8274     }
8275     {
8276         AV * const av = newAV();
8277         SV *rv;
8278         /* The 0th element stores the character class description
8279          * in its textual form: used later (regexec.c:Perl_regclass_swash())
8280          * to initialize the appropriate swash (which gets stored in
8281          * the 1st element), and also useful for dumping the regnode.
8282          * The 2nd element stores the multicharacter foldings,
8283          * used later (regexec.c:S_reginclass()). */
8284         av_store(av, 0, listsv);
8285         av_store(av, 1, NULL);
8286         av_store(av, 2, (SV*)unicode_alternate);
8287         rv = newRV_noinc((SV*)av);
8288         n = add_data(pRExC_state, 1, "s");
8289         RExC_rxi->data->data[n] = (void*)rv;
8290         ARG_SET(ret, n);
8291     }
8292     return ret;
8293 }
8294 #undef _C_C_T_
8295
8296
8297 /* reg_skipcomment()
8298
8299    Absorbs an /x style # comments from the input stream.
8300    Returns true if there is more text remaining in the stream.
8301    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8302    terminates the pattern without including a newline.
8303
8304    Note its the callers responsibility to ensure that we are
8305    actually in /x mode
8306
8307 */
8308
8309 STATIC bool
8310 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8311 {
8312     bool ended = 0;
8313     while (RExC_parse < RExC_end)
8314         if (*RExC_parse++ == '\n') {
8315             ended = 1;
8316             break;
8317         }
8318     if (!ended) {
8319         /* we ran off the end of the pattern without ending
8320            the comment, so we have to add an \n when wrapping */
8321         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8322         return 0;
8323     } else
8324         return 1;
8325 }
8326
8327 /* nextchar()
8328
8329    Advance that parse position, and optionall absorbs
8330    "whitespace" from the inputstream.
8331
8332    Without /x "whitespace" means (?#...) style comments only,
8333    with /x this means (?#...) and # comments and whitespace proper.
8334
8335    Returns the RExC_parse point from BEFORE the scan occurs.
8336
8337    This is the /x friendly way of saying RExC_parse++.
8338 */
8339
8340 STATIC char*
8341 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
8342 {
8343     char* const retval = RExC_parse++;
8344
8345     for (;;) {
8346         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8347                 RExC_parse[2] == '#') {
8348             while (*RExC_parse != ')') {
8349                 if (RExC_parse == RExC_end)
8350                     FAIL("Sequence (?#... not terminated");
8351                 RExC_parse++;
8352             }
8353             RExC_parse++;
8354             continue;
8355         }
8356         if (RExC_flags & RXf_PMf_EXTENDED) {
8357             if (isSPACE(*RExC_parse)) {
8358                 RExC_parse++;
8359                 continue;
8360             }
8361             else if (*RExC_parse == '#') {
8362                 if ( reg_skipcomment( pRExC_state ) )
8363                     continue;
8364             }
8365         }
8366         return retval;
8367     }
8368 }
8369
8370 /*
8371 - reg_node - emit a node
8372 */
8373 STATIC regnode *                        /* Location. */
8374 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
8375 {
8376     dVAR;
8377     register regnode *ptr;
8378     regnode * const ret = RExC_emit;
8379     GET_RE_DEBUG_FLAGS_DECL;
8380
8381     if (SIZE_ONLY) {
8382         SIZE_ALIGN(RExC_size);
8383         RExC_size += 1;
8384         return(ret);
8385     }
8386     if (RExC_emit >= RExC_emit_bound)
8387         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8388
8389     NODE_ALIGN_FILL(ret);
8390     ptr = ret;
8391     FILL_ADVANCE_NODE(ptr, op);
8392 #ifdef RE_TRACK_PATTERN_OFFSETS
8393     if (RExC_offsets) {         /* MJD */
8394         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
8395               "reg_node", __LINE__,
8396               PL_reg_name[op],
8397               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8398                 ? "Overwriting end of array!\n" : "OK",
8399               (UV)(RExC_emit - RExC_emit_start),
8400               (UV)(RExC_parse - RExC_start),
8401               (UV)RExC_offsets[0]));
8402         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
8403     }
8404 #endif
8405     RExC_emit = ptr;
8406     return(ret);
8407 }
8408
8409 /*
8410 - reganode - emit a node with an argument
8411 */
8412 STATIC regnode *                        /* Location. */
8413 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
8414 {
8415     dVAR;
8416     register regnode *ptr;
8417     regnode * const ret = RExC_emit;
8418     GET_RE_DEBUG_FLAGS_DECL;
8419
8420     if (SIZE_ONLY) {
8421         SIZE_ALIGN(RExC_size);
8422         RExC_size += 2;
8423         /*
8424            We can't do this:
8425
8426            assert(2==regarglen[op]+1);
8427
8428            Anything larger than this has to allocate the extra amount.
8429            If we changed this to be:
8430
8431            RExC_size += (1 + regarglen[op]);
8432
8433            then it wouldn't matter. Its not clear what side effect
8434            might come from that so its not done so far.
8435            -- dmq
8436         */
8437         return(ret);
8438     }
8439     if (RExC_emit >= RExC_emit_bound)
8440         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8441
8442     NODE_ALIGN_FILL(ret);
8443     ptr = ret;
8444     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
8445 #ifdef RE_TRACK_PATTERN_OFFSETS
8446     if (RExC_offsets) {         /* MJD */
8447         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8448               "reganode",
8449               __LINE__,
8450               PL_reg_name[op],
8451               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
8452               "Overwriting end of array!\n" : "OK",
8453               (UV)(RExC_emit - RExC_emit_start),
8454               (UV)(RExC_parse - RExC_start),
8455               (UV)RExC_offsets[0]));
8456         Set_Cur_Node_Offset;
8457     }
8458 #endif
8459     RExC_emit = ptr;
8460     return(ret);
8461 }
8462
8463 /*
8464 - reguni - emit (if appropriate) a Unicode character
8465 */
8466 STATIC STRLEN
8467 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
8468 {
8469     dVAR;
8470     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
8471 }
8472
8473 /*
8474 - reginsert - insert an operator in front of already-emitted operand
8475 *
8476 * Means relocating the operand.
8477 */
8478 STATIC void
8479 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
8480 {
8481     dVAR;
8482     register regnode *src;
8483     register regnode *dst;
8484     register regnode *place;
8485     const int offset = regarglen[(U8)op];
8486     const int size = NODE_STEP_REGNODE + offset;
8487     GET_RE_DEBUG_FLAGS_DECL;
8488     PERL_UNUSED_ARG(depth);
8489 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
8490     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
8491     if (SIZE_ONLY) {
8492         RExC_size += size;
8493         return;
8494     }
8495
8496     src = RExC_emit;
8497     RExC_emit += size;
8498     dst = RExC_emit;
8499     if (RExC_open_parens) {
8500         int paren;
8501         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
8502         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
8503             if ( RExC_open_parens[paren] >= opnd ) {
8504                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
8505                 RExC_open_parens[paren] += size;
8506             } else {
8507                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
8508             }
8509             if ( RExC_close_parens[paren] >= opnd ) {
8510                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
8511                 RExC_close_parens[paren] += size;
8512             } else {
8513                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
8514             }
8515         }
8516     }
8517
8518     while (src > opnd) {
8519         StructCopy(--src, --dst, regnode);
8520 #ifdef RE_TRACK_PATTERN_OFFSETS
8521         if (RExC_offsets) {     /* MJD 20010112 */
8522             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
8523                   "reg_insert",
8524                   __LINE__,
8525                   PL_reg_name[op],
8526                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8527                     ? "Overwriting end of array!\n" : "OK",
8528                   (UV)(src - RExC_emit_start),
8529                   (UV)(dst - RExC_emit_start),
8530                   (UV)RExC_offsets[0]));
8531             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8532             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
8533         }
8534 #endif
8535     }
8536
8537
8538     place = opnd;               /* Op node, where operand used to be. */
8539 #ifdef RE_TRACK_PATTERN_OFFSETS
8540     if (RExC_offsets) {         /* MJD */
8541         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
8542               "reginsert",
8543               __LINE__,
8544               PL_reg_name[op],
8545               (UV)(place - RExC_emit_start) > RExC_offsets[0]
8546               ? "Overwriting end of array!\n" : "OK",
8547               (UV)(place - RExC_emit_start),
8548               (UV)(RExC_parse - RExC_start),
8549               (UV)RExC_offsets[0]));
8550         Set_Node_Offset(place, RExC_parse);
8551         Set_Node_Length(place, 1);
8552     }
8553 #endif
8554     src = NEXTOPER(place);
8555     FILL_ADVANCE_NODE(place, op);
8556     Zero(src, offset, regnode);
8557 }
8558
8559 /*
8560 - regtail - set the next-pointer at the end of a node chain of p to val.
8561 - SEE ALSO: regtail_study
8562 */
8563 /* TODO: All three parms should be const */
8564 STATIC void
8565 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8566 {
8567     dVAR;
8568     register regnode *scan;
8569     GET_RE_DEBUG_FLAGS_DECL;
8570 #ifndef DEBUGGING
8571     PERL_UNUSED_ARG(depth);
8572 #endif
8573
8574     if (SIZE_ONLY)
8575         return;
8576
8577     /* Find last node. */
8578     scan = p;
8579     for (;;) {
8580         regnode * const temp = regnext(scan);
8581         DEBUG_PARSE_r({
8582             SV * const mysv=sv_newmortal();
8583             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8584             regprop(RExC_rx, mysv, scan);
8585             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8586                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8587                     (temp == NULL ? "->" : ""),
8588                     (temp == NULL ? PL_reg_name[OP(val)] : "")
8589             );
8590         });
8591         if (temp == NULL)
8592             break;
8593         scan = temp;
8594     }
8595
8596     if (reg_off_by_arg[OP(scan)]) {
8597         ARG_SET(scan, val - scan);
8598     }
8599     else {
8600         NEXT_OFF(scan) = val - scan;
8601     }
8602 }
8603
8604 #ifdef DEBUGGING
8605 /*
8606 - regtail_study - set the next-pointer at the end of a node chain of p to val.
8607 - Look for optimizable sequences at the same time.
8608 - currently only looks for EXACT chains.
8609
8610 This is expermental code. The idea is to use this routine to perform
8611 in place optimizations on branches and groups as they are constructed,
8612 with the long term intention of removing optimization from study_chunk so
8613 that it is purely analytical.
8614
8615 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8616 to control which is which.
8617
8618 */
8619 /* TODO: All four parms should be const */
8620
8621 STATIC U8
8622 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8623 {
8624     dVAR;
8625     register regnode *scan;
8626     U8 exact = PSEUDO;
8627 #ifdef EXPERIMENTAL_INPLACESCAN
8628     I32 min = 0;
8629 #endif
8630
8631     GET_RE_DEBUG_FLAGS_DECL;
8632
8633
8634     if (SIZE_ONLY)
8635         return exact;
8636
8637     /* Find last node. */
8638
8639     scan = p;
8640     for (;;) {
8641         regnode * const temp = regnext(scan);
8642 #ifdef EXPERIMENTAL_INPLACESCAN
8643         if (PL_regkind[OP(scan)] == EXACT)
8644             if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8645                 return EXACT;
8646 #endif
8647         if ( exact ) {
8648             switch (OP(scan)) {
8649                 case EXACT:
8650                 case EXACTF:
8651                 case EXACTFL:
8652                         if( exact == PSEUDO )
8653                             exact= OP(scan);
8654                         else if ( exact != OP(scan) )
8655                             exact= 0;
8656                 case NOTHING:
8657                     break;
8658                 default:
8659                     exact= 0;
8660             }
8661         }
8662         DEBUG_PARSE_r({
8663             SV * const mysv=sv_newmortal();
8664             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8665             regprop(RExC_rx, mysv, scan);
8666             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8667                 SvPV_nolen_const(mysv),
8668                 REG_NODE_NUM(scan),
8669                 PL_reg_name[exact]);
8670         });
8671         if (temp == NULL)
8672             break;
8673         scan = temp;
8674     }
8675     DEBUG_PARSE_r({
8676         SV * const mysv_val=sv_newmortal();
8677         DEBUG_PARSE_MSG("");
8678         regprop(RExC_rx, mysv_val, val);
8679         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8680                       SvPV_nolen_const(mysv_val),
8681                       (IV)REG_NODE_NUM(val),
8682                       (IV)(val - scan)
8683         );
8684     });
8685     if (reg_off_by_arg[OP(scan)]) {
8686         ARG_SET(scan, val - scan);
8687     }
8688     else {
8689         NEXT_OFF(scan) = val - scan;
8690     }
8691
8692     return exact;
8693 }
8694 #endif
8695
8696 /*
8697  - regcurly - a little FSA that accepts {\d+,?\d*}
8698  */
8699 STATIC I32
8700 S_regcurly(register const char *s)
8701 {
8702     if (*s++ != '{')
8703         return FALSE;
8704     if (!isDIGIT(*s))
8705         return FALSE;
8706     while (isDIGIT(*s))
8707         s++;
8708     if (*s == ',')
8709         s++;
8710     while (isDIGIT(*s))
8711         s++;
8712     if (*s != '}')
8713         return FALSE;
8714     return TRUE;
8715 }
8716
8717
8718 /*
8719  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8720  */
8721 #ifdef DEBUGGING
8722 void
8723 S_regdump_extflags(pTHX_ const char *lead, const U32 flags) {
8724     int bit;
8725     int set=0;
8726     for (bit=0; bit<32; bit++) {
8727         if (flags & (1<<bit)) {
8728             if (!set++ && lead)
8729                 PerlIO_printf(Perl_debug_log, "%s",lead);
8730             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
8731         }
8732     }
8733     if (lead)  {
8734         if (set)
8735             PerlIO_printf(Perl_debug_log, "\n");
8736         else
8737             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
8738     }
8739 }
8740 #endif
8741
8742 void
8743 Perl_regdump(pTHX_ const regexp *r)
8744 {
8745 #ifdef DEBUGGING
8746     dVAR;
8747     SV * const sv = sv_newmortal();
8748     SV *dsv= sv_newmortal();
8749     RXi_GET_DECL(r,ri);
8750     GET_RE_DEBUG_FLAGS_DECL;
8751
8752     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
8753
8754     /* Header fields of interest. */
8755     if (r->anchored_substr) {
8756         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8757             RE_SV_DUMPLEN(r->anchored_substr), 30);
8758         PerlIO_printf(Perl_debug_log,
8759                       "anchored %s%s at %"IVdf" ",
8760                       s, RE_SV_TAIL(r->anchored_substr),
8761                       (IV)r->anchored_offset);
8762     } else if (r->anchored_utf8) {
8763         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8764             RE_SV_DUMPLEN(r->anchored_utf8), 30);
8765         PerlIO_printf(Perl_debug_log,
8766                       "anchored utf8 %s%s at %"IVdf" ",
8767                       s, RE_SV_TAIL(r->anchored_utf8),
8768                       (IV)r->anchored_offset);
8769     }
8770     if (r->float_substr) {
8771         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8772             RE_SV_DUMPLEN(r->float_substr), 30);
8773         PerlIO_printf(Perl_debug_log,
8774                       "floating %s%s at %"IVdf"..%"UVuf" ",
8775                       s, RE_SV_TAIL(r->float_substr),
8776                       (IV)r->float_min_offset, (UV)r->float_max_offset);
8777     } else if (r->float_utf8) {
8778         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8779             RE_SV_DUMPLEN(r->float_utf8), 30);
8780         PerlIO_printf(Perl_debug_log,
8781                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8782                       s, RE_SV_TAIL(r->float_utf8),
8783                       (IV)r->float_min_offset, (UV)r->float_max_offset);
8784     }
8785     if (r->check_substr || r->check_utf8)
8786         PerlIO_printf(Perl_debug_log,
8787                       (const char *)
8788                       (r->check_substr == r->float_substr
8789                        && r->check_utf8 == r->float_utf8
8790                        ? "(checking floating" : "(checking anchored"));
8791     if (r->extflags & RXf_NOSCAN)
8792         PerlIO_printf(Perl_debug_log, " noscan");
8793     if (r->extflags & RXf_CHECK_ALL)
8794         PerlIO_printf(Perl_debug_log, " isall");
8795     if (r->check_substr || r->check_utf8)
8796         PerlIO_printf(Perl_debug_log, ") ");
8797
8798     if (ri->regstclass) {
8799         regprop(r, sv, ri->regstclass);
8800         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
8801     }
8802     if (r->extflags & RXf_ANCH) {
8803         PerlIO_printf(Perl_debug_log, "anchored");
8804         if (r->extflags & RXf_ANCH_BOL)
8805             PerlIO_printf(Perl_debug_log, "(BOL)");
8806         if (r->extflags & RXf_ANCH_MBOL)
8807             PerlIO_printf(Perl_debug_log, "(MBOL)");
8808         if (r->extflags & RXf_ANCH_SBOL)
8809             PerlIO_printf(Perl_debug_log, "(SBOL)");
8810         if (r->extflags & RXf_ANCH_GPOS)
8811             PerlIO_printf(Perl_debug_log, "(GPOS)");
8812         PerlIO_putc(Perl_debug_log, ' ');
8813     }
8814     if (r->extflags & RXf_GPOS_SEEN)
8815         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
8816     if (r->intflags & PREGf_SKIP)
8817         PerlIO_printf(Perl_debug_log, "plus ");
8818     if (r->intflags & PREGf_IMPLICIT)
8819         PerlIO_printf(Perl_debug_log, "implicit ");
8820     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
8821     if (r->extflags & RXf_EVAL_SEEN)
8822         PerlIO_printf(Perl_debug_log, "with eval ");
8823     PerlIO_printf(Perl_debug_log, "\n");
8824     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
8825 #else
8826     PERL_UNUSED_CONTEXT;
8827     PERL_UNUSED_ARG(r);
8828 #endif  /* DEBUGGING */
8829 }
8830
8831 /*
8832 - regprop - printable representation of opcode
8833 */
8834 void
8835 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
8836 {
8837 #ifdef DEBUGGING
8838     dVAR;
8839     register int k;
8840     RXi_GET_DECL(prog,progi);
8841     GET_RE_DEBUG_FLAGS_DECL;
8842
8843
8844     sv_setpvn(sv, "", 0);
8845
8846     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
8847         /* It would be nice to FAIL() here, but this may be called from
8848            regexec.c, and it would be hard to supply pRExC_state. */
8849         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
8850     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
8851
8852     k = PL_regkind[OP(o)];
8853
8854     if (k == EXACT) {
8855         sv_catpvs(sv, " ");
8856         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8857          * is a crude hack but it may be the best for now since
8858          * we have no flag "this EXACTish node was UTF-8"
8859          * --jhi */
8860         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
8861                   PERL_PV_ESCAPE_UNI_DETECT |
8862                   PERL_PV_PRETTY_ELLIPSES   |
8863                   PERL_PV_PRETTY_LTGT       |
8864                   PERL_PV_PRETTY_NOCLEAR
8865                   );
8866     } else if (k == TRIE) {
8867         /* print the details of the trie in dumpuntil instead, as
8868          * progi->data isn't available here */
8869         const char op = OP(o);
8870         const U32 n = ARG(o);
8871         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
8872                (reg_ac_data *)progi->data->data[n] :
8873                NULL;
8874         const reg_trie_data * const trie
8875             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
8876
8877         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
8878         DEBUG_TRIE_COMPILE_r(
8879             Perl_sv_catpvf(aTHX_ sv,
8880                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8881                 (UV)trie->startstate,
8882                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
8883                 (UV)trie->wordcount,
8884                 (UV)trie->minlen,
8885                 (UV)trie->maxlen,
8886                 (UV)TRIE_CHARCOUNT(trie),
8887                 (UV)trie->uniquecharcount
8888             )
8889         );
8890         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8891             int i;
8892             int rangestart = -1;
8893             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
8894             sv_catpvs(sv, "[");
8895             for (i = 0; i <= 256; i++) {
8896                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8897                     if (rangestart == -1)
8898                         rangestart = i;
8899                 } else if (rangestart != -1) {
8900                     if (i <= rangestart + 3)
8901                         for (; rangestart < i; rangestart++)
8902                             put_byte(sv, rangestart);
8903                     else {
8904                         put_byte(sv, rangestart);
8905                         sv_catpvs(sv, "-");
8906                         put_byte(sv, i - 1);
8907                     }
8908                     rangestart = -1;
8909                 }
8910             }
8911             sv_catpvs(sv, "]");
8912         }
8913
8914     } else if (k == CURLY) {
8915         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
8916             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8917         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
8918     }
8919     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
8920         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
8921     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
8922         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
8923         if ( prog->paren_names ) {
8924             if ( k != REF || OP(o) < NREF) {
8925                 AV *list= (AV *)progi->data->data[progi->name_list_idx];
8926                 SV **name= av_fetch(list, ARG(o), 0 );
8927                 if (name)
8928                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
8929             }
8930             else {
8931                 AV *list= (AV *)progi->data->data[ progi->name_list_idx ];
8932                 SV *sv_dat=(SV*)progi->data->data[ ARG( o ) ];
8933                 I32 *nums=(I32*)SvPVX(sv_dat);
8934                 SV **name= av_fetch(list, nums[0], 0 );
8935                 I32 n;
8936                 if (name) {
8937                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
8938                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
8939                                     (n ? "," : ""), (IV)nums[n]);
8940                     }
8941                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
8942                 }
8943             }
8944         }
8945     } else if (k == GOSUB)
8946         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
8947     else if (k == VERB) {
8948         if (!o->flags)
8949             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
8950                 SVfARG((SV*)progi->data->data[ ARG( o ) ]));
8951     } else if (k == LOGICAL)
8952         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
8953     else if (k == FOLDCHAR)
8954         Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
8955     else if (k == ANYOF) {
8956         int i, rangestart = -1;
8957         const U8 flags = ANYOF_FLAGS(o);
8958
8959         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8960         static const char * const anyofs[] = {
8961             "\\w",
8962             "\\W",
8963             "\\s",
8964             "\\S",
8965             "\\d",
8966             "\\D",
8967             "[:alnum:]",
8968             "[:^alnum:]",
8969             "[:alpha:]",
8970             "[:^alpha:]",
8971             "[:ascii:]",
8972             "[:^ascii:]",
8973             "[:ctrl:]",
8974             "[:^ctrl:]",
8975             "[:graph:]",
8976             "[:^graph:]",
8977             "[:lower:]",
8978             "[:^lower:]",
8979             "[:print:]",
8980             "[:^print:]",
8981             "[:punct:]",
8982             "[:^punct:]",
8983             "[:upper:]",
8984             "[:^upper:]",
8985             "[:xdigit:]",
8986             "[:^xdigit:]",
8987             "[:space:]",
8988             "[:^space:]",
8989             "[:blank:]",
8990             "[:^blank:]"
8991         };
8992
8993         if (flags & ANYOF_LOCALE)
8994             sv_catpvs(sv, "{loc}");
8995         if (flags & ANYOF_FOLD)
8996             sv_catpvs(sv, "{i}");
8997         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
8998         if (flags & ANYOF_INVERT)
8999             sv_catpvs(sv, "^");
9000         for (i = 0; i <= 256; i++) {
9001             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9002                 if (rangestart == -1)
9003                     rangestart = i;
9004             } else if (rangestart != -1) {
9005                 if (i <= rangestart + 3)
9006                     for (; rangestart < i; rangestart++)
9007                         put_byte(sv, rangestart);
9008                 else {
9009                     put_byte(sv, rangestart);
9010                     sv_catpvs(sv, "-");
9011                     put_byte(sv, i - 1);
9012                 }
9013                 rangestart = -1;
9014             }
9015         }
9016
9017         if (o->flags & ANYOF_CLASS)
9018             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9019                 if (ANYOF_CLASS_TEST(o,i))
9020                     sv_catpv(sv, anyofs[i]);
9021
9022         if (flags & ANYOF_UNICODE)
9023             sv_catpvs(sv, "{unicode}");
9024         else if (flags & ANYOF_UNICODE_ALL)
9025             sv_catpvs(sv, "{unicode_all}");
9026
9027         {
9028             SV *lv;
9029             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9030
9031             if (lv) {
9032                 if (sw) {
9033                     U8 s[UTF8_MAXBYTES_CASE+1];
9034
9035                     for (i = 0; i <= 256; i++) { /* just the first 256 */
9036                         uvchr_to_utf8(s, i);
9037
9038                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
9039                             if (rangestart == -1)
9040                                 rangestart = i;
9041                         } else if (rangestart != -1) {
9042                             if (i <= rangestart + 3)
9043                                 for (; rangestart < i; rangestart++) {
9044                                     const U8 * const e = uvchr_to_utf8(s,rangestart);
9045                                     U8 *p;
9046                                     for(p = s; p < e; p++)
9047                                         put_byte(sv, *p);
9048                                 }
9049                             else {
9050                                 const U8 *e = uvchr_to_utf8(s,rangestart);
9051                                 U8 *p;
9052                                 for (p = s; p < e; p++)
9053                                     put_byte(sv, *p);
9054                                 sv_catpvs(sv, "-");
9055                                 e = uvchr_to_utf8(s, i-1);
9056                                 for (p = s; p < e; p++)
9057                                     put_byte(sv, *p);
9058                                 }
9059                                 rangestart = -1;
9060                             }
9061                         }
9062
9063                     sv_catpvs(sv, "..."); /* et cetera */
9064                 }
9065
9066                 {
9067                     char *s = savesvpv(lv);
9068                     char * const origs = s;
9069
9070                     while (*s && *s != '\n')
9071                         s++;
9072
9073                     if (*s == '\n') {
9074                         const char * const t = ++s;
9075
9076                         while (*s) {
9077                             if (*s == '\n')
9078                                 *s = ' ';
9079                             s++;
9080                         }
9081                         if (s[-1] == ' ')
9082                             s[-1] = 0;
9083
9084                         sv_catpv(sv, t);
9085                     }
9086
9087                     Safefree(origs);
9088                 }
9089             }
9090         }
9091
9092         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9093     }
9094     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9095         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9096 #else
9097     PERL_UNUSED_CONTEXT;
9098     PERL_UNUSED_ARG(sv);
9099     PERL_UNUSED_ARG(o);
9100     PERL_UNUSED_ARG(prog);
9101 #endif  /* DEBUGGING */
9102 }
9103
9104 SV *
9105 Perl_re_intuit_string(pTHX_ REGEXP * const r)
9106 {                               /* Assume that RE_INTUIT is set */
9107     dVAR;
9108     struct regexp *const prog = (struct regexp *)SvANY(r);
9109     GET_RE_DEBUG_FLAGS_DECL;
9110     PERL_UNUSED_CONTEXT;
9111
9112     DEBUG_COMPILE_r(
9113         {
9114             const char * const s = SvPV_nolen_const(prog->check_substr
9115                       ? prog->check_substr : prog->check_utf8);
9116
9117             if (!PL_colorset) reginitcolors();
9118             PerlIO_printf(Perl_debug_log,
9119                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9120                       PL_colors[4],
9121                       prog->check_substr ? "" : "utf8 ",
9122                       PL_colors[5],PL_colors[0],
9123                       s,
9124                       PL_colors[1],
9125                       (strlen(s) > 60 ? "..." : ""));
9126         } );
9127
9128     return prog->check_substr ? prog->check_substr : prog->check_utf8;
9129 }
9130
9131 /*
9132    pregfree()
9133
9134    handles refcounting and freeing the perl core regexp structure. When
9135    it is necessary to actually free the structure the first thing it
9136    does is call the 'free' method of the regexp_engine associated to to
9137    the regexp, allowing the handling of the void *pprivate; member
9138    first. (This routine is not overridable by extensions, which is why
9139    the extensions free is called first.)
9140
9141    See regdupe and regdupe_internal if you change anything here.
9142 */
9143 #ifndef PERL_IN_XSUB_RE
9144 void
9145 Perl_pregfree(pTHX_ REGEXP *r)
9146 {
9147     SvREFCNT_dec(r);
9148 }
9149
9150 void
9151 Perl_pregfree2(pTHX_ REGEXP *rx)
9152 {
9153     dVAR;
9154     struct regexp *const r = (struct regexp *)SvANY(rx);
9155     GET_RE_DEBUG_FLAGS_DECL;
9156
9157     if (r->mother_re) {
9158         ReREFCNT_dec(r->mother_re);
9159     } else {
9160         CALLREGFREE_PVT(rx); /* free the private data */
9161         if (r->paren_names)
9162             SvREFCNT_dec(r->paren_names);
9163     }
9164     if (r->substrs) {
9165         if (r->anchored_substr)
9166             SvREFCNT_dec(r->anchored_substr);
9167         if (r->anchored_utf8)
9168             SvREFCNT_dec(r->anchored_utf8);
9169         if (r->float_substr)
9170             SvREFCNT_dec(r->float_substr);
9171         if (r->float_utf8)
9172             SvREFCNT_dec(r->float_utf8);
9173         Safefree(r->substrs);
9174     }
9175     RX_MATCH_COPY_FREE(rx);
9176 #ifdef PERL_OLD_COPY_ON_WRITE
9177     if (r->saved_copy)
9178         SvREFCNT_dec(r->saved_copy);
9179 #endif
9180     Safefree(r->swap);
9181     Safefree(r->offs);
9182 }
9183
9184 /*  reg_temp_copy()
9185
9186     This is a hacky workaround to the structural issue of match results
9187     being stored in the regexp structure which is in turn stored in
9188     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9189     could be PL_curpm in multiple contexts, and could require multiple
9190     result sets being associated with the pattern simultaneously, such
9191     as when doing a recursive match with (??{$qr})
9192
9193     The solution is to make a lightweight copy of the regexp structure
9194     when a qr// is returned from the code executed by (??{$qr}) this
9195     lightweight copy doesnt actually own any of its data except for
9196     the starp/end and the actual regexp structure itself.
9197
9198 */
9199
9200
9201 REGEXP *
9202 Perl_reg_temp_copy (pTHX_ REGEXP *rx) {
9203     REGEXP *ret_x = newSV_type(SVt_REGEXP);
9204     struct regexp *ret = (struct regexp *)SvANY(ret_x);
9205     struct regexp *const r = (struct regexp *)SvANY(rx);
9206     register const I32 npar = r->nparens+1;
9207     (void)ReREFCNT_inc(rx);
9208     /* FIXME ORANGE (once we start actually using the regular SV fields.) */
9209     /* We can take advantage of the existing "copied buffer" mechanism in SVs
9210        by pointing directly at the buffer, but flagging that the allocated
9211        space in the copy is zero. As we've just done a struct copy, it's now
9212        a case of zero-ing that, rather than copying the current length.  */
9213     SvPV_set(ret_x, RX_WRAPPED(rx));
9214     StructCopy(r, ret, regexp);
9215     SvLEN_set(ret_x, 0);
9216     Newx(ret->offs, npar, regexp_paren_pair);
9217     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9218     if (r->substrs) {
9219         Newx(ret->substrs, 1, struct reg_substr_data);
9220         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9221
9222         SvREFCNT_inc_void(ret->anchored_substr);
9223         SvREFCNT_inc_void(ret->anchored_utf8);
9224         SvREFCNT_inc_void(ret->float_substr);
9225         SvREFCNT_inc_void(ret->float_utf8);
9226
9227         /* check_substr and check_utf8, if non-NULL, point to either their
9228            anchored or float namesakes, and don't hold a second reference.  */
9229     }
9230     RX_MATCH_COPIED_off(ret_x);
9231 #ifdef PERL_OLD_COPY_ON_WRITE
9232     ret->saved_copy = NULL;
9233 #endif
9234     ret->mother_re = rx;
9235     ret->swap = NULL;
9236
9237     return ret_x;
9238 }
9239 #endif
9240
9241 /* regfree_internal()
9242
9243    Free the private data in a regexp. This is overloadable by
9244    extensions. Perl takes care of the regexp structure in pregfree(),
9245    this covers the *pprivate pointer which technically perldoesnt
9246    know about, however of course we have to handle the
9247    regexp_internal structure when no extension is in use.
9248
9249    Note this is called before freeing anything in the regexp
9250    structure.
9251  */
9252
9253 void
9254 Perl_regfree_internal(pTHX_ REGEXP * const rx)
9255 {
9256     dVAR;
9257     struct regexp *const r = (struct regexp *)SvANY(rx);
9258     RXi_GET_DECL(r,ri);
9259     GET_RE_DEBUG_FLAGS_DECL;
9260
9261     DEBUG_COMPILE_r({
9262         if (!PL_colorset)
9263             reginitcolors();
9264         {
9265             SV *dsv= sv_newmortal();
9266             RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8),
9267                 dsv, RX_PRECOMP(rx), RXp_PRELEN(r), 60);
9268             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9269                 PL_colors[4],PL_colors[5],s);
9270         }
9271     });
9272 #ifdef RE_TRACK_PATTERN_OFFSETS
9273     if (ri->u.offsets)
9274         Safefree(ri->u.offsets);             /* 20010421 MJD */
9275 #endif
9276     if (ri->data) {
9277         int n = ri->data->count;
9278         PAD* new_comppad = NULL;
9279         PAD* old_comppad;
9280         PADOFFSET refcnt;
9281
9282         while (--n >= 0) {
9283           /* If you add a ->what type here, update the comment in regcomp.h */
9284             switch (ri->data->what[n]) {
9285             case 's':
9286             case 'S':
9287             case 'u':
9288                 SvREFCNT_dec((SV*)ri->data->data[n]);
9289                 break;
9290             case 'f':
9291                 Safefree(ri->data->data[n]);
9292                 break;
9293             case 'p':
9294                 new_comppad = (AV*)ri->data->data[n];
9295                 break;
9296             case 'o':
9297                 if (new_comppad == NULL)
9298                     Perl_croak(aTHX_ "panic: pregfree comppad");
9299                 PAD_SAVE_LOCAL(old_comppad,
9300                     /* Watch out for global destruction's random ordering. */
9301                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9302                 );
9303                 OP_REFCNT_LOCK;
9304                 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
9305                 OP_REFCNT_UNLOCK;
9306                 if (!refcnt)
9307                     op_free((OP_4tree*)ri->data->data[n]);
9308
9309                 PAD_RESTORE_LOCAL(old_comppad);
9310                 SvREFCNT_dec((SV*)new_comppad);
9311                 new_comppad = NULL;
9312                 break;
9313             case 'n':
9314                 break;
9315             case 'T':
9316                 { /* Aho Corasick add-on structure for a trie node.
9317                      Used in stclass optimization only */
9318                     U32 refcount;
9319                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
9320                     OP_REFCNT_LOCK;
9321                     refcount = --aho->refcount;
9322                     OP_REFCNT_UNLOCK;
9323                     if ( !refcount ) {
9324                         PerlMemShared_free(aho->states);
9325                         PerlMemShared_free(aho->fail);
9326                          /* do this last!!!! */
9327                         PerlMemShared_free(ri->data->data[n]);
9328                         PerlMemShared_free(ri->regstclass);
9329                     }
9330                 }
9331                 break;
9332             case 't':
9333                 {
9334                     /* trie structure. */
9335                     U32 refcount;
9336                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
9337                     OP_REFCNT_LOCK;
9338                     refcount = --trie->refcount;
9339                     OP_REFCNT_UNLOCK;
9340                     if ( !refcount ) {
9341                         PerlMemShared_free(trie->charmap);
9342                         PerlMemShared_free(trie->states);
9343                         PerlMemShared_free(trie->trans);
9344                         if (trie->bitmap)
9345                             PerlMemShared_free(trie->bitmap);
9346                         if (trie->wordlen)
9347                             PerlMemShared_free(trie->wordlen);
9348                         if (trie->jump)
9349                             PerlMemShared_free(trie->jump);
9350                         if (trie->nextword)
9351                             PerlMemShared_free(trie->nextword);
9352                         /* do this last!!!! */
9353                         PerlMemShared_free(ri->data->data[n]);
9354                     }
9355                 }
9356                 break;
9357             default:
9358                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
9359             }
9360         }
9361         Safefree(ri->data->what);
9362         Safefree(ri->data);
9363     }
9364
9365     Safefree(ri);
9366 }
9367
9368 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
9369 #define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
9370 #define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
9371 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
9372
9373 /*
9374    re_dup - duplicate a regexp.
9375
9376    This routine is expected to clone a given regexp structure. It is not
9377    compiler under USE_ITHREADS.
9378
9379    After all of the core data stored in struct regexp is duplicated
9380    the regexp_engine.dupe method is used to copy any private data
9381    stored in the *pprivate pointer. This allows extensions to handle
9382    any duplication it needs to do.
9383
9384    See pregfree() and regfree_internal() if you change anything here.
9385 */
9386 #if defined(USE_ITHREADS)
9387 #ifndef PERL_IN_XSUB_RE
9388 void
9389 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
9390 {
9391     dVAR;
9392     I32 npar;
9393     const struct regexp *r = (const struct regexp *)SvANY(sstr);
9394     struct regexp *ret = (struct regexp *)SvANY(dstr);
9395
9396     npar = r->nparens+1;
9397     Newx(ret->offs, npar, regexp_paren_pair);
9398     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9399     if(ret->swap) {
9400         /* no need to copy these */
9401         Newx(ret->swap, npar, regexp_paren_pair);
9402     }
9403
9404     if (ret->substrs) {
9405         /* Do it this way to avoid reading from *r after the StructCopy().
9406            That way, if any of the sv_dup_inc()s dislodge *r from the L1
9407            cache, it doesn't matter.  */
9408         const bool anchored = r->check_substr == r->anchored_substr;
9409         Newx(ret->substrs, 1, struct reg_substr_data);
9410         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9411
9412         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9413         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9414         ret->float_substr = sv_dup_inc(ret->float_substr, param);
9415         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
9416
9417         /* check_substr and check_utf8, if non-NULL, point to either their
9418            anchored or float namesakes, and don't hold a second reference.  */
9419
9420         if (ret->check_substr) {
9421             if (anchored) {
9422                 assert(r->check_utf8 == r->anchored_utf8);
9423                 ret->check_substr = ret->anchored_substr;
9424                 ret->check_utf8 = ret->anchored_utf8;
9425             } else {
9426                 assert(r->check_substr == r->float_substr);
9427                 assert(r->check_utf8 == r->float_utf8);
9428                 ret->check_substr = ret->float_substr;
9429                 ret->check_utf8 = ret->float_utf8;
9430             }
9431         }
9432     }
9433
9434     ret->paren_names    = hv_dup_inc(ret->paren_names, param);
9435
9436     if (ret->pprivate)
9437         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
9438
9439     if (RX_MATCH_COPIED(dstr))
9440         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
9441     else
9442         ret->subbeg = NULL;
9443 #ifdef PERL_OLD_COPY_ON_WRITE
9444     ret->saved_copy = NULL;
9445 #endif
9446
9447     ret->mother_re      = NULL;
9448     ret->gofs = 0;
9449     ret->seen_evals = 0;
9450 }
9451 #endif /* PERL_IN_XSUB_RE */
9452
9453 /*
9454    regdupe_internal()
9455
9456    This is the internal complement to regdupe() which is used to copy
9457    the structure pointed to by the *pprivate pointer in the regexp.
9458    This is the core version of the extension overridable cloning hook.
9459    The regexp structure being duplicated will be copied by perl prior
9460    to this and will be provided as the regexp *r argument, however
9461    with the /old/ structures pprivate pointer value. Thus this routine
9462    may override any copying normally done by perl.
9463
9464    It returns a pointer to the new regexp_internal structure.
9465 */
9466
9467 void *
9468 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
9469 {
9470     dVAR;
9471     struct regexp *const r = (struct regexp *)SvANY(rx);
9472     regexp_internal *reti;
9473     int len, npar;
9474     RXi_GET_DECL(r,ri);
9475
9476     npar = r->nparens+1;
9477     len = ProgLen(ri);
9478
9479     Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal);
9480     Copy(ri->program, reti->program, len+1, regnode);
9481
9482
9483     reti->regstclass = NULL;
9484
9485     if (ri->data) {
9486         struct reg_data *d;
9487         const int count = ri->data->count;
9488         int i;
9489
9490         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9491                 char, struct reg_data);
9492         Newx(d->what, count, U8);
9493
9494         d->count = count;
9495         for (i = 0; i < count; i++) {
9496             d->what[i] = ri->data->what[i];
9497             switch (d->what[i]) {
9498                 /* legal options are one of: sSfpontTu
9499                    see also regcomp.h and pregfree() */
9500             case 's':
9501             case 'S':
9502             case 'p': /* actually an AV, but the dup function is identical.  */
9503             case 'u': /* actually an HV, but the dup function is identical.  */
9504                 d->data[i] = sv_dup_inc((SV *)ri->data->data[i], param);
9505                 break;
9506             case 'f':
9507                 /* This is cheating. */
9508                 Newx(d->data[i], 1, struct regnode_charclass_class);
9509                 StructCopy(ri->data->data[i], d->data[i],
9510                             struct regnode_charclass_class);
9511                 reti->regstclass = (regnode*)d->data[i];
9512                 break;
9513             case 'o':
9514                 /* Compiled op trees are readonly and in shared memory,
9515                    and can thus be shared without duplication. */
9516                 OP_REFCNT_LOCK;
9517                 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
9518                 OP_REFCNT_UNLOCK;
9519                 break;
9520             case 'T':
9521                 /* Trie stclasses are readonly and can thus be shared
9522                  * without duplication. We free the stclass in pregfree
9523                  * when the corresponding reg_ac_data struct is freed.
9524                  */
9525                 reti->regstclass= ri->regstclass;
9526                 /* Fall through */
9527             case 't':
9528                 OP_REFCNT_LOCK;
9529                 ((reg_trie_data*)ri->data->data[i])->refcount++;
9530                 OP_REFCNT_UNLOCK;
9531                 /* Fall through */
9532             case 'n':
9533                 d->data[i] = ri->data->data[i];
9534                 break;
9535             default:
9536                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
9537             }
9538         }
9539
9540         reti->data = d;
9541     }
9542     else
9543         reti->data = NULL;
9544
9545     reti->name_list_idx = ri->name_list_idx;
9546
9547 #ifdef RE_TRACK_PATTERN_OFFSETS
9548     if (ri->u.offsets) {
9549         Newx(reti->u.offsets, 2*len+1, U32);
9550         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9551     }
9552 #else
9553     SetProgLen(reti,len);
9554 #endif
9555
9556     return (void*)reti;
9557 }
9558
9559 #endif    /* USE_ITHREADS */
9560
9561 /*
9562    reg_stringify()
9563
9564    converts a regexp embedded in a MAGIC struct to its stringified form,
9565    caching the converted form in the struct and returns the cached
9566    string.
9567
9568    If lp is nonnull then it is used to return the length of the
9569    resulting string
9570
9571    If flags is nonnull and the returned string contains UTF8 then
9572    (*flags & 1) will be true.
9573
9574    If haseval is nonnull then it is used to return whether the pattern
9575    contains evals.
9576
9577    Normally called via macro:
9578
9579         CALLREG_STRINGIFY(mg,&len,&utf8);
9580
9581    And internally with
9582
9583         CALLREG_AS_STR(mg,&lp,&flags,&haseval)
9584
9585    See sv_2pv_flags() in sv.c for an example of internal usage.
9586
9587  */
9588 #ifndef PERL_IN_XSUB_RE
9589
9590 char *
9591 Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
9592     dVAR;
9593     const REGEXP * const re = (REGEXP *)mg->mg_obj;
9594     if (haseval)
9595         *haseval = RX_SEEN_EVALS(re);
9596     if (flags)
9597         *flags = ((RX_EXTFLAGS(re) & RXf_UTF8) ? 1 : 0);
9598     if (lp)
9599         *lp = RX_WRAPLEN(re);
9600     return RX_WRAPPED(re);
9601 }
9602
9603 /*
9604  - regnext - dig the "next" pointer out of a node
9605  */
9606 regnode *
9607 Perl_regnext(pTHX_ register regnode *p)
9608 {
9609     dVAR;
9610     register I32 offset;
9611
9612     if (!p)
9613         return(NULL);
9614
9615     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9616     if (offset == 0)
9617         return(NULL);
9618
9619     return(p+offset);
9620 }
9621 #endif
9622
9623 STATIC void
9624 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
9625 {
9626     va_list args;
9627     STRLEN l1 = strlen(pat1);
9628     STRLEN l2 = strlen(pat2);
9629     char buf[512];
9630     SV *msv;
9631     const char *message;
9632
9633     if (l1 > 510)
9634         l1 = 510;
9635     if (l1 + l2 > 510)
9636         l2 = 510 - l1;
9637     Copy(pat1, buf, l1 , char);
9638     Copy(pat2, buf + l1, l2 , char);
9639     buf[l1 + l2] = '\n';
9640     buf[l1 + l2 + 1] = '\0';
9641 #ifdef I_STDARG
9642     /* ANSI variant takes additional second argument */
9643     va_start(args, pat2);
9644 #else
9645     va_start(args);
9646 #endif
9647     msv = vmess(buf, &args);
9648     va_end(args);
9649     message = SvPV_const(msv,l1);
9650     if (l1 > 512)
9651         l1 = 512;
9652     Copy(message, buf, l1 , char);
9653     buf[l1-1] = '\0';                   /* Overwrite \n */
9654     Perl_croak(aTHX_ "%s", buf);
9655 }
9656
9657 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
9658
9659 #ifndef PERL_IN_XSUB_RE
9660 void
9661 Perl_save_re_context(pTHX)
9662 {
9663     dVAR;
9664
9665     struct re_save_state *state;
9666
9667     SAVEVPTR(PL_curcop);
9668     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9669
9670     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9671     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9672     SSPUSHINT(SAVEt_RE_STATE);
9673
9674     Copy(&PL_reg_state, state, 1, struct re_save_state);
9675
9676     PL_reg_start_tmp = 0;
9677     PL_reg_start_tmpl = 0;
9678     PL_reg_oldsaved = NULL;
9679     PL_reg_oldsavedlen = 0;
9680     PL_reg_maxiter = 0;
9681     PL_reg_leftiter = 0;
9682     PL_reg_poscache = NULL;
9683     PL_reg_poscache_size = 0;
9684 #ifdef PERL_OLD_COPY_ON_WRITE
9685     PL_nrs = NULL;
9686 #endif
9687
9688     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9689     if (PL_curpm) {
9690         const REGEXP * const rx = PM_GETRE(PL_curpm);
9691         if (rx) {
9692             U32 i;
9693             for (i = 1; i <= RX_NPARENS(rx); i++) {
9694                 char digits[TYPE_CHARS(long)];
9695                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
9696                 GV *const *const gvp
9697                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9698
9699                 if (gvp) {
9700                     GV * const gv = *gvp;
9701                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9702                         save_scalar(gv);
9703                 }
9704             }
9705         }
9706     }
9707 }
9708 #endif
9709
9710 static void
9711 clear_re(pTHX_ void *r)
9712 {
9713     dVAR;
9714     ReREFCNT_dec((REGEXP *)r);
9715 }
9716
9717 #ifdef DEBUGGING
9718
9719 STATIC void
9720 S_put_byte(pTHX_ SV *sv, int c)
9721 {
9722     /* Our definition of isPRINT() ignores locales, so only bytes that are
9723        not part of UTF-8 are considered printable. I assume that the same
9724        holds for UTF-EBCDIC.
9725        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
9726        which Wikipedia says:
9727
9728        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
9729        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
9730        identical, to the ASCII delete (DEL) or rubout control character.
9731        ) So the old condition can be simplified to !isPRINT(c)  */
9732     if (!isPRINT(c))
9733         Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9734     else {
9735         const char string = c;
9736         if (c == '-' || c == ']' || c == '\\' || c == '^')
9737             sv_catpvs(sv, "\\");
9738         sv_catpvn(sv, &string, 1);
9739     }
9740 }
9741
9742
9743 #define CLEAR_OPTSTART \
9744     if (optstart) STMT_START { \
9745             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
9746             optstart=NULL; \
9747     } STMT_END
9748
9749 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
9750
9751 STATIC const regnode *
9752 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
9753             const regnode *last, const regnode *plast,
9754             SV* sv, I32 indent, U32 depth)
9755 {
9756     dVAR;
9757     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
9758     register const regnode *next;
9759     const regnode *optstart= NULL;
9760
9761     RXi_GET_DECL(r,ri);
9762     GET_RE_DEBUG_FLAGS_DECL;
9763
9764 #ifdef DEBUG_DUMPUNTIL
9765     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9766         last ? last-start : 0,plast ? plast-start : 0);
9767 #endif
9768
9769     if (plast && plast < last)
9770         last= plast;
9771
9772     while (PL_regkind[op] != END && (!last || node < last)) {
9773         /* While that wasn't END last time... */
9774         NODE_ALIGN(node);
9775         op = OP(node);
9776         if (op == CLOSE || op == WHILEM)
9777             indent--;
9778         next = regnext((regnode *)node);
9779
9780         /* Where, what. */
9781         if (OP(node) == OPTIMIZED) {
9782             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
9783                 optstart = node;
9784             else
9785                 goto after_print;
9786         } else
9787             CLEAR_OPTSTART;
9788
9789         regprop(r, sv, node);
9790         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
9791                       (int)(2*indent + 1), "", SvPVX_const(sv));
9792
9793         if (OP(node) != OPTIMIZED) {
9794             if (next == NULL)           /* Next ptr. */
9795                 PerlIO_printf(Perl_debug_log, " (0)");
9796             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9797                 PerlIO_printf(Perl_debug_log, " (FAIL)");
9798             else
9799                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
9800             (void)PerlIO_putc(Perl_debug_log, '\n');
9801         }
9802
9803       after_print:
9804         if (PL_regkind[(U8)op] == BRANCHJ) {
9805             assert(next);
9806             {
9807                 register const regnode *nnode = (OP(next) == LONGJMP
9808                                              ? regnext((regnode *)next)
9809                                              : next);
9810                 if (last && nnode > last)
9811                     nnode = last;
9812                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
9813             }
9814         }
9815         else if (PL_regkind[(U8)op] == BRANCH) {
9816             assert(next);
9817             DUMPUNTIL(NEXTOPER(node), next);
9818         }
9819         else if ( PL_regkind[(U8)op]  == TRIE ) {
9820             const regnode *this_trie = node;
9821             const char op = OP(node);
9822             const U32 n = ARG(node);
9823             const reg_ac_data * const ac = op>=AHOCORASICK ?
9824                (reg_ac_data *)ri->data->data[n] :
9825                NULL;
9826             const reg_trie_data * const trie =
9827                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
9828 #ifdef DEBUGGING
9829             AV *const trie_words = (AV *) ri->data->data[n + TRIE_WORDS_OFFSET];
9830 #endif
9831             const regnode *nextbranch= NULL;
9832             I32 word_idx;
9833             sv_setpvn(sv, "", 0);
9834             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
9835                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
9836
9837                 PerlIO_printf(Perl_debug_log, "%*s%s ",
9838                    (int)(2*(indent+3)), "",
9839                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
9840                             PL_colors[0], PL_colors[1],
9841                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9842                             PERL_PV_PRETTY_ELLIPSES    |
9843                             PERL_PV_PRETTY_LTGT
9844                             )
9845                             : "???"
9846                 );
9847                 if (trie->jump) {
9848                     U16 dist= trie->jump[word_idx+1];
9849                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
9850                                   (UV)((dist ? this_trie + dist : next) - start));
9851                     if (dist) {
9852                         if (!nextbranch)
9853                             nextbranch= this_trie + trie->jump[0];
9854                         DUMPUNTIL(this_trie + dist, nextbranch);
9855                     }
9856                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9857                         nextbranch= regnext((regnode *)nextbranch);
9858                 } else {
9859                     PerlIO_printf(Perl_debug_log, "\n");
9860                 }
9861             }
9862             if (last && next > last)
9863                 node= last;
9864             else
9865                 node= next;
9866         }
9867         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
9868             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9869                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
9870         }
9871         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
9872             assert(next);
9873             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
9874         }
9875         else if ( op == PLUS || op == STAR) {
9876             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
9877         }
9878         else if (op == ANYOF) {
9879             /* arglen 1 + class block */
9880             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9881                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9882             node = NEXTOPER(node);
9883         }
9884         else if (PL_regkind[(U8)op] == EXACT) {
9885             /* Literal string, where present. */
9886             node += NODE_SZ_STR(node) - 1;
9887             node = NEXTOPER(node);
9888         }
9889         else {
9890             node = NEXTOPER(node);
9891             node += regarglen[(U8)op];
9892         }
9893         if (op == CURLYX || op == OPEN)
9894             indent++;
9895     }
9896     CLEAR_OPTSTART;
9897 #ifdef DEBUG_DUMPUNTIL
9898     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
9899 #endif
9900     return node;
9901 }
9902
9903 #endif  /* DEBUGGING */
9904
9905 /*
9906  * Local variables:
9907  * c-indentation-style: bsd
9908  * c-basic-offset: 4
9909  * indent-tabs-mode: t
9910  * End:
9911  *
9912  * ex: set ts=8 sts=4 sw=4 noet:
9913  */