regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
   6  */
   7
   8 /* This file contains functions for compiling a regular expression.  See
   9  * also regexec.c which funnily enough, contains functions for executing
  10  * a regular expression.
  11  *
  12  * This file is also copied at build time to ext/re/re_comp.c, where
  13  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  14  * This causes the main functions to be compiled under new names and with
  15  * debugging support added, which makes "use re 'debug'" work.
  16  */
  17
  18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  19  * confused with the original package (see point 3 below).  Thanks, Henry!
  20  */
  21
  22 /* Additional note: this code is very heavily munged from Henry's version
  23  * in places.  In some spots I've traded clarity for efficiency, so don't
  24  * blame Henry for some of the lack of readability.
  25  */
  26
  27 /* The names of the functions have been changed from regcomp and
  28  * regexec to  pregcomp and pregexec in order to avoid conflicts
  29  * with the POSIX routines of the same names.
  30 */
  31
  32 #ifdef PERL_EXT_RE_BUILD
  33 #include "re_top.h"
  34 #endif
  35
  36 /*
  37  * pregcomp and pregexec -- regsub and regerror are not used in perl
  38  *
  39  *      Copyright (c) 1986 by University of Toronto.
  40  *      Written by Henry Spencer.  Not derived from licensed software.
  41  *
  42  *      Permission is granted to anyone to use this software for any
  43  *      purpose on any computer system, and to redistribute it freely,
  44  *      subject to the following restrictions:
  45  *
  46  *      1. The author is not responsible for the consequences of use of
  47  *              this software, no matter how awful, even if they arise
  48  *              from defects in it.
  49  *
  50  *      2. The origin of this software must not be misrepresented, either
  51  *              by explicit claim or by omission.
  52  *
  53  *      3. Altered versions must be plainly marked as such, and must not
  54  *              be misrepresented as being the original software.
  55  *
  56  *
  57  ****    Alterations to Henry's code are...
  58  ****
  59  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  60  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
  61  ****
  62  ****    You may distribute under the terms of either the GNU General Public
  63  ****    License or the Artistic License, as specified in the README file.
  64
  65  *
  66  * Beware that some of this code is subtly aware of the way operator
  67  * precedence is structured in regular expressions.  Serious changes in
  68  * regular-expression syntax might require a total rethink.
  69  */
  70 #include "EXTERN.h"
  71 #define PERL_IN_REGCOMP_C
  72 #include "perl.h"
  73
  74 #ifndef PERL_IN_XSUB_RE
  75 #  include "INTERN.h"
  76 #endif
  77
  78 #define REG_COMP_C
  79 #ifdef PERL_IN_XSUB_RE
  80 #  include "re_comp.h"
  81 #else
  82 #  include "regcomp.h"
  83 #endif
  84
  85 #ifdef op
  86 #undef op
  87 #endif /* op */
  88
  89 #ifdef MSDOS
  90 #  if defined(BUGGY_MSC6)
  91  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
  92 #    pragma optimize("a",off)
  93  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
  94 #    pragma optimize("w",on )
  95 #  endif /* BUGGY_MSC6 */
  96 #endif /* MSDOS */
  97
  98 #ifndef STATIC
  99 #define STATIC  static
 100 #endif
 101
 102 typedef struct RExC_state_t {
 103     U32         flags;                  /* are we folding, multilining? */
 104     char        *precomp;               /* uncompiled string. */
 105     regexp      *rx;
 106     char        *start;                 /* Start of input for compile */
 107     char        *end;                   /* End of input for compile */
 108     char        *parse;                 /* Input-scan pointer. */
 109     I32         whilem_seen;            /* number of WHILEM in this expr */
 110     regnode     *emit_start;            /* Start of emitted-code area */
 111     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 112     I32         naughty;                /* How bad is this pattern? */
 113     I32         sawback;                /* Did we see \1, ...? */
 114     U32         seen;
 115     I32         size;                   /* Code size. */
 116     I32         npar;                   /* Capture buffer count, (OPEN). */
 117     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 118     I32         nestroot;               /* root parens we are in - used by accept */
 119     I32         extralen;
 120     I32         seen_zerolen;
 121     I32         seen_evals;
 122     regnode     **open_parens;          /* pointers to open parens */
 123     regnode     **close_parens;         /* pointers to close parens */
 124     regnode     *opend;                 /* END node in program */
 125     I32         utf8;
 126     HV          *charnames;             /* cache of named sequences */
 127     HV          *paren_names;           /* Paren names */
 128     regnode     **recurse;              /* Recurse regops */
 129     I32         recurse_count;          /* Number of recurse regops */
 130 #if ADD_TO_REGEXEC
 131     char        *starttry;              /* -Dr: where regtry was called. */
 132 #define RExC_starttry   (pRExC_state->starttry)
 133 #endif
 134 #ifdef DEBUGGING
 135     const char  *lastparse;
 136     I32         lastnum;
 137 #define RExC_lastparse  (pRExC_state->lastparse)
 138 #define RExC_lastnum    (pRExC_state->lastnum)
 139 #endif
 140 } RExC_state_t;
 141
 142 #define RExC_flags      (pRExC_state->flags)
 143 #define RExC_precomp    (pRExC_state->precomp)
 144 #define RExC_rx         (pRExC_state->rx)
 145 #define RExC_start      (pRExC_state->start)
 146 #define RExC_end        (pRExC_state->end)
 147 #define RExC_parse      (pRExC_state->parse)
 148 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 149 #define RExC_offsets    (pRExC_state->rx->offsets) /* I am not like the others */
 150 #define RExC_emit       (pRExC_state->emit)
 151 #define RExC_emit_start (pRExC_state->emit_start)
 152 #define RExC_naughty    (pRExC_state->naughty)
 153 #define RExC_sawback    (pRExC_state->sawback)
 154 #define RExC_seen       (pRExC_state->seen)
 155 #define RExC_size       (pRExC_state->size)
 156 #define RExC_npar       (pRExC_state->npar)
 157 #define RExC_cpar       (pRExC_state->cpar)
 158 #define RExC_nestroot   (pRExC_state->nestroot)
 159 #define RExC_extralen   (pRExC_state->extralen)
 160 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 161 #define RExC_seen_evals (pRExC_state->seen_evals)
 162 #define RExC_utf8       (pRExC_state->utf8)
 163 #define RExC_charnames  (pRExC_state->charnames)
 164 #define RExC_open_parens        (pRExC_state->open_parens)
 165 #define RExC_close_parens       (pRExC_state->close_parens)
 166 #define RExC_opend      (pRExC_state->opend)
 167 #define RExC_paren_names        (pRExC_state->paren_names)
 168 #define RExC_recurse    (pRExC_state->recurse)
 169 #define RExC_recurse_count      (pRExC_state->recurse_count)
 170
 171 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 172 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 173         ((*s) == '{' && regcurly(s)))
 174
 175 #ifdef SPSTART
 176 #undef SPSTART          /* dratted cpp namespace... */
 177 #endif
 178 /*
 179  * Flags to be passed up and down.
 180  */
 181 #define WORST           0       /* Worst case. */
 182 #define HASWIDTH        0x1     /* Known to match non-null strings. */
 183 #define SIMPLE          0x2     /* Simple enough to be STAR/PLUS operand. */
 184 #define SPSTART         0x4     /* Starts with * or +. */
 185 #define TRYAGAIN        0x8     /* Weeded out a declaration. */
 186
 187 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 188
 189 /* whether trie related optimizations are enabled */
 190 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 191 #define TRIE_STUDY_OPT
 192 #define FULL_TRIE_STUDY
 193 #define TRIE_STCLASS
 194 #endif
 195
 196
 197
 198 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 199 #define PBITVAL(paren) (1 << ((paren) & 7))
 200 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 201 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 202 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 203
 204
 205 /* About scan_data_t.
 206
 207   During optimisation we recurse through the regexp program performing
 208   various inplace (keyhole style) optimisations. In addition study_chunk
 209   and scan_commit populate this data structure with information about
 210   what strings MUST appear in the pattern. We look for the longest
 211   string that must appear for at a fixed location, and we look for the
 212   longest string that may appear at a floating location. So for instance
 213   in the pattern:
 214
 215     /FOO[xX]A.*B[xX]BAR/
 216
 217   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 218   strings (because they follow a .* construct). study_chunk will identify
 219   both FOO and BAR as being the longest fixed and floating strings respectively.
 220
 221   The strings can be composites, for instance
 222
 223      /(f)(o)(o)/
 224
 225   will result in a composite fixed substring 'foo'.
 226
 227   For each string some basic information is maintained:
 228
 229   - offset or min_offset
 230     This is the position the string must appear at, or not before.
 231     It also implicitly (when combined with minlenp) tells us how many
 232     character must match before the string we are searching.
 233     Likewise when combined with minlenp and the length of the string
 234     tells us how many characters must appear after the string we have
 235     found.
 236
 237   - max_offset
 238     Only used for floating strings. This is the rightmost point that
 239     the string can appear at. Ifset to I32 max it indicates that the
 240     string can occur infinitely far to the right.
 241
 242   - minlenp
 243     A pointer to the minimum length of the pattern that the string
 244     was found inside. This is important as in the case of positive
 245     lookahead or positive lookbehind we can have multiple patterns
 246     involved. Consider
 247
 248     /(?=FOO).*F/
 249
 250     The minimum length of the pattern overall is 3, the minimum length
 251     of the lookahead part is 3, but the minimum length of the part that
 252     will actually match is 1. So 'FOO's minimum length is 3, but the
 253     minimum length for the F is 1. This is important as the minimum length
 254     is used to determine offsets in front of and behind the string being
 255     looked for.  Since strings can be composites this is the length of the
 256     pattern at the time it was commited with a scan_commit. Note that
 257     the length is calculated by study_chunk, so that the minimum lengths
 258     are not known until the full pattern has been compiled, thus the
 259     pointer to the value.
 260
 261   - lookbehind
 262
 263     In the case of lookbehind the string being searched for can be
 264     offset past the start point of the final matching string.
 265     If this value was just blithely removed from the min_offset it would
 266     invalidate some of the calculations for how many chars must match
 267     before or after (as they are derived from min_offset and minlen and
 268     the length of the string being searched for).
 269     When the final pattern is compiled and the data is moved from the
 270     scan_data_t structure into the regexp structure the information
 271     about lookbehind is factored in, with the information that would
 272     have been lost precalculated in the end_shift field for the
 273     associated string.
 274
 275   The fields pos_min and pos_delta are used to store the minimum offset
 276   and the delta to the maximum offset at the current point in the pattern.
 277
 278 */
 279
 280 typedef struct scan_data_t {
 281     /*I32 len_min;      unused */
 282     /*I32 len_delta;    unused */
 283     I32 pos_min;
 284     I32 pos_delta;
 285     SV *last_found;
 286     I32 last_end;           /* min value, <0 unless valid. */
 287     I32 last_start_min;
 288     I32 last_start_max;
 289     SV **longest;           /* Either &l_fixed, or &l_float. */
 290     SV *longest_fixed;      /* longest fixed string found in pattern */
 291     I32 offset_fixed;       /* offset where it starts */
 292     I32 *minlen_fixed;      /* pointer to the minlen relevent to the string */
 293     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 294     SV *longest_float;      /* longest floating string found in pattern */
 295     I32 offset_float_min;   /* earliest point in string it can appear */
 296     I32 offset_float_max;   /* latest point in string it can appear */
 297     I32 *minlen_float;      /* pointer to the minlen relevent to the string */
 298     I32 lookbehind_float;   /* is the position of the string modified by LB */
 299     I32 flags;
 300     I32 whilem_c;
 301     I32 *last_closep;
 302     struct regnode_charclass_class *start_class;
 303 } scan_data_t;
 304
 305 /*
 306  * Forward declarations for pregcomp()'s friends.
 307  */
 308
 309 static const scan_data_t zero_scan_data =
 310   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 311
 312 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 313 #define SF_BEFORE_SEOL          0x0001
 314 #define SF_BEFORE_MEOL          0x0002
 315 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 316 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 317
 318 #ifdef NO_UNARY_PLUS
 319 #  define SF_FIX_SHIFT_EOL      (0+2)
 320 #  define SF_FL_SHIFT_EOL               (0+4)
 321 #else
 322 #  define SF_FIX_SHIFT_EOL      (+2)
 323 #  define SF_FL_SHIFT_EOL               (+4)
 324 #endif
 325
 326 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 327 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 328
 329 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 330 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 331 #define SF_IS_INF               0x0040
 332 #define SF_HAS_PAR              0x0080
 333 #define SF_IN_PAR               0x0100
 334 #define SF_HAS_EVAL             0x0200
 335 #define SCF_DO_SUBSTR           0x0400
 336 #define SCF_DO_STCLASS_AND      0x0800
 337 #define SCF_DO_STCLASS_OR       0x1000
 338 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 339 #define SCF_WHILEM_VISITED_POS  0x2000
 340
 341 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 342 #define SCF_SEEN_ACCEPT         0x8000
 343
 344 #define UTF (RExC_utf8 != 0)
 345 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
 346 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
 347
 348 #define OOB_UNICODE             12345678
 349 #define OOB_NAMEDCLASS          -1
 350
 351 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 352 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 353
 354
 355 /* length of regex to show in messages that don't mark a position within */
 356 #define RegexLengthToShowInErrorMessages 127
 357
 358 /*
 359  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 360  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 361  * op/pragma/warn/regcomp.
 362  */
 363 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 364 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 365
 366 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 367
 368 /*
 369  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 370  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 371  * "...".
 372  */
 373 #define _FAIL(code) STMT_START {                                        \
 374     const char *ellipses = "";                                          \
 375     IV len = RExC_end - RExC_precomp;                                   \
 376                                                                         \
 377     if (!SIZE_ONLY)                                                     \
 378         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
 379     if (len > RegexLengthToShowInErrorMessages) {                       \
 380         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 381         len = RegexLengthToShowInErrorMessages - 10;                    \
 382         ellipses = "...";                                               \
 383     }                                                                   \
 384     code;                                                               \
 385 } STMT_END
 386
 387 #define FAIL(msg) _FAIL(                            \
 388     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 389             msg, (int)len, RExC_precomp, ellipses))
 390
 391 #define FAIL2(msg,arg) _FAIL(                       \
 392     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 393             arg, (int)len, RExC_precomp, ellipses))
 394
 395 /*
 396  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 397  */
 398 #define Simple_vFAIL(m) STMT_START {                                    \
 399     const IV offset = RExC_parse - RExC_precomp;                        \
 400     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 401             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 402 } STMT_END
 403
 404 /*
 405  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 406  */
 407 #define vFAIL(m) STMT_START {                           \
 408     if (!SIZE_ONLY)                                     \
 409         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
 410     Simple_vFAIL(m);                                    \
 411 } STMT_END
 412
 413 /*
 414  * Like Simple_vFAIL(), but accepts two arguments.
 415  */
 416 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 417     const IV offset = RExC_parse - RExC_precomp;                        \
 418     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 419             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 420 } STMT_END
 421
 422 /*
 423  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 424  */
 425 #define vFAIL2(m,a1) STMT_START {                       \
 426     if (!SIZE_ONLY)                                     \
 427         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
 428     Simple_vFAIL2(m, a1);                               \
 429 } STMT_END
 430
 431
 432 /*
 433  * Like Simple_vFAIL(), but accepts three arguments.
 434  */
 435 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 436     const IV offset = RExC_parse - RExC_precomp;                \
 437     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 438             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 439 } STMT_END
 440
 441 /*
 442  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 443  */
 444 #define vFAIL3(m,a1,a2) STMT_START {                    \
 445     if (!SIZE_ONLY)                                     \
 446         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
 447     Simple_vFAIL3(m, a1, a2);                           \
 448 } STMT_END
 449
 450 /*
 451  * Like Simple_vFAIL(), but accepts four arguments.
 452  */
 453 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 454     const IV offset = RExC_parse - RExC_precomp;                \
 455     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 456             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 457 } STMT_END
 458
 459 #define vWARN(loc,m) STMT_START {                                       \
 460     const IV offset = loc - RExC_precomp;                               \
 461     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 462             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 463 } STMT_END
 464
 465 #define vWARNdep(loc,m) STMT_START {                                    \
 466     const IV offset = loc - RExC_precomp;                               \
 467     Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),          \
 468             "%s" REPORT_LOCATION,                                       \
 469             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 470 } STMT_END
 471
 472
 473 #define vWARN2(loc, m, a1) STMT_START {                                 \
 474     const IV offset = loc - RExC_precomp;                               \
 475     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 476             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 477 } STMT_END
 478
 479 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 480     const IV offset = loc - RExC_precomp;                               \
 481     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 482             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 483 } STMT_END
 484
 485 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 486     const IV offset = loc - RExC_precomp;                               \
 487     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 488             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 489 } STMT_END
 490
 491 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 492     const IV offset = loc - RExC_precomp;                               \
 493     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 494             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 495 } STMT_END
 496
 497
 498 /* Allow for side effects in s */
 499 #define REGC(c,s) STMT_START {                  \
 500     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 501 } STMT_END
 502
 503 /* Macros for recording node offsets.   20001227 mjd@plover.com
 504  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 505  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 506  * Element 0 holds the number n.
 507  * Position is 1 indexed.
 508  */
 509
 510 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 511     if (! SIZE_ONLY) {                                                  \
 512         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 513                     __LINE__, (int)(node), (int)(byte)));               \
 514         if((node) < 0) {                                                \
 515             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 516         } else {                                                        \
 517             RExC_offsets[2*(node)-1] = (byte);                          \
 518         }                                                               \
 519     }                                                                   \
 520 } STMT_END
 521
 522 #define Set_Node_Offset(node,byte) \
 523     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 524 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 525
 526 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 527     if (! SIZE_ONLY) {                                                  \
 528         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 529                 __LINE__, (int)(node), (int)(len)));                    \
 530         if((node) < 0) {                                                \
 531             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 532         } else {                                                        \
 533             RExC_offsets[2*(node)] = (len);                             \
 534         }                                                               \
 535     }                                                                   \
 536 } STMT_END
 537
 538 #define Set_Node_Length(node,len) \
 539     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 540 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 541 #define Set_Node_Cur_Length(node) \
 542     Set_Node_Length(node, RExC_parse - parse_start)
 543
 544 /* Get offsets and lengths */
 545 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 546 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 547
 548 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 549     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 550     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 551 } STMT_END
 552
 553
 554 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 555 #define EXPERIMENTAL_INPLACESCAN
 556 #endif
 557
 558 #define DEBUG_STUDYDATA(data,depth)                                  \
 559 DEBUG_OPTIMISE_MORE_r(if(data){                                           \
 560     PerlIO_printf(Perl_debug_log,                                    \
 561         "%*s"/* Len:%"IVdf"/%"IVdf" */"Pos:%"IVdf"/%"IVdf           \
 562         " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ",           \
 563         (int)(depth)*2, "",                                          \
 564         (IV)((data)->pos_min),                                       \
 565         (IV)((data)->pos_delta),                                     \
 566         (IV)((data)->flags),                                         \
 567         (IV)((data)->whilem_c),                                      \
 568         (IV)((data)->last_closep ? *((data)->last_closep) : -1)      \
 569     );                                                               \
 570     if ((data)->last_found)                                          \
 571         PerlIO_printf(Perl_debug_log,                                \
 572             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 573             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 574             SvPVX_const((data)->last_found),                         \
 575             (IV)((data)->last_end),                                  \
 576             (IV)((data)->last_start_min),                            \
 577             (IV)((data)->last_start_max),                            \
 578             ((data)->longest &&                                      \
 579              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 580             SvPVX_const((data)->longest_fixed),                      \
 581             (IV)((data)->offset_fixed),                              \
 582             ((data)->longest &&                                      \
 583              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 584             SvPVX_const((data)->longest_float),                      \
 585             (IV)((data)->offset_float_min),                          \
 586             (IV)((data)->offset_float_max)                           \
 587         );                                                           \
 588     PerlIO_printf(Perl_debug_log,"\n");                              \
 589 });
 590
 591 static void clear_re(pTHX_ void *r);
 592
 593 /* Mark that we cannot extend a found fixed substring at this point.
 594    Update the longest found anchored substring and the longest found
 595    floating substrings if needed. */
 596
 597 STATIC void
 598 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
 599 {
 600     const STRLEN l = CHR_SVLEN(data->last_found);
 601     const STRLEN old_l = CHR_SVLEN(*data->longest);
 602     GET_RE_DEBUG_FLAGS_DECL;
 603
 604     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 605         SvSetMagicSV(*data->longest, data->last_found);
 606         if (*data->longest == data->longest_fixed) {
 607             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 608             if (data->flags & SF_BEFORE_EOL)
 609                 data->flags
 610                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 611             else
 612                 data->flags &= ~SF_FIX_BEFORE_EOL;
 613             data->minlen_fixed=minlenp;
 614             data->lookbehind_fixed=0;
 615         }
 616         else {
 617             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 618             data->offset_float_max = (l
 619                                       ? data->last_start_max
 620                                       : data->pos_min + data->pos_delta);
 621             if ((U32)data->offset_float_max > (U32)I32_MAX)
 622                 data->offset_float_max = I32_MAX;
 623             if (data->flags & SF_BEFORE_EOL)
 624                 data->flags
 625                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 626             else
 627                 data->flags &= ~SF_FL_BEFORE_EOL;
 628             data->minlen_float=minlenp;
 629             data->lookbehind_float=0;
 630         }
 631     }
 632     SvCUR_set(data->last_found, 0);
 633     {
 634         SV * const sv = data->last_found;
 635         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 636             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 637             if (mg)
 638                 mg->mg_len = 0;
 639         }
 640     }
 641     data->last_end = -1;
 642     data->flags &= ~SF_BEFORE_EOL;
 643     DEBUG_STUDYDATA(data,0);
 644 }
 645
 646 /* Can match anything (initialization) */
 647 STATIC void
 648 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 649 {
 650     ANYOF_CLASS_ZERO(cl);
 651     ANYOF_BITMAP_SETALL(cl);
 652     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
 653     if (LOC)
 654         cl->flags |= ANYOF_LOCALE;
 655 }
 656
 657 /* Can match anything (initialization) */
 658 STATIC int
 659 S_cl_is_anything(const struct regnode_charclass_class *cl)
 660 {
 661     int value;
 662
 663     for (value = 0; value <= ANYOF_MAX; value += 2)
 664         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 665             return 1;
 666     if (!(cl->flags & ANYOF_UNICODE_ALL))
 667         return 0;
 668     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 669         return 0;
 670     return 1;
 671 }
 672
 673 /* Can match anything (initialization) */
 674 STATIC void
 675 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 676 {
 677     Zero(cl, 1, struct regnode_charclass_class);
 678     cl->type = ANYOF;
 679     cl_anything(pRExC_state, cl);
 680 }
 681
 682 STATIC void
 683 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 684 {
 685     Zero(cl, 1, struct regnode_charclass_class);
 686     cl->type = ANYOF;
 687     cl_anything(pRExC_state, cl);
 688     if (LOC)
 689         cl->flags |= ANYOF_LOCALE;
 690 }
 691
 692 /* 'And' a given class with another one.  Can create false positives */
 693 /* We assume that cl is not inverted */
 694 STATIC void
 695 S_cl_and(struct regnode_charclass_class *cl,
 696         const struct regnode_charclass_class *and_with)
 697 {
 698
 699     assert(and_with->type == ANYOF);
 700     if (!(and_with->flags & ANYOF_CLASS)
 701         && !(cl->flags & ANYOF_CLASS)
 702         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 703         && !(and_with->flags & ANYOF_FOLD)
 704         && !(cl->flags & ANYOF_FOLD)) {
 705         int i;
 706
 707         if (and_with->flags & ANYOF_INVERT)
 708             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 709                 cl->bitmap[i] &= ~and_with->bitmap[i];
 710         else
 711             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 712                 cl->bitmap[i] &= and_with->bitmap[i];
 713     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 714     if (!(and_with->flags & ANYOF_EOS))
 715         cl->flags &= ~ANYOF_EOS;
 716
 717     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
 718         !(and_with->flags & ANYOF_INVERT)) {
 719         cl->flags &= ~ANYOF_UNICODE_ALL;
 720         cl->flags |= ANYOF_UNICODE;
 721         ARG_SET(cl, ARG(and_with));
 722     }
 723     if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
 724         !(and_with->flags & ANYOF_INVERT))
 725         cl->flags &= ~ANYOF_UNICODE_ALL;
 726     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
 727         !(and_with->flags & ANYOF_INVERT))
 728         cl->flags &= ~ANYOF_UNICODE;
 729 }
 730
 731 /* 'OR' a given class with another one.  Can create false positives */
 732 /* We assume that cl is not inverted */
 733 STATIC void
 734 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 735 {
 736     if (or_with->flags & ANYOF_INVERT) {
 737         /* We do not use
 738          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 739          *   <= (B1 | !B2) | (CL1 | !CL2)
 740          * which is wasteful if CL2 is small, but we ignore CL2:
 741          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 742          * XXXX Can we handle case-fold?  Unclear:
 743          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 744          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 745          */
 746         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 747              && !(or_with->flags & ANYOF_FOLD)
 748              && !(cl->flags & ANYOF_FOLD) ) {
 749             int i;
 750
 751             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 752                 cl->bitmap[i] |= ~or_with->bitmap[i];
 753         } /* XXXX: logic is complicated otherwise */
 754         else {
 755             cl_anything(pRExC_state, cl);
 756         }
 757     } else {
 758         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 759         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 760              && (!(or_with->flags & ANYOF_FOLD)
 761                  || (cl->flags & ANYOF_FOLD)) ) {
 762             int i;
 763
 764             /* OR char bitmap and class bitmap separately */
 765             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 766                 cl->bitmap[i] |= or_with->bitmap[i];
 767             if (or_with->flags & ANYOF_CLASS) {
 768                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 769                     cl->classflags[i] |= or_with->classflags[i];
 770                 cl->flags |= ANYOF_CLASS;
 771             }
 772         }
 773         else { /* XXXX: logic is complicated, leave it along for a moment. */
 774             cl_anything(pRExC_state, cl);
 775         }
 776     }
 777     if (or_with->flags & ANYOF_EOS)
 778         cl->flags |= ANYOF_EOS;
 779
 780     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
 781         ARG(cl) != ARG(or_with)) {
 782         cl->flags |= ANYOF_UNICODE_ALL;
 783         cl->flags &= ~ANYOF_UNICODE;
 784     }
 785     if (or_with->flags & ANYOF_UNICODE_ALL) {
 786         cl->flags |= ANYOF_UNICODE_ALL;
 787         cl->flags &= ~ANYOF_UNICODE;
 788     }
 789 }
 790
 791 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
 792 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
 793 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
 794 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
 795
 796
 797 #ifdef DEBUGGING
 798 /*
 799    dump_trie(trie)
 800    dump_trie_interim_list(trie,next_alloc)
 801    dump_trie_interim_table(trie,next_alloc)
 802
 803    These routines dump out a trie in a somewhat readable format.
 804    The _interim_ variants are used for debugging the interim
 805    tables that are used to generate the final compressed
 806    representation which is what dump_trie expects.
 807
 808    Part of the reason for their existance is to provide a form
 809    of documentation as to how the different representations function.
 810
 811 */
 812
 813 /*
 814   dump_trie(trie)
 815   Dumps the final compressed table form of the trie to Perl_debug_log.
 816   Used for debugging make_trie().
 817 */
 818
 819 STATIC void
 820 S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
 821 {
 822     U32 state;
 823     SV *sv=sv_newmortal();
 824     int colwidth= trie->widecharmap ? 6 : 4;
 825     GET_RE_DEBUG_FLAGS_DECL;
 826
 827
 828     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
 829         (int)depth * 2 + 2,"",
 830         "Match","Base","Ofs" );
 831
 832     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
 833         SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
 834         if ( tmp ) {
 835             PerlIO_printf( Perl_debug_log, "%*s",
 836                 colwidth,
 837                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
 838                             PL_colors[0], PL_colors[1],
 839                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
 840                             PERL_PV_ESCAPE_FIRSTCHAR
 841                 )
 842             );
 843         }
 844     }
 845     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
 846         (int)depth * 2 + 2,"");
 847
 848     for( state = 0 ; state < trie->uniquecharcount ; state++ )
 849         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
 850     PerlIO_printf( Perl_debug_log, "\n");
 851
 852     for( state = 1 ; state < trie->statecount ; state++ ) {
 853         const U32 base = trie->states[ state ].trans.base;
 854
 855         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
 856
 857         if ( trie->states[ state ].wordnum ) {
 858             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
 859         } else {
 860             PerlIO_printf( Perl_debug_log, "%6s", "" );
 861         }
 862
 863         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
 864
 865         if ( base ) {
 866             U32 ofs = 0;
 867
 868             while( ( base + ofs  < trie->uniquecharcount ) ||
 869                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
 870                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
 871                     ofs++;
 872
 873             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
 874
 875             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
 876                 if ( ( base + ofs >= trie->uniquecharcount ) &&
 877                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
 878                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
 879                 {
 880                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
 881                     colwidth,
 882                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
 883                 } else {
 884                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
 885                 }
 886             }
 887
 888             PerlIO_printf( Perl_debug_log, "]");
 889
 890         }
 891         PerlIO_printf( Perl_debug_log, "\n" );
 892     }
 893 }
 894 /*
 895   dump_trie_interim_list(trie,next_alloc)
 896   Dumps a fully constructed but uncompressed trie in list form.
 897   List tries normally only are used for construction when the number of
 898   possible chars (trie->uniquecharcount) is very high.
 899   Used for debugging make_trie().
 900 */
 901 STATIC void
 902 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
 903 {
 904     U32 state;
 905     SV *sv=sv_newmortal();
 906     int colwidth= trie->widecharmap ? 6 : 4;
 907     GET_RE_DEBUG_FLAGS_DECL;
 908     /* print out the table precompression.  */
 909     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
 910         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
 911         "------:-----+-----------------\n" );
 912
 913     for( state=1 ; state < next_alloc ; state ++ ) {
 914         U16 charid;
 915
 916         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
 917             (int)depth * 2 + 2,"", (UV)state  );
 918         if ( ! trie->states[ state ].wordnum ) {
 919             PerlIO_printf( Perl_debug_log, "%5s| ","");
 920         } else {
 921             PerlIO_printf( Perl_debug_log, "W%4x| ",
 922                 trie->states[ state ].wordnum
 923             );
 924         }
 925         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
 926             SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
 927             if ( tmp ) {
 928                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
 929                     colwidth,
 930                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
 931                             PL_colors[0], PL_colors[1],
 932                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
 933                             PERL_PV_ESCAPE_FIRSTCHAR
 934                     ) ,
 935                     TRIE_LIST_ITEM(state,charid).forid,
 936                     (UV)TRIE_LIST_ITEM(state,charid).newstate
 937                 );
 938                 if (!(charid % 10))
 939                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
 940                         (int)((depth * 2) + 14), "");
 941             }
 942         }
 943         PerlIO_printf( Perl_debug_log, "\n");
 944     }
 945 }
 946
 947 /*
 948   dump_trie_interim_table(trie,next_alloc)
 949   Dumps a fully constructed but uncompressed trie in table form.
 950   This is the normal DFA style state transition table, with a few
 951   twists to facilitate compression later.
 952   Used for debugging make_trie().
 953 */
 954 STATIC void
 955 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
 956 {
 957     U32 state;
 958     U16 charid;
 959     SV *sv=sv_newmortal();
 960     int colwidth= trie->widecharmap ? 6 : 4;
 961     GET_RE_DEBUG_FLAGS_DECL;
 962
 963     /*
 964        print out the table precompression so that we can do a visual check
 965        that they are identical.
 966      */
 967
 968     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
 969
 970     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
 971         SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
 972         if ( tmp ) {
 973             PerlIO_printf( Perl_debug_log, "%*s",
 974                 colwidth,
 975                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
 976                             PL_colors[0], PL_colors[1],
 977                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
 978                             PERL_PV_ESCAPE_FIRSTCHAR
 979                 )
 980             );
 981         }
 982     }
 983
 984     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
 985
 986     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
 987         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
 988     }
 989
 990     PerlIO_printf( Perl_debug_log, "\n" );
 991
 992     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
 993
 994         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
 995             (int)depth * 2 + 2,"",
 996             (UV)TRIE_NODENUM( state ) );
 997
 998         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
 999             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1000             if (v)
1001                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1002             else
1003                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1004         }
1005         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1006             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1007         } else {
1008             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1009             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1010         }
1011     }
1012 }
1013
1014 #endif
1015
1016 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1017   startbranch: the first branch in the whole branch sequence
1018   first      : start branch of sequence of branch-exact nodes.
1019                May be the same as startbranch
1020   last       : Thing following the last branch.
1021                May be the same as tail.
1022   tail       : item following the branch sequence
1023   count      : words in the sequence
1024   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1025   depth      : indent depth
1026
1027 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1028
1029 A trie is an N'ary tree where the branches are determined by digital
1030 decomposition of the key. IE, at the root node you look up the 1st character and
1031 follow that branch repeat until you find the end of the branches. Nodes can be
1032 marked as "accepting" meaning they represent a complete word. Eg:
1033
1034   /he|she|his|hers/
1035
1036 would convert into the following structure. Numbers represent states, letters
1037 following numbers represent valid transitions on the letter from that state, if
1038 the number is in square brackets it represents an accepting state, otherwise it
1039 will be in parenthesis.
1040
1041       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1042       |    |
1043       |   (2)
1044       |    |
1045      (1)   +-i->(6)-+-s->[7]
1046       |
1047       +-s->(3)-+-h->(4)-+-e->[5]
1048
1049       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1050
1051 This shows that when matching against the string 'hers' we will begin at state 1
1052 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1053 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1054 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1055 single traverse. We store a mapping from accepting to state to which word was
1056 matched, and then when we have multiple possibilities we try to complete the
1057 rest of the regex in the order in which they occured in the alternation.
1058
1059 The only prior NFA like behaviour that would be changed by the TRIE support is
1060 the silent ignoring of duplicate alternations which are of the form:
1061
1062  / (DUPE|DUPE) X? (?{ ... }) Y /x
1063
1064 Thus EVAL blocks follwing a trie may be called a different number of times with
1065 and without the optimisation. With the optimisations dupes will be silently
1066 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1067 the following demonstrates:
1068
1069  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1070
1071 which prints out 'word' three times, but
1072
1073  'words'=~/(word|word|word)(?{ print $1 })S/
1074
1075 which doesnt print it out at all. This is due to other optimisations kicking in.
1076
1077 Example of what happens on a structural level:
1078
1079 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1080
1081    1: CURLYM[1] {1,32767}(18)
1082    5:   BRANCH(8)
1083    6:     EXACT <ac>(16)
1084    8:   BRANCH(11)
1085    9:     EXACT <ad>(16)
1086   11:   BRANCH(14)
1087   12:     EXACT <ab>(16)
1088   16:   SUCCEED(0)
1089   17:   NOTHING(18)
1090   18: END(0)
1091
1092 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1093 and should turn into:
1094
1095    1: CURLYM[1] {1,32767}(18)
1096    5:   TRIE(16)
1097         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1098           <ac>
1099           <ad>
1100           <ab>
1101   16:   SUCCEED(0)
1102   17:   NOTHING(18)
1103   18: END(0)
1104
1105 Cases where tail != last would be like /(?foo|bar)baz/:
1106
1107    1: BRANCH(4)
1108    2:   EXACT <foo>(8)
1109    4: BRANCH(7)
1110    5:   EXACT <bar>(8)
1111    7: TAIL(8)
1112    8: EXACT <baz>(10)
1113   10: END(0)
1114
1115 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1116 and would end up looking like:
1117
1118     1: TRIE(8)
1119       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1120         <foo>
1121         <bar>
1122    7: TAIL(8)
1123    8: EXACT <baz>(10)
1124   10: END(0)
1125
1126     d = uvuni_to_utf8_flags(d, uv, 0);
1127
1128 is the recommended Unicode-aware way of saying
1129
1130     *(d++) = uv;
1131 */
1132
1133 #define TRIE_STORE_REVCHAR                                                 \
1134     STMT_START {                                                           \
1135         SV *tmp = newSVpvs("");                                            \
1136         if (UTF) SvUTF8_on(tmp);                                           \
1137         Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc );                       \
1138         av_push( TRIE_REVCHARMAP(trie), tmp );                             \
1139     } STMT_END
1140
1141 #define TRIE_READ_CHAR STMT_START {                                           \
1142     wordlen++;                                                                \
1143     if ( UTF ) {                                                              \
1144         if ( folder ) {                                                       \
1145             if ( foldlen > 0 ) {                                              \
1146                uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1147                foldlen -= len;                                                \
1148                scan += len;                                                   \
1149                len = 0;                                                       \
1150             } else {                                                          \
1151                 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1152                 uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
1153                 foldlen -= UNISKIP( uvc );                                    \
1154                 scan = foldbuf + UNISKIP( uvc );                              \
1155             }                                                                 \
1156         } else {                                                              \
1157             uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1158         }                                                                     \
1159     } else {                                                                  \
1160         uvc = (U32)*uc;                                                       \
1161         len = 1;                                                              \
1162     }                                                                         \
1163 } STMT_END
1164
1165
1166
1167 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1168     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1169         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1170         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1171     }                                                           \
1172     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1173     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1174     TRIE_LIST_CUR( state )++;                                   \
1175 } STMT_END
1176
1177 #define TRIE_LIST_NEW(state) STMT_START {                       \
1178     Newxz( trie->states[ state ].trans.list,               \
1179         4, reg_trie_trans_le );                                 \
1180      TRIE_LIST_CUR( state ) = 1;                                \
1181      TRIE_LIST_LEN( state ) = 4;                                \
1182 } STMT_END
1183
1184 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1185     U16 dupe= trie->states[ state ].wordnum;                    \
1186     regnode * const noper_next = regnext( noper );              \
1187                                                                 \
1188     if (trie->wordlen)                                          \
1189         trie->wordlen[ curword ] = wordlen;                     \
1190     DEBUG_r({                                                   \
1191         /* store the word for dumping */                        \
1192         SV* tmp;                                                \
1193         if (OP(noper) != NOTHING)                               \
1194             tmp = newSVpvn(STRING(noper), STR_LEN(noper));      \
1195         else                                                    \
1196             tmp = newSVpvn( "", 0 );                            \
1197         if ( UTF ) SvUTF8_on( tmp );                            \
1198         av_push( trie->words, tmp );                            \
1199     });                                                         \
1200                                                                 \
1201     curword++;                                                  \
1202                                                                 \
1203     if ( noper_next < tail ) {                                  \
1204         if (!trie->jump)                                        \
1205             Newxz( trie->jump, word_count + 1, U16);            \
1206         trie->jump[curword] = (U16)(noper_next - convert);      \
1207         if (!jumper)                                            \
1208             jumper = noper_next;                                \
1209         if (!nextbranch)                                        \
1210             nextbranch= regnext(cur);                           \
1211     }                                                           \
1212                                                                 \
1213     if ( dupe ) {                                               \
1214         /* So it's a dupe. This means we need to maintain a   */\
1215         /* linked-list from the first to the next.            */\
1216         /* we only allocate the nextword buffer when there    */\
1217         /* a dupe, so first time we have to do the allocation */\
1218         if (!trie->nextword)                                    \
1219             Newxz( trie->nextword, word_count + 1, U16);        \
1220         while ( trie->nextword[dupe] )                          \
1221             dupe= trie->nextword[dupe];                         \
1222         trie->nextword[dupe]= curword;                          \
1223     } else {                                                    \
1224         /* we haven't inserted this word yet.                */ \
1225         trie->states[ state ].wordnum = curword;                \
1226     }                                                           \
1227 } STMT_END
1228
1229
1230 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1231      ( ( base + charid >=  ucharcount                                   \
1232          && base + charid < ubound                                      \
1233          && state == trie->trans[ base - ucharcount + charid ].check    \
1234          && trie->trans[ base - ucharcount + charid ].next )            \
1235            ? trie->trans[ base - ucharcount + charid ].next             \
1236            : ( state==1 ? special : 0 )                                 \
1237       )
1238
1239 #define MADE_TRIE       1
1240 #define MADE_JUMP_TRIE  2
1241 #define MADE_EXACT_TRIE 4
1242
1243 STATIC I32
1244 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1245 {
1246     dVAR;
1247     /* first pass, loop through and scan words */
1248     reg_trie_data *trie;
1249     regnode *cur;
1250     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1251     STRLEN len = 0;
1252     UV uvc = 0;
1253     U16 curword = 0;
1254     U32 next_alloc = 0;
1255     regnode *jumper = NULL;
1256     regnode *nextbranch = NULL;
1257     regnode *convert = NULL;
1258     /* we just use folder as a flag in utf8 */
1259     const U8 * const folder = ( flags == EXACTF
1260                        ? PL_fold
1261                        : ( flags == EXACTFL
1262                            ? PL_fold_locale
1263                            : NULL
1264                          )
1265                      );
1266
1267     const U32 data_slot = add_data( pRExC_state, 1, "t" );
1268     SV *re_trie_maxbuff;
1269 #ifndef DEBUGGING
1270     /* these are only used during construction but are useful during
1271      * debugging so we store them in the struct when debugging.
1272      */
1273     STRLEN trie_charcount=0;
1274     AV *trie_revcharmap;
1275 #endif
1276     GET_RE_DEBUG_FLAGS_DECL;
1277 #ifndef DEBUGGING
1278     PERL_UNUSED_ARG(depth);
1279 #endif
1280
1281     Newxz( trie, 1, reg_trie_data );
1282     trie->refcount = 1;
1283     trie->startstate = 1;
1284     trie->wordcount = word_count;
1285     RExC_rx->data->data[ data_slot ] = (void*)trie;
1286     Newxz( trie->charmap, 256, U16 );
1287     if (!(UTF && folder))
1288         Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
1289     DEBUG_r({
1290         trie->words = newAV();
1291     });
1292     TRIE_REVCHARMAP(trie) = newAV();
1293
1294     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1295     if (!SvIOK(re_trie_maxbuff)) {
1296         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1297     }
1298     DEBUG_OPTIMISE_r({
1299                 PerlIO_printf( Perl_debug_log,
1300                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1301                   (int)depth * 2 + 2, "",
1302                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1303                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1304                   (int)depth);
1305     });
1306
1307    /* Find the node we are going to overwrite */
1308     if ( first == startbranch && OP( last ) != BRANCH ) {
1309         /* whole branch chain */
1310         convert = first;
1311     } else {
1312         /* branch sub-chain */
1313         convert = NEXTOPER( first );
1314     }
1315
1316     /*  -- First loop and Setup --
1317
1318        We first traverse the branches and scan each word to determine if it
1319        contains widechars, and how many unique chars there are, this is
1320        important as we have to build a table with at least as many columns as we
1321        have unique chars.
1322
1323        We use an array of integers to represent the character codes 0..255
1324        (trie->charmap) and we use a an HV* to store unicode characters. We use the
1325        native representation of the character value as the key and IV's for the
1326        coded index.
1327
1328        *TODO* If we keep track of how many times each character is used we can
1329        remap the columns so that the table compression later on is more
1330        efficient in terms of memory by ensuring most common value is in the
1331        middle and the least common are on the outside.  IMO this would be better
1332        than a most to least common mapping as theres a decent chance the most
1333        common letter will share a node with the least common, meaning the node
1334        will not be compressable. With a middle is most common approach the worst
1335        case is when we have the least common nodes twice.
1336
1337      */
1338
1339     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1340         regnode * const noper = NEXTOPER( cur );
1341         const U8 *uc = (U8*)STRING( noper );
1342         const U8 * const e  = uc + STR_LEN( noper );
1343         STRLEN foldlen = 0;
1344         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1345         const U8 *scan = (U8*)NULL;
1346         U32 wordlen      = 0;         /* required init */
1347         STRLEN chars=0;
1348
1349         if (OP(noper) == NOTHING) {
1350             trie->minlen= 0;
1351             continue;
1352         }
1353         if (trie->bitmap) {
1354             TRIE_BITMAP_SET(trie,*uc);
1355             if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1356         }
1357         for ( ; uc < e ; uc += len ) {
1358             TRIE_CHARCOUNT(trie)++;
1359             TRIE_READ_CHAR;
1360             chars++;
1361             if ( uvc < 256 ) {
1362                 if ( !trie->charmap[ uvc ] ) {
1363                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1364                     if ( folder )
1365                         trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1366                     TRIE_STORE_REVCHAR;
1367                 }
1368             } else {
1369                 SV** svpp;
1370                 if ( !trie->widecharmap )
1371                     trie->widecharmap = newHV();
1372
1373                 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1374
1375                 if ( !svpp )
1376                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1377
1378                 if ( !SvTRUE( *svpp ) ) {
1379                     sv_setiv( *svpp, ++trie->uniquecharcount );
1380                     TRIE_STORE_REVCHAR;
1381                 }
1382             }
1383         }
1384         if( cur == first ) {
1385             trie->minlen=chars;
1386             trie->maxlen=chars;
1387         } else if (chars < trie->minlen) {
1388             trie->minlen=chars;
1389         } else if (chars > trie->maxlen) {
1390             trie->maxlen=chars;
1391         }
1392
1393     } /* end first pass */
1394     DEBUG_TRIE_COMPILE_r(
1395         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1396                 (int)depth * 2 + 2,"",
1397                 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1398                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1399                 (int)trie->minlen, (int)trie->maxlen )
1400     );
1401     Newxz( trie->wordlen, word_count, U32 );
1402
1403     /*
1404         We now know what we are dealing with in terms of unique chars and
1405         string sizes so we can calculate how much memory a naive
1406         representation using a flat table  will take. If it's over a reasonable
1407         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1408         conservative but potentially much slower representation using an array
1409         of lists.
1410
1411         At the end we convert both representations into the same compressed
1412         form that will be used in regexec.c for matching with. The latter
1413         is a form that cannot be used to construct with but has memory
1414         properties similar to the list form and access properties similar
1415         to the table form making it both suitable for fast searches and
1416         small enough that its feasable to store for the duration of a program.
1417
1418         See the comment in the code where the compressed table is produced
1419         inplace from the flat tabe representation for an explanation of how
1420         the compression works.
1421
1422     */
1423
1424
1425     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1426         /*
1427             Second Pass -- Array Of Lists Representation
1428
1429             Each state will be represented by a list of charid:state records
1430             (reg_trie_trans_le) the first such element holds the CUR and LEN
1431             points of the allocated array. (See defines above).
1432
1433             We build the initial structure using the lists, and then convert
1434             it into the compressed table form which allows faster lookups
1435             (but cant be modified once converted).
1436         */
1437
1438         STRLEN transcount = 1;
1439
1440         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1441             "%*sCompiling trie using list compiler\n",
1442             (int)depth * 2 + 2, ""));
1443
1444         Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1445         TRIE_LIST_NEW(1);
1446         next_alloc = 2;
1447
1448         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1449
1450             regnode * const noper = NEXTOPER( cur );
1451             U8 *uc           = (U8*)STRING( noper );
1452             const U8 * const e = uc + STR_LEN( noper );
1453             U32 state        = 1;         /* required init */
1454             U16 charid       = 0;         /* sanity init */
1455             U8 *scan         = (U8*)NULL; /* sanity init */
1456             STRLEN foldlen   = 0;         /* required init */
1457             U32 wordlen      = 0;         /* required init */
1458             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1459
1460             if (OP(noper) != NOTHING) {
1461                 for ( ; uc < e ; uc += len ) {
1462
1463                     TRIE_READ_CHAR;
1464
1465                     if ( uvc < 256 ) {
1466                         charid = trie->charmap[ uvc ];
1467                     } else {
1468                         SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1469                         if ( !svpp ) {
1470                             charid = 0;
1471                         } else {
1472                             charid=(U16)SvIV( *svpp );
1473                         }
1474                     }
1475                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1476                     if ( charid ) {
1477
1478                         U16 check;
1479                         U32 newstate = 0;
1480
1481                         charid--;
1482                         if ( !trie->states[ state ].trans.list ) {
1483                             TRIE_LIST_NEW( state );
1484                         }
1485                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1486                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1487                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1488                                 break;
1489                             }
1490                         }
1491                         if ( ! newstate ) {
1492                             newstate = next_alloc++;
1493                             TRIE_LIST_PUSH( state, charid, newstate );
1494                             transcount++;
1495                         }
1496                         state = newstate;
1497                     } else {
1498                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1499                     }
1500                 }
1501             }
1502             TRIE_HANDLE_WORD(state);
1503
1504         } /* end second pass */
1505
1506         /* next alloc is the NEXT state to be allocated */
1507         trie->statecount = next_alloc;
1508         Renew( trie->states, next_alloc, reg_trie_state );
1509
1510         /* and now dump it out before we compress it */
1511         DEBUG_TRIE_COMPILE_MORE_r(
1512             dump_trie_interim_list(trie,next_alloc,depth+1)
1513         );
1514
1515         Newxz( trie->trans, transcount ,reg_trie_trans );
1516         {
1517             U32 state;
1518             U32 tp = 0;
1519             U32 zp = 0;
1520
1521
1522             for( state=1 ; state < next_alloc ; state ++ ) {
1523                 U32 base=0;
1524
1525                 /*
1526                 DEBUG_TRIE_COMPILE_MORE_r(
1527                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1528                 );
1529                 */
1530
1531                 if (trie->states[state].trans.list) {
1532                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1533                     U16 maxid=minid;
1534                     U16 idx;
1535
1536                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1537                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1538                         if ( forid < minid ) {
1539                             minid=forid;
1540                         } else if ( forid > maxid ) {
1541                             maxid=forid;
1542                         }
1543                     }
1544                     if ( transcount < tp + maxid - minid + 1) {
1545                         transcount *= 2;
1546                         Renew( trie->trans, transcount, reg_trie_trans );
1547                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1548                     }
1549                     base = trie->uniquecharcount + tp - minid;
1550                     if ( maxid == minid ) {
1551                         U32 set = 0;
1552                         for ( ; zp < tp ; zp++ ) {
1553                             if ( ! trie->trans[ zp ].next ) {
1554                                 base = trie->uniquecharcount + zp - minid;
1555                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1556                                 trie->trans[ zp ].check = state;
1557                                 set = 1;
1558                                 break;
1559                             }
1560                         }
1561                         if ( !set ) {
1562                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1563                             trie->trans[ tp ].check = state;
1564                             tp++;
1565                             zp = tp;
1566                         }
1567                     } else {
1568                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1569                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1570                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1571                             trie->trans[ tid ].check = state;
1572                         }
1573                         tp += ( maxid - minid + 1 );
1574                     }
1575                     Safefree(trie->states[ state ].trans.list);
1576                 }
1577                 /*
1578                 DEBUG_TRIE_COMPILE_MORE_r(
1579                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1580                 );
1581                 */
1582                 trie->states[ state ].trans.base=base;
1583             }
1584             trie->lasttrans = tp + 1;
1585         }
1586     } else {
1587         /*
1588            Second Pass -- Flat Table Representation.
1589
1590            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1591            We know that we will need Charcount+1 trans at most to store the data
1592            (one row per char at worst case) So we preallocate both structures
1593            assuming worst case.
1594
1595            We then construct the trie using only the .next slots of the entry
1596            structs.
1597
1598            We use the .check field of the first entry of the node  temporarily to
1599            make compression both faster and easier by keeping track of how many non
1600            zero fields are in the node.
1601
1602            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1603            transition.
1604
1605            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1606            number representing the first entry of the node, and state as a
1607            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1608            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1609            are 2 entrys per node. eg:
1610
1611              A B       A B
1612           1. 2 4    1. 3 7
1613           2. 0 3    3. 0 5
1614           3. 0 0    5. 0 0
1615           4. 0 0    7. 0 0
1616
1617            The table is internally in the right hand, idx form. However as we also
1618            have to deal with the states array which is indexed by nodenum we have to
1619            use TRIE_NODENUM() to convert.
1620
1621         */
1622         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1623             "%*sCompiling trie using table compiler\n",
1624             (int)depth * 2 + 2, ""));
1625
1626         Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
1627               reg_trie_trans );
1628         Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1629         next_alloc = trie->uniquecharcount + 1;
1630
1631
1632         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1633
1634             regnode * const noper   = NEXTOPER( cur );
1635             const U8 *uc     = (U8*)STRING( noper );
1636             const U8 * const e = uc + STR_LEN( noper );
1637
1638             U32 state        = 1;         /* required init */
1639
1640             U16 charid       = 0;         /* sanity init */
1641             U32 accept_state = 0;         /* sanity init */
1642             U8 *scan         = (U8*)NULL; /* sanity init */
1643
1644             STRLEN foldlen   = 0;         /* required init */
1645             U32 wordlen      = 0;         /* required init */
1646             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1647
1648             if ( OP(noper) != NOTHING ) {
1649                 for ( ; uc < e ; uc += len ) {
1650
1651                     TRIE_READ_CHAR;
1652
1653                     if ( uvc < 256 ) {
1654                         charid = trie->charmap[ uvc ];
1655                     } else {
1656                         SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1657                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1658                     }
1659                     if ( charid ) {
1660                         charid--;
1661                         if ( !trie->trans[ state + charid ].next ) {
1662                             trie->trans[ state + charid ].next = next_alloc;
1663                             trie->trans[ state ].check++;
1664                             next_alloc += trie->uniquecharcount;
1665                         }
1666                         state = trie->trans[ state + charid ].next;
1667                     } else {
1668                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1669                     }
1670                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1671                 }
1672             }
1673             accept_state = TRIE_NODENUM( state );
1674             TRIE_HANDLE_WORD(accept_state);
1675
1676         } /* end second pass */
1677
1678         /* and now dump it out before we compress it */
1679         DEBUG_TRIE_COMPILE_MORE_r(
1680             dump_trie_interim_table(trie,next_alloc,depth+1)
1681         );
1682
1683         {
1684         /*
1685            * Inplace compress the table.*
1686
1687            For sparse data sets the table constructed by the trie algorithm will
1688            be mostly 0/FAIL transitions or to put it another way mostly empty.
1689            (Note that leaf nodes will not contain any transitions.)
1690
1691            This algorithm compresses the tables by eliminating most such
1692            transitions, at the cost of a modest bit of extra work during lookup:
1693
1694            - Each states[] entry contains a .base field which indicates the
1695            index in the state[] array wheres its transition data is stored.
1696
1697            - If .base is 0 there are no  valid transitions from that node.
1698
1699            - If .base is nonzero then charid is added to it to find an entry in
1700            the trans array.
1701
1702            -If trans[states[state].base+charid].check!=state then the
1703            transition is taken to be a 0/Fail transition. Thus if there are fail
1704            transitions at the front of the node then the .base offset will point
1705            somewhere inside the previous nodes data (or maybe even into a node
1706            even earlier), but the .check field determines if the transition is
1707            valid.
1708
1709            XXX - wrong maybe?
1710            The following process inplace converts the table to the compressed
1711            table: We first do not compress the root node 1,and mark its all its
1712            .check pointers as 1 and set its .base pointer as 1 as well. This
1713            allows to do a DFA construction from the compressed table later, and
1714            ensures that any .base pointers we calculate later are greater than
1715            0.
1716
1717            - We set 'pos' to indicate the first entry of the second node.
1718
1719            - We then iterate over the columns of the node, finding the first and
1720            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1721            and set the .check pointers accordingly, and advance pos
1722            appropriately and repreat for the next node. Note that when we copy
1723            the next pointers we have to convert them from the original
1724            NODEIDX form to NODENUM form as the former is not valid post
1725            compression.
1726
1727            - If a node has no transitions used we mark its base as 0 and do not
1728            advance the pos pointer.
1729
1730            - If a node only has one transition we use a second pointer into the
1731            structure to fill in allocated fail transitions from other states.
1732            This pointer is independent of the main pointer and scans forward
1733            looking for null transitions that are allocated to a state. When it
1734            finds one it writes the single transition into the "hole".  If the
1735            pointer doesnt find one the single transition is appended as normal.
1736
1737            - Once compressed we can Renew/realloc the structures to release the
1738            excess space.
1739
1740            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1741            specifically Fig 3.47 and the associated pseudocode.
1742
1743            demq
1744         */
1745         const U32 laststate = TRIE_NODENUM( next_alloc );
1746         U32 state, charid;
1747         U32 pos = 0, zp=0;
1748         trie->statecount = laststate;
1749
1750         for ( state = 1 ; state < laststate ; state++ ) {
1751             U8 flag = 0;
1752             const U32 stateidx = TRIE_NODEIDX( state );
1753             const U32 o_used = trie->trans[ stateidx ].check;
1754             U32 used = trie->trans[ stateidx ].check;
1755             trie->trans[ stateidx ].check = 0;
1756
1757             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1758                 if ( flag || trie->trans[ stateidx + charid ].next ) {
1759                     if ( trie->trans[ stateidx + charid ].next ) {
1760                         if (o_used == 1) {
1761                             for ( ; zp < pos ; zp++ ) {
1762                                 if ( ! trie->trans[ zp ].next ) {
1763                                     break;
1764                                 }
1765                             }
1766                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1767                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1768                             trie->trans[ zp ].check = state;
1769                             if ( ++zp > pos ) pos = zp;
1770                             break;
1771                         }
1772                         used--;
1773                     }
1774                     if ( !flag ) {
1775                         flag = 1;
1776                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1777                     }
1778                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1779                     trie->trans[ pos ].check = state;
1780                     pos++;
1781                 }
1782             }
1783         }
1784         trie->lasttrans = pos + 1;
1785         Renew( trie->states, laststate, reg_trie_state);
1786         DEBUG_TRIE_COMPILE_MORE_r(
1787                 PerlIO_printf( Perl_debug_log,
1788                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1789                     (int)depth * 2 + 2,"",
1790                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1791                     (IV)next_alloc,
1792                     (IV)pos,
1793                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1794             );
1795
1796         } /* end table compress */
1797     }
1798     DEBUG_TRIE_COMPILE_MORE_r(
1799             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1800                 (int)depth * 2 + 2, "",
1801                 (UV)trie->statecount,
1802                 (UV)trie->lasttrans)
1803     );
1804     /* resize the trans array to remove unused space */
1805     Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1806
1807     /* and now dump out the compressed format */
1808     DEBUG_TRIE_COMPILE_r(
1809         dump_trie(trie,depth+1)
1810     );
1811
1812     {   /* Modify the program and insert the new TRIE node*/
1813         U8 nodetype =(U8)(flags & 0xFF);
1814         char *str=NULL;
1815
1816 #ifdef DEBUGGING
1817         regnode *optimize = NULL;
1818         U32 mjd_offset = 0;
1819         U32 mjd_nodelen = 0;
1820 #endif
1821         /*
1822            This means we convert either the first branch or the first Exact,
1823            depending on whether the thing following (in 'last') is a branch
1824            or not and whther first is the startbranch (ie is it a sub part of
1825            the alternation or is it the whole thing.)
1826            Assuming its a sub part we conver the EXACT otherwise we convert
1827            the whole branch sequence, including the first.
1828          */
1829         /* Find the node we are going to overwrite */
1830         if ( first != startbranch || OP( last ) == BRANCH ) {
1831             /* branch sub-chain */
1832             NEXT_OFF( first ) = (U16)(last - first);
1833             DEBUG_r({
1834                 mjd_offset= Node_Offset((convert));
1835                 mjd_nodelen= Node_Length((convert));
1836             });
1837             /* whole branch chain */
1838         } else {
1839             DEBUG_r({
1840                 const  regnode *nop = NEXTOPER( convert );
1841                 mjd_offset= Node_Offset((nop));
1842                 mjd_nodelen= Node_Length((nop));
1843             });
1844         }
1845
1846         DEBUG_OPTIMISE_r(
1847             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1848                 (int)depth * 2 + 2, "",
1849                 (UV)mjd_offset, (UV)mjd_nodelen)
1850         );
1851
1852         /* But first we check to see if there is a common prefix we can
1853            split out as an EXACT and put in front of the TRIE node.  */
1854         trie->startstate= 1;
1855         if ( trie->bitmap && !trie->widecharmap && !trie->jump  ) {
1856             U32 state;
1857             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1858                 U32 ofs = 0;
1859                 I32 idx = -1;
1860                 U32 count = 0;
1861                 const U32 base = trie->states[ state ].trans.base;
1862
1863                 if ( trie->states[state].wordnum )
1864                         count = 1;
1865
1866                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1867                     if ( ( base + ofs >= trie->uniquecharcount ) &&
1868                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1869                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1870                     {
1871                         if ( ++count > 1 ) {
1872                             SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
1873                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1874                             if ( state == 1 ) break;
1875                             if ( count == 2 ) {
1876                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1877                                 DEBUG_OPTIMISE_r(
1878                                     PerlIO_printf(Perl_debug_log,
1879                                         "%*sNew Start State=%"UVuf" Class: [",
1880                                         (int)depth * 2 + 2, "",
1881                                         (UV)state));
1882                                 if (idx >= 0) {
1883                                     SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1884                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1885
1886                                     TRIE_BITMAP_SET(trie,*ch);
1887                                     if ( folder )
1888                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
1889                                     DEBUG_OPTIMISE_r(
1890                                         PerlIO_printf(Perl_debug_log, (char*)ch)
1891                                     );
1892                                 }
1893                             }
1894                             TRIE_BITMAP_SET(trie,*ch);
1895                             if ( folder )
1896                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1897                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1898                         }
1899                         idx = ofs;
1900                     }
1901                 }
1902                 if ( count == 1 ) {
1903                     SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1904                     char *ch = SvPV_nolen( *tmp );
1905                     DEBUG_OPTIMISE_r({
1906                         SV *sv=sv_newmortal();
1907                         PerlIO_printf( Perl_debug_log,
1908                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1909                             (int)depth * 2 + 2, "",
1910                             (UV)state, (UV)idx,
1911                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1912                                 PL_colors[0], PL_colors[1],
1913                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1914                                 PERL_PV_ESCAPE_FIRSTCHAR
1915                             )
1916                         );
1917                     });
1918                     if ( state==1 ) {
1919                         OP( convert ) = nodetype;
1920                         str=STRING(convert);
1921                         STR_LEN(convert)=0;
1922                     }
1923                     while (*ch) {
1924                         *str++ = *ch++;
1925                         STR_LEN(convert)++;
1926                     }
1927
1928                 } else {
1929 #ifdef DEBUGGING
1930                     if (state>1)
1931                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1932 #endif
1933                     break;
1934                 }
1935             }
1936             if (str) {
1937                 regnode *n = convert+NODE_SZ_STR(convert);
1938                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
1939                 trie->startstate = state;
1940                 trie->minlen -= (state - 1);
1941                 trie->maxlen -= (state - 1);
1942                 DEBUG_r({
1943                     regnode *fix = convert;
1944                     U32 word = trie->wordcount;
1945                     mjd_nodelen++;
1946                     Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1947                     while( ++fix < n ) {
1948                         Set_Node_Offset_Length(fix, 0, 0);
1949                     }
1950                     while (word--) {
1951                         SV ** const tmp = av_fetch( trie->words, word, 0 );
1952                         if (tmp) {
1953                             if ( STR_LEN(convert) <= SvCUR(*tmp) )
1954                                 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
1955                             else
1956                                 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
1957                         }
1958                     }
1959                 });
1960                 if (trie->maxlen) {
1961                     convert = n;
1962                 } else {
1963                     NEXT_OFF(convert) = (U16)(tail - convert);
1964                     DEBUG_r(optimize= n);
1965                 }
1966             }
1967         }
1968         if (!jumper)
1969             jumper = last;
1970         if ( trie->maxlen ) {
1971             NEXT_OFF( convert ) = (U16)(tail - convert);
1972             ARG_SET( convert, data_slot );
1973             /* Store the offset to the first unabsorbed branch in
1974                jump[0], which is otherwise unused by the jump logic.
1975                We use this when dumping a trie and during optimisation. */
1976             if (trie->jump)
1977                 trie->jump[0] = (U16)(nextbranch - convert);
1978
1979             /* XXXX */
1980             if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1981                  ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
1982             {
1983                 OP( convert ) = TRIEC;
1984                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1985                 Safefree(trie->bitmap);
1986                 trie->bitmap= NULL;
1987             } else
1988                 OP( convert ) = TRIE;
1989
1990             /* store the type in the flags */
1991             convert->flags = nodetype;
1992             DEBUG_r({
1993             optimize = convert
1994                       + NODE_STEP_REGNODE
1995                       + regarglen[ OP( convert ) ];
1996             });
1997             /* XXX We really should free up the resource in trie now,
1998                    as we won't use them - (which resources?) dmq */
1999         }
2000         /* needed for dumping*/
2001         DEBUG_r(if (optimize) {
2002             regnode *opt = convert;
2003             while ( ++opt < optimize) {
2004                 Set_Node_Offset_Length(opt,0,0);
2005             }
2006             /*
2007                 Try to clean up some of the debris left after the
2008                 optimisation.
2009              */
2010             while( optimize < jumper ) {
2011                 mjd_nodelen += Node_Length((optimize));
2012                 OP( optimize ) = OPTIMIZED;
2013                 Set_Node_Offset_Length(optimize,0,0);
2014                 optimize++;
2015             }
2016             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2017         });
2018     } /* end node insert */
2019 #ifndef DEBUGGING
2020     SvREFCNT_dec(TRIE_REVCHARMAP(trie));
2021 #endif
2022     return trie->jump
2023            ? MADE_JUMP_TRIE
2024            : trie->startstate>1
2025              ? MADE_EXACT_TRIE
2026              : MADE_TRIE;
2027 }
2028
2029 STATIC void
2030 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2031 {
2032 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2033
2034    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2035    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2036    ISBN 0-201-10088-6
2037
2038    We find the fail state for each state in the trie, this state is the longest proper
2039    suffix of the current states 'word' that is also a proper prefix of another word in our
2040    trie. State 1 represents the word '' and is the thus the default fail state. This allows
2041    the DFA not to have to restart after its tried and failed a word at a given point, it
2042    simply continues as though it had been matching the other word in the first place.
2043    Consider
2044       'abcdgu'=~/abcdefg|cdgu/
2045    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2046    fail, which would bring use to the state representing 'd' in the second word where we would
2047    try 'g' and succeed, prodceding to match 'cdgu'.
2048  */
2049  /* add a fail transition */
2050     reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
2051     U32 *q;
2052     const U32 ucharcount = trie->uniquecharcount;
2053     const U32 numstates = trie->statecount;
2054     const U32 ubound = trie->lasttrans + ucharcount;
2055     U32 q_read = 0;
2056     U32 q_write = 0;
2057     U32 charid;
2058     U32 base = trie->states[ 1 ].trans.base;
2059     U32 *fail;
2060     reg_ac_data *aho;
2061     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2062     GET_RE_DEBUG_FLAGS_DECL;
2063 #ifndef DEBUGGING
2064     PERL_UNUSED_ARG(depth);
2065 #endif
2066
2067
2068     ARG_SET( stclass, data_slot );
2069     Newxz( aho, 1, reg_ac_data );
2070     RExC_rx->data->data[ data_slot ] = (void*)aho;
2071     aho->trie=trie;
2072     aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
2073         numstates * sizeof(reg_trie_state));
2074     Newxz( q, numstates, U32);
2075     Newxz( aho->fail, numstates, U32 );
2076     aho->refcount = 1;
2077     fail = aho->fail;
2078     /* initialize fail[0..1] to be 1 so that we always have
2079        a valid final fail state */
2080     fail[ 0 ] = fail[ 1 ] = 1;
2081
2082     for ( charid = 0; charid < ucharcount ; charid++ ) {
2083         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2084         if ( newstate ) {
2085             q[ q_write ] = newstate;
2086             /* set to point at the root */
2087             fail[ q[ q_write++ ] ]=1;
2088         }
2089     }
2090     while ( q_read < q_write) {
2091         const U32 cur = q[ q_read++ % numstates ];
2092         base = trie->states[ cur ].trans.base;
2093
2094         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2095             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2096             if (ch_state) {
2097                 U32 fail_state = cur;
2098                 U32 fail_base;
2099                 do {
2100                     fail_state = fail[ fail_state ];
2101                     fail_base = aho->states[ fail_state ].trans.base;
2102                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2103
2104                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2105                 fail[ ch_state ] = fail_state;
2106                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2107                 {
2108                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2109                 }
2110                 q[ q_write++ % numstates] = ch_state;
2111             }
2112         }
2113     }
2114     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2115        when we fail in state 1, this allows us to use the
2116        charclass scan to find a valid start char. This is based on the principle
2117        that theres a good chance the string being searched contains lots of stuff
2118        that cant be a start char.
2119      */
2120     fail[ 0 ] = fail[ 1 ] = 0;
2121     DEBUG_TRIE_COMPILE_r({
2122         PerlIO_printf(Perl_debug_log,
2123                       "%*sStclass Failtable (%"UVuf" states): 0",
2124                       (int)(depth * 2), "", (UV)numstates
2125         );
2126         for( q_read=1; q_read<numstates; q_read++ ) {
2127             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2128         }
2129         PerlIO_printf(Perl_debug_log, "\n");
2130     });
2131     Safefree(q);
2132     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2133 }
2134
2135
2136 /*
2137  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2138  * These need to be revisited when a newer toolchain becomes available.
2139  */
2140 #if defined(__sparc64__) && defined(__GNUC__)
2141 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2142 #       undef  SPARC64_GCC_WORKAROUND
2143 #       define SPARC64_GCC_WORKAROUND 1
2144 #   endif
2145 #endif
2146
2147 #define DEBUG_PEEP(str,scan,depth) \
2148     DEBUG_OPTIMISE_r({if (scan){ \
2149        SV * const mysv=sv_newmortal(); \
2150        regnode *Next = regnext(scan); \
2151        regprop(RExC_rx, mysv, scan); \
2152        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2153        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2154        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2155    }});
2156
2157
2158
2159
2160
2161 #define JOIN_EXACT(scan,min,flags) \
2162     if (PL_regkind[OP(scan)] == EXACT) \
2163         join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2164
2165 STATIC U32
2166 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2167     /* Merge several consecutive EXACTish nodes into one. */
2168     regnode *n = regnext(scan);
2169     U32 stringok = 1;
2170     regnode *next = scan + NODE_SZ_STR(scan);
2171     U32 merged = 0;
2172     U32 stopnow = 0;
2173 #ifdef DEBUGGING
2174     regnode *stop = scan;
2175     GET_RE_DEBUG_FLAGS_DECL;
2176 #else
2177     PERL_UNUSED_ARG(depth);
2178 #endif
2179 #ifndef EXPERIMENTAL_INPLACESCAN
2180     PERL_UNUSED_ARG(flags);
2181     PERL_UNUSED_ARG(val);
2182 #endif
2183     DEBUG_PEEP("join",scan,depth);
2184
2185     /* Skip NOTHING, merge EXACT*. */
2186     while (n &&
2187            ( PL_regkind[OP(n)] == NOTHING ||
2188              (stringok && (OP(n) == OP(scan))))
2189            && NEXT_OFF(n)
2190            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2191
2192         if (OP(n) == TAIL || n > next)
2193             stringok = 0;
2194         if (PL_regkind[OP(n)] == NOTHING) {
2195             DEBUG_PEEP("skip:",n,depth);
2196             NEXT_OFF(scan) += NEXT_OFF(n);
2197             next = n + NODE_STEP_REGNODE;
2198 #ifdef DEBUGGING
2199             if (stringok)
2200                 stop = n;
2201 #endif
2202             n = regnext(n);
2203         }
2204         else if (stringok) {
2205             const unsigned int oldl = STR_LEN(scan);
2206             regnode * const nnext = regnext(n);
2207
2208             DEBUG_PEEP("merg",n,depth);
2209
2210             merged++;
2211             if (oldl + STR_LEN(n) > U8_MAX)
2212                 break;
2213             NEXT_OFF(scan) += NEXT_OFF(n);
2214             STR_LEN(scan) += STR_LEN(n);
2215             next = n + NODE_SZ_STR(n);
2216             /* Now we can overwrite *n : */
2217             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2218 #ifdef DEBUGGING
2219             stop = next - 1;
2220 #endif
2221             n = nnext;
2222             if (stopnow) break;
2223         }
2224
2225 #ifdef EXPERIMENTAL_INPLACESCAN
2226         if (flags && !NEXT_OFF(n)) {
2227             DEBUG_PEEP("atch", val, depth);
2228             if (reg_off_by_arg[OP(n)]) {
2229                 ARG_SET(n, val - n);
2230             }
2231             else {
2232                 NEXT_OFF(n) = val - n;
2233             }
2234             stopnow = 1;
2235         }
2236 #endif
2237     }
2238
2239     if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2240     /*
2241     Two problematic code points in Unicode casefolding of EXACT nodes:
2242
2243     U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2244     U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2245
2246     which casefold to
2247
2248     Unicode                      UTF-8
2249
2250     U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2251     U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2252
2253     This means that in case-insensitive matching (or "loose matching",
2254     as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2255     length of the above casefolded versions) can match a target string
2256     of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2257     This would rather mess up the minimum length computation.
2258
2259     What we'll do is to look for the tail four bytes, and then peek
2260     at the preceding two bytes to see whether we need to decrease
2261     the minimum length by four (six minus two).
2262
2263     Thanks to the design of UTF-8, there cannot be false matches:
2264     A sequence of valid UTF-8 bytes cannot be a subsequence of
2265     another valid sequence of UTF-8 bytes.
2266
2267     */
2268          char * const s0 = STRING(scan), *s, *t;
2269          char * const s1 = s0 + STR_LEN(scan) - 1;
2270          char * const s2 = s1 - 4;
2271 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2272          const char t0[] = "\xaf\x49\xaf\x42";
2273 #else
2274          const char t0[] = "\xcc\x88\xcc\x81";
2275 #endif
2276          const char * const t1 = t0 + 3;
2277
2278          for (s = s0 + 2;
2279               s < s2 && (t = ninstr(s, s1, t0, t1));
2280               s = t + 4) {
2281 #ifdef EBCDIC
2282               if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2283                   ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2284 #else
2285               if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2286                   ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2287 #endif
2288                    *min -= 4;
2289          }
2290     }
2291
2292 #ifdef DEBUGGING
2293     /* Allow dumping */
2294     n = scan + NODE_SZ_STR(scan);
2295     while (n <= stop) {
2296         if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2297             OP(n) = OPTIMIZED;
2298             NEXT_OFF(n) = 0;
2299         }
2300         n++;
2301     }
2302 #endif
2303     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2304     return stopnow;
2305 }
2306
2307 /* REx optimizer.  Converts nodes into quickier variants "in place".
2308    Finds fixed substrings.  */
2309
2310 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2311    to the position after last scanned or to NULL. */
2312
2313 #define INIT_AND_WITHP \
2314     assert(!and_withp); \
2315     Newx(and_withp,1,struct regnode_charclass_class); \
2316     SAVEFREEPV(and_withp)
2317
2318 /* this is a chain of data about sub patterns we are processing that
2319    need to be handled seperately/specially in study_chunk. Its so
2320    we can simulate recursion without losing state.  */
2321 struct scan_frame;
2322 typedef struct scan_frame {
2323     regnode *last;  /* last node to process in this frame */
2324     regnode *next;  /* next node to process when last is reached */
2325     struct scan_frame *prev; /*previous frame*/
2326     I32 stop; /* what stopparen do we use */
2327 } scan_frame;
2328
2329 STATIC I32
2330 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2331                         I32 *minlenp, I32 *deltap,
2332                         regnode *last,
2333                         scan_data_t *data,
2334                         I32 stopparen,
2335                         U8* recursed,
2336                         struct regnode_charclass_class *and_withp,
2337                         U32 flags, U32 depth)
2338                         /* scanp: Start here (read-write). */
2339                         /* deltap: Write maxlen-minlen here. */
2340                         /* last: Stop before this one. */
2341                         /* data: string data about the pattern */
2342                         /* stopparen: treat close N as END */
2343                         /* recursed: which subroutines have we recursed into */
2344                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2345 {
2346     dVAR;
2347     I32 min = 0, pars = 0, code;
2348     regnode *scan = *scanp, *next;
2349     I32 delta = 0;
2350     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2351     int is_inf_internal = 0;            /* The studied chunk is infinite */
2352     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2353     scan_data_t data_fake;
2354     SV *re_trie_maxbuff = NULL;
2355     regnode *first_non_open = scan;
2356     I32 stopmin = I32_MAX;
2357     scan_frame *frame = NULL;
2358
2359     GET_RE_DEBUG_FLAGS_DECL;
2360
2361 #ifdef DEBUGGING
2362     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2363 #endif
2364
2365     if ( depth == 0 ) {
2366         while (first_non_open && OP(first_non_open) == OPEN)
2367             first_non_open=regnext(first_non_open);
2368     }
2369
2370
2371   fake_study_recurse:
2372     while ( scan && OP(scan) != END && scan < last ){
2373         /* Peephole optimizer: */
2374         DEBUG_STUDYDATA(data,depth);
2375         DEBUG_PEEP("Peep",scan,depth);
2376         JOIN_EXACT(scan,&min,0);
2377
2378         /* Follow the next-chain of the current node and optimize
2379            away all the NOTHINGs from it.  */
2380         if (OP(scan) != CURLYX) {
2381             const int max = (reg_off_by_arg[OP(scan)]
2382                        ? I32_MAX
2383                        /* I32 may be smaller than U16 on CRAYs! */
2384                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2385             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2386             int noff;
2387             regnode *n = scan;
2388
2389             /* Skip NOTHING and LONGJMP. */
2390             while ((n = regnext(n))
2391                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2392                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2393                    && off + noff < max)
2394                 off += noff;
2395             if (reg_off_by_arg[OP(scan)])
2396                 ARG(scan) = off;
2397             else
2398                 NEXT_OFF(scan) = off;
2399         }
2400
2401
2402
2403         /* The principal pseudo-switch.  Cannot be a switch, since we
2404            look into several different things.  */
2405         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2406                    || OP(scan) == IFTHEN) {
2407             next = regnext(scan);
2408             code = OP(scan);
2409             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2410
2411             if (OP(next) == code || code == IFTHEN) {
2412                 /* NOTE - There is similar code to this block below for handling
2413                    TRIE nodes on a re-study.  If you change stuff here check there
2414                    too. */
2415                 I32 max1 = 0, min1 = I32_MAX, num = 0;
2416                 struct regnode_charclass_class accum;
2417                 regnode * const startbranch=scan;
2418
2419                 if (flags & SCF_DO_SUBSTR)
2420                     scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2421                 if (flags & SCF_DO_STCLASS)
2422                     cl_init_zero(pRExC_state, &accum);
2423
2424                 while (OP(scan) == code) {
2425                     I32 deltanext, minnext, f = 0, fake;
2426                     struct regnode_charclass_class this_class;
2427
2428                     num++;
2429                     data_fake.flags = 0;
2430                     if (data) {
2431                         data_fake.whilem_c = data->whilem_c;
2432                         data_fake.last_closep = data->last_closep;
2433                     }
2434                     else
2435                         data_fake.last_closep = &fake;
2436
2437                     data_fake.pos_delta = delta;
2438                     next = regnext(scan);
2439                     scan = NEXTOPER(scan);
2440                     if (code != BRANCH)
2441                         scan = NEXTOPER(scan);
2442                     if (flags & SCF_DO_STCLASS) {
2443                         cl_init(pRExC_state, &this_class);
2444                         data_fake.start_class = &this_class;
2445                         f = SCF_DO_STCLASS_AND;
2446                     }
2447                     if (flags & SCF_WHILEM_VISITED_POS)
2448                         f |= SCF_WHILEM_VISITED_POS;
2449
2450                     /* we suppose the run is continuous, last=next...*/
2451                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2452                                           next, &data_fake,
2453                                           stopparen, recursed, NULL, f,depth+1);
2454                     if (min1 > minnext)
2455                         min1 = minnext;
2456                     if (max1 < minnext + deltanext)
2457                         max1 = minnext + deltanext;
2458                     if (deltanext == I32_MAX)
2459                         is_inf = is_inf_internal = 1;
2460                     scan = next;
2461                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2462                         pars++;
2463                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
2464                         if ( stopmin > minnext)
2465                             stopmin = min + min1;
2466                         flags &= ~SCF_DO_SUBSTR;
2467                         if (data)
2468                             data->flags |= SCF_SEEN_ACCEPT;
2469                     }
2470                     if (data) {
2471                         if (data_fake.flags & SF_HAS_EVAL)
2472                             data->flags |= SF_HAS_EVAL;
2473                         data->whilem_c = data_fake.whilem_c;
2474                     }
2475                     if (flags & SCF_DO_STCLASS)
2476                         cl_or(pRExC_state, &accum, &this_class);
2477                 }
2478                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2479                     min1 = 0;
2480                 if (flags & SCF_DO_SUBSTR) {
2481                     data->pos_min += min1;
2482                     data->pos_delta += max1 - min1;
2483                     if (max1 != min1 || is_inf)
2484                         data->longest = &(data->longest_float);
2485                 }
2486                 min += min1;
2487                 delta += max1 - min1;
2488                 if (flags & SCF_DO_STCLASS_OR) {
2489                     cl_or(pRExC_state, data->start_class, &accum);
2490                     if (min1) {
2491                         cl_and(data->start_class, and_withp);
2492                         flags &= ~SCF_DO_STCLASS;
2493                     }
2494                 }
2495                 else if (flags & SCF_DO_STCLASS_AND) {
2496                     if (min1) {
2497                         cl_and(data->start_class, &accum);
2498                         flags &= ~SCF_DO_STCLASS;
2499                     }
2500                     else {
2501                         /* Switch to OR mode: cache the old value of
2502                          * data->start_class */
2503                         INIT_AND_WITHP;
2504                         StructCopy(data->start_class, and_withp,
2505                                    struct regnode_charclass_class);
2506                         flags &= ~SCF_DO_STCLASS_AND;
2507                         StructCopy(&accum, data->start_class,
2508                                    struct regnode_charclass_class);
2509                         flags |= SCF_DO_STCLASS_OR;
2510                         data->start_class->flags |= ANYOF_EOS;
2511                     }
2512                 }
2513
2514                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2515                 /* demq.
2516
2517                    Assuming this was/is a branch we are dealing with: 'scan' now
2518                    points at the item that follows the branch sequence, whatever
2519                    it is. We now start at the beginning of the sequence and look
2520                    for subsequences of
2521
2522                    BRANCH->EXACT=>x1
2523                    BRANCH->EXACT=>x2
2524                    tail
2525
2526                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
2527
2528                    If we can find such a subseqence we need to turn the first
2529                    element into a trie and then add the subsequent branch exact
2530                    strings to the trie.
2531
2532                    We have two cases
2533
2534                      1. patterns where the whole set of branch can be converted.
2535
2536                      2. patterns where only a subset can be converted.
2537
2538                    In case 1 we can replace the whole set with a single regop
2539                    for the trie. In case 2 we need to keep the start and end
2540                    branchs so
2541
2542                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2543                      becomes BRANCH TRIE; BRANCH X;
2544
2545                   There is an additional case, that being where there is a
2546                   common prefix, which gets split out into an EXACT like node
2547                   preceding the TRIE node.
2548
2549                   If x(1..n)==tail then we can do a simple trie, if not we make
2550                   a "jump" trie, such that when we match the appropriate word
2551                   we "jump" to the appopriate tail node. Essentailly we turn
2552                   a nested if into a case structure of sorts.
2553
2554                 */
2555
2556                     int made=0;
2557                     if (!re_trie_maxbuff) {
2558                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2559                         if (!SvIOK(re_trie_maxbuff))
2560                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2561                     }
2562                     if ( SvIV(re_trie_maxbuff)>=0  ) {
2563                         regnode *cur;
2564                         regnode *first = (regnode *)NULL;
2565                         regnode *last = (regnode *)NULL;
2566                         regnode *tail = scan;
2567                         U8 optype = 0;
2568                         U32 count=0;
2569
2570 #ifdef DEBUGGING
2571                         SV * const mysv = sv_newmortal();       /* for dumping */
2572 #endif
2573                         /* var tail is used because there may be a TAIL
2574                            regop in the way. Ie, the exacts will point to the
2575                            thing following the TAIL, but the last branch will
2576                            point at the TAIL. So we advance tail. If we
2577                            have nested (?:) we may have to move through several
2578                            tails.
2579                          */
2580
2581                         while ( OP( tail ) == TAIL ) {
2582                             /* this is the TAIL generated by (?:) */
2583                             tail = regnext( tail );
2584                         }
2585
2586
2587                         DEBUG_OPTIMISE_r({
2588                             regprop(RExC_rx, mysv, tail );
2589                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2590                                 (int)depth * 2 + 2, "",
2591                                 "Looking for TRIE'able sequences. Tail node is: ",
2592                                 SvPV_nolen_const( mysv )
2593                             );
2594                         });
2595
2596                         /*
2597
2598                            step through the branches, cur represents each
2599                            branch, noper is the first thing to be matched
2600                            as part of that branch and noper_next is the
2601                            regnext() of that node. if noper is an EXACT
2602                            and noper_next is the same as scan (our current
2603                            position in the regex) then the EXACT branch is
2604                            a possible optimization target. Once we have
2605                            two or more consequetive such branches we can
2606                            create a trie of the EXACT's contents and stich
2607                            it in place. If the sequence represents all of
2608                            the branches we eliminate the whole thing and
2609                            replace it with a single TRIE. If it is a
2610                            subsequence then we need to stitch it in. This
2611                            means the first branch has to remain, and needs
2612                            to be repointed at the item on the branch chain
2613                            following the last branch optimized. This could
2614                            be either a BRANCH, in which case the
2615                            subsequence is internal, or it could be the
2616                            item following the branch sequence in which
2617                            case the subsequence is at the end.
2618
2619                         */
2620
2621                         /* dont use tail as the end marker for this traverse */
2622                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2623                             regnode * const noper = NEXTOPER( cur );
2624 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2625                             regnode * const noper_next = regnext( noper );
2626 #endif
2627
2628                             DEBUG_OPTIMISE_r({
2629                                 regprop(RExC_rx, mysv, cur);
2630                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2631                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2632
2633                                 regprop(RExC_rx, mysv, noper);
2634                                 PerlIO_printf( Perl_debug_log, " -> %s",
2635                                     SvPV_nolen_const(mysv));
2636
2637                                 if ( noper_next ) {
2638                                   regprop(RExC_rx, mysv, noper_next );
2639                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2640                                     SvPV_nolen_const(mysv));
2641                                 }
2642                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2643                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2644                             });
2645                             if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2646                                          : PL_regkind[ OP( noper ) ] == EXACT )
2647                                   || OP(noper) == NOTHING )
2648 #ifdef NOJUMPTRIE
2649                                   && noper_next == tail
2650 #endif
2651                                   && count < U16_MAX)
2652                             {
2653                                 count++;
2654                                 if ( !first || optype == NOTHING ) {
2655                                     if (!first) first = cur;
2656                                     optype = OP( noper );
2657                                 } else {
2658                                     last = cur;
2659                                 }
2660                             } else {
2661                                 if ( last ) {
2662                                     make_trie( pRExC_state,
2663                                             startbranch, first, cur, tail, count,
2664                                             optype, depth+1 );
2665                                 }
2666                                 if ( PL_regkind[ OP( noper ) ] == EXACT
2667 #ifdef NOJUMPTRIE
2668                                      && noper_next == tail
2669 #endif
2670                                 ){
2671                                     count = 1;
2672                                     first = cur;
2673                                     optype = OP( noper );
2674                                 } else {
2675                                     count = 0;
2676                                     first = NULL;
2677                                     optype = 0;
2678                                 }
2679                                 last = NULL;
2680                             }
2681                         }
2682                         DEBUG_OPTIMISE_r({
2683                             regprop(RExC_rx, mysv, cur);
2684                             PerlIO_printf( Perl_debug_log,
2685                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2686                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2687
2688                         });
2689                         if ( last ) {
2690                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2691 #ifdef TRIE_STUDY_OPT
2692                             if ( ((made == MADE_EXACT_TRIE &&
2693                                  startbranch == first)
2694                                  || ( first_non_open == first )) &&
2695                                  depth==0 ) {
2696                                 flags |= SCF_TRIE_RESTUDY;
2697                                 if ( startbranch == first
2698                                      && scan == tail )
2699                                 {
2700                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2701                                 }
2702                             }
2703 #endif
2704                         }
2705                     }
2706
2707                 } /* do trie */
2708
2709             }
2710             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
2711                 scan = NEXTOPER(NEXTOPER(scan));
2712             } else                      /* single branch is optimized. */
2713                 scan = NEXTOPER(scan);
2714             continue;
2715         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2716             scan_frame *newframe = NULL;
2717             I32 paren;
2718             regnode *start;
2719             regnode *end;
2720
2721             if (OP(scan) != SUSPEND) {
2722             /* set the pointer */
2723                 if (OP(scan) == GOSUB) {
2724                     paren = ARG(scan);
2725                     RExC_recurse[ARG2L(scan)] = scan;
2726                     start = RExC_open_parens[paren-1];
2727                     end   = RExC_close_parens[paren-1];
2728                 } else {
2729                     paren = 0;
2730                     start = RExC_rx->program + 1;
2731                     end   = RExC_opend;
2732                 }
2733                 if (!recursed) {
2734                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2735                     SAVEFREEPV(recursed);
2736                 }
2737                 if (!PAREN_TEST(recursed,paren+1)) {
2738                     PAREN_SET(recursed,paren+1);
2739                     Newx(newframe,1,scan_frame);
2740                 } else {
2741                     if (flags & SCF_DO_SUBSTR) {
2742                         scan_commit(pRExC_state,data,minlenp);
2743                         data->longest = &(data->longest_float);
2744                     }
2745                     is_inf = is_inf_internal = 1;
2746                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2747                         cl_anything(pRExC_state, data->start_class);
2748                     flags &= ~SCF_DO_STCLASS;
2749                 }
2750             } else {
2751                 Newx(newframe,1,scan_frame);
2752                 paren = stopparen;
2753                 start = scan+2;
2754                 end = regnext(scan);
2755             }
2756             if (newframe) {
2757                 assert(start);
2758                 assert(end);
2759                 SAVEFREEPV(newframe);
2760                 newframe->next = regnext(scan);
2761                 newframe->last = last;
2762                 newframe->stop = stopparen;
2763                 newframe->prev = frame;
2764
2765                 frame = newframe;
2766                 scan =  start;
2767                 stopparen = paren;
2768                 last = end;
2769
2770                 continue;
2771             }
2772         }
2773         else if (OP(scan) == EXACT) {
2774             I32 l = STR_LEN(scan);
2775             UV uc;
2776             if (UTF) {
2777                 const U8 * const s = (U8*)STRING(scan);
2778                 l = utf8_length(s, s + l);
2779                 uc = utf8_to_uvchr(s, NULL);
2780             } else {
2781                 uc = *((U8*)STRING(scan));
2782             }
2783             min += l;
2784             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2785                 /* The code below prefers earlier match for fixed
2786                    offset, later match for variable offset.  */
2787                 if (data->last_end == -1) { /* Update the start info. */
2788                     data->last_start_min = data->pos_min;
2789                     data->last_start_max = is_inf
2790                         ? I32_MAX : data->pos_min + data->pos_delta;
2791                 }
2792                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2793                 if (UTF)
2794                     SvUTF8_on(data->last_found);
2795                 {
2796                     SV * const sv = data->last_found;
2797                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2798                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
2799                     if (mg && mg->mg_len >= 0)
2800                         mg->mg_len += utf8_length((U8*)STRING(scan),
2801                                                   (U8*)STRING(scan)+STR_LEN(scan));
2802                 }
2803                 data->last_end = data->pos_min + l;
2804                 data->pos_min += l; /* As in the first entry. */
2805                 data->flags &= ~SF_BEFORE_EOL;
2806             }
2807             if (flags & SCF_DO_STCLASS_AND) {
2808                 /* Check whether it is compatible with what we know already! */
2809                 int compat = 1;
2810
2811                 if (uc >= 0x100 ||
2812                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2813                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
2814                     && (!(data->start_class->flags & ANYOF_FOLD)
2815                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2816                     )
2817                     compat = 0;
2818                 ANYOF_CLASS_ZERO(data->start_class);
2819                 ANYOF_BITMAP_ZERO(data->start_class);
2820                 if (compat)
2821                     ANYOF_BITMAP_SET(data->start_class, uc);
2822                 data->start_class->flags &= ~ANYOF_EOS;
2823                 if (uc < 0x100)
2824                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2825             }
2826             else if (flags & SCF_DO_STCLASS_OR) {
2827                 /* false positive possible if the class is case-folded */
2828                 if (uc < 0x100)
2829                     ANYOF_BITMAP_SET(data->start_class, uc);
2830                 else
2831                     data->start_class->flags |= ANYOF_UNICODE_ALL;
2832                 data->start_class->flags &= ~ANYOF_EOS;
2833                 cl_and(data->start_class, and_withp);
2834             }
2835             flags &= ~SCF_DO_STCLASS;
2836         }
2837         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2838             I32 l = STR_LEN(scan);
2839             UV uc = *((U8*)STRING(scan));
2840
2841             /* Search for fixed substrings supports EXACT only. */
2842             if (flags & SCF_DO_SUBSTR) {
2843                 assert(data);
2844                 scan_commit(pRExC_state, data, minlenp);
2845             }
2846             if (UTF) {
2847                 const U8 * const s = (U8 *)STRING(scan);
2848                 l = utf8_length(s, s + l);
2849                 uc = utf8_to_uvchr(s, NULL);
2850             }
2851             min += l;
2852             if (flags & SCF_DO_SUBSTR)
2853                 data->pos_min += l;
2854             if (flags & SCF_DO_STCLASS_AND) {
2855                 /* Check whether it is compatible with what we know already! */
2856                 int compat = 1;
2857
2858                 if (uc >= 0x100 ||
2859                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2860                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
2861                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2862                     compat = 0;
2863                 ANYOF_CLASS_ZERO(data->start_class);
2864                 ANYOF_BITMAP_ZERO(data->start_class);
2865                 if (compat) {
2866                     ANYOF_BITMAP_SET(data->start_class, uc);
2867                     data->start_class->flags &= ~ANYOF_EOS;
2868                     data->start_class->flags |= ANYOF_FOLD;
2869                     if (OP(scan) == EXACTFL)
2870                         data->start_class->flags |= ANYOF_LOCALE;
2871                 }
2872             }
2873             else if (flags & SCF_DO_STCLASS_OR) {
2874                 if (data->start_class->flags & ANYOF_FOLD) {
2875                     /* false positive possible if the class is case-folded.
2876                        Assume that the locale settings are the same... */
2877                     if (uc < 0x100)
2878                         ANYOF_BITMAP_SET(data->start_class, uc);
2879                     data->start_class->flags &= ~ANYOF_EOS;
2880                 }
2881                 cl_and(data->start_class, and_withp);
2882             }
2883             flags &= ~SCF_DO_STCLASS;
2884         }
2885         else if (strchr((const char*)PL_varies,OP(scan))) {
2886             I32 mincount, maxcount, minnext, deltanext, fl = 0;
2887             I32 f = flags, pos_before = 0;
2888             regnode * const oscan = scan;
2889             struct regnode_charclass_class this_class;
2890             struct regnode_charclass_class *oclass = NULL;
2891             I32 next_is_eval = 0;
2892
2893             switch (PL_regkind[OP(scan)]) {
2894             case WHILEM:                /* End of (?:...)* . */
2895                 scan = NEXTOPER(scan);
2896                 goto finish;
2897             case PLUS:
2898                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2899                     next = NEXTOPER(scan);
2900                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2901                         mincount = 1;
2902                         maxcount = REG_INFTY;
2903                         next = regnext(scan);
2904                         scan = NEXTOPER(scan);
2905                         goto do_curly;
2906                     }
2907                 }
2908                 if (flags & SCF_DO_SUBSTR)
2909                     data->pos_min++;
2910                 min++;
2911                 /* Fall through. */
2912             case STAR:
2913                 if (flags & SCF_DO_STCLASS) {
2914                     mincount = 0;
2915                     maxcount = REG_INFTY;
2916                     next = regnext(scan);
2917                     scan = NEXTOPER(scan);
2918                     goto do_curly;
2919                 }
2920                 is_inf = is_inf_internal = 1;
2921                 scan = regnext(scan);
2922                 if (flags & SCF_DO_SUBSTR) {
2923                     scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2924                     data->longest = &(data->longest_float);
2925                 }
2926                 goto optimize_curly_tail;
2927             case CURLY:
2928                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2929                     && (scan->flags == stopparen))
2930                 {
2931                     mincount = 1;
2932                     maxcount = 1;
2933                 } else {
2934                     mincount = ARG1(scan);
2935                     maxcount = ARG2(scan);
2936                 }
2937                 next = regnext(scan);
2938                 if (OP(scan) == CURLYX) {
2939                     I32 lp = (data ? *(data->last_closep) : 0);
2940                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
2941                 }
2942                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2943                 next_is_eval = (OP(scan) == EVAL);
2944               do_curly:
2945                 if (flags & SCF_DO_SUBSTR) {
2946                     if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2947                     pos_before = data->pos_min;
2948                 }
2949                 if (data) {
2950                     fl = data->flags;
2951                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2952                     if (is_inf)
2953                         data->flags |= SF_IS_INF;
2954                 }
2955                 if (flags & SCF_DO_STCLASS) {
2956                     cl_init(pRExC_state, &this_class);
2957                     oclass = data->start_class;
2958                     data->start_class = &this_class;
2959                     f |= SCF_DO_STCLASS_AND;
2960                     f &= ~SCF_DO_STCLASS_OR;
2961                 }
2962                 /* These are the cases when once a subexpression
2963                    fails at a particular position, it cannot succeed
2964                    even after backtracking at the enclosing scope.
2965
2966                    XXXX what if minimal match and we are at the
2967                         initial run of {n,m}? */
2968                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2969                     f &= ~SCF_WHILEM_VISITED_POS;
2970
2971                 /* This will finish on WHILEM, setting scan, or on NULL: */
2972                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2973                                       last, data, stopparen, recursed, NULL,
2974                                       (mincount == 0
2975                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2976
2977                 if (flags & SCF_DO_STCLASS)
2978                     data->start_class = oclass;
2979                 if (mincount == 0 || minnext == 0) {
2980                     if (flags & SCF_DO_STCLASS_OR) {
2981                         cl_or(pRExC_state, data->start_class, &this_class);
2982                     }
2983                     else if (flags & SCF_DO_STCLASS_AND) {
2984                         /* Switch to OR mode: cache the old value of
2985                          * data->start_class */
2986                         INIT_AND_WITHP;
2987                         StructCopy(data->start_class, and_withp,
2988                                    struct regnode_charclass_class);
2989                         flags &= ~SCF_DO_STCLASS_AND;
2990                         StructCopy(&this_class, data->start_class,
2991                                    struct regnode_charclass_class);
2992                         flags |= SCF_DO_STCLASS_OR;
2993                         data->start_class->flags |= ANYOF_EOS;
2994                     }
2995                 } else {                /* Non-zero len */
2996                     if (flags & SCF_DO_STCLASS_OR) {
2997                         cl_or(pRExC_state, data->start_class, &this_class);
2998                         cl_and(data->start_class, and_withp);
2999                     }
3000                     else if (flags & SCF_DO_STCLASS_AND)
3001                         cl_and(data->start_class, &this_class);
3002                     flags &= ~SCF_DO_STCLASS;
3003                 }
3004                 if (!scan)              /* It was not CURLYX, but CURLY. */
3005                     scan = next;
3006                 if ( /* ? quantifier ok, except for (?{ ... }) */
3007                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3008                     && (minnext == 0) && (deltanext == 0)
3009                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3010                     && maxcount <= REG_INFTY/3 /* Complement check for big count */
3011                     && ckWARN(WARN_REGEXP))
3012                 {
3013                     vWARN(RExC_parse,
3014                           "Quantifier unexpected on zero-length expression");
3015                 }
3016
3017                 min += minnext * mincount;
3018                 is_inf_internal |= ((maxcount == REG_INFTY
3019                                      && (minnext + deltanext) > 0)
3020                                     || deltanext == I32_MAX);
3021                 is_inf |= is_inf_internal;
3022                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3023
3024                 /* Try powerful optimization CURLYX => CURLYN. */
3025                 if (  OP(oscan) == CURLYX && data
3026                       && data->flags & SF_IN_PAR
3027                       && !(data->flags & SF_HAS_EVAL)
3028                       && !deltanext && minnext == 1 ) {
3029                     /* Try to optimize to CURLYN.  */
3030                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3031                     regnode * const nxt1 = nxt;
3032 #ifdef DEBUGGING
3033                     regnode *nxt2;
3034 #endif
3035
3036                     /* Skip open. */
3037                     nxt = regnext(nxt);
3038                     if (!strchr((const char*)PL_simple,OP(nxt))
3039                         && !(PL_regkind[OP(nxt)] == EXACT
3040                              && STR_LEN(nxt) == 1))
3041                         goto nogo;
3042 #ifdef DEBUGGING
3043                     nxt2 = nxt;
3044 #endif
3045                     nxt = regnext(nxt);
3046                     if (OP(nxt) != CLOSE)
3047                         goto nogo;
3048                     if (RExC_open_parens) {
3049                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3050                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3051                     }
3052                     /* Now we know that nxt2 is the only contents: */
3053                     oscan->flags = (U8)ARG(nxt);
3054                     OP(oscan) = CURLYN;
3055                     OP(nxt1) = NOTHING; /* was OPEN. */
3056
3057 #ifdef DEBUGGING
3058                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3059                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3060                     NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3061                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3062                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3063                     NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3064 #endif
3065                 }
3066               nogo:
3067
3068                 /* Try optimization CURLYX => CURLYM. */
3069                 if (  OP(oscan) == CURLYX && data
3070                       && !(data->flags & SF_HAS_PAR)
3071                       && !(data->flags & SF_HAS_EVAL)
3072                       && !deltanext     /* atom is fixed width */
3073                       && minnext != 0   /* CURLYM can't handle zero width */
3074                 ) {
3075                     /* XXXX How to optimize if data == 0? */
3076                     /* Optimize to a simpler form.  */
3077                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3078                     regnode *nxt2;
3079
3080                     OP(oscan) = CURLYM;
3081                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3082                             && (OP(nxt2) != WHILEM))
3083                         nxt = nxt2;
3084                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3085                     /* Need to optimize away parenths. */
3086                     if (data->flags & SF_IN_PAR) {
3087                         /* Set the parenth number.  */
3088                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3089
3090                         if (OP(nxt) != CLOSE)
3091                             FAIL("Panic opt close");
3092                         oscan->flags = (U8)ARG(nxt);
3093                         if (RExC_open_parens) {
3094                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3095                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3096                         }
3097                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3098                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3099
3100 #ifdef DEBUGGING
3101                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3102                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3103                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3104                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3105 #endif
3106 #if 0
3107                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3108                             regnode *nnxt = regnext(nxt1);
3109
3110                             if (nnxt == nxt) {
3111                                 if (reg_off_by_arg[OP(nxt1)])
3112                                     ARG_SET(nxt1, nxt2 - nxt1);
3113                                 else if (nxt2 - nxt1 < U16_MAX)
3114                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3115                                 else
3116                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3117                             }
3118                             nxt1 = nnxt;
3119                         }
3120 #endif
3121                         /* Optimize again: */
3122                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3123                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3124                     }
3125                     else
3126                         oscan->flags = 0;
3127                 }
3128                 else if ((OP(oscan) == CURLYX)
3129                          && (flags & SCF_WHILEM_VISITED_POS)
3130                          /* See the comment on a similar expression above.
3131                             However, this time it not a subexpression
3132                             we care about, but the expression itself. */
3133                          && (maxcount == REG_INFTY)
3134                          && data && ++data->whilem_c < 16) {
3135                     /* This stays as CURLYX, we can put the count/of pair. */
3136                     /* Find WHILEM (as in regexec.c) */
3137                     regnode *nxt = oscan + NEXT_OFF(oscan);
3138
3139                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3140                         nxt += ARG(nxt);
3141                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
3142                         | (RExC_whilem_seen << 4)); /* On WHILEM */
3143                 }
3144                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3145                     pars++;
3146                 if (flags & SCF_DO_SUBSTR) {
3147                     SV *last_str = NULL;
3148                     int counted = mincount != 0;
3149
3150                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3151 #if defined(SPARC64_GCC_WORKAROUND)
3152                         I32 b = 0;
3153                         STRLEN l = 0;
3154                         const char *s = NULL;
3155                         I32 old = 0;
3156
3157                         if (pos_before >= data->last_start_min)
3158                             b = pos_before;
3159                         else
3160                             b = data->last_start_min;
3161
3162                         l = 0;
3163                         s = SvPV_const(data->last_found, l);
3164                         old = b - data->last_start_min;
3165
3166 #else
3167                         I32 b = pos_before >= data->last_start_min
3168                             ? pos_before : data->last_start_min;
3169                         STRLEN l;
3170                         const char * const s = SvPV_const(data->last_found, l);
3171                         I32 old = b - data->last_start_min;
3172 #endif
3173
3174                         if (UTF)
3175                             old = utf8_hop((U8*)s, old) - (U8*)s;
3176
3177                         l -= old;
3178                         /* Get the added string: */
3179                         last_str = newSVpvn(s  + old, l);
3180                         if (UTF)
3181                             SvUTF8_on(last_str);
3182                         if (deltanext == 0 && pos_before == b) {
3183                             /* What was added is a constant string */
3184                             if (mincount > 1) {
3185                                 SvGROW(last_str, (mincount * l) + 1);
3186                                 repeatcpy(SvPVX(last_str) + l,
3187                                           SvPVX_const(last_str), l, mincount - 1);
3188                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3189                                 /* Add additional parts. */
3190                                 SvCUR_set(data->last_found,
3191                                           SvCUR(data->last_found) - l);
3192                                 sv_catsv(data->last_found, last_str);
3193                                 {
3194                                     SV * sv = data->last_found;
3195                                     MAGIC *mg =
3196                                         SvUTF8(sv) && SvMAGICAL(sv) ?
3197                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3198                                     if (mg && mg->mg_len >= 0)
3199                                         mg->mg_len += CHR_SVLEN(last_str);
3200                                 }
3201                                 data->last_end += l * (mincount - 1);
3202                             }
3203                         } else {
3204                             /* start offset must point into the last copy */
3205                             data->last_start_min += minnext * (mincount - 1);
3206                             data->last_start_max += is_inf ? I32_MAX
3207                                 : (maxcount - 1) * (minnext + data->pos_delta);
3208                         }
3209                     }
3210                     /* It is counted once already... */
3211                     data->pos_min += minnext * (mincount - counted);
3212                     data->pos_delta += - counted * deltanext +
3213                         (minnext + deltanext) * maxcount - minnext * mincount;
3214                     if (mincount != maxcount) {
3215                          /* Cannot extend fixed substrings found inside
3216                             the group.  */
3217                         scan_commit(pRExC_state,data,minlenp);
3218                         if (mincount && last_str) {
3219                             SV * const sv = data->last_found;
3220                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3221                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3222
3223                             if (mg)
3224                                 mg->mg_len = -1;
3225                             sv_setsv(sv, last_str);
3226                             data->last_end = data->pos_min;
3227                             data->last_start_min =
3228                                 data->pos_min - CHR_SVLEN(last_str);
3229                             data->last_start_max = is_inf
3230                                 ? I32_MAX
3231                                 : data->pos_min + data->pos_delta
3232                                 - CHR_SVLEN(last_str);
3233                         }
3234                         data->longest = &(data->longest_float);
3235                     }
3236                     SvREFCNT_dec(last_str);
3237                 }
3238                 if (data && (fl & SF_HAS_EVAL))
3239                     data->flags |= SF_HAS_EVAL;
3240               optimize_curly_tail:
3241                 if (OP(oscan) != CURLYX) {
3242                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3243                            && NEXT_OFF(next))
3244                         NEXT_OFF(oscan) += NEXT_OFF(next);
3245                 }
3246                 continue;
3247             default:                    /* REF and CLUMP only? */
3248                 if (flags & SCF_DO_SUBSTR) {
3249                     scan_commit(pRExC_state,data,minlenp);      /* Cannot expect anything... */
3250                     data->longest = &(data->longest_float);
3251                 }
3252                 is_inf = is_inf_internal = 1;
3253                 if (flags & SCF_DO_STCLASS_OR)
3254                     cl_anything(pRExC_state, data->start_class);
3255                 flags &= ~SCF_DO_STCLASS;
3256                 break;
3257             }
3258         }
3259         else if (strchr((const char*)PL_simple,OP(scan))) {
3260             int value = 0;
3261
3262             if (flags & SCF_DO_SUBSTR) {
3263                 scan_commit(pRExC_state,data,minlenp);
3264                 data->pos_min++;
3265             }
3266             min++;
3267             if (flags & SCF_DO_STCLASS) {
3268                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3269
3270                 /* Some of the logic below assumes that switching
3271                    locale on will only add false positives. */
3272                 switch (PL_regkind[OP(scan)]) {
3273                 case SANY:
3274                 default:
3275                   do_default:
3276                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3277                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3278                         cl_anything(pRExC_state, data->start_class);
3279                     break;
3280                 case REG_ANY:
3281                     if (OP(scan) == SANY)
3282                         goto do_default;
3283                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3284                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3285                                  || (data->start_class->flags & ANYOF_CLASS));
3286                         cl_anything(pRExC_state, data->start_class);
3287                     }
3288                     if (flags & SCF_DO_STCLASS_AND || !value)
3289                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3290                     break;
3291                 case ANYOF:
3292                     if (flags & SCF_DO_STCLASS_AND)
3293                         cl_and(data->start_class,
3294                                (struct regnode_charclass_class*)scan);
3295                     else
3296                         cl_or(pRExC_state, data->start_class,
3297                               (struct regnode_charclass_class*)scan);
3298                     break;
3299                 case ALNUM:
3300                     if (flags & SCF_DO_STCLASS_AND) {
3301                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3302                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3303                             for (value = 0; value < 256; value++)
3304                                 if (!isALNUM(value))
3305                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3306                         }
3307                     }
3308                     else {
3309                         if (data->start_class->flags & ANYOF_LOCALE)
3310                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3311                         else {
3312                             for (value = 0; value < 256; value++)
3313                                 if (isALNUM(value))
3314                                     ANYOF_BITMAP_SET(data->start_class, value);
3315                         }
3316                     }
3317                     break;
3318                 case ALNUML:
3319                     if (flags & SCF_DO_STCLASS_AND) {
3320                         if (data->start_class->flags & ANYOF_LOCALE)
3321                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3322                     }
3323                     else {
3324                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3325                         data->start_class->flags |= ANYOF_LOCALE;
3326                     }
3327                     break;
3328                 case NALNUM:
3329                     if (flags & SCF_DO_STCLASS_AND) {
3330                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3331                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3332                             for (value = 0; value < 256; value++)
3333                                 if (isALNUM(value))
3334                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3335                         }
3336                     }
3337                     else {
3338                         if (data->start_class->flags & ANYOF_LOCALE)
3339                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3340                         else {
3341                             for (value = 0; value < 256; value++)
3342                                 if (!isALNUM(value))
3343                                     ANYOF_BITMAP_SET(data->start_class, value);
3344                         }
3345                     }
3346                     break;
3347                 case NALNUML:
3348                     if (flags & SCF_DO_STCLASS_AND) {
3349                         if (data->start_class->flags & ANYOF_LOCALE)
3350                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3351                     }
3352                     else {
3353                         data->start_class->flags |= ANYOF_LOCALE;
3354                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3355                     }
3356                     break;
3357                 case SPACE:
3358                     if (flags & SCF_DO_STCLASS_AND) {
3359                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3360                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3361                             for (value = 0; value < 256; value++)
3362                                 if (!isSPACE(value))
3363                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3364                         }
3365                     }
3366                     else {
3367                         if (data->start_class->flags & ANYOF_LOCALE)
3368                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3369                         else {
3370                             for (value = 0; value < 256; value++)
3371                                 if (isSPACE(value))
3372                                     ANYOF_BITMAP_SET(data->start_class, value);
3373                         }
3374                     }
3375                     break;
3376                 case SPACEL:
3377                     if (flags & SCF_DO_STCLASS_AND) {
3378                         if (data->start_class->flags & ANYOF_LOCALE)
3379                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3380                     }
3381                     else {
3382                         data->start_class->flags |= ANYOF_LOCALE;
3383                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3384                     }
3385                     break;
3386                 case NSPACE:
3387                     if (flags & SCF_DO_STCLASS_AND) {
3388                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3389                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3390                             for (value = 0; value < 256; value++)
3391                                 if (isSPACE(value))
3392                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3393                         }
3394                     }
3395                     else {
3396                         if (data->start_class->flags & ANYOF_LOCALE)
3397                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3398                         else {
3399                             for (value = 0; value < 256; value++)
3400                                 if (!isSPACE(value))
3401                                     ANYOF_BITMAP_SET(data->start_class, value);
3402                         }
3403                     }
3404                     break;
3405                 case NSPACEL:
3406                     if (flags & SCF_DO_STCLASS_AND) {
3407                         if (data->start_class->flags & ANYOF_LOCALE) {
3408                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3409                             for (value = 0; value < 256; value++)
3410                                 if (!isSPACE(value))
3411                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3412                         }
3413                     }
3414                     else {
3415                         data->start_class->flags |= ANYOF_LOCALE;
3416                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3417                     }
3418                     break;
3419                 case DIGIT:
3420                     if (flags & SCF_DO_STCLASS_AND) {
3421                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3422                         for (value = 0; value < 256; value++)
3423                             if (!isDIGIT(value))
3424                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3425                     }
3426                     else {
3427                         if (data->start_class->flags & ANYOF_LOCALE)
3428                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3429                         else {
3430                             for (value = 0; value < 256; value++)
3431                                 if (isDIGIT(value))
3432                                     ANYOF_BITMAP_SET(data->start_class, value);
3433                         }
3434                     }
3435                     break;
3436                 case NDIGIT:
3437                     if (flags & SCF_DO_STCLASS_AND) {
3438                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3439                         for (value = 0; value < 256; value++)
3440                             if (isDIGIT(value))
3441                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3442                     }
3443                     else {
3444                         if (data->start_class->flags & ANYOF_LOCALE)
3445                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3446                         else {
3447                             for (value = 0; value < 256; value++)
3448                                 if (!isDIGIT(value))
3449                                     ANYOF_BITMAP_SET(data->start_class, value);
3450                         }
3451                     }
3452                     break;
3453                 }
3454                 if (flags & SCF_DO_STCLASS_OR)
3455                     cl_and(data->start_class, and_withp);
3456                 flags &= ~SCF_DO_STCLASS;
3457             }
3458         }
3459         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3460             data->flags |= (OP(scan) == MEOL
3461                             ? SF_BEFORE_MEOL
3462                             : SF_BEFORE_SEOL);
3463         }
3464         else if (  PL_regkind[OP(scan)] == BRANCHJ
3465                  /* Lookbehind, or need to calculate parens/evals/stclass: */
3466                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
3467                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3468             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3469                 || OP(scan) == UNLESSM )
3470             {
3471                 /* Negative Lookahead/lookbehind
3472                    In this case we can't do fixed string optimisation.
3473                 */
3474
3475                 I32 deltanext, minnext, fake = 0;
3476                 regnode *nscan;
3477                 struct regnode_charclass_class intrnl;
3478                 int f = 0;
3479
3480                 data_fake.flags = 0;
3481                 if (data) {
3482                     data_fake.whilem_c = data->whilem_c;
3483                     data_fake.last_closep = data->last_closep;
3484                 }
3485                 else
3486                     data_fake.last_closep = &fake;
3487                 data_fake.pos_delta = delta;
3488                 if ( flags & SCF_DO_STCLASS && !scan->flags
3489                      && OP(scan) == IFMATCH ) { /* Lookahead */
3490                     cl_init(pRExC_state, &intrnl);
3491                     data_fake.start_class = &intrnl;
3492                     f |= SCF_DO_STCLASS_AND;
3493                 }
3494                 if (flags & SCF_WHILEM_VISITED_POS)
3495                     f |= SCF_WHILEM_VISITED_POS;
3496                 next = regnext(scan);
3497                 nscan = NEXTOPER(NEXTOPER(scan));
3498                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3499                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3500                 if (scan->flags) {
3501                     if (deltanext) {
3502                         FAIL("Variable length lookbehind not implemented");
3503                     }
3504                     else if (minnext > (I32)U8_MAX) {
3505                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3506                     }
3507                     scan->flags = (U8)minnext;
3508                 }
3509                 if (data) {
3510                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3511                         pars++;
3512                     if (data_fake.flags & SF_HAS_EVAL)
3513                         data->flags |= SF_HAS_EVAL;
3514                     data->whilem_c = data_fake.whilem_c;
3515                 }
3516                 if (f & SCF_DO_STCLASS_AND) {
3517                     const int was = (data->start_class->flags & ANYOF_EOS);
3518
3519                     cl_and(data->start_class, &intrnl);
3520                     if (was)
3521                         data->start_class->flags |= ANYOF_EOS;
3522                 }
3523             }
3524 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3525             else {
3526                 /* Positive Lookahead/lookbehind
3527                    In this case we can do fixed string optimisation,
3528                    but we must be careful about it. Note in the case of
3529                    lookbehind the positions will be offset by the minimum
3530                    length of the pattern, something we won't know about
3531                    until after the recurse.
3532                 */
3533                 I32 deltanext, fake = 0;
3534                 regnode *nscan;
3535                 struct regnode_charclass_class intrnl;
3536                 int f = 0;
3537                 /* We use SAVEFREEPV so that when the full compile
3538                     is finished perl will clean up the allocated
3539                     minlens when its all done. This was we don't
3540                     have to worry about freeing them when we know
3541                     they wont be used, which would be a pain.
3542                  */
3543                 I32 *minnextp;
3544                 Newx( minnextp, 1, I32 );
3545                 SAVEFREEPV(minnextp);
3546
3547                 if (data) {
3548                     StructCopy(data, &data_fake, scan_data_t);
3549                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3550                         f |= SCF_DO_SUBSTR;
3551                         if (scan->flags)
3552                             scan_commit(pRExC_state, &data_fake,minlenp);
3553                         data_fake.last_found=newSVsv(data->last_found);
3554                     }
3555                 }
3556                 else
3557                     data_fake.last_closep = &fake;
3558                 data_fake.flags = 0;
3559                 data_fake.pos_delta = delta;
3560                 if (is_inf)
3561                     data_fake.flags |= SF_IS_INF;
3562                 if ( flags & SCF_DO_STCLASS && !scan->flags
3563                      && OP(scan) == IFMATCH ) { /* Lookahead */
3564                     cl_init(pRExC_state, &intrnl);
3565                     data_fake.start_class = &intrnl;
3566                     f |= SCF_DO_STCLASS_AND;
3567                 }
3568                 if (flags & SCF_WHILEM_VISITED_POS)
3569                     f |= SCF_WHILEM_VISITED_POS;
3570                 next = regnext(scan);
3571                 nscan = NEXTOPER(NEXTOPER(scan));
3572
3573                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3574                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3575                 if (scan->flags) {
3576                     if (deltanext) {
3577                         FAIL("Variable length lookbehind not implemented");
3578                     }
3579                     else if (*minnextp > (I32)U8_MAX) {
3580                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3581                     }
3582                     scan->flags = (U8)*minnextp;
3583                 }
3584
3585                 *minnextp += min;
3586
3587                 if (f & SCF_DO_STCLASS_AND) {
3588                     const int was = (data->start_class->flags & ANYOF_EOS);
3589
3590                     cl_and(data->start_class, &intrnl);
3591                     if (was)
3592                         data->start_class->flags |= ANYOF_EOS;
3593                 }
3594                 if (data) {
3595                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3596                         pars++;
3597                     if (data_fake.flags & SF_HAS_EVAL)
3598                         data->flags |= SF_HAS_EVAL;
3599                     data->whilem_c = data_fake.whilem_c;
3600                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3601                         if (RExC_rx->minlen<*minnextp)
3602                             RExC_rx->minlen=*minnextp;
3603                         scan_commit(pRExC_state, &data_fake, minnextp);
3604                         SvREFCNT_dec(data_fake.last_found);
3605
3606                         if ( data_fake.minlen_fixed != minlenp )
3607                         {
3608                             data->offset_fixed= data_fake.offset_fixed;
3609                             data->minlen_fixed= data_fake.minlen_fixed;
3610                             data->lookbehind_fixed+= scan->flags;
3611                         }
3612                         if ( data_fake.minlen_float != minlenp )
3613                         {
3614                             data->minlen_float= data_fake.minlen_float;
3615                             data->offset_float_min=data_fake.offset_float_min;
3616                             data->offset_float_max=data_fake.offset_float_max;
3617                             data->lookbehind_float+= scan->flags;
3618                         }
3619                     }
3620                 }
3621
3622
3623             }
3624 #endif
3625         }
3626         else if (OP(scan) == OPEN) {
3627             if (stopparen != (I32)ARG(scan))
3628                 pars++;
3629         }
3630         else if (OP(scan) == CLOSE) {
3631             if (stopparen == (I32)ARG(scan)) {
3632                 break;
3633             }
3634             if ((I32)ARG(scan) == is_par) {
3635                 next = regnext(scan);
3636
3637                 if ( next && (OP(next) != WHILEM) && next < last)
3638                     is_par = 0;         /* Disable optimization */
3639             }
3640             if (data)
3641                 *(data->last_closep) = ARG(scan);
3642         }
3643         else if (OP(scan) == EVAL) {
3644                 if (data)
3645                     data->flags |= SF_HAS_EVAL;
3646         }
3647         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3648             if (flags & SCF_DO_SUBSTR) {
3649                 scan_commit(pRExC_state,data,minlenp);
3650                 flags &= ~SCF_DO_SUBSTR;
3651             }
3652             if (data && OP(scan)==ACCEPT) {
3653                 data->flags |= SCF_SEEN_ACCEPT;
3654                 if (stopmin > min)
3655                     stopmin = min;
3656             }
3657         }
3658         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3659         {
3660                 if (flags & SCF_DO_SUBSTR) {
3661                     scan_commit(pRExC_state,data,minlenp);
3662                     data->longest = &(data->longest_float);
3663                 }
3664                 is_inf = is_inf_internal = 1;
3665                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3666                     cl_anything(pRExC_state, data->start_class);
3667                 flags &= ~SCF_DO_STCLASS;
3668         }
3669         else if (OP(scan) == GPOS) {
3670             if (!(RExC_rx->reganch & ROPT_GPOS_FLOAT) &&
3671                 !(delta || is_inf || (data && data->pos_delta)))
3672             {
3673                 if (!(RExC_rx->reganch & ROPT_ANCH) && (flags & SCF_DO_SUBSTR))
3674                     RExC_rx->reganch |= ROPT_ANCH_GPOS;
3675                 if (RExC_rx->gofs < (U32)min)
3676                     RExC_rx->gofs = min;
3677             } else {
3678                 RExC_rx->reganch |= ROPT_GPOS_FLOAT;
3679                 RExC_rx->gofs = 0;
3680             }
3681         }
3682 #ifdef TRIE_STUDY_OPT
3683 #ifdef FULL_TRIE_STUDY
3684         else if (PL_regkind[OP(scan)] == TRIE) {
3685             /* NOTE - There is similar code to this block above for handling
3686                BRANCH nodes on the initial study.  If you change stuff here
3687                check there too. */
3688             regnode *trie_node= scan;
3689             regnode *tail= regnext(scan);
3690             reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3691             I32 max1 = 0, min1 = I32_MAX;
3692             struct regnode_charclass_class accum;
3693
3694             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3695                 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3696             if (flags & SCF_DO_STCLASS)
3697                 cl_init_zero(pRExC_state, &accum);
3698
3699             if (!trie->jump) {
3700                 min1= trie->minlen;
3701                 max1= trie->maxlen;
3702             } else {
3703                 const regnode *nextbranch= NULL;
3704                 U32 word;
3705
3706                 for ( word=1 ; word <= trie->wordcount ; word++)
3707                 {
3708                     I32 deltanext=0, minnext=0, f = 0, fake;
3709                     struct regnode_charclass_class this_class;
3710
3711                     data_fake.flags = 0;
3712                     if (data) {
3713                         data_fake.whilem_c = data->whilem_c;
3714                         data_fake.last_closep = data->last_closep;
3715                     }
3716                     else
3717                         data_fake.last_closep = &fake;
3718                     data_fake.pos_delta = delta;
3719                     if (flags & SCF_DO_STCLASS) {
3720                         cl_init(pRExC_state, &this_class);
3721                         data_fake.start_class = &this_class;
3722                         f = SCF_DO_STCLASS_AND;
3723                     }
3724                     if (flags & SCF_WHILEM_VISITED_POS)
3725                         f |= SCF_WHILEM_VISITED_POS;
3726
3727                     if (trie->jump[word]) {
3728                         if (!nextbranch)
3729                             nextbranch = trie_node + trie->jump[0];
3730                         scan= trie_node + trie->jump[word];
3731                         /* We go from the jump point to the branch that follows
3732                            it. Note this means we need the vestigal unused branches
3733                            even though they arent otherwise used.
3734                          */
3735                         minnext = study_chunk(pRExC_state, &scan, minlenp,
3736                             &deltanext, (regnode *)nextbranch, &data_fake,
3737                             stopparen, recursed, NULL, f,depth+1);
3738                     }
3739                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3740                         nextbranch= regnext((regnode*)nextbranch);
3741
3742                     if (min1 > (I32)(minnext + trie->minlen))
3743                         min1 = minnext + trie->minlen;
3744                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3745                         max1 = minnext + deltanext + trie->maxlen;
3746                     if (deltanext == I32_MAX)
3747                         is_inf = is_inf_internal = 1;
3748
3749                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3750                         pars++;
3751                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3752                         if ( stopmin > min + min1)
3753                             stopmin = min + min1;
3754                         flags &= ~SCF_DO_SUBSTR;
3755                         if (data)
3756                             data->flags |= SCF_SEEN_ACCEPT;
3757                     }
3758                     if (data) {
3759                         if (data_fake.flags & SF_HAS_EVAL)
3760                             data->flags |= SF_HAS_EVAL;
3761                         data->whilem_c = data_fake.whilem_c;
3762                     }
3763                     if (flags & SCF_DO_STCLASS)
3764                         cl_or(pRExC_state, &accum, &this_class);
3765                 }
3766             }
3767             if (flags & SCF_DO_SUBSTR) {
3768                 data->pos_min += min1;
3769                 data->pos_delta += max1 - min1;
3770                 if (max1 != min1 || is_inf)
3771                     data->longest = &(data->longest_float);
3772             }
3773             min += min1;
3774             delta += max1 - min1;
3775             if (flags & SCF_DO_STCLASS_OR) {
3776                 cl_or(pRExC_state, data->start_class, &accum);
3777                 if (min1) {
3778                     cl_and(data->start_class, and_withp);
3779                     flags &= ~SCF_DO_STCLASS;
3780                 }
3781             }
3782             else if (flags & SCF_DO_STCLASS_AND) {
3783                 if (min1) {
3784                     cl_and(data->start_class, &accum);
3785                     flags &= ~SCF_DO_STCLASS;
3786                 }
3787                 else {
3788                     /* Switch to OR mode: cache the old value of
3789                      * data->start_class */
3790                     INIT_AND_WITHP;
3791                     StructCopy(data->start_class, and_withp,
3792                                struct regnode_charclass_class);
3793                     flags &= ~SCF_DO_STCLASS_AND;
3794                     StructCopy(&accum, data->start_class,
3795                                struct regnode_charclass_class);
3796                     flags |= SCF_DO_STCLASS_OR;
3797                     data->start_class->flags |= ANYOF_EOS;
3798                 }
3799             }
3800             scan= tail;
3801             continue;
3802         }
3803 #else
3804         else if (PL_regkind[OP(scan)] == TRIE) {
3805             reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3806             U8*bang=NULL;
3807
3808             min += trie->minlen;
3809             delta += (trie->maxlen - trie->minlen);
3810             flags &= ~SCF_DO_STCLASS; /* xxx */
3811             if (flags & SCF_DO_SUBSTR) {
3812                 scan_commit(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3813                 data->pos_min += trie->minlen;
3814                 data->pos_delta += (trie->maxlen - trie->minlen);
3815                 if (trie->maxlen != trie->minlen)
3816                     data->longest = &(data->longest_float);
3817             }
3818             if (trie->jump) /* no more substrings -- for now /grr*/
3819                 flags &= ~SCF_DO_SUBSTR;
3820         }
3821 #endif /* old or new */
3822 #endif /* TRIE_STUDY_OPT */
3823         /* Else: zero-length, ignore. */
3824         scan = regnext(scan);
3825     }
3826     if (frame) {
3827         last = frame->last;
3828         scan = frame->next;
3829         stopparen = frame->stop;
3830         frame = frame->prev;
3831         goto fake_study_recurse;
3832     }
3833
3834   finish:
3835     assert(!frame);
3836
3837     *scanp = scan;
3838     *deltap = is_inf_internal ? I32_MAX : delta;
3839     if (flags & SCF_DO_SUBSTR && is_inf)
3840         data->pos_delta = I32_MAX - data->pos_min;
3841     if (is_par > (I32)U8_MAX)
3842         is_par = 0;
3843     if (is_par && pars==1 && data) {
3844         data->flags |= SF_IN_PAR;
3845         data->flags &= ~SF_HAS_PAR;
3846     }
3847     else if (pars && data) {
3848         data->flags |= SF_HAS_PAR;
3849         data->flags &= ~SF_IN_PAR;
3850     }
3851     if (flags & SCF_DO_STCLASS_OR)
3852         cl_and(data->start_class, and_withp);
3853     if (flags & SCF_TRIE_RESTUDY)
3854         data->flags |=  SCF_TRIE_RESTUDY;
3855
3856     DEBUG_STUDYDATA(data,depth);
3857
3858     return min < stopmin ? min : stopmin;
3859 }
3860
3861 STATIC I32
3862 S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
3863 {
3864     if (RExC_rx->data) {
3865         const U32 count = RExC_rx->data->count;
3866         Renewc(RExC_rx->data,
3867                sizeof(*RExC_rx->data) + sizeof(void*) * (count + n - 1),
3868                char, struct reg_data);
3869         Renew(RExC_rx->data->what, count + n, U8);
3870         RExC_rx->data->count += n;
3871     }
3872     else {
3873         Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
3874              char, struct reg_data);
3875         Newx(RExC_rx->data->what, n, U8);
3876         RExC_rx->data->count = n;
3877     }
3878     Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3879     return RExC_rx->data->count - n;
3880 }
3881
3882 #ifndef PERL_IN_XSUB_RE
3883 void
3884 Perl_reginitcolors(pTHX)
3885 {
3886     dVAR;
3887     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
3888     if (s) {
3889         char *t = savepv(s);
3890         int i = 0;
3891         PL_colors[0] = t;
3892         while (++i < 6) {
3893             t = strchr(t, '\t');
3894             if (t) {
3895                 *t = '\0';
3896                 PL_colors[i] = ++t;
3897             }
3898             else
3899                 PL_colors[i] = t = (char *)"";
3900         }
3901     } else {
3902         int i = 0;
3903         while (i < 6)
3904             PL_colors[i++] = (char *)"";
3905     }
3906     PL_colorset = 1;
3907 }
3908 #endif
3909
3910
3911 #ifdef TRIE_STUDY_OPT
3912 #define CHECK_RESTUDY_GOTO                                  \
3913         if (                                                \
3914               (data.flags & SCF_TRIE_RESTUDY)               \
3915               && ! restudied++                              \
3916         )     goto reStudy
3917 #else
3918 #define CHECK_RESTUDY_GOTO
3919 #endif
3920
3921 /*
3922  - pregcomp - compile a regular expression into internal code
3923  *
3924  * We can't allocate space until we know how big the compiled form will be,
3925  * but we can't compile it (and thus know how big it is) until we've got a
3926  * place to put the code.  So we cheat:  we compile it twice, once with code
3927  * generation turned off and size counting turned on, and once "for real".
3928  * This also means that we don't allocate space until we are sure that the
3929  * thing really will compile successfully, and we never have to move the
3930  * code and thus invalidate pointers into it.  (Note that it has to be in
3931  * one piece because free() must be able to free it all.) [NB: not true in perl]
3932  *
3933  * Beware that the optimization-preparation code in here knows about some
3934  * of the structure of the compiled regexp.  [I'll say.]
3935  */
3936
3937
3938
3939 #ifndef PERL_IN_XSUB_RE
3940 #define RE_ENGINE_PTR &PL_core_reg_engine
3941 #else
3942 extern const struct regexp_engine my_reg_engine;
3943 #define RE_ENGINE_PTR &my_reg_engine
3944 #endif
3945 /* these make a few things look better, to avoid indentation */
3946 #define BEGIN_BLOCK {
3947 #define END_BLOCK }
3948
3949 regexp *
3950 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
3951 {
3952     dVAR;
3953     GET_RE_DEBUG_FLAGS_DECL;
3954     DEBUG_r(if (!PL_colorset) reginitcolors());
3955 #ifndef PERL_IN_XSUB_RE
3956     BEGIN_BLOCK
3957     /* Dispatch a request to compile a regexp to correct
3958        regexp engine. */
3959     HV * const table = GvHV(PL_hintgv);
3960     if (table) {
3961         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
3962         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
3963             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3964             DEBUG_COMPILE_r({
3965                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
3966                     SvIV(*ptr));
3967             });
3968             return CALLREGCOMP_ENG(eng, exp, xend, pm);
3969         }
3970     }
3971     END_BLOCK
3972 #endif
3973     BEGIN_BLOCK
3974     register regexp *r;
3975     regnode *scan;
3976     regnode *first;
3977     I32 flags;
3978     I32 minlen = 0;
3979     I32 sawplus = 0;
3980     I32 sawopen = 0;
3981     scan_data_t data;
3982     RExC_state_t RExC_state;
3983     RExC_state_t * const pRExC_state = &RExC_state;
3984 #ifdef TRIE_STUDY_OPT
3985     int restudied= 0;
3986     RExC_state_t copyRExC_state;
3987 #endif
3988     if (exp == NULL)
3989         FAIL("NULL regexp argument");
3990
3991     RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
3992
3993     RExC_precomp = exp;
3994     DEBUG_COMPILE_r({
3995         SV *dsv= sv_newmortal();
3996         RE_PV_QUOTED_DECL(s, RExC_utf8,
3997             dsv, RExC_precomp, (xend - exp), 60);
3998         PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3999                        PL_colors[4],PL_colors[5],s);
4000     });
4001     RExC_flags = pm->op_pmflags;
4002     RExC_sawback = 0;
4003
4004     RExC_seen = 0;
4005     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4006     RExC_seen_evals = 0;
4007     RExC_extralen = 0;
4008
4009     /* First pass: determine size, legality. */
4010     RExC_parse = exp;
4011     RExC_start = exp;
4012     RExC_end = xend;
4013     RExC_naughty = 0;
4014     RExC_npar = 1;
4015     RExC_cpar = 1;
4016     RExC_nestroot = 0;
4017     RExC_size = 0L;
4018     RExC_emit = &PL_regdummy;
4019     RExC_whilem_seen = 0;
4020     RExC_charnames = NULL;
4021     RExC_open_parens = NULL;
4022     RExC_close_parens = NULL;
4023     RExC_opend = NULL;
4024     RExC_paren_names = NULL;
4025     RExC_recurse = NULL;
4026     RExC_recurse_count = 0;
4027
4028 #if 0 /* REGC() is (currently) a NOP at the first pass.
4029        * Clever compilers notice this and complain. --jhi */
4030     REGC((U8)REG_MAGIC, (char*)RExC_emit);
4031 #endif
4032     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4033     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4034         RExC_precomp = NULL;
4035         return(NULL);
4036     }
4037     DEBUG_PARSE_r({
4038         PerlIO_printf(Perl_debug_log,
4039             "Required size %"IVdf" nodes\n"
4040             "Starting second pass (creation)\n",
4041             (IV)RExC_size);
4042         RExC_lastnum=0;
4043         RExC_lastparse=NULL;
4044     });
4045     /* Small enough for pointer-storage convention?
4046        If extralen==0, this means that we will not need long jumps. */
4047     if (RExC_size >= 0x10000L && RExC_extralen)
4048         RExC_size += RExC_extralen;
4049     else
4050         RExC_extralen = 0;
4051     if (RExC_whilem_seen > 15)
4052         RExC_whilem_seen = 15;
4053
4054 #ifdef DEBUGGING
4055     /* Make room for a sentinel value at the end of the program */
4056     RExC_size++;
4057 #endif
4058
4059     /* Allocate space and zero-initialize. Note, the two step process
4060        of zeroing when in debug mode, thus anything assigned has to
4061        happen after that */
4062     Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
4063          char, regexp);
4064     if (r == NULL)
4065         FAIL("Regexp out of space");
4066 #ifdef DEBUGGING
4067     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4068     Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
4069 #else
4070     /* bulk initialize fields with 0. */
4071     Zero(r, sizeof(regexp), char);
4072 #endif
4073
4074     /* non-zero initialization begins here */
4075     r->engine= RE_ENGINE_PTR;
4076     r->refcnt = 1;
4077     r->prelen = xend - exp;
4078     r->precomp = savepvn(RExC_precomp, r->prelen);
4079     r->reganch = pm->op_pmflags & PMf_COMPILETIME;
4080     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4081
4082     if (RExC_seen & REG_SEEN_RECURSE) {
4083         Newxz(RExC_open_parens, RExC_npar,regnode *);
4084         SAVEFREEPV(RExC_open_parens);
4085         Newxz(RExC_close_parens,RExC_npar,regnode *);
4086         SAVEFREEPV(RExC_close_parens);
4087     }
4088
4089     /* Useful during FAIL. */
4090     Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4091     if (r->offsets) {
4092         r->offsets[0] = RExC_size;
4093     }
4094     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4095                           "%s %"UVuf" bytes for offset annotations.\n",
4096                           r->offsets ? "Got" : "Couldn't get",
4097                           (UV)((2*RExC_size+1) * sizeof(U32))));
4098
4099     RExC_rx = r;
4100
4101     /* Second pass: emit code. */
4102     RExC_flags = pm->op_pmflags;        /* don't let top level (?i) bleed */
4103     RExC_parse = exp;
4104     RExC_end = xend;
4105     RExC_naughty = 0;
4106     RExC_npar = 1;
4107     RExC_cpar = 1;
4108     RExC_emit_start = r->program;
4109     RExC_emit = r->program;
4110 #ifdef DEBUGGING
4111     /* put a sentinal on the end of the program so we can check for
4112        overwrites */
4113     r->program[RExC_size].type = 255;
4114 #endif
4115     /* Store the count of eval-groups for security checks: */
4116     RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
4117     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4118     r->data = 0;
4119     if (reg(pRExC_state, 0, &flags,1) == NULL)
4120         return(NULL);
4121
4122     /* XXXX To minimize changes to RE engine we always allocate
4123        3-units-long substrs field. */
4124     Newx(r->substrs, 1, struct reg_substr_data);
4125     if (RExC_recurse_count) {
4126         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4127         SAVEFREEPV(RExC_recurse);
4128     }
4129
4130 reStudy:
4131     r->minlen = minlen = sawplus = sawopen = 0;
4132     Zero(r->substrs, 1, struct reg_substr_data);
4133
4134 #ifdef TRIE_STUDY_OPT
4135     if ( restudied ) {
4136         U32 seen=RExC_seen;
4137         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4138
4139         RExC_state = copyRExC_state;
4140         if (seen & REG_TOP_LEVEL_BRANCHES)
4141             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4142         else
4143             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4144         if (data.last_found) {
4145             SvREFCNT_dec(data.longest_fixed);
4146             SvREFCNT_dec(data.longest_float);
4147             SvREFCNT_dec(data.last_found);
4148         }
4149         StructCopy(&zero_scan_data, &data, scan_data_t);
4150     } else {
4151         StructCopy(&zero_scan_data, &data, scan_data_t);
4152         copyRExC_state = RExC_state;
4153     }
4154 #else
4155     StructCopy(&zero_scan_data, &data, scan_data_t);
4156 #endif
4157
4158     /* Dig out information for optimizations. */
4159     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
4160     pm->op_pmflags = RExC_flags;
4161     if (UTF)
4162         r->reganch |= ROPT_UTF8;        /* Unicode in it? */
4163     r->regstclass = NULL;
4164     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
4165         r->reganch |= ROPT_NAUGHTY;
4166     scan = r->program + 1;              /* First BRANCH. */
4167
4168     /* testing for BRANCH here tells us whether there is "must appear"
4169        data in the pattern. If there is then we can use it for optimisations */
4170     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
4171         I32 fake;
4172         STRLEN longest_float_length, longest_fixed_length;
4173         struct regnode_charclass_class ch_class; /* pointed to by data */
4174         int stclass_flag;
4175         I32 last_close = 0; /* pointed to by data */
4176
4177         first = scan;
4178         /* Skip introductions and multiplicators >= 1. */
4179         while ((OP(first) == OPEN && (sawopen = 1)) ||
4180                /* An OR of *one* alternative - should not happen now. */
4181             (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
4182             /* for now we can't handle lookbehind IFMATCH*/
4183             (OP(first) == IFMATCH && !first->flags) ||
4184             (OP(first) == PLUS) ||
4185             (OP(first) == MINMOD) ||
4186                /* An {n,m} with n>0 */
4187             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4188         {
4189
4190                 if (OP(first) == PLUS)
4191                     sawplus = 1;
4192                 else
4193                     first += regarglen[OP(first)];
4194                 if (OP(first) == IFMATCH) {
4195                     first = NEXTOPER(first);
4196                     first += EXTRA_STEP_2ARGS;
4197                 } else  /* XXX possible optimisation for /(?=)/  */
4198                     first = NEXTOPER(first);
4199         }
4200
4201         /* Starting-point info. */
4202       again:
4203         DEBUG_PEEP("first:",first,0);
4204         /* Ignore EXACT as we deal with it later. */
4205         if (PL_regkind[OP(first)] == EXACT) {
4206             if (OP(first) == EXACT)
4207                 NOOP;   /* Empty, get anchored substr later. */
4208             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4209                 r->regstclass = first;
4210         }
4211 #ifdef TRIE_STCLASS
4212         else if (PL_regkind[OP(first)] == TRIE &&
4213                 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
4214         {
4215             regnode *trie_op;
4216             /* this can happen only on restudy */
4217             if ( OP(first) == TRIE ) {
4218                 struct regnode_1 *trieop;
4219                 Newxz(trieop,1,struct regnode_1);
4220                 StructCopy(first,trieop,struct regnode_1);
4221                 trie_op=(regnode *)trieop;
4222             } else {
4223                 struct regnode_charclass *trieop;
4224                 Newxz(trieop,1,struct regnode_charclass);
4225                 StructCopy(first,trieop,struct regnode_charclass);
4226                 trie_op=(regnode *)trieop;
4227             }
4228             OP(trie_op)+=2;
4229             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4230             r->regstclass = trie_op;
4231         }
4232 #endif
4233         else if (strchr((const char*)PL_simple,OP(first)))
4234             r->regstclass = first;
4235         else if (PL_regkind[OP(first)] == BOUND ||
4236                  PL_regkind[OP(first)] == NBOUND)
4237             r->regstclass = first;
4238         else if (PL_regkind[OP(first)] == BOL) {
4239             r->reganch |= (OP(first) == MBOL
4240                            ? ROPT_ANCH_MBOL
4241                            : (OP(first) == SBOL
4242                               ? ROPT_ANCH_SBOL
4243                               : ROPT_ANCH_BOL));
4244             first = NEXTOPER(first);
4245             goto again;
4246         }
4247         else if (OP(first) == GPOS) {
4248             r->reganch |= ROPT_ANCH_GPOS;
4249             first = NEXTOPER(first);
4250             goto again;
4251         }
4252         else if ((!sawopen || !RExC_sawback) &&
4253             (OP(first) == STAR &&
4254             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4255             !(r->reganch & ROPT_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4256         {
4257             /* turn .* into ^.* with an implied $*=1 */
4258             const int type =
4259                 (OP(NEXTOPER(first)) == REG_ANY)
4260                     ? ROPT_ANCH_MBOL
4261                     : ROPT_ANCH_SBOL;
4262             r->reganch |= type | ROPT_IMPLICIT;
4263             first = NEXTOPER(first);
4264             goto again;
4265         }
4266         if (sawplus && (!sawopen || !RExC_sawback)
4267             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4268             /* x+ must match at the 1st pos of run of x's */
4269             r->reganch |= ROPT_SKIP;
4270
4271         /* Scan is after the zeroth branch, first is atomic matcher. */
4272 #ifdef TRIE_STUDY_OPT
4273         DEBUG_PARSE_r(
4274             if (!restudied)
4275                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4276                               (IV)(first - scan + 1))
4277         );
4278 #else
4279         DEBUG_PARSE_r(
4280             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4281                 (IV)(first - scan + 1))
4282         );
4283 #endif
4284
4285
4286         /*
4287         * If there's something expensive in the r.e., find the
4288         * longest literal string that must appear and make it the
4289         * regmust.  Resolve ties in favor of later strings, since
4290         * the regstart check works with the beginning of the r.e.
4291         * and avoiding duplication strengthens checking.  Not a
4292         * strong reason, but sufficient in the absence of others.
4293         * [Now we resolve ties in favor of the earlier string if
4294         * it happens that c_offset_min has been invalidated, since the
4295         * earlier string may buy us something the later one won't.]
4296         */
4297
4298         data.longest_fixed = newSVpvs("");
4299         data.longest_float = newSVpvs("");
4300         data.last_found = newSVpvs("");
4301         data.longest = &(data.longest_fixed);
4302         first = scan;
4303         if (!r->regstclass) {
4304             cl_init(pRExC_state, &ch_class);
4305             data.start_class = &ch_class;
4306             stclass_flag = SCF_DO_STCLASS_AND;
4307         } else                          /* XXXX Check for BOUND? */
4308             stclass_flag = 0;
4309         data.last_closep = &last_close;
4310
4311         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4312             &data, -1, NULL, NULL,
4313             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4314
4315
4316         CHECK_RESTUDY_GOTO;
4317
4318
4319         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4320              && data.last_start_min == 0 && data.last_end > 0
4321              && !RExC_seen_zerolen
4322              && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
4323             r->reganch |= ROPT_CHECK_ALL;
4324         scan_commit(pRExC_state, &data,&minlen);
4325         SvREFCNT_dec(data.last_found);
4326
4327         /* Note that code very similar to this but for anchored string
4328            follows immediately below, changes may need to be made to both.
4329            Be careful.
4330          */
4331         longest_float_length = CHR_SVLEN(data.longest_float);
4332         if (longest_float_length
4333             || (data.flags & SF_FL_BEFORE_EOL
4334                 && (!(data.flags & SF_FL_BEFORE_MEOL)
4335                     || (RExC_flags & PMf_MULTILINE))))
4336         {
4337             I32 t,ml;
4338
4339             if (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
4340                 && data.offset_fixed == data.offset_float_min
4341                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4342                     goto remove_float;          /* As in (a)+. */
4343
4344             /* copy the information about the longest float from the reg_scan_data
4345                over to the program. */
4346             if (SvUTF8(data.longest_float)) {
4347                 r->float_utf8 = data.longest_float;
4348                 r->float_substr = NULL;
4349             } else {
4350                 r->float_substr = data.longest_float;
4351                 r->float_utf8 = NULL;
4352             }
4353             /* float_end_shift is how many chars that must be matched that
4354                follow this item. We calculate it ahead of time as once the
4355                lookbehind offset is added in we lose the ability to correctly
4356                calculate it.*/
4357             ml = data.minlen_float ? *(data.minlen_float)
4358                                    : (I32)longest_float_length;
4359             r->float_end_shift = ml - data.offset_float_min
4360                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4361                 + data.lookbehind_float;
4362             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4363             r->float_max_offset = data.offset_float_max;
4364             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4365                 r->float_max_offset -= data.lookbehind_float;
4366
4367             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4368                        && (!(data.flags & SF_FL_BEFORE_MEOL)
4369                            || (RExC_flags & PMf_MULTILINE)));
4370             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4371         }
4372         else {
4373           remove_float:
4374             r->float_substr = r->float_utf8 = NULL;
4375             SvREFCNT_dec(data.longest_float);
4376             longest_float_length = 0;
4377         }
4378
4379         /* Note that code very similar to this but for floating string
4380            is immediately above, changes may need to be made to both.
4381            Be careful.
4382          */
4383         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4384         if (longest_fixed_length
4385             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4386                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4387                     || (RExC_flags & PMf_MULTILINE))))
4388         {
4389             I32 t,ml;
4390
4391             /* copy the information about the longest fixed
4392                from the reg_scan_data over to the program. */
4393             if (SvUTF8(data.longest_fixed)) {
4394                 r->anchored_utf8 = data.longest_fixed;
4395                 r->anchored_substr = NULL;
4396             } else {
4397                 r->anchored_substr = data.longest_fixed;
4398                 r->anchored_utf8 = NULL;
4399             }
4400             /* fixed_end_shift is how many chars that must be matched that
4401                follow this item. We calculate it ahead of time as once the
4402                lookbehind offset is added in we lose the ability to correctly
4403                calculate it.*/
4404             ml = data.minlen_fixed ? *(data.minlen_fixed)
4405                                    : (I32)longest_fixed_length;
4406             r->anchored_end_shift = ml - data.offset_fixed
4407                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4408                 + data.lookbehind_fixed;
4409             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4410
4411             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4412                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
4413                      || (RExC_flags & PMf_MULTILINE)));
4414             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4415         }
4416         else {
4417             r->anchored_substr = r->anchored_utf8 = NULL;
4418             SvREFCNT_dec(data.longest_fixed);
4419             longest_fixed_length = 0;
4420         }
4421         if (r->regstclass
4422             && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
4423             r->regstclass = NULL;
4424         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4425             && stclass_flag
4426             && !(data.start_class->flags & ANYOF_EOS)
4427             && !cl_is_anything(data.start_class))
4428         {
4429             const I32 n = add_data(pRExC_state, 1, "f");
4430
4431             Newx(RExC_rx->data->data[n], 1,
4432                 struct regnode_charclass_class);
4433             StructCopy(data.start_class,
4434                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
4435                        struct regnode_charclass_class);
4436             r->regstclass = (regnode*)RExC_rx->data->data[n];
4437             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
4438             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4439                       regprop(r, sv, (regnode*)data.start_class);
4440                       PerlIO_printf(Perl_debug_log,
4441                                     "synthetic stclass \"%s\".\n",
4442                                     SvPVX_const(sv));});
4443         }
4444
4445         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4446         if (longest_fixed_length > longest_float_length) {
4447             r->check_end_shift = r->anchored_end_shift;
4448             r->check_substr = r->anchored_substr;
4449             r->check_utf8 = r->anchored_utf8;
4450             r->check_offset_min = r->check_offset_max = r->anchored_offset;
4451             if (r->reganch & ROPT_ANCH_SINGLE)
4452                 r->reganch |= ROPT_NOSCAN;
4453         }
4454         else {
4455             r->check_end_shift = r->float_end_shift;
4456             r->check_substr = r->float_substr;
4457             r->check_utf8 = r->float_utf8;
4458             r->check_offset_min = r->float_min_offset;
4459             r->check_offset_max = r->float_max_offset;
4460         }
4461         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4462            This should be changed ASAP!  */
4463         if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
4464             r->reganch |= RE_USE_INTUIT;
4465             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4466                 r->reganch |= RE_INTUIT_TAIL;
4467         }
4468         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4469         if ( (STRLEN)minlen < longest_float_length )
4470             minlen= longest_float_length;
4471         if ( (STRLEN)minlen < longest_fixed_length )
4472             minlen= longest_fixed_length;
4473         */
4474     }
4475     else {
4476         /* Several toplevels. Best we can is to set minlen. */
4477         I32 fake;
4478         struct regnode_charclass_class ch_class;
4479         I32 last_close = 0;
4480
4481         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4482
4483         scan = r->program + 1;
4484         cl_init(pRExC_state, &ch_class);
4485         data.start_class = &ch_class;
4486         data.last_closep = &last_close;
4487
4488
4489         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4490             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4491
4492         CHECK_RESTUDY_GOTO;
4493
4494         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4495                 = r->float_substr = r->float_utf8 = NULL;
4496         if (!(data.start_class->flags & ANYOF_EOS)
4497             && !cl_is_anything(data.start_class))
4498         {
4499             const I32 n = add_data(pRExC_state, 1, "f");
4500
4501             Newx(RExC_rx->data->data[n], 1,
4502                 struct regnode_charclass_class);
4503             StructCopy(data.start_class,
4504                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
4505                        struct regnode_charclass_class);
4506             r->regstclass = (regnode*)RExC_rx->data->data[n];
4507             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
4508             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4509                       regprop(r, sv, (regnode*)data.start_class);
4510                       PerlIO_printf(Perl_debug_log,
4511                                     "synthetic stclass \"%s\".\n",
4512                                     SvPVX_const(sv));});
4513         }
4514     }
4515
4516     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4517        the "real" pattern. */
4518     DEBUG_OPTIMISE_r({
4519         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4520             minlen, r->minlen);
4521     });
4522     r->minlenret = minlen;
4523     if (r->minlen < minlen)
4524         r->minlen = minlen;
4525
4526     if (RExC_seen & REG_SEEN_GPOS)
4527         r->reganch |= ROPT_GPOS_SEEN;
4528     if (RExC_seen & REG_SEEN_LOOKBEHIND)
4529         r->reganch |= ROPT_LOOKBEHIND_SEEN;
4530     if (RExC_seen & REG_SEEN_EVAL)
4531         r->reganch |= ROPT_EVAL_SEEN;
4532     if (RExC_seen & REG_SEEN_CANY)
4533         r->reganch |= ROPT_CANY_SEEN;
4534     if (RExC_seen & REG_SEEN_VERBARG)
4535         r->reganch |= ROPT_VERBARG_SEEN;
4536     if (RExC_seen & REG_SEEN_CUTGROUP)
4537         r->reganch |= ROPT_CUTGROUP_SEEN;
4538     if (RExC_paren_names)
4539         r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4540     else
4541         r->paren_names = NULL;
4542
4543     if (RExC_recurse_count) {
4544         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4545             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4546             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4547         }
4548     }
4549     Newxz(r->startp, RExC_npar, I32);
4550     Newxz(r->endp, RExC_npar, I32);
4551     /* assume we don't need to swap parens around before we match */
4552
4553     DEBUG_DUMP_r({
4554         PerlIO_printf(Perl_debug_log,"Final program:\n");
4555         regdump(r);
4556     });
4557     DEBUG_OFFSETS_r(if (r->offsets) {
4558         const U32 len = r->offsets[0];
4559         U32 i;
4560         GET_RE_DEBUG_FLAGS_DECL;
4561         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4562         for (i = 1; i <= len; i++) {
4563             if (r->offsets[i*2-1] || r->offsets[i*2])
4564                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4565                 (UV)i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
4566             }
4567         PerlIO_printf(Perl_debug_log, "\n");
4568     });
4569     return(r);
4570     END_BLOCK
4571 }
4572
4573 #undef CORE_ONLY_BLOCK
4574 #undef END_BLOCK
4575 #undef RE_ENGINE_PTR
4576
4577 #ifndef PERL_IN_XSUB_RE
4578 SV*
4579 Perl_reg_named_buff_sv(pTHX_ SV* namesv)
4580 {
4581     I32 parno = 0; /* no match */
4582     if (PL_curpm) {
4583         const REGEXP * const rx = PM_GETRE(PL_curpm);
4584         if (rx && rx->paren_names) {
4585             HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4586             if (he_str) {
4587                 IV i;
4588                 SV* sv_dat=HeVAL(he_str);
4589                 I32 *nums=(I32*)SvPVX(sv_dat);
4590                 for ( i=0; i<SvIVX(sv_dat); i++ ) {
4591                     if ((I32)(rx->lastparen) >= nums[i] &&
4592                         rx->endp[nums[i]] != -1)
4593                     {
4594                         parno = nums[i];
4595                         break;
4596                     }
4597                 }
4598             }
4599         }
4600     }
4601     if ( !parno ) {
4602         return 0;
4603     } else {
4604         GV *gv_paren;
4605         SV *sv= sv_newmortal();
4606         Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
4607         gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
4608         return GvSVn(gv_paren);
4609     }
4610 }
4611 #endif
4612
4613 /* Scans the name of a named buffer from the pattern.
4614  * If flags is REG_RSN_RETURN_NULL returns null.
4615  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4616  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4617  * to the parsed name as looked up in the RExC_paren_names hash.
4618  * If there is an error throws a vFAIL().. type exception.
4619  */
4620
4621 #define REG_RSN_RETURN_NULL    0
4622 #define REG_RSN_RETURN_NAME    1
4623 #define REG_RSN_RETURN_DATA    2
4624
4625 STATIC SV*
4626 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4627     char *name_start = RExC_parse;
4628     if ( UTF ) {
4629         STRLEN numlen;
4630         while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse,
4631             RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT)))
4632         {
4633                 RExC_parse += numlen;
4634         }
4635     } else {
4636         while( isIDFIRST(*RExC_parse) )
4637             RExC_parse++;
4638     }
4639     if ( flags ) {
4640         SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4641             (int)(RExC_parse - name_start)));
4642         if (UTF)
4643             SvUTF8_on(sv_name);
4644         if ( flags == REG_RSN_RETURN_NAME)
4645             return sv_name;
4646         else if (flags==REG_RSN_RETURN_DATA) {
4647             HE *he_str = NULL;
4648             SV *sv_dat = NULL;
4649             if ( ! sv_name )      /* should not happen*/
4650                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4651             if (RExC_paren_names)
4652                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4653             if ( he_str )
4654                 sv_dat = HeVAL(he_str);
4655             if ( ! sv_dat )
4656                 vFAIL("Reference to nonexistent named group");
4657             return sv_dat;
4658         }
4659         else {
4660             Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4661         }
4662         /* NOT REACHED */
4663     }
4664     return NULL;
4665 }
4666
4667 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
4668     int rem=(int)(RExC_end - RExC_parse);                       \
4669     int cut;                                                    \
4670     int num;                                                    \
4671     int iscut=0;                                                \
4672     if (rem>10) {                                               \
4673         rem=10;                                                 \
4674         iscut=1;                                                \
4675     }                                                           \
4676     cut=10-rem;                                                 \
4677     if (RExC_lastparse!=RExC_parse)                             \
4678         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
4679             rem, RExC_parse,                                    \
4680             cut + 4,                                            \
4681             iscut ? "..." : "<"                                 \
4682         );                                                      \
4683     else                                                        \
4684         PerlIO_printf(Perl_debug_log,"%16s","");                \
4685                                                                 \
4686     if (SIZE_ONLY)                                              \
4687        num=RExC_size;                                           \
4688     else                                                        \
4689        num=REG_NODE_NUM(RExC_emit);                             \
4690     if (RExC_lastnum!=num)                                      \
4691        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
4692     else                                                        \
4693        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
4694     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
4695         (int)((depth*2)), "",                                   \
4696         (funcname)                                              \
4697     );                                                          \
4698     RExC_lastnum=num;                                           \
4699     RExC_lastparse=RExC_parse;                                  \
4700 })
4701
4702
4703
4704 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
4705     DEBUG_PARSE_MSG((funcname));                            \
4706     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
4707 })
4708 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
4709     DEBUG_PARSE_MSG((funcname));                            \
4710     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
4711 })
4712 /*
4713  - reg - regular expression, i.e. main body or parenthesized thing
4714  *
4715  * Caller must absorb opening parenthesis.
4716  *
4717  * Combining parenthesis handling with the base level of regular expression
4718  * is a trifle forced, but the need to tie the tails of the branches to what
4719  * follows makes it hard to avoid.
4720  */
4721 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4722 #ifdef DEBUGGING
4723 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4724 #else
4725 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4726 #endif
4727
4728 /* this idea is borrowed from STR_WITH_LEN in handy.h */
4729 #define CHECK_WORD(s,v,l)  \
4730     (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4731
4732 STATIC regnode *
4733 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
4734     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
4735 {
4736     dVAR;
4737     register regnode *ret;              /* Will be the head of the group. */
4738     register regnode *br;
4739     register regnode *lastbr;
4740     register regnode *ender = NULL;
4741     register I32 parno = 0;
4742     I32 flags;
4743     const I32 oregflags = RExC_flags;
4744     bool have_branch = 0;
4745     bool is_open = 0;
4746
4747     /* for (?g), (?gc), and (?o) warnings; warning
4748        about (?c) will warn about (?g) -- japhy    */
4749
4750 #define WASTED_O  0x01
4751 #define WASTED_G  0x02
4752 #define WASTED_C  0x04
4753 #define WASTED_GC (0x02|0x04)
4754     I32 wastedflags = 0x00;
4755
4756     char * parse_start = RExC_parse; /* MJD */
4757     char * const oregcomp_parse = RExC_parse;
4758
4759     GET_RE_DEBUG_FLAGS_DECL;
4760     DEBUG_PARSE("reg ");
4761
4762
4763     *flagp = 0;                         /* Tentatively. */
4764
4765
4766     /* Make an OPEN node, if parenthesized. */
4767     if (paren) {
4768         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4769             char *start_verb = RExC_parse;
4770             STRLEN verb_len = 0;
4771             char *start_arg = NULL;
4772             unsigned char op = 0;
4773             int argok = 1;
4774             int internal_argval = 0; /* internal_argval is only useful if !argok */
4775             while ( *RExC_parse && *RExC_parse != ')' ) {
4776                 if ( *RExC_parse == ':' ) {
4777                     start_arg = RExC_parse + 1;
4778                     break;
4779                 }
4780                 RExC_parse++;
4781             }
4782             ++start_verb;
4783             verb_len = RExC_parse - start_verb;
4784             if ( start_arg ) {
4785                 RExC_parse++;
4786                 while ( *RExC_parse && *RExC_parse != ')' )
4787                     RExC_parse++;
4788                 if ( *RExC_parse != ')' )
4789                     vFAIL("Unterminated verb pattern argument");
4790                 if ( RExC_parse == start_arg )
4791                     start_arg = NULL;
4792             } else {
4793                 if ( *RExC_parse != ')' )
4794                     vFAIL("Unterminated verb pattern");
4795             }
4796
4797             switch ( *start_verb ) {
4798             case 'A':  /* (*ACCEPT) */
4799                 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4800                     op = ACCEPT;
4801                     internal_argval = RExC_nestroot;
4802                 }
4803                 break;
4804             case 'C':  /* (*COMMIT) */
4805                 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4806                     op = COMMIT;
4807                 break;
4808             case 'F':  /* (*FAIL) */
4809                 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4810                     op = OPFAIL;
4811                     argok = 0;
4812                 }
4813                 break;
4814             case ':':  /* (*:NAME) */
4815             case 'M':  /* (*MARK:NAME) */
4816                 if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) {
4817                     op = MARKPOINT;
4818                     argok = -1;
4819                 }
4820                 break;
4821             case 'P':  /* (*PRUNE) */
4822                 if ( CHECK_WORD("PRUNE",start_verb,verb_len) )
4823                     op = PRUNE;
4824                 break;
4825             case 'S':   /* (*SKIP) */
4826                 if ( CHECK_WORD("SKIP",start_verb,verb_len) )
4827                     op = SKIP;
4828                 break;
4829             case 'T':  /* (*THEN) */
4830                 /* [19:06] <TimToady> :: is then */
4831                 if ( CHECK_WORD("THEN",start_verb,verb_len) ) {
4832                     op = CUTGROUP;
4833                     RExC_seen |= REG_SEEN_CUTGROUP;
4834                 }
4835                 break;
4836             }
4837             if ( ! op ) {
4838                 RExC_parse++;
4839                 vFAIL3("Unknown verb pattern '%.*s'",
4840                     verb_len, start_verb);
4841             }
4842             if ( argok ) {
4843                 if ( start_arg && internal_argval ) {
4844                     vFAIL3("Verb pattern '%.*s' may not have an argument",
4845                         verb_len, start_verb);
4846                 } else if ( argok < 0 && !start_arg ) {
4847                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
4848                         verb_len, start_verb);
4849                 } else {
4850                     ret = reganode(pRExC_state, op, internal_argval);
4851                     if ( ! internal_argval && ! SIZE_ONLY ) {
4852                         if (start_arg) {
4853                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
4854                             ARG(ret) = add_data( pRExC_state, 1, "S" );
4855                             RExC_rx->data->data[ARG(ret)]=(void*)sv;
4856                             ret->flags = 0;
4857                         } else {
4858                             ret->flags = 1;
4859                         }
4860                     }
4861                 }
4862                 if (!internal_argval)
4863                     RExC_seen |= REG_SEEN_VERBARG;
4864             } else if ( start_arg ) {
4865                 vFAIL3("Verb pattern '%.*s' may not have an argument",
4866                         verb_len, start_verb);
4867             } else {
4868                 ret = reg_node(pRExC_state, op);
4869             }
4870             nextchar(pRExC_state);
4871             return ret;
4872         } else
4873         if (*RExC_parse == '?') { /* (?...) */
4874             U32 posflags = 0, negflags = 0;
4875             U32 *flagsp = &posflags;
4876             bool is_logical = 0;
4877             const char * const seqstart = RExC_parse;
4878
4879             RExC_parse++;
4880             paren = *RExC_parse++;
4881             ret = NULL;                 /* For look-ahead/behind. */
4882             switch (paren) {
4883
4884             case '<':           /* (?<...) */
4885                 if (*RExC_parse == '!')
4886                     paren = ',';
4887                 else if (*RExC_parse != '=')
4888                 {               /* (?<...>) */
4889                     char *name_start;
4890                     SV *svname;
4891                     paren= '>';
4892             case '\'':          /* (?'...') */
4893                     name_start= RExC_parse;
4894                     svname = reg_scan_name(pRExC_state,
4895                         SIZE_ONLY ?  /* reverse test from the others */
4896                         REG_RSN_RETURN_NAME :
4897                         REG_RSN_RETURN_NULL);
4898                     if (RExC_parse == name_start)
4899                         goto unknown;
4900                     if (*RExC_parse != paren)
4901                         vFAIL2("Sequence (?%c... not terminated",
4902                             paren=='>' ? '<' : paren);
4903                     if (SIZE_ONLY) {
4904                         HE *he_str;
4905                         SV *sv_dat = NULL;
4906                         if (!svname) /* shouldnt happen */
4907                             Perl_croak(aTHX_
4908                                 "panic: reg_scan_name returned NULL");
4909                         if (!RExC_paren_names) {
4910                             RExC_paren_names= newHV();
4911                             sv_2mortal((SV*)RExC_paren_names);
4912                         }
4913                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
4914                         if ( he_str )
4915                             sv_dat = HeVAL(he_str);
4916                         if ( ! sv_dat ) {
4917                             /* croak baby croak */
4918                             Perl_croak(aTHX_
4919                                 "panic: paren_name hash element allocation failed");
4920                         } else if ( SvPOK(sv_dat) ) {
4921                             IV count=SvIV(sv_dat);
4922                             I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
4923                             SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
4924                             pv[count]=RExC_npar;
4925                             SvIVX(sv_dat)++;
4926                         } else {
4927                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
4928                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
4929                             SvIOK_on(sv_dat);
4930                             SvIVX(sv_dat)= 1;
4931                         }
4932
4933                         /*sv_dump(sv_dat);*/
4934                     }
4935                     nextchar(pRExC_state);
4936                     paren = 1;
4937                     goto capturing_parens;
4938                 }
4939                 RExC_seen |= REG_SEEN_LOOKBEHIND;
4940                 RExC_parse++;
4941             case '=':           /* (?=...) */
4942             case '!':           /* (?!...) */
4943                 RExC_seen_zerolen++;
4944                 if (*RExC_parse == ')') {
4945                     ret=reg_node(pRExC_state, OPFAIL);
4946                     nextchar(pRExC_state);
4947                     return ret;
4948                 }
4949             case ':':           /* (?:...) */
4950             case '>':           /* (?>...) */
4951                 break;
4952             case '$':           /* (?$...) */
4953             case '@':           /* (?@...) */
4954                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
4955                 break;
4956             case '#':           /* (?#...) */
4957                 while (*RExC_parse && *RExC_parse != ')')
4958                     RExC_parse++;
4959                 if (*RExC_parse != ')')
4960                     FAIL("Sequence (?#... not terminated");
4961                 nextchar(pRExC_state);
4962                 *flagp = TRYAGAIN;
4963                 return NULL;
4964             case '0' :           /* (?0) */
4965             case 'R' :           /* (?R) */
4966                 if (*RExC_parse != ')')
4967                     FAIL("Sequence (?R) not terminated");
4968                 ret = reg_node(pRExC_state, GOSTART);
4969                 nextchar(pRExC_state);
4970                 return ret;
4971                 /*notreached*/
4972             { /* named and numeric backreferences */
4973                 I32 num;
4974                 char * parse_start;
4975             case '&':            /* (?&NAME) */
4976                 parse_start = RExC_parse - 1;
4977                 {
4978                     SV *sv_dat = reg_scan_name(pRExC_state,
4979                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4980                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
4981                 }
4982                 goto gen_recurse_regop;
4983                 /* NOT REACHED */
4984             case '+':
4985                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
4986                     RExC_parse++;
4987                     vFAIL("Illegal pattern");
4988                 }
4989                 goto parse_recursion;
4990                 /* NOT REACHED*/
4991             case '-': /* (?-1) */
4992                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
4993                     RExC_parse--; /* rewind to let it be handled later */
4994                     goto parse_flags;
4995                 }
4996                 /*FALLTHROUGH */
4997             case '1': case '2': case '3': case '4': /* (?1) */
4998             case '5': case '6': case '7': case '8': case '9':
4999                 RExC_parse--;
5000               parse_recursion:
5001                 num = atoi(RExC_parse);
5002                 parse_start = RExC_parse - 1; /* MJD */
5003                 if (*RExC_parse == '-')
5004                     RExC_parse++;
5005                 while (isDIGIT(*RExC_parse))
5006                         RExC_parse++;
5007                 if (*RExC_parse!=')')
5008                     vFAIL("Expecting close bracket");
5009
5010               gen_recurse_regop:
5011                 if ( paren == '-' ) {
5012                     /*
5013                     Diagram of capture buffer numbering.
5014                     Top line is the normal capture buffer numbers
5015                     Botton line is the negative indexing as from
5016                     the X (the (?-2))
5017
5018                     +   1 2    3 4 5 X          6 7
5019                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5020                     -   5 4    3 2 1 X          x x
5021
5022                     */
5023                     num = RExC_npar + num;
5024                     if (num < 1)  {
5025                         RExC_parse++;
5026                         vFAIL("Reference to nonexistent group");
5027                     }
5028                 } else if ( paren == '+' ) {
5029                     num = RExC_npar + num - 1;
5030                 }
5031
5032                 ret = reganode(pRExC_state, GOSUB, num);
5033                 if (!SIZE_ONLY) {
5034                     if (num > (I32)RExC_rx->nparens) {
5035                         RExC_parse++;
5036                         vFAIL("Reference to nonexistent group");
5037                     }
5038                     ARG2L_SET( ret, RExC_recurse_count++);
5039                     RExC_emit++;
5040                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5041                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5042                 } else {
5043                     RExC_size++;
5044                 }
5045                 RExC_seen |= REG_SEEN_RECURSE;
5046                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5047                 Set_Node_Offset(ret, parse_start); /* MJD */
5048
5049                 nextchar(pRExC_state);
5050                 return ret;
5051             } /* named and numeric backreferences */
5052             /* NOT REACHED */
5053
5054             case 'p':           /* (?p...) */
5055                 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
5056                     vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
5057                 /* FALL THROUGH*/
5058             case '?':           /* (??...) */
5059                 is_logical = 1;
5060                 if (*RExC_parse != '{')
5061                     goto unknown;
5062                 paren = *RExC_parse++;
5063                 /* FALL THROUGH */
5064             case '{':           /* (?{...}) */
5065             {
5066                 I32 count = 1, n = 0;
5067                 char c;
5068                 char *s = RExC_parse;
5069
5070                 RExC_seen_zerolen++;
5071                 RExC_seen |= REG_SEEN_EVAL;
5072                 while (count && (c = *RExC_parse)) {
5073                     if (c == '\\') {
5074                         if (RExC_parse[1])
5075                             RExC_parse++;
5076                     }
5077                     else if (c == '{')
5078                         count++;
5079                     else if (c == '}')
5080                         count--;
5081                     RExC_parse++;
5082                 }
5083                 if (*RExC_parse != ')') {
5084                     RExC_parse = s;
5085                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5086                 }
5087                 if (!SIZE_ONLY) {
5088                     PAD *pad;
5089                     OP_4tree *sop, *rop;
5090                     SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5091
5092                     ENTER;
5093                     Perl_save_re_context(aTHX);
5094                     rop = sv_compile_2op(sv, &sop, "re", &pad);
5095                     sop->op_private |= OPpREFCOUNTED;
5096                     /* re_dup will OpREFCNT_inc */
5097                     OpREFCNT_set(sop, 1);
5098                     LEAVE;
5099
5100                     n = add_data(pRExC_state, 3, "nop");
5101                     RExC_rx->data->data[n] = (void*)rop;
5102                     RExC_rx->data->data[n+1] = (void*)sop;
5103                     RExC_rx->data->data[n+2] = (void*)pad;
5104                     SvREFCNT_dec(sv);
5105                 }
5106                 else {                                          /* First pass */
5107                     if (PL_reginterp_cnt < ++RExC_seen_evals
5108                         && IN_PERL_RUNTIME)
5109                         /* No compiled RE interpolated, has runtime
5110                            components ===> unsafe.  */
5111                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
5112                     if (PL_tainting && PL_tainted)
5113                         FAIL("Eval-group in insecure regular expression");
5114 #if PERL_VERSION > 8
5115                     if (IN_PERL_COMPILETIME)
5116                         PL_cv_has_eval = 1;
5117 #endif
5118                 }
5119
5120                 nextchar(pRExC_state);
5121                 if (is_logical) {
5122                     ret = reg_node(pRExC_state, LOGICAL);
5123                     if (!SIZE_ONLY)
5124                         ret->flags = 2;
5125                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5126                     /* deal with the length of this later - MJD */
5127                     return ret;
5128                 }
5129                 ret = reganode(pRExC_state, EVAL, n);
5130                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5131                 Set_Node_Offset(ret, parse_start);
5132                 return ret;
5133             }
5134             case '(':           /* (?(?{...})...) and (?(?=...)...) */
5135             {
5136                 int is_define= 0;
5137                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
5138                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5139                         || RExC_parse[1] == '<'
5140                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
5141                         I32 flag;
5142
5143                         ret = reg_node(pRExC_state, LOGICAL);
5144                         if (!SIZE_ONLY)
5145                             ret->flags = 1;
5146                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5147                         goto insert_if;
5148                     }
5149                 }
5150                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
5151                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5152                 {
5153                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
5154                     char *name_start= RExC_parse++;
5155                     I32 num = 0;
5156                     SV *sv_dat=reg_scan_name(pRExC_state,
5157                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5158                     if (RExC_parse == name_start || *RExC_parse != ch)
5159                         vFAIL2("Sequence (?(%c... not terminated",
5160                             (ch == '>' ? '<' : ch));
5161                     RExC_parse++;
5162                     if (!SIZE_ONLY) {
5163                         num = add_data( pRExC_state, 1, "S" );
5164                         RExC_rx->data->data[num]=(void*)sv_dat;
5165                         SvREFCNT_inc(sv_dat);
5166                     }
5167                     ret = reganode(pRExC_state,NGROUPP,num);
5168                     goto insert_if_check_paren;
5169                 }
5170                 else if (RExC_parse[0] == 'D' &&
5171                          RExC_parse[1] == 'E' &&
5172                          RExC_parse[2] == 'F' &&
5173                          RExC_parse[3] == 'I' &&
5174                          RExC_parse[4] == 'N' &&
5175                          RExC_parse[5] == 'E')
5176                 {
5177                     ret = reganode(pRExC_state,DEFINEP,0);
5178                     RExC_parse +=6 ;
5179                     is_define = 1;
5180                     goto insert_if_check_paren;
5181                 }
5182                 else if (RExC_parse[0] == 'R') {
5183                     RExC_parse++;
5184                     parno = 0;
5185                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5186                         parno = atoi(RExC_parse++);
5187                         while (isDIGIT(*RExC_parse))
5188                             RExC_parse++;
5189                     } else if (RExC_parse[0] == '&') {
5190                         SV *sv_dat;
5191                         RExC_parse++;
5192                         sv_dat = reg_scan_name(pRExC_state,
5193                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5194                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5195                     }
5196                     ret = reganode(pRExC_state,INSUBP,parno);
5197                     goto insert_if_check_paren;
5198                 }
5199                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5200                     /* (?(1)...) */
5201                     char c;
5202                     parno = atoi(RExC_parse++);
5203
5204                     while (isDIGIT(*RExC_parse))
5205                         RExC_parse++;
5206                     ret = reganode(pRExC_state, GROUPP, parno);
5207
5208                  insert_if_check_paren:
5209                     if ((c = *nextchar(pRExC_state)) != ')')
5210                         vFAIL("Switch condition not recognized");
5211                   insert_if:
5212                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5213                     br = regbranch(pRExC_state, &flags, 1,depth+1);
5214                     if (br == NULL)
5215                         br = reganode(pRExC_state, LONGJMP, 0);
5216                     else
5217                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
5218                     c = *nextchar(pRExC_state);
5219                     if (flags&HASWIDTH)
5220                         *flagp |= HASWIDTH;
5221                     if (c == '|') {
5222                         if (is_define)
5223                             vFAIL("(?(DEFINE)....) does not allow branches");
5224                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
5225                         regbranch(pRExC_state, &flags, 1,depth+1);
5226                         REGTAIL(pRExC_state, ret, lastbr);
5227                         if (flags&HASWIDTH)
5228                             *flagp |= HASWIDTH;
5229                         c = *nextchar(pRExC_state);
5230                     }
5231                     else
5232                         lastbr = NULL;
5233                     if (c != ')')
5234                         vFAIL("Switch (?(condition)... contains too many branches");
5235                     ender = reg_node(pRExC_state, TAIL);
5236                     REGTAIL(pRExC_state, br, ender);
5237                     if (lastbr) {
5238                         REGTAIL(pRExC_state, lastbr, ender);
5239                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
5240                     }
5241                     else
5242                         REGTAIL(pRExC_state, ret, ender);
5243                     return ret;
5244                 }
5245                 else {
5246                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
5247                 }
5248             }
5249             case 0:
5250                 RExC_parse--; /* for vFAIL to print correctly */
5251                 vFAIL("Sequence (? incomplete");
5252                 break;
5253             default:
5254                 --RExC_parse;
5255               parse_flags:      /* (?i) */
5256                 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
5257                     /* (?g), (?gc) and (?o) are useless here
5258                        and must be globally applied -- japhy */
5259
5260                     if (*RExC_parse == 'o' || *RExC_parse == 'g') {
5261                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5262                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
5263                             if (! (wastedflags & wflagbit) ) {
5264                                 wastedflags |= wflagbit;
5265                                 vWARN5(
5266                                     RExC_parse + 1,
5267                                     "Useless (%s%c) - %suse /%c modifier",
5268                                     flagsp == &negflags ? "?-" : "?",
5269                                     *RExC_parse,
5270                                     flagsp == &negflags ? "don't " : "",
5271                                     *RExC_parse
5272                                 );
5273                             }
5274                         }
5275                     }
5276                     else if (*RExC_parse == 'c') {
5277                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5278                             if (! (wastedflags & WASTED_C) ) {
5279                                 wastedflags |= WASTED_GC;
5280                                 vWARN3(
5281                                     RExC_parse + 1,
5282                                     "Useless (%sc) - %suse /gc modifier",
5283                                     flagsp == &negflags ? "?-" : "?",
5284                                     flagsp == &negflags ? "don't " : ""
5285                                 );
5286                             }
5287                         }
5288                     }
5289                     else { pmflag(flagsp, *RExC_parse); }
5290
5291                     ++RExC_parse;
5292                 }
5293                 if (*RExC_parse == '-') {
5294                     flagsp = &negflags;
5295                     wastedflags = 0;  /* reset so (?g-c) warns twice */
5296                     ++RExC_parse;
5297                     goto parse_flags;
5298                 }
5299                 RExC_flags |= posflags;
5300                 RExC_flags &= ~negflags;
5301                 if (*RExC_parse == ':') {
5302                     RExC_parse++;
5303                     paren = ':';
5304                     break;
5305                 }
5306               unknown:
5307                 if (*RExC_parse != ')') {
5308                     RExC_parse++;
5309                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5310                 }
5311                 nextchar(pRExC_state);
5312                 *flagp = TRYAGAIN;
5313                 return NULL;
5314             }
5315         }
5316         else {                  /* (...) */
5317           capturing_parens:
5318             parno = RExC_npar;
5319             RExC_npar++;
5320
5321             ret = reganode(pRExC_state, OPEN, parno);
5322             if (!SIZE_ONLY ){
5323                 if (!RExC_nestroot)
5324                     RExC_nestroot = parno;
5325                 if (RExC_seen & REG_SEEN_RECURSE) {
5326                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5327                         "Setting open paren #%"IVdf" to %d\n",
5328                         (IV)parno, REG_NODE_NUM(ret)));
5329                     RExC_open_parens[parno-1]= ret;
5330                 }
5331             }
5332             Set_Node_Length(ret, 1); /* MJD */
5333             Set_Node_Offset(ret, RExC_parse); /* MJD */
5334             is_open = 1;
5335         }
5336     }
5337     else                        /* ! paren */
5338         ret = NULL;
5339
5340     /* Pick up the branches, linking them together. */
5341     parse_start = RExC_parse;   /* MJD */
5342     br = regbranch(pRExC_state, &flags, 1,depth+1);
5343     /*     branch_len = (paren != 0); */
5344
5345     if (br == NULL)
5346         return(NULL);
5347     if (*RExC_parse == '|') {
5348         if (!SIZE_ONLY && RExC_extralen) {
5349             reginsert(pRExC_state, BRANCHJ, br, depth+1);
5350         }
5351         else {                  /* MJD */
5352             reginsert(pRExC_state, BRANCH, br, depth+1);
5353             Set_Node_Length(br, paren != 0);
5354             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5355         }
5356         have_branch = 1;
5357         if (SIZE_ONLY)
5358             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
5359     }
5360     else if (paren == ':') {
5361         *flagp |= flags&SIMPLE;
5362     }
5363     if (is_open) {                              /* Starts with OPEN. */
5364         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
5365     }
5366     else if (paren != '?')              /* Not Conditional */
5367         ret = br;
5368     *flagp |= flags & (SPSTART | HASWIDTH);
5369     lastbr = br;
5370     while (*RExC_parse == '|') {
5371         if (!SIZE_ONLY && RExC_extralen) {
5372             ender = reganode(pRExC_state, LONGJMP,0);
5373             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
5374         }
5375         if (SIZE_ONLY)
5376             RExC_extralen += 2;         /* Account for LONGJMP. */
5377         nextchar(pRExC_state);
5378         br = regbranch(pRExC_state, &flags, 0, depth+1);
5379
5380         if (br == NULL)
5381             return(NULL);
5382         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
5383         lastbr = br;
5384         if (flags&HASWIDTH)
5385             *flagp |= HASWIDTH;
5386         *flagp |= flags&SPSTART;
5387     }
5388
5389     if (have_branch || paren != ':') {
5390         /* Make a closing node, and hook it on the end. */
5391         switch (paren) {
5392         case ':':
5393             ender = reg_node(pRExC_state, TAIL);
5394             break;
5395         case 1:
5396             RExC_cpar++;
5397             ender = reganode(pRExC_state, CLOSE, parno);
5398             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5399                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5400                         "Setting close paren #%"IVdf" to %d\n",
5401                         (IV)parno, REG_NODE_NUM(ender)));
5402                 RExC_close_parens[parno-1]= ender;
5403                 if (RExC_nestroot == parno)
5404                     RExC_nestroot = 0;
5405             }
5406             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5407             Set_Node_Length(ender,1); /* MJD */
5408             break;
5409         case '<':
5410         case ',':
5411         case '=':
5412         case '!':
5413             *flagp &= ~HASWIDTH;
5414             /* FALL THROUGH */
5415         case '>':
5416             ender = reg_node(pRExC_state, SUCCEED);
5417             break;
5418         case 0:
5419             ender = reg_node(pRExC_state, END);
5420             if (!SIZE_ONLY) {
5421                 assert(!RExC_opend); /* there can only be one! */
5422                 RExC_opend = ender;
5423             }
5424             break;
5425         }
5426         REGTAIL(pRExC_state, lastbr, ender);
5427
5428         if (have_branch && !SIZE_ONLY) {
5429             if (depth==1)
5430                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5431
5432             /* Hook the tails of the branches to the closing node. */
5433             for (br = ret; br; br = regnext(br)) {
5434                 const U8 op = PL_regkind[OP(br)];
5435                 if (op == BRANCH) {
5436                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
5437                 }
5438                 else if (op == BRANCHJ) {
5439                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
5440                 }
5441             }
5442         }
5443     }
5444
5445     {
5446         const char *p;
5447         static const char parens[] = "=!<,>";
5448
5449         if (paren && (p = strchr(parens, paren))) {
5450             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
5451             int flag = (p - parens) > 1;
5452
5453             if (paren == '>')
5454                 node = SUSPEND, flag = 0;
5455             reginsert(pRExC_state, node,ret, depth+1);
5456             Set_Node_Cur_Length(ret);
5457             Set_Node_Offset(ret, parse_start + 1);
5458             ret->flags = flag;
5459             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
5460         }
5461     }
5462
5463     /* Check for proper termination. */
5464     if (paren) {
5465         RExC_flags = oregflags;
5466         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5467             RExC_parse = oregcomp_parse;
5468             vFAIL("Unmatched (");
5469         }
5470     }
5471     else if (!paren && RExC_parse < RExC_end) {
5472         if (*RExC_parse == ')') {
5473             RExC_parse++;
5474             vFAIL("Unmatched )");
5475         }
5476         else
5477             FAIL("Junk on end of regexp");      /* "Can't happen". */
5478         /* NOTREACHED */
5479     }
5480
5481     return(ret);
5482 }
5483
5484 /*
5485  - regbranch - one alternative of an | operator
5486  *
5487  * Implements the concatenation operator.
5488  */
5489 STATIC regnode *
5490 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
5491 {
5492     dVAR;
5493     register regnode *ret;
5494     register regnode *chain = NULL;
5495     register regnode *latest;
5496     I32 flags = 0, c = 0;
5497     GET_RE_DEBUG_FLAGS_DECL;
5498     DEBUG_PARSE("brnc");
5499     if (first)
5500         ret = NULL;
5501     else {
5502         if (!SIZE_ONLY && RExC_extralen)
5503             ret = reganode(pRExC_state, BRANCHJ,0);
5504         else {
5505             ret = reg_node(pRExC_state, BRANCH);
5506             Set_Node_Length(ret, 1);
5507         }
5508     }
5509
5510     if (!first && SIZE_ONLY)
5511         RExC_extralen += 1;                     /* BRANCHJ */
5512
5513     *flagp = WORST;                     /* Tentatively. */
5514
5515     RExC_parse--;
5516     nextchar(pRExC_state);
5517     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
5518         flags &= ~TRYAGAIN;
5519         latest = regpiece(pRExC_state, &flags,depth+1);
5520         if (latest == NULL) {
5521             if (flags & TRYAGAIN)
5522                 continue;
5523             return(NULL);
5524         }
5525         else if (ret == NULL)
5526             ret = latest;
5527         *flagp |= flags&HASWIDTH;
5528         if (chain == NULL)      /* First piece. */
5529             *flagp |= flags&SPSTART;
5530         else {
5531             RExC_naughty++;
5532             REGTAIL(pRExC_state, chain, latest);
5533         }
5534         chain = latest;
5535         c++;
5536     }
5537     if (chain == NULL) {        /* Loop ran zero times. */
5538         chain = reg_node(pRExC_state, NOTHING);
5539         if (ret == NULL)
5540             ret = chain;
5541     }
5542     if (c == 1) {
5543         *flagp |= flags&SIMPLE;
5544     }
5545
5546     return ret;
5547 }
5548
5549 /*
5550  - regpiece - something followed by possible [*+?]
5551  *
5552  * Note that the branching code sequences used for ? and the general cases
5553  * of * and + are somewhat optimized:  they use the same NOTHING node as
5554  * both the endmarker for their branch list and the body of the last branch.
5555  * It might seem that this node could be dispensed with entirely, but the
5556  * endmarker role is not redundant.
5557  */
5558 STATIC regnode *
5559 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
5560 {
5561     dVAR;
5562     register regnode *ret;
5563     register char op;
5564     register char *next;
5565     I32 flags;
5566     const char * const origparse = RExC_parse;
5567     I32 min;
5568     I32 max = REG_INFTY;
5569     char *parse_start;
5570     const char *maxpos = NULL;
5571     GET_RE_DEBUG_FLAGS_DECL;
5572     DEBUG_PARSE("piec");
5573
5574     ret = regatom(pRExC_state, &flags,depth+1);
5575     if (ret == NULL) {
5576         if (flags & TRYAGAIN)
5577             *flagp |= TRYAGAIN;
5578         return(NULL);
5579     }
5580
5581     op = *RExC_parse;
5582
5583     if (op == '{' && regcurly(RExC_parse)) {
5584         maxpos = NULL;
5585         parse_start = RExC_parse; /* MJD */
5586         next = RExC_parse + 1;
5587         while (isDIGIT(*next) || *next == ',') {
5588             if (*next == ',') {
5589                 if (maxpos)
5590                     break;
5591                 else
5592                     maxpos = next;
5593             }
5594             next++;
5595         }
5596         if (*next == '}') {             /* got one */
5597             if (!maxpos)
5598                 maxpos = next;
5599             RExC_parse++;
5600             min = atoi(RExC_parse);
5601             if (*maxpos == ',')
5602                 maxpos++;
5603             else
5604                 maxpos = RExC_parse;
5605             max = atoi(maxpos);
5606             if (!max && *maxpos != '0')
5607                 max = REG_INFTY;                /* meaning "infinity" */
5608             else if (max >= REG_INFTY)
5609                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
5610             RExC_parse = next;
5611             nextchar(pRExC_state);
5612
5613         do_curly:
5614             if ((flags&SIMPLE)) {
5615                 RExC_naughty += 2 + RExC_naughty / 2;
5616                 reginsert(pRExC_state, CURLY, ret, depth+1);
5617                 Set_Node_Offset(ret, parse_start+1); /* MJD */
5618                 Set_Node_Cur_Length(ret);
5619             }
5620             else {
5621                 regnode * const w = reg_node(pRExC_state, WHILEM);
5622
5623                 w->flags = 0;
5624                 REGTAIL(pRExC_state, ret, w);
5625                 if (!SIZE_ONLY && RExC_extralen) {
5626                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
5627                     reginsert(pRExC_state, NOTHING,ret, depth+1);
5628                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
5629                 }
5630                 reginsert(pRExC_state, CURLYX,ret, depth+1);
5631                                 /* MJD hk */
5632                 Set_Node_Offset(ret, parse_start+1);
5633                 Set_Node_Length(ret,
5634                                 op == '{' ? (RExC_parse - parse_start) : 1);
5635
5636                 if (!SIZE_ONLY && RExC_extralen)
5637                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
5638                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
5639                 if (SIZE_ONLY)
5640                     RExC_whilem_seen++, RExC_extralen += 3;
5641                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
5642             }
5643             ret->flags = 0;
5644
5645             if (min > 0)
5646                 *flagp = WORST;
5647             if (max > 0)
5648                 *flagp |= HASWIDTH;
5649             if (max && max < min)
5650                 vFAIL("Can't do {n,m} with n > m");
5651             if (!SIZE_ONLY) {
5652                 ARG1_SET(ret, (U16)min);
5653                 ARG2_SET(ret, (U16)max);
5654             }
5655
5656             goto nest_check;
5657         }
5658     }
5659
5660     if (!ISMULT1(op)) {
5661         *flagp = flags;
5662         return(ret);
5663     }
5664
5665 #if 0                           /* Now runtime fix should be reliable. */
5666
5667     /* if this is reinstated, don't forget to put this back into perldiag:
5668
5669             =item Regexp *+ operand could be empty at {#} in regex m/%s/
5670
5671            (F) The part of the regexp subject to either the * or + quantifier
5672            could match an empty string. The {#} shows in the regular
5673            expression about where the problem was discovered.
5674
5675     */
5676
5677     if (!(flags&HASWIDTH) && op != '?')
5678       vFAIL("Regexp *+ operand could be empty");
5679 #endif
5680
5681     parse_start = RExC_parse;
5682     nextchar(pRExC_state);
5683
5684     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
5685
5686     if (op == '*' && (flags&SIMPLE)) {
5687         reginsert(pRExC_state, STAR, ret, depth+1);
5688         ret->flags = 0;
5689         RExC_naughty += 4;
5690     }
5691     else if (op == '*') {
5692         min = 0;
5693         goto do_curly;
5694     }
5695     else if (op == '+' && (flags&SIMPLE)) {
5696         reginsert(pRExC_state, PLUS, ret, depth+1);
5697         ret->flags = 0;
5698         RExC_naughty += 3;
5699     }
5700     else if (op == '+') {
5701         min = 1;
5702         goto do_curly;
5703     }
5704     else if (op == '?') {
5705         min = 0; max = 1;
5706         goto do_curly;
5707     }
5708   nest_check:
5709     if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
5710         vWARN3(RExC_parse,
5711                "%.*s matches null string many times",
5712                (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
5713                origparse);
5714     }
5715
5716     if (RExC_parse < RExC_end && *RExC_parse == '?') {
5717         nextchar(pRExC_state);
5718         reginsert(pRExC_state, MINMOD, ret, depth+1);
5719         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
5720     }
5721 #ifndef REG_ALLOW_MINMOD_SUSPEND
5722     else
5723 #endif
5724     if (RExC_parse < RExC_end && *RExC_parse == '+') {
5725         regnode *ender;
5726         nextchar(pRExC_state);
5727         ender = reg_node(pRExC_state, SUCCEED);
5728         REGTAIL(pRExC_state, ret, ender);
5729         reginsert(pRExC_state, SUSPEND, ret, depth+1);
5730         ret->flags = 0;
5731         ender = reg_node(pRExC_state, TAIL);
5732         REGTAIL(pRExC_state, ret, ender);
5733         /*ret= ender;*/
5734     }
5735
5736     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
5737         RExC_parse++;
5738         vFAIL("Nested quantifiers");
5739     }
5740
5741     return(ret);
5742 }
5743
5744
5745 /* reg_namedseq(pRExC_state,UVp)
5746
5747    This is expected to be called by a parser routine that has
5748    recognized'\N' and needs to handle the rest. RExC_parse is
5749    expected to point at the first char following the N at the time
5750    of the call.
5751
5752    If valuep is non-null then it is assumed that we are parsing inside
5753    of a charclass definition and the first codepoint in the resolved
5754    string is returned via *valuep and the routine will return NULL.
5755    In this mode if a multichar string is returned from the charnames
5756    handler a warning will be issued, and only the first char in the
5757    sequence will be examined. If the string returned is zero length
5758    then the value of *valuep is undefined and NON-NULL will
5759    be returned to indicate failure. (This will NOT be a valid pointer
5760    to a regnode.)
5761
5762    If value is null then it is assumed that we are parsing normal text
5763    and inserts a new EXACT node into the program containing the resolved
5764    string and returns a pointer to the new node. If the string is
5765    zerolength a NOTHING node is emitted.
5766
5767    On success RExC_parse is set to the char following the endbrace.
5768    Parsing failures will generate a fatal errorvia vFAIL(...)
5769
5770    NOTE: We cache all results from the charnames handler locally in
5771    the RExC_charnames hash (created on first use) to prevent a charnames
5772    handler from playing silly-buggers and returning a short string and
5773    then a long string for a given pattern. Since the regexp program
5774    size is calculated during an initial parse this would result
5775    in a buffer overrun so we cache to prevent the charname result from
5776    changing during the course of the parse.
5777
5778  */
5779 STATIC regnode *
5780 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5781 {
5782     char * name;        /* start of the content of the name */
5783     char * endbrace;    /* endbrace following the name */
5784     SV *sv_str = NULL;
5785     SV *sv_name = NULL;
5786     STRLEN len; /* this has various purposes throughout the code */
5787     bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5788     regnode *ret = NULL;
5789
5790     if (*RExC_parse != '{') {
5791         vFAIL("Missing braces on \\N{}");
5792     }
5793     name = RExC_parse+1;
5794     endbrace = strchr(RExC_parse, '}');
5795     if ( ! endbrace ) {
5796         RExC_parse++;
5797         vFAIL("Missing right brace on \\N{}");
5798     }
5799     RExC_parse = endbrace + 1;
5800
5801
5802     /* RExC_parse points at the beginning brace,
5803        endbrace points at the last */
5804     if ( name[0]=='U' && name[1]=='+' ) {
5805         /* its a "unicode hex" notation {U+89AB} */
5806         I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5807             | PERL_SCAN_DISALLOW_PREFIX
5808             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5809         UV cp;
5810         len = (STRLEN)(endbrace - name - 2);
5811         cp = grok_hex(name + 2, &len, &fl, NULL);
5812         if ( len != (STRLEN)(endbrace - name - 2) ) {
5813             cp = 0xFFFD;
5814         }
5815         if (cp > 0xff)
5816             RExC_utf8 = 1;
5817         if ( valuep ) {
5818             *valuep = cp;
5819             return NULL;
5820         }
5821         sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5822     } else {
5823         /* fetch the charnames handler for this scope */
5824         HV * const table = GvHV(PL_hintgv);
5825         SV **cvp= table ?
5826             hv_fetchs(table, "charnames", FALSE) :
5827             NULL;
5828         SV *cv= cvp ? *cvp : NULL;
5829         HE *he_str;
5830         int count;
5831         /* create an SV with the name as argument */
5832         sv_name = newSVpvn(name, endbrace - name);
5833
5834         if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5835             vFAIL2("Constant(\\N{%s}) unknown: "
5836                   "(possibly a missing \"use charnames ...\")",
5837                   SvPVX(sv_name));
5838         }
5839         if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5840             vFAIL2("Constant(\\N{%s}): "
5841                   "$^H{charnames} is not defined",SvPVX(sv_name));
5842         }
5843
5844
5845
5846         if (!RExC_charnames) {
5847             /* make sure our cache is allocated */
5848             RExC_charnames = newHV();
5849             sv_2mortal((SV*)RExC_charnames);
5850         }
5851             /* see if we have looked this one up before */
5852         he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5853         if ( he_str ) {
5854             sv_str = HeVAL(he_str);
5855             cached = 1;
5856         } else {
5857             dSP ;
5858
5859             ENTER ;
5860             SAVETMPS ;
5861             PUSHMARK(SP) ;
5862
5863             XPUSHs(sv_name);
5864
5865             PUTBACK ;
5866
5867             count= call_sv(cv, G_SCALAR);
5868
5869             if (count == 1) { /* XXXX is this right? dmq */
5870                 sv_str = POPs;
5871                 SvREFCNT_inc_simple_void(sv_str);
5872             }
5873
5874             SPAGAIN ;
5875             PUTBACK ;
5876             FREETMPS ;
5877             LEAVE ;
5878
5879             if ( !sv_str || !SvOK(sv_str) ) {
5880                 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5881                       "did not return a defined value",SvPVX(sv_name));
5882             }
5883             if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5884                 cached = 1;
5885         }
5886     }
5887     if (valuep) {
5888         char *p = SvPV(sv_str, len);
5889         if (len) {
5890             STRLEN numlen = 1;
5891             if ( SvUTF8(sv_str) ) {
5892                 *valuep = utf8_to_uvchr((U8*)p, &numlen);
5893                 if (*valuep > 0x7F)
5894                     RExC_utf8 = 1;
5895                 /* XXXX
5896                   We have to turn on utf8 for high bit chars otherwise
5897                   we get failures with
5898
5899                    "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5900                    "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5901
5902                   This is different from what \x{} would do with the same
5903                   codepoint, where the condition is > 0xFF.
5904                   - dmq
5905                 */
5906
5907
5908             } else {
5909                 *valuep = (UV)*p;
5910                 /* warn if we havent used the whole string? */
5911             }
5912             if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5913                 vWARN2(RExC_parse,
5914                     "Ignoring excess chars from \\N{%s} in character class",
5915                     SvPVX(sv_name)
5916                 );
5917             }
5918         } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5919             vWARN2(RExC_parse,
5920                     "Ignoring zero length \\N{%s} in character class",
5921                     SvPVX(sv_name)
5922                 );
5923         }
5924         if (sv_name)
5925             SvREFCNT_dec(sv_name);
5926         if (!cached)
5927             SvREFCNT_dec(sv_str);
5928         return len ? NULL : (regnode *)&len;
5929     } else if(SvCUR(sv_str)) {
5930
5931         char *s;
5932         char *p, *pend;
5933         STRLEN charlen = 1;
5934         char * parse_start = name-3; /* needed for the offsets */
5935         GET_RE_DEBUG_FLAGS_DECL;     /* needed for the offsets */
5936
5937         ret = reg_node(pRExC_state,
5938             (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5939         s= STRING(ret);
5940
5941         if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5942             sv_utf8_upgrade(sv_str);
5943         } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5944             RExC_utf8= 1;
5945         }
5946
5947         p = SvPV(sv_str, len);
5948         pend = p + len;
5949         /* len is the length written, charlen is the size the char read */
5950         for ( len = 0; p < pend; p += charlen ) {
5951             if (UTF) {
5952                 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
5953                 if (FOLD) {
5954                     STRLEN foldlen,numlen;
5955                     U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5956                     uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5957                     /* Emit all the Unicode characters. */
5958
5959                     for (foldbuf = tmpbuf;
5960                         foldlen;
5961                         foldlen -= numlen)
5962                     {
5963                         uvc = utf8_to_uvchr(foldbuf, &numlen);
5964                         if (numlen > 0) {
5965                             const STRLEN unilen = reguni(pRExC_state, uvc, s);
5966                             s       += unilen;
5967                             len     += unilen;
5968                             /* In EBCDIC the numlen
5969                             * and unilen can differ. */
5970                             foldbuf += numlen;
5971                             if (numlen >= foldlen)
5972                                 break;
5973                         }
5974                         else
5975                             break; /* "Can't happen." */
5976                     }
5977                 } else {
5978                     const STRLEN unilen = reguni(pRExC_state, uvc, s);
5979                     if (unilen > 0) {
5980                        s   += unilen;
5981                        len += unilen;
5982                     }
5983                 }
5984             } else {
5985                 len++;
5986                 REGC(*p, s++);
5987             }
5988         }
5989         if (SIZE_ONLY) {
5990             RExC_size += STR_SZ(len);
5991         } else {
5992             STR_LEN(ret) = len;
5993             RExC_emit += STR_SZ(len);
5994         }
5995         Set_Node_Cur_Length(ret); /* MJD */
5996         RExC_parse--;
5997         nextchar(pRExC_state);
5998     } else {
5999         ret = reg_node(pRExC_state,NOTHING);
6000     }
6001     if (!cached) {
6002         SvREFCNT_dec(sv_str);
6003     }
6004     if (sv_name) {
6005         SvREFCNT_dec(sv_name);
6006     }
6007     return ret;
6008
6009 }
6010
6011
6012 /*
6013  * reg_recode
6014  *
6015  * It returns the code point in utf8 for the value in *encp.
6016  *    value: a code value in the source encoding
6017  *    encp:  a pointer to an Encode object
6018  *
6019  * If the result from Encode is not a single character,
6020  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6021  */
6022 STATIC UV
6023 S_reg_recode(pTHX_ const char value, SV **encp)
6024 {
6025     STRLEN numlen = 1;
6026     SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
6027     const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
6028                                          : SvPVX(sv);
6029     const STRLEN newlen = SvCUR(sv);
6030     UV uv = UNICODE_REPLACEMENT;
6031
6032     if (newlen)
6033         uv = SvUTF8(sv)
6034              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6035              : *(U8*)s;
6036
6037     if (!newlen || numlen != newlen) {
6038         uv = UNICODE_REPLACEMENT;
6039         if (encp)
6040             *encp = NULL;
6041     }
6042     return uv;
6043 }
6044
6045
6046 /*
6047  - regatom - the lowest level
6048  *
6049  * Optimization:  gobbles an entire sequence of ordinary characters so that
6050  * it can turn them into a single node, which is smaller to store and
6051  * faster to run.  Backslashed characters are exceptions, each becoming a
6052  * separate node; the code is simpler that way and it's not worth fixing.
6053  *
6054  * [Yes, it is worth fixing, some scripts can run twice the speed.]
6055  * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
6056  */
6057 STATIC regnode *
6058 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6059 {
6060     dVAR;
6061     register regnode *ret = NULL;
6062     I32 flags;
6063     char *parse_start = RExC_parse;
6064     GET_RE_DEBUG_FLAGS_DECL;
6065     DEBUG_PARSE("atom");
6066     *flagp = WORST;             /* Tentatively. */
6067
6068 tryagain:
6069     switch (*RExC_parse) {
6070     case '^':
6071         RExC_seen_zerolen++;
6072         nextchar(pRExC_state);
6073         if (RExC_flags & PMf_MULTILINE)
6074             ret = reg_node(pRExC_state, MBOL);
6075         else if (RExC_flags & PMf_SINGLELINE)
6076             ret = reg_node(pRExC_state, SBOL);
6077         else
6078             ret = reg_node(pRExC_state, BOL);
6079         Set_Node_Length(ret, 1); /* MJD */
6080         break;
6081     case '$':
6082         nextchar(pRExC_state);
6083         if (*RExC_parse)
6084             RExC_seen_zerolen++;
6085         if (RExC_flags & PMf_MULTILINE)
6086             ret = reg_node(pRExC_state, MEOL);
6087         else if (RExC_flags & PMf_SINGLELINE)
6088             ret = reg_node(pRExC_state, SEOL);
6089         else
6090             ret = reg_node(pRExC_state, EOL);
6091         Set_Node_Length(ret, 1); /* MJD */
6092         break;
6093     case '.':
6094         nextchar(pRExC_state);
6095         if (RExC_flags & PMf_SINGLELINE)
6096             ret = reg_node(pRExC_state, SANY);
6097         else
6098             ret = reg_node(pRExC_state, REG_ANY);
6099         *flagp |= HASWIDTH|SIMPLE;
6100         RExC_naughty++;
6101         Set_Node_Length(ret, 1); /* MJD */
6102         break;
6103     case '[':
6104     {
6105         char * const oregcomp_parse = ++RExC_parse;
6106         ret = regclass(pRExC_state,depth+1);
6107         if (*RExC_parse != ']') {
6108             RExC_parse = oregcomp_parse;
6109             vFAIL("Unmatched [");
6110         }
6111         nextchar(pRExC_state);
6112         *flagp |= HASWIDTH|SIMPLE;
6113         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
6114         break;
6115     }
6116     case '(':
6117         nextchar(pRExC_state);
6118         ret = reg(pRExC_state, 1, &flags,depth+1);
6119         if (ret == NULL) {
6120                 if (flags & TRYAGAIN) {
6121                     if (RExC_parse == RExC_end) {
6122                          /* Make parent create an empty node if needed. */
6123                         *flagp |= TRYAGAIN;
6124                         return(NULL);
6125                     }
6126                     goto tryagain;
6127                 }
6128                 return(NULL);
6129         }
6130         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
6131         break;
6132     case '|':
6133     case ')':
6134         if (flags & TRYAGAIN) {
6135             *flagp |= TRYAGAIN;
6136             return NULL;
6137         }
6138         vFAIL("Internal urp");
6139                                 /* Supposed to be caught earlier. */
6140         break;
6141     case '{':
6142         if (!regcurly(RExC_parse)) {
6143             RExC_parse++;
6144             goto defchar;
6145         }
6146         /* FALL THROUGH */
6147     case '?':
6148     case '+':
6149     case '*':
6150         RExC_parse++;
6151         vFAIL("Quantifier follows nothing");
6152         break;
6153     case '\\':
6154         switch (*++RExC_parse) {
6155         case 'A':
6156             RExC_seen_zerolen++;
6157             ret = reg_node(pRExC_state, SBOL);
6158             *flagp |= SIMPLE;
6159             nextchar(pRExC_state);
6160             Set_Node_Length(ret, 2); /* MJD */
6161             break;
6162         case 'G':
6163             ret = reg_node(pRExC_state, GPOS);
6164             RExC_seen |= REG_SEEN_GPOS;
6165             *flagp |= SIMPLE;
6166             nextchar(pRExC_state);
6167             Set_Node_Length(ret, 2); /* MJD */
6168             break;
6169         case 'Z':
6170             ret = reg_node(pRExC_state, SEOL);
6171             *flagp |= SIMPLE;
6172             RExC_seen_zerolen++;                /* Do not optimize RE away */
6173             nextchar(pRExC_state);
6174             break;
6175         case 'z':
6176             ret = reg_node(pRExC_state, EOS);
6177             *flagp |= SIMPLE;
6178             RExC_seen_zerolen++;                /* Do not optimize RE away */
6179             nextchar(pRExC_state);
6180             Set_Node_Length(ret, 2); /* MJD */
6181             break;
6182         case 'C':
6183             ret = reg_node(pRExC_state, CANY);
6184             RExC_seen |= REG_SEEN_CANY;
6185             *flagp |= HASWIDTH|SIMPLE;
6186             nextchar(pRExC_state);
6187             Set_Node_Length(ret, 2); /* MJD */
6188             break;
6189         case 'X':
6190             ret = reg_node(pRExC_state, CLUMP);
6191             *flagp |= HASWIDTH;
6192             nextchar(pRExC_state);
6193             Set_Node_Length(ret, 2); /* MJD */
6194             break;
6195         case 'w':
6196             ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
6197             *flagp |= HASWIDTH|SIMPLE;
6198             nextchar(pRExC_state);
6199             Set_Node_Length(ret, 2); /* MJD */
6200             break;
6201         case 'W':
6202             ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
6203             *flagp |= HASWIDTH|SIMPLE;
6204             nextchar(pRExC_state);
6205             Set_Node_Length(ret, 2); /* MJD */
6206             break;
6207         case 'b':
6208             RExC_seen_zerolen++;
6209             RExC_seen |= REG_SEEN_LOOKBEHIND;
6210             ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
6211             *flagp |= SIMPLE;
6212             nextchar(pRExC_state);
6213             Set_Node_Length(ret, 2); /* MJD */
6214             break;
6215         case 'B':
6216             RExC_seen_zerolen++;
6217             RExC_seen |= REG_SEEN_LOOKBEHIND;
6218             ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
6219             *flagp |= SIMPLE;
6220             nextchar(pRExC_state);
6221             Set_Node_Length(ret, 2); /* MJD */
6222             break;
6223         case 's':
6224             ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
6225             *flagp |= HASWIDTH|SIMPLE;
6226             nextchar(pRExC_state);
6227             Set_Node_Length(ret, 2); /* MJD */
6228             break;
6229         case 'S':
6230             ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
6231             *flagp |= HASWIDTH|SIMPLE;
6232             nextchar(pRExC_state);
6233             Set_Node_Length(ret, 2); /* MJD */
6234             break;
6235         case 'd':
6236             ret = reg_node(pRExC_state, DIGIT);
6237             *flagp |= HASWIDTH|SIMPLE;
6238             nextchar(pRExC_state);
6239             Set_Node_Length(ret, 2); /* MJD */
6240             break;
6241         case 'D':
6242             ret = reg_node(pRExC_state, NDIGIT);
6243             *flagp |= HASWIDTH|SIMPLE;
6244             nextchar(pRExC_state);
6245             Set_Node_Length(ret, 2); /* MJD */
6246             break;
6247         case 'p':
6248         case 'P':
6249             {
6250                 char* const oldregxend = RExC_end;
6251                 char* parse_start = RExC_parse - 2;
6252
6253                 if (RExC_parse[1] == '{') {
6254                   /* a lovely hack--pretend we saw [\pX] instead */
6255                     RExC_end = strchr(RExC_parse, '}');
6256                     if (!RExC_end) {
6257                         const U8 c = (U8)*RExC_parse;
6258                         RExC_parse += 2;
6259                         RExC_end = oldregxend;
6260                         vFAIL2("Missing right brace on \\%c{}", c);
6261                     }
6262                     RExC_end++;
6263                 }
6264                 else {
6265                     RExC_end = RExC_parse + 2;
6266                     if (RExC_end > oldregxend)
6267                         RExC_end = oldregxend;
6268                 }
6269                 RExC_parse--;
6270
6271                 ret = regclass(pRExC_state,depth+1);
6272
6273                 RExC_end = oldregxend;
6274                 RExC_parse--;
6275
6276                 Set_Node_Offset(ret, parse_start + 2);
6277                 Set_Node_Cur_Length(ret);
6278                 nextchar(pRExC_state);
6279                 *flagp |= HASWIDTH|SIMPLE;
6280             }
6281             break;
6282         case 'N':
6283             /* Handle \N{NAME} here and not below because it can be
6284             multicharacter. join_exact() will join them up later on.
6285             Also this makes sure that things like /\N{BLAH}+/ and
6286             \N{BLAH} being multi char Just Happen. dmq*/
6287             ++RExC_parse;
6288             ret= reg_namedseq(pRExC_state, NULL);
6289             break;
6290         case 'k':    /* Handle \k<NAME> and \k'NAME' */
6291         {
6292             char ch= RExC_parse[1];
6293             if (ch != '<' && ch != '\'') {
6294                 if (SIZE_ONLY)
6295                     vWARN( RExC_parse + 1,
6296                         "Possible broken named back reference treated as literal k");
6297                 parse_start--;
6298                 goto defchar;
6299             } else {
6300                 char* name_start = (RExC_parse += 2);
6301                 I32 num = 0;
6302                 SV *sv_dat = reg_scan_name(pRExC_state,
6303                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6304                 ch= (ch == '<') ? '>' : '\'';
6305
6306                 if (RExC_parse == name_start || *RExC_parse != ch)
6307                     vFAIL2("Sequence \\k%c... not terminated",
6308                         (ch == '>' ? '<' : ch));
6309
6310                 RExC_sawback = 1;
6311                 ret = reganode(pRExC_state,
6312                            (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6313                            num);
6314                 *flagp |= HASWIDTH;
6315
6316
6317                 if (!SIZE_ONLY) {
6318                     num = add_data( pRExC_state, 1, "S" );
6319                     ARG_SET(ret,num);
6320                     RExC_rx->data->data[num]=(void*)sv_dat;
6321                     SvREFCNT_inc(sv_dat);
6322                 }
6323                 /* override incorrect value set in reganode MJD */
6324                 Set_Node_Offset(ret, parse_start+1);
6325                 Set_Node_Cur_Length(ret); /* MJD */
6326                 nextchar(pRExC_state);
6327
6328             }
6329             break;
6330         }
6331         case 'n':
6332         case 'r':
6333         case 't':
6334         case 'f':
6335         case 'e':
6336         case 'a':
6337         case 'x':
6338         case 'c':
6339         case '0':
6340             goto defchar;
6341         case 'R':
6342         case '1': case '2': case '3': case '4':
6343         case '5': case '6': case '7': case '8': case '9':
6344             {
6345                 I32 num;
6346                 bool isrel=(*RExC_parse=='R');
6347                 if (isrel)
6348                     RExC_parse++;
6349                 num = atoi(RExC_parse);
6350                 if (isrel) {
6351                     num = RExC_cpar - num;
6352                     if (num < 1)
6353                         vFAIL("Reference to nonexistent or unclosed group");
6354                 }
6355                 if (num > 9 && num >= RExC_npar)
6356                     goto defchar;
6357                 else {
6358                     char * const parse_start = RExC_parse - 1; /* MJD */
6359                     while (isDIGIT(*RExC_parse))
6360                         RExC_parse++;
6361
6362                     if (!SIZE_ONLY) {
6363                         if (num > (I32)RExC_rx->nparens)
6364                             vFAIL("Reference to nonexistent group");
6365                         /* People make this error all the time apparently.
6366                            So we cant fail on it, even though we should
6367
6368                         else if (num >= RExC_cpar)
6369                             vFAIL("Reference to unclosed group will always match");
6370                         */
6371                     }
6372                     RExC_sawback = 1;
6373                     ret = reganode(pRExC_state,
6374                                    (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6375                                    num);
6376                     *flagp |= HASWIDTH;
6377
6378                     /* override incorrect value set in reganode MJD */
6379                     Set_Node_Offset(ret, parse_start+1);
6380                     Set_Node_Cur_Length(ret); /* MJD */
6381                     RExC_parse--;
6382                     nextchar(pRExC_state);
6383                 }
6384             }
6385             break;
6386         case '\0':
6387             if (RExC_parse >= RExC_end)
6388                 FAIL("Trailing \\");
6389             /* FALL THROUGH */
6390         default:
6391             /* Do not generate "unrecognized" warnings here, we fall
6392                back into the quick-grab loop below */
6393             parse_start--;
6394             goto defchar;
6395         }
6396         break;
6397
6398     case '#':
6399         if (RExC_flags & PMf_EXTENDED) {
6400             while (RExC_parse < RExC_end && *RExC_parse != '\n')
6401                 RExC_parse++;
6402             if (RExC_parse < RExC_end)
6403                 goto tryagain;
6404         }
6405         /* FALL THROUGH */
6406
6407     default: {
6408             register STRLEN len;
6409             register UV ender;
6410             register char *p;
6411             char *s;
6412             STRLEN foldlen;
6413             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6414
6415             parse_start = RExC_parse - 1;
6416
6417             RExC_parse++;
6418
6419         defchar:
6420             ender = 0;
6421             ret = reg_node(pRExC_state,
6422                            (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6423             s = STRING(ret);
6424             for (len = 0, p = RExC_parse - 1;
6425               len < 127 && p < RExC_end;
6426               len++)
6427             {
6428                 char * const oldp = p;
6429
6430                 if (RExC_flags & PMf_EXTENDED)
6431                     p = regwhite(p, RExC_end);
6432                 switch (*p) {
6433                 case '^':
6434                 case '$':
6435                 case '.':
6436                 case '[':
6437                 case '(':
6438                 case ')':
6439                 case '|':
6440                     goto loopdone;
6441                 case '\\':
6442                     switch (*++p) {
6443                     case 'A':
6444                     case 'C':
6445                     case 'X':
6446                     case 'G':
6447                     case 'Z':
6448                     case 'z':
6449                     case 'w':
6450                     case 'W':
6451                     case 'b':
6452                     case 'B':
6453                     case 's':
6454                     case 'S':
6455                     case 'd':
6456                     case 'D':
6457                     case 'p':
6458                     case 'P':
6459                     case 'N':
6460                     case 'R':
6461                         --p;
6462                         goto loopdone;
6463                     case 'n':
6464                         ender = '\n';
6465                         p++;
6466                         break;
6467                     case 'r':
6468                         ender = '\r';
6469                         p++;
6470                         break;
6471                     case 't':
6472                         ender = '\t';
6473                         p++;
6474                         break;
6475                     case 'f':
6476                         ender = '\f';
6477                         p++;
6478                         break;
6479                     case 'e':
6480                           ender = ASCII_TO_NATIVE('\033');
6481                         p++;
6482                         break;
6483                     case 'a':
6484                           ender = ASCII_TO_NATIVE('\007');
6485                         p++;
6486                         break;
6487                     case 'x':
6488                         if (*++p == '{') {
6489                             char* const e = strchr(p, '}');
6490
6491                             if (!e) {
6492                                 RExC_parse = p + 1;
6493                                 vFAIL("Missing right brace on \\x{}");
6494                             }
6495                             else {
6496                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6497                                     | PERL_SCAN_DISALLOW_PREFIX;
6498                                 STRLEN numlen = e - p - 1;
6499                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
6500                                 if (ender > 0xff)
6501                                     RExC_utf8 = 1;
6502                                 p = e + 1;
6503                             }
6504                         }
6505                         else {
6506                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
6507                             STRLEN numlen = 2;
6508                             ender = grok_hex(p, &numlen, &flags, NULL);
6509                             p += numlen;
6510                         }
6511                         if (PL_encoding && ender < 0x100)
6512                             goto recode_encoding;
6513                         break;
6514                     case 'c':
6515                         p++;
6516                         ender = UCHARAT(p++);
6517                         ender = toCTRL(ender);
6518                         break;
6519                     case '0': case '1': case '2': case '3':case '4':
6520                     case '5': case '6': case '7': case '8':case '9':
6521                         if (*p == '0' ||
6522                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
6523                             I32 flags = 0;
6524                             STRLEN numlen = 3;
6525                             ender = grok_oct(p, &numlen, &flags, NULL);
6526                             p += numlen;
6527                         }
6528                         else {
6529                             --p;
6530                             goto loopdone;
6531                         }
6532                         if (PL_encoding && ender < 0x100)
6533                             goto recode_encoding;
6534                         break;
6535                     recode_encoding:
6536                         {
6537                             SV* enc = PL_encoding;
6538                             ender = reg_recode((const char)(U8)ender, &enc);
6539                             if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6540                                 vWARN(p, "Invalid escape in the specified encoding");
6541                             RExC_utf8 = 1;
6542                         }
6543                         break;
6544                     case '\0':
6545                         if (p >= RExC_end)
6546                             FAIL("Trailing \\");
6547                         /* FALL THROUGH */
6548                     default:
6549                         if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
6550                             vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
6551                         goto normal_default;
6552                     }
6553                     break;
6554                 default:
6555                   normal_default:
6556                     if (UTF8_IS_START(*p) && UTF) {
6557                         STRLEN numlen;
6558                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
6559                                                &numlen, UTF8_ALLOW_DEFAULT);
6560                         p += numlen;
6561                     }
6562                     else
6563                         ender = *p++;
6564                     break;
6565                 }
6566                 if (RExC_flags & PMf_EXTENDED)
6567                     p = regwhite(p, RExC_end);
6568                 if (UTF && FOLD) {
6569                     /* Prime the casefolded buffer. */
6570                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
6571                 }
6572                 if (ISMULT2(p)) { /* Back off on ?+*. */
6573                     if (len)
6574                         p = oldp;
6575                     else if (UTF) {
6576                          if (FOLD) {
6577                               /* Emit all the Unicode characters. */
6578                               STRLEN numlen;
6579                               for (foldbuf = tmpbuf;
6580                                    foldlen;
6581                                    foldlen -= numlen) {
6582                                    ender = utf8_to_uvchr(foldbuf, &numlen);
6583                                    if (numlen > 0) {
6584                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
6585                                         s       += unilen;
6586                                         len     += unilen;
6587                                         /* In EBCDIC the numlen
6588                                          * and unilen can differ. */
6589                                         foldbuf += numlen;
6590                                         if (numlen >= foldlen)
6591                                              break;
6592                                    }
6593                                    else
6594                                         break; /* "Can't happen." */
6595                               }
6596                          }
6597                          else {
6598                               const STRLEN unilen = reguni(pRExC_state, ender, s);
6599                               if (unilen > 0) {
6600                                    s   += unilen;
6601                                    len += unilen;
6602                               }
6603                          }
6604                     }
6605                     else {
6606                         len++;
6607                         REGC((char)ender, s++);
6608                     }
6609                     break;
6610                 }
6611                 if (UTF) {
6612                      if (FOLD) {
6613                           /* Emit all the Unicode characters. */
6614                           STRLEN numlen;
6615                           for (foldbuf = tmpbuf;
6616                                foldlen;
6617                                foldlen -= numlen) {
6618                                ender = utf8_to_uvchr(foldbuf, &numlen);
6619                                if (numlen > 0) {
6620                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
6621                                     len     += unilen;
6622                                     s       += unilen;
6623                                     /* In EBCDIC the numlen
6624                                      * and unilen can differ. */
6625                                     foldbuf += numlen;
6626                                     if (numlen >= foldlen)
6627                                          break;
6628                                }
6629                                else
6630                                     break;
6631                           }
6632                      }
6633                      else {
6634                           const STRLEN unilen = reguni(pRExC_state, ender, s);
6635                           if (unilen > 0) {
6636                                s   += unilen;
6637                                len += unilen;
6638                           }
6639                      }
6640                      len--;
6641                 }
6642                 else
6643                     REGC((char)ender, s++);
6644             }
6645         loopdone:
6646             RExC_parse = p - 1;
6647             Set_Node_Cur_Length(ret); /* MJD */
6648             nextchar(pRExC_state);
6649             {
6650                 /* len is STRLEN which is unsigned, need to copy to signed */
6651                 IV iv = len;
6652                 if (iv < 0)
6653                     vFAIL("Internal disaster");
6654             }
6655             if (len > 0)
6656                 *flagp |= HASWIDTH;
6657             if (len == 1 && UNI_IS_INVARIANT(ender))
6658                 *flagp |= SIMPLE;
6659
6660             if (SIZE_ONLY)
6661                 RExC_size += STR_SZ(len);
6662             else {
6663                 STR_LEN(ret) = len;
6664                 RExC_emit += STR_SZ(len);
6665             }
6666         }
6667         break;
6668     }
6669
6670     return(ret);
6671 }
6672
6673 STATIC char *
6674 S_regwhite(char *p, const char *e)
6675 {
6676     while (p < e) {
6677         if (isSPACE(*p))
6678             ++p;
6679         else if (*p == '#') {
6680             do {
6681                 p++;
6682             } while (p < e && *p != '\n');
6683         }
6684         else
6685             break;
6686     }
6687     return p;
6688 }
6689
6690 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6691    Character classes ([:foo:]) can also be negated ([:^foo:]).
6692    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6693    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
6694    but trigger failures because they are currently unimplemented. */
6695
6696 #define POSIXCC_DONE(c)   ((c) == ':')
6697 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6698 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6699
6700 STATIC I32
6701 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
6702 {
6703     dVAR;
6704     I32 namedclass = OOB_NAMEDCLASS;
6705
6706     if (value == '[' && RExC_parse + 1 < RExC_end &&
6707         /* I smell either [: or [= or [. -- POSIX has been here, right? */
6708         POSIXCC(UCHARAT(RExC_parse))) {
6709         const char c = UCHARAT(RExC_parse);
6710         char* const s = RExC_parse++;
6711
6712         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
6713             RExC_parse++;
6714         if (RExC_parse == RExC_end)
6715             /* Grandfather lone [:, [=, [. */
6716             RExC_parse = s;
6717         else {
6718             const char* const t = RExC_parse++; /* skip over the c */
6719             assert(*t == c);
6720
6721             if (UCHARAT(RExC_parse) == ']') {
6722                 const char *posixcc = s + 1;
6723                 RExC_parse++; /* skip over the ending ] */
6724
6725                 if (*s == ':') {
6726                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6727                     const I32 skip = t - posixcc;
6728
6729                     /* Initially switch on the length of the name.  */
6730                     switch (skip) {
6731                     case 4:
6732                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6733                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
6734                         break;
6735                     case 5:
6736                         /* Names all of length 5.  */
6737                         /* alnum alpha ascii blank cntrl digit graph lower
6738                            print punct space upper  */
6739                         /* Offset 4 gives the best switch position.  */
6740                         switch (posixcc[4]) {
6741                         case 'a':
6742                             if (memEQ(posixcc, "alph", 4)) /* alpha */
6743                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
6744                             break;
6745                         case 'e':
6746                             if (memEQ(posixcc, "spac", 4)) /* space */
6747                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
6748                             break;
6749                         case 'h':
6750                             if (memEQ(posixcc, "grap", 4)) /* graph */
6751                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
6752                             break;
6753                         case 'i':
6754                             if (memEQ(posixcc, "asci", 4)) /* ascii */
6755                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
6756                             break;
6757                         case 'k':
6758                             if (memEQ(posixcc, "blan", 4)) /* blank */
6759                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
6760                             break;
6761                         case 'l':
6762                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6763                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
6764                             break;
6765                         case 'm':
6766                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
6767                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
6768                             break;
6769                         case 'r':
6770                             if (memEQ(posixcc, "lowe", 4)) /* lower */
6771                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6772                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
6773                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
6774                             break;
6775                         case 't':
6776                             if (memEQ(posixcc, "digi", 4)) /* digit */
6777                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6778                             else if (memEQ(posixcc, "prin", 4)) /* print */
6779                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6780                             else if (memEQ(posixcc, "punc", 4)) /* punct */
6781                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
6782                             break;
6783                         }
6784                         break;
6785                     case 6:
6786                         if (memEQ(posixcc, "xdigit", 6))
6787                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
6788                         break;
6789                     }
6790
6791                     if (namedclass == OOB_NAMEDCLASS)
6792                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6793                                       t - s - 1, s + 1);
6794                     assert (posixcc[skip] == ':');
6795                     assert (posixcc[skip+1] == ']');
6796                 } else if (!SIZE_ONLY) {
6797                     /* [[=foo=]] and [[.foo.]] are still future. */
6798
6799                     /* adjust RExC_parse so the warning shows after
6800                        the class closes */
6801                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
6802                         RExC_parse++;
6803                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6804                 }
6805             } else {
6806                 /* Maternal grandfather:
6807                  * "[:" ending in ":" but not in ":]" */
6808                 RExC_parse = s;
6809             }
6810         }
6811     }
6812
6813     return namedclass;
6814 }
6815
6816 STATIC void
6817 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
6818 {
6819     dVAR;
6820     if (POSIXCC(UCHARAT(RExC_parse))) {
6821         const char *s = RExC_parse;
6822         const char  c = *s++;
6823
6824         while (isALNUM(*s))
6825             s++;
6826         if (*s && c == *s && s[1] == ']') {
6827             if (ckWARN(WARN_REGEXP))
6828                 vWARN3(s+2,
6829                         "POSIX syntax [%c %c] belongs inside character classes",
6830                         c, c);
6831
6832             /* [[=foo=]] and [[.foo.]] are still future. */
6833             if (POSIXCC_NOTYET(c)) {
6834                 /* adjust RExC_parse so the error shows after
6835                    the class closes */
6836                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
6837                     NOOP;
6838                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6839             }
6840         }
6841     }
6842 }
6843
6844
6845 /*
6846    parse a class specification and produce either an ANYOF node that
6847    matches the pattern. If the pattern matches a single char only and
6848    that char is < 256 then we produce an EXACT node instead.
6849 */
6850 STATIC regnode *
6851 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
6852 {
6853     dVAR;
6854     register UV value = 0;
6855     register UV nextvalue;
6856     register IV prevvalue = OOB_UNICODE;
6857     register IV range = 0;
6858     register regnode *ret;
6859     STRLEN numlen;
6860     IV namedclass;
6861     char *rangebegin = NULL;
6862     bool need_class = 0;
6863     SV *listsv = NULL;
6864     UV n;
6865     bool optimize_invert   = TRUE;
6866     AV* unicode_alternate  = NULL;
6867 #ifdef EBCDIC
6868     UV literal_endpoint = 0;
6869 #endif
6870     UV stored = 0;  /* number of chars stored in the class */
6871
6872     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
6873         case we need to change the emitted regop to an EXACT. */
6874     const char * orig_parse = RExC_parse;
6875     GET_RE_DEBUG_FLAGS_DECL;
6876 #ifndef DEBUGGING
6877     PERL_UNUSED_ARG(depth);
6878 #endif
6879
6880     DEBUG_PARSE("clas");
6881
6882     /* Assume we are going to generate an ANYOF node. */
6883     ret = reganode(pRExC_state, ANYOF, 0);
6884
6885     if (!SIZE_ONLY)
6886         ANYOF_FLAGS(ret) = 0;
6887
6888     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
6889         RExC_naughty++;
6890         RExC_parse++;
6891         if (!SIZE_ONLY)
6892             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6893     }
6894
6895     if (SIZE_ONLY) {
6896         RExC_size += ANYOF_SKIP;
6897         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6898     }
6899     else {
6900         RExC_emit += ANYOF_SKIP;
6901         if (FOLD)
6902             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6903         if (LOC)
6904             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
6905         ANYOF_BITMAP_ZERO(ret);
6906         listsv = newSVpvs("# comment\n");
6907     }
6908
6909     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6910
6911     if (!SIZE_ONLY && POSIXCC(nextvalue))
6912         checkposixcc(pRExC_state);
6913
6914     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6915     if (UCHARAT(RExC_parse) == ']')
6916         goto charclassloop;
6917
6918 parseit:
6919     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
6920
6921     charclassloop:
6922
6923         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6924
6925         if (!range)
6926             rangebegin = RExC_parse;
6927         if (UTF) {
6928             value = utf8n_to_uvchr((U8*)RExC_parse,
6929                                    RExC_end - RExC_parse,
6930                                    &numlen, UTF8_ALLOW_DEFAULT);
6931             RExC_parse += numlen;
6932         }
6933         else
6934             value = UCHARAT(RExC_parse++);
6935
6936         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6937         if (value == '[' && POSIXCC(nextvalue))
6938             namedclass = regpposixcc(pRExC_state, value);
6939         else if (value == '\\') {
6940             if (UTF) {
6941                 value = utf8n_to_uvchr((U8*)RExC_parse,
6942                                    RExC_end - RExC_parse,
6943                                    &numlen, UTF8_ALLOW_DEFAULT);
6944                 RExC_parse += numlen;
6945             }
6946             else
6947                 value = UCHARAT(RExC_parse++);
6948             /* Some compilers cannot handle switching on 64-bit integer
6949              * values, therefore value cannot be an UV.  Yes, this will
6950              * be a problem later if we want switch on Unicode.
6951              * A similar issue a little bit later when switching on
6952              * namedclass. --jhi */
6953             switch ((I32)value) {
6954             case 'w':   namedclass = ANYOF_ALNUM;       break;
6955             case 'W':   namedclass = ANYOF_NALNUM;      break;
6956             case 's':   namedclass = ANYOF_SPACE;       break;
6957             case 'S':   namedclass = ANYOF_NSPACE;      break;
6958             case 'd':   namedclass = ANYOF_DIGIT;       break;
6959             case 'D':   namedclass = ANYOF_NDIGIT;      break;
6960             case 'N':  /* Handle \N{NAME} in class */
6961                 {
6962                     /* We only pay attention to the first char of
6963                     multichar strings being returned. I kinda wonder
6964                     if this makes sense as it does change the behaviour
6965                     from earlier versions, OTOH that behaviour was broken
6966                     as well. */
6967                     UV v; /* value is register so we cant & it /grrr */
6968                     if (reg_namedseq(pRExC_state, &v)) {
6969                         goto parseit;
6970                     }
6971                     value= v;
6972                 }
6973                 break;
6974             case 'p':
6975             case 'P':
6976                 {
6977                 char *e;
6978                 if (RExC_parse >= RExC_end)
6979                     vFAIL2("Empty \\%c{}", (U8)value);
6980                 if (*RExC_parse == '{') {
6981                     const U8 c = (U8)value;
6982                     e = strchr(RExC_parse++, '}');
6983                     if (!e)
6984                         vFAIL2("Missing right brace on \\%c{}", c);
6985                     while (isSPACE(UCHARAT(RExC_parse)))
6986                         RExC_parse++;
6987                     if (e == RExC_parse)
6988                         vFAIL2("Empty \\%c{}", c);
6989                     n = e - RExC_parse;
6990                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
6991                         n--;
6992                 }
6993                 else {
6994                     e = RExC_parse;
6995                     n = 1;
6996                 }
6997                 if (!SIZE_ONLY) {
6998                     if (UCHARAT(RExC_parse) == '^') {
6999                          RExC_parse++;
7000                          n--;
7001                          value = value == 'p' ? 'P' : 'p'; /* toggle */
7002                          while (isSPACE(UCHARAT(RExC_parse))) {
7003                               RExC_parse++;
7004                               n--;
7005                          }
7006                     }
7007                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7008                         (value=='p' ? '+' : '!'), (int)n, RExC_parse);
7009                 }
7010                 RExC_parse = e + 1;
7011                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7012                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
7013                 }
7014                 break;
7015             case 'n':   value = '\n';                   break;
7016             case 'r':   value = '\r';                   break;
7017             case 't':   value = '\t';                   break;
7018             case 'f':   value = '\f';                   break;
7019             case 'b':   value = '\b';                   break;
7020             case 'e':   value = ASCII_TO_NATIVE('\033');break;
7021             case 'a':   value = ASCII_TO_NATIVE('\007');break;
7022             case 'x':
7023                 if (*RExC_parse == '{') {
7024                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7025                         | PERL_SCAN_DISALLOW_PREFIX;
7026                     char * const e = strchr(RExC_parse++, '}');
7027                     if (!e)
7028                         vFAIL("Missing right brace on \\x{}");
7029
7030                     numlen = e - RExC_parse;
7031                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
7032                     RExC_parse = e + 1;
7033                 }
7034                 else {
7035                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7036                     numlen = 2;
7037                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
7038                     RExC_parse += numlen;
7039                 }
7040                 if (PL_encoding && value < 0x100)
7041                     goto recode_encoding;
7042                 break;
7043             case 'c':
7044                 value = UCHARAT(RExC_parse++);
7045                 value = toCTRL(value);
7046                 break;
7047             case '0': case '1': case '2': case '3': case '4':
7048             case '5': case '6': case '7': case '8': case '9':
7049                 {
7050                     I32 flags = 0;
7051                     numlen = 3;
7052                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7053                     RExC_parse += numlen;
7054                     if (PL_encoding && value < 0x100)
7055                         goto recode_encoding;
7056                     break;
7057                 }
7058             recode_encoding:
7059                 {
7060                     SV* enc = PL_encoding;
7061                     value = reg_recode((const char)(U8)value, &enc);
7062                     if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7063                         vWARN(RExC_parse,
7064                               "Invalid escape in the specified encoding");
7065                     break;
7066                 }
7067             default:
7068                 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
7069                     vWARN2(RExC_parse,
7070                            "Unrecognized escape \\%c in character class passed through",
7071                            (int)value);
7072                 break;
7073             }
7074         } /* end of \blah */
7075 #ifdef EBCDIC
7076         else
7077             literal_endpoint++;
7078 #endif
7079
7080         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7081
7082             if (!SIZE_ONLY && !need_class)
7083                 ANYOF_CLASS_ZERO(ret);
7084
7085             need_class = 1;
7086
7087             /* a bad range like a-\d, a-[:digit:] ? */
7088             if (range) {
7089                 if (!SIZE_ONLY) {
7090                     if (ckWARN(WARN_REGEXP)) {
7091                         const int w =
7092                             RExC_parse >= rangebegin ?
7093                             RExC_parse - rangebegin : 0;
7094                         vWARN4(RExC_parse,
7095                                "False [] range \"%*.*s\"",
7096                                w, w, rangebegin);
7097                     }
7098                     if (prevvalue < 256) {
7099                         ANYOF_BITMAP_SET(ret, prevvalue);
7100                         ANYOF_BITMAP_SET(ret, '-');
7101                     }
7102                     else {
7103                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7104                         Perl_sv_catpvf(aTHX_ listsv,
7105                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
7106                     }
7107                 }
7108
7109                 range = 0; /* this was not a true range */
7110             }
7111
7112             if (!SIZE_ONLY) {
7113                 const char *what = NULL;
7114                 char yesno = 0;
7115
7116                 if (namedclass > OOB_NAMEDCLASS)
7117                     optimize_invert = FALSE;
7118                 /* Possible truncation here but in some 64-bit environments
7119                  * the compiler gets heartburn about switch on 64-bit values.
7120                  * A similar issue a little earlier when switching on value.
7121                  * --jhi */
7122                 switch ((I32)namedclass) {
7123                 case ANYOF_ALNUM:
7124                     if (LOC)
7125                         ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
7126                     else {
7127                         for (value = 0; value < 256; value++)
7128                             if (isALNUM(value))
7129                                 ANYOF_BITMAP_SET(ret, value);
7130                     }
7131                     yesno = '+';
7132                     what = "Word";
7133                     break;
7134                 case ANYOF_NALNUM:
7135                     if (LOC)
7136                         ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
7137                     else {
7138                         for (value = 0; value < 256; value++)
7139                             if (!isALNUM(value))
7140                                 ANYOF_BITMAP_SET(ret, value);
7141                     }
7142                     yesno = '!';
7143                     what = "Word";
7144                     break;
7145                 case ANYOF_ALNUMC:
7146                     if (LOC)
7147                         ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
7148                     else {
7149                         for (value = 0; value < 256; value++)
7150                             if (isALNUMC(value))
7151                                 ANYOF_BITMAP_SET(ret, value);
7152                     }
7153                     yesno = '+';
7154                     what = "Alnum";
7155                     break;
7156                 case ANYOF_NALNUMC:
7157                     if (LOC)
7158                         ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
7159                     else {
7160                         for (value = 0; value < 256; value++)
7161                             if (!isALNUMC(value))
7162                                 ANYOF_BITMAP_SET(ret, value);
7163                     }
7164                     yesno = '!';
7165                     what = "Alnum";
7166                     break;
7167                 case ANYOF_ALPHA:
7168                     if (LOC)
7169                         ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
7170                     else {
7171                         for (value = 0; value < 256; value++)
7172                             if (isALPHA(value))
7173                                 ANYOF_BITMAP_SET(ret, value);
7174                     }
7175                     yesno = '+';
7176                     what = "Alpha";
7177                     break;
7178                 case ANYOF_NALPHA:
7179                     if (LOC)
7180                         ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
7181                     else {
7182                         for (value = 0; value < 256; value++)
7183                             if (!isALPHA(value))
7184                                 ANYOF_BITMAP_SET(ret, value);
7185                     }
7186                     yesno = '!';
7187                     what = "Alpha";
7188                     break;
7189                 case ANYOF_ASCII:
7190                     if (LOC)
7191                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
7192                     else {
7193 #ifndef EBCDIC
7194                         for (value = 0; value < 128; value++)
7195                             ANYOF_BITMAP_SET(ret, value);
7196 #else  /* EBCDIC */
7197                         for (value = 0; value < 256; value++) {
7198                             if (isASCII(value))
7199                                 ANYOF_BITMAP_SET(ret, value);
7200                         }
7201 #endif /* EBCDIC */
7202                     }
7203                     yesno = '+';
7204                     what = "ASCII";
7205                     break;
7206                 case ANYOF_NASCII:
7207                     if (LOC)
7208                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
7209                     else {
7210 #ifndef EBCDIC
7211                         for (value = 128; value < 256; value++)
7212                             ANYOF_BITMAP_SET(ret, value);
7213 #else  /* EBCDIC */
7214                         for (value = 0; value < 256; value++) {
7215                             if (!isASCII(value))
7216                                 ANYOF_BITMAP_SET(ret, value);
7217                         }
7218 #endif /* EBCDIC */
7219                     }
7220                     yesno = '!';
7221                     what = "ASCII";
7222                     break;
7223                 case ANYOF_BLANK:
7224                     if (LOC)
7225                         ANYOF_CLASS_SET(ret, ANYOF_BLANK);
7226                     else {
7227                         for (value = 0; value < 256; value++)
7228                             if (isBLANK(value))
7229                                 ANYOF_BITMAP_SET(ret, value);
7230                     }
7231                     yesno = '+';
7232                     what = "Blank";
7233                     break;
7234                 case ANYOF_NBLANK:
7235                     if (LOC)
7236                         ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
7237                     else {
7238                         for (value = 0; value < 256; value++)
7239                             if (!isBLANK(value))
7240                                 ANYOF_BITMAP_SET(ret, value);
7241                     }
7242                     yesno = '!';
7243                     what = "Blank";
7244                     break;
7245                 case ANYOF_CNTRL:
7246                     if (LOC)
7247                         ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
7248                     else {
7249                         for (value = 0; value < 256; value++)
7250                             if (isCNTRL(value))
7251                                 ANYOF_BITMAP_SET(ret, value);
7252                     }
7253                     yesno = '+';
7254                     what = "Cntrl";
7255                     break;
7256                 case ANYOF_NCNTRL:
7257                     if (LOC)
7258                         ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
7259                     else {
7260                         for (value = 0; value < 256; value++)
7261                             if (!isCNTRL(value))
7262                                 ANYOF_BITMAP_SET(ret, value);
7263                     }
7264                     yesno = '!';
7265                     what = "Cntrl";
7266                     break;
7267                 case ANYOF_DIGIT:
7268                     if (LOC)
7269                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7270                     else {
7271                         /* consecutive digits assumed */
7272                         for (value = '0'; value <= '9'; value++)
7273                             ANYOF_BITMAP_SET(ret, value);
7274                     }
7275                     yesno = '+';
7276                     what = "Digit";
7277                     break;
7278                 case ANYOF_NDIGIT:
7279                     if (LOC)
7280                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7281                     else {
7282                         /* consecutive digits assumed */
7283                         for (value = 0; value < '0'; value++)
7284                             ANYOF_BITMAP_SET(ret, value);
7285                         for (value = '9' + 1; value < 256; value++)
7286                             ANYOF_BITMAP_SET(ret, value);
7287                     }
7288                     yesno = '!';
7289                     what = "Digit";
7290                     break;
7291                 case ANYOF_GRAPH:
7292                     if (LOC)
7293                         ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
7294                     else {
7295                         for (value = 0; value < 256; value++)
7296                             if (isGRAPH(value))
7297                                 ANYOF_BITMAP_SET(ret, value);
7298                     }
7299                     yesno = '+';
7300                     what = "Graph";
7301                     break;
7302                 case ANYOF_NGRAPH:
7303                     if (LOC)
7304                         ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
7305                     else {
7306                         for (value = 0; value < 256; value++)
7307                             if (!isGRAPH(value))
7308                                 ANYOF_BITMAP_SET(ret, value);
7309                     }
7310                     yesno = '!';
7311                     what = "Graph";
7312                     break;
7313                 case ANYOF_LOWER:
7314                     if (LOC)
7315                         ANYOF_CLASS_SET(ret, ANYOF_LOWER);
7316                     else {
7317                         for (value = 0; value < 256; value++)
7318                             if (isLOWER(value))
7319                                 ANYOF_BITMAP_SET(ret, value);
7320                     }
7321                     yesno = '+';
7322                     what = "Lower";
7323                     break;
7324                 case ANYOF_NLOWER:
7325                     if (LOC)
7326                         ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
7327                     else {
7328                         for (value = 0; value < 256; value++)
7329                             if (!isLOWER(value))
7330                                 ANYOF_BITMAP_SET(ret, value);
7331                     }
7332                     yesno = '!';
7333                     what = "Lower";
7334                     break;
7335                 case ANYOF_PRINT:
7336                     if (LOC)
7337                         ANYOF_CLASS_SET(ret, ANYOF_PRINT);
7338                     else {
7339                         for (value = 0; value < 256; value++)
7340                             if (isPRINT(value))
7341                                 ANYOF_BITMAP_SET(ret, value);
7342                     }
7343                     yesno = '+';
7344                     what = "Print";
7345                     break;
7346                 case ANYOF_NPRINT:
7347                     if (LOC)
7348                         ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
7349                     else {
7350                         for (value = 0; value < 256; value++)
7351                             if (!isPRINT(value))
7352                                 ANYOF_BITMAP_SET(ret, value);
7353                     }
7354                     yesno = '!';
7355                     what = "Print";
7356                     break;
7357                 case ANYOF_PSXSPC:
7358                     if (LOC)
7359                         ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
7360                     else {
7361                         for (value = 0; value < 256; value++)
7362                             if (isPSXSPC(value))
7363                                 ANYOF_BITMAP_SET(ret, value);
7364                     }
7365                     yesno = '+';
7366                     what = "Space";
7367                     break;
7368                 case ANYOF_NPSXSPC:
7369                     if (LOC)
7370                         ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
7371                     else {
7372                         for (value = 0; value < 256; value++)
7373                             if (!isPSXSPC(value))
7374                                 ANYOF_BITMAP_SET(ret, value);
7375                     }
7376                     yesno = '!';
7377                     what = "Space";
7378                     break;
7379                 case ANYOF_PUNCT:
7380                     if (LOC)
7381                         ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
7382                     else {
7383                         for (value = 0; value < 256; value++)
7384                             if (isPUNCT(value))
7385                                 ANYOF_BITMAP_SET(ret, value);
7386                     }
7387                     yesno = '+';
7388                     what = "Punct";
7389                     break;
7390                 case ANYOF_NPUNCT:
7391                     if (LOC)
7392                         ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
7393                     else {
7394                         for (value = 0; value < 256; value++)
7395                             if (!isPUNCT(value))
7396                                 ANYOF_BITMAP_SET(ret, value);
7397                     }
7398                     yesno = '!';
7399                     what = "Punct";
7400                     break;
7401                 case ANYOF_SPACE:
7402                     if (LOC)
7403                         ANYOF_CLASS_SET(ret, ANYOF_SPACE);
7404                     else {
7405                         for (value = 0; value < 256; value++)
7406                             if (isSPACE(value))
7407                                 ANYOF_BITMAP_SET(ret, value);
7408                     }
7409                     yesno = '+';
7410                     what = "SpacePerl";
7411                     break;
7412                 case ANYOF_NSPACE:
7413                     if (LOC)
7414                         ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
7415                     else {
7416                         for (value = 0; value < 256; value++)
7417                             if (!isSPACE(value))
7418                                 ANYOF_BITMAP_SET(ret, value);
7419                     }
7420                     yesno = '!';
7421                     what = "SpacePerl";
7422                     break;
7423                 case ANYOF_UPPER:
7424                     if (LOC)
7425                         ANYOF_CLASS_SET(ret, ANYOF_UPPER);
7426                     else {
7427                         for (value = 0; value < 256; value++)
7428                             if (isUPPER(value))
7429                                 ANYOF_BITMAP_SET(ret, value);
7430                     }
7431                     yesno = '+';
7432                     what = "Upper";
7433                     break;
7434                 case ANYOF_NUPPER:
7435                     if (LOC)
7436                         ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
7437                     else {
7438                         for (value = 0; value < 256; value++)
7439                             if (!isUPPER(value))
7440                                 ANYOF_BITMAP_SET(ret, value);
7441                     }
7442                     yesno = '!';
7443                     what = "Upper";
7444                     break;
7445                 case ANYOF_XDIGIT:
7446                     if (LOC)
7447                         ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
7448                     else {
7449                         for (value = 0; value < 256; value++)
7450                             if (isXDIGIT(value))
7451                                 ANYOF_BITMAP_SET(ret, value);
7452                     }
7453                     yesno = '+';
7454                     what = "XDigit";
7455                     break;
7456                 case ANYOF_NXDIGIT:
7457                     if (LOC)
7458                         ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
7459                     else {
7460                         for (value = 0; value < 256; value++)
7461                             if (!isXDIGIT(value))
7462                                 ANYOF_BITMAP_SET(ret, value);
7463                     }
7464                     yesno = '!';
7465                     what = "XDigit";
7466                     break;
7467                 case ANYOF_MAX:
7468                     /* this is to handle \p and \P */
7469                     break;
7470                 default:
7471                     vFAIL("Invalid [::] class");
7472                     break;
7473                 }
7474                 if (what) {
7475                     /* Strings such as "+utf8::isWord\n" */
7476                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7477                 }
7478                 if (LOC)
7479                     ANYOF_FLAGS(ret) |= ANYOF_CLASS;
7480                 continue;
7481             }
7482         } /* end of namedclass \blah */
7483
7484         if (range) {
7485             if (prevvalue > (IV)value) /* b-a */ {
7486                 const int w = RExC_parse - rangebegin;
7487                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
7488                 range = 0; /* not a valid range */
7489             }
7490         }
7491         else {
7492             prevvalue = value; /* save the beginning of the range */
7493             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7494                 RExC_parse[1] != ']') {
7495                 RExC_parse++;
7496
7497                 /* a bad range like \w-, [:word:]- ? */
7498                 if (namedclass > OOB_NAMEDCLASS) {
7499                     if (ckWARN(WARN_REGEXP)) {
7500                         const int w =
7501                             RExC_parse >= rangebegin ?
7502                             RExC_parse - rangebegin : 0;
7503                         vWARN4(RExC_parse,
7504                                "False [] range \"%*.*s\"",
7505                                w, w, rangebegin);
7506                     }
7507                     if (!SIZE_ONLY)
7508                         ANYOF_BITMAP_SET(ret, '-');
7509                 } else
7510                     range = 1;  /* yeah, it's a range! */
7511                 continue;       /* but do it the next time */
7512             }
7513         }
7514
7515         /* now is the next time */
7516         /*stored += (value - prevvalue + 1);*/
7517         if (!SIZE_ONLY) {
7518             if (prevvalue < 256) {
7519                 const IV ceilvalue = value < 256 ? value : 255;
7520                 IV i;
7521 #ifdef EBCDIC
7522                 /* In EBCDIC [\x89-\x91] should include
7523                  * the \x8e but [i-j] should not. */
7524                 if (literal_endpoint == 2 &&
7525                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7526                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
7527                 {
7528                     if (isLOWER(prevvalue)) {
7529                         for (i = prevvalue; i <= ceilvalue; i++)
7530                             if (isLOWER(i))
7531                                 ANYOF_BITMAP_SET(ret, i);
7532                     } else {
7533                         for (i = prevvalue; i <= ceilvalue; i++)
7534                             if (isUPPER(i))
7535                                 ANYOF_BITMAP_SET(ret, i);
7536                     }
7537                 }
7538                 else
7539 #endif
7540                       for (i = prevvalue; i <= ceilvalue; i++) {
7541                         if (!ANYOF_BITMAP_TEST(ret,i)) {
7542                             stored++;
7543                             ANYOF_BITMAP_SET(ret, i);
7544                         }
7545                       }
7546           }
7547           if (value > 255 || UTF) {
7548                 const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
7549                 const UV natvalue      = NATIVE_TO_UNI(value);
7550                 stored+=2; /* can't optimize this class */
7551                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7552                 if (prevnatvalue < natvalue) { /* what about > ? */
7553                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
7554                                    prevnatvalue, natvalue);
7555                 }
7556                 else if (prevnatvalue == natvalue) {
7557                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
7558                     if (FOLD) {
7559                          U8 foldbuf[UTF8_MAXBYTES_CASE+1];
7560                          STRLEN foldlen;
7561                          const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
7562
7563 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7564                          if (RExC_precomp[0] == ':' &&
7565                              RExC_precomp[1] == '[' &&
7566                              (f == 0xDF || f == 0x92)) {
7567                              f = NATIVE_TO_UNI(f);
7568                         }
7569 #endif
7570                          /* If folding and foldable and a single
7571                           * character, insert also the folded version
7572                           * to the charclass. */
7573                          if (f != value) {
7574 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7575                              if ((RExC_precomp[0] == ':' &&
7576                                   RExC_precomp[1] == '[' &&
7577                                   (f == 0xA2 &&
7578                                    (value == 0xFB05 || value == 0xFB06))) ?
7579                                  foldlen == ((STRLEN)UNISKIP(f) - 1) :
7580                                  foldlen == (STRLEN)UNISKIP(f) )
7581 #else
7582                               if (foldlen == (STRLEN)UNISKIP(f))
7583 #endif
7584                                   Perl_sv_catpvf(aTHX_ listsv,
7585                                                  "%04"UVxf"\n", f);
7586                               else {
7587                                   /* Any multicharacter foldings
7588                                    * require the following transform:
7589                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7590                                    * where E folds into "pq" and F folds
7591                                    * into "rst", all other characters
7592                                    * fold to single characters.  We save
7593                                    * away these multicharacter foldings,
7594                                    * to be later saved as part of the
7595                                    * additional "s" data. */
7596                                   SV *sv;
7597
7598                                   if (!unicode_alternate)
7599                                       unicode_alternate = newAV();
7600                                   sv = newSVpvn((char*)foldbuf, foldlen);
7601                                   SvUTF8_on(sv);
7602                                   av_push(unicode_alternate, sv);
7603                               }
7604                          }
7605
7606                          /* If folding and the value is one of the Greek
7607                           * sigmas insert a few more sigmas to make the
7608                           * folding rules of the sigmas to work right.
7609                           * Note that not all the possible combinations
7610                           * are handled here: some of them are handled
7611                           * by the standard folding rules, and some of
7612                           * them (literal or EXACTF cases) are handled
7613                           * during runtime in regexec.c:S_find_byclass(). */
7614                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7615                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7616                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
7617                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7618                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
7619                          }
7620                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7621                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7622                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
7623                     }
7624                 }
7625             }
7626 #ifdef EBCDIC
7627             literal_endpoint = 0;
7628 #endif
7629         }
7630
7631         range = 0; /* this range (if it was one) is done now */
7632     }
7633
7634     if (need_class) {
7635         ANYOF_FLAGS(ret) |= ANYOF_LARGE;
7636         if (SIZE_ONLY)
7637             RExC_size += ANYOF_CLASS_ADD_SKIP;
7638         else
7639             RExC_emit += ANYOF_CLASS_ADD_SKIP;
7640     }
7641
7642
7643     if (SIZE_ONLY)
7644         return ret;
7645     /****** !SIZE_ONLY AFTER HERE *********/
7646
7647     if( stored == 1 && value < 256
7648         && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7649     ) {
7650         /* optimize single char class to an EXACT node
7651            but *only* when its not a UTF/high char  */
7652         const char * cur_parse= RExC_parse;
7653         RExC_emit = (regnode *)orig_emit;
7654         RExC_parse = (char *)orig_parse;
7655         ret = reg_node(pRExC_state,
7656                        (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
7657         RExC_parse = (char *)cur_parse;
7658         *STRING(ret)= (char)value;
7659         STR_LEN(ret)= 1;
7660         RExC_emit += STR_SZ(1);
7661         return ret;
7662     }
7663     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7664     if ( /* If the only flag is folding (plus possibly inversion). */
7665         ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7666        ) {
7667         for (value = 0; value < 256; ++value) {
7668             if (ANYOF_BITMAP_TEST(ret, value)) {
7669                 UV fold = PL_fold[value];
7670
7671                 if (fold != value)
7672                     ANYOF_BITMAP_SET(ret, fold);
7673             }
7674         }
7675         ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
7676     }
7677
7678     /* optimize inverted simple patterns (e.g. [^a-z]) */
7679     if (optimize_invert &&
7680         /* If the only flag is inversion. */
7681         (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
7682         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
7683             ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
7684         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
7685     }
7686     {
7687         AV * const av = newAV();
7688         SV *rv;
7689         /* The 0th element stores the character class description
7690          * in its textual form: used later (regexec.c:Perl_regclass_swash())
7691          * to initialize the appropriate swash (which gets stored in
7692          * the 1st element), and also useful for dumping the regnode.
7693          * The 2nd element stores the multicharacter foldings,
7694          * used later (regexec.c:S_reginclass()). */
7695         av_store(av, 0, listsv);
7696         av_store(av, 1, NULL);
7697         av_store(av, 2, (SV*)unicode_alternate);
7698         rv = newRV_noinc((SV*)av);
7699         n = add_data(pRExC_state, 1, "s");
7700         RExC_rx->data->data[n] = (void*)rv;
7701         ARG_SET(ret, n);
7702     }
7703     return ret;
7704 }
7705
7706 STATIC char*
7707 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
7708 {
7709     char* const retval = RExC_parse++;
7710
7711     for (;;) {
7712         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7713                 RExC_parse[2] == '#') {
7714             while (*RExC_parse != ')') {
7715                 if (RExC_parse == RExC_end)
7716                     FAIL("Sequence (?#... not terminated");
7717                 RExC_parse++;
7718             }
7719             RExC_parse++;
7720             continue;
7721         }
7722         if (RExC_flags & PMf_EXTENDED) {
7723             if (isSPACE(*RExC_parse)) {
7724                 RExC_parse++;
7725                 continue;
7726             }
7727             else if (*RExC_parse == '#') {
7728                 while (RExC_parse < RExC_end)
7729                     if (*RExC_parse++ == '\n') break;
7730                 continue;
7731             }
7732         }
7733         return retval;
7734     }
7735 }
7736
7737 /*
7738 - reg_node - emit a node
7739 */
7740 STATIC regnode *                        /* Location. */
7741 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
7742 {
7743     dVAR;
7744     register regnode *ptr;
7745     regnode * const ret = RExC_emit;
7746     GET_RE_DEBUG_FLAGS_DECL;
7747
7748     if (SIZE_ONLY) {
7749         SIZE_ALIGN(RExC_size);
7750         RExC_size += 1;
7751         return(ret);
7752     }
7753 #ifdef DEBUGGING
7754     if (OP(RExC_emit) == 255)
7755         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7756             reg_name[op], OP(RExC_emit));
7757 #endif
7758     NODE_ALIGN_FILL(ret);
7759     ptr = ret;
7760     FILL_ADVANCE_NODE(ptr, op);
7761     if (RExC_offsets) {         /* MJD */
7762         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
7763               "reg_node", __LINE__,
7764               reg_name[op],
7765               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7766                 ? "Overwriting end of array!\n" : "OK",
7767               (UV)(RExC_emit - RExC_emit_start),
7768               (UV)(RExC_parse - RExC_start),
7769               (UV)RExC_offsets[0]));
7770         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
7771     }
7772
7773     RExC_emit = ptr;
7774     return(ret);
7775 }
7776
7777 /*
7778 - reganode - emit a node with an argument
7779 */
7780 STATIC regnode *                        /* Location. */
7781 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
7782 {
7783     dVAR;
7784     register regnode *ptr;
7785     regnode * const ret = RExC_emit;
7786     GET_RE_DEBUG_FLAGS_DECL;
7787
7788     if (SIZE_ONLY) {
7789         SIZE_ALIGN(RExC_size);
7790         RExC_size += 2;
7791         /*
7792            We can't do this:
7793
7794            assert(2==regarglen[op]+1);
7795
7796            Anything larger than this has to allocate the extra amount.
7797            If we changed this to be:
7798
7799            RExC_size += (1 + regarglen[op]);
7800
7801            then it wouldn't matter. Its not clear what side effect
7802            might come from that so its not done so far.
7803            -- dmq
7804         */
7805         return(ret);
7806     }
7807 #ifdef DEBUGGING
7808     if (OP(RExC_emit) == 255)
7809         Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7810 #endif
7811     NODE_ALIGN_FILL(ret);
7812     ptr = ret;
7813     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7814     if (RExC_offsets) {         /* MJD */
7815         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
7816               "reganode",
7817               __LINE__,
7818               reg_name[op],
7819               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
7820               "Overwriting end of array!\n" : "OK",
7821               (UV)(RExC_emit - RExC_emit_start),
7822               (UV)(RExC_parse - RExC_start),
7823               (UV)RExC_offsets[0]));
7824         Set_Cur_Node_Offset;
7825     }
7826
7827     RExC_emit = ptr;
7828     return(ret);
7829 }
7830
7831 /*
7832 - reguni - emit (if appropriate) a Unicode character
7833 */
7834 STATIC STRLEN
7835 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
7836 {
7837     dVAR;
7838     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
7839 }
7840
7841 /*
7842 - reginsert - insert an operator in front of already-emitted operand
7843 *
7844 * Means relocating the operand.
7845 */
7846 STATIC void
7847 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
7848 {
7849     dVAR;
7850     register regnode *src;
7851     register regnode *dst;
7852     register regnode *place;
7853     const int offset = regarglen[(U8)op];
7854     const int size = NODE_STEP_REGNODE + offset;
7855     GET_RE_DEBUG_FLAGS_DECL;
7856 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
7857     DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
7858     if (SIZE_ONLY) {
7859         RExC_size += size;
7860         return;
7861     }
7862
7863     src = RExC_emit;
7864     RExC_emit += size;
7865     dst = RExC_emit;
7866     if (RExC_open_parens) {
7867         int paren;
7868         DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
7869         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
7870             if ( RExC_open_parens[paren] >= opnd ) {
7871                 DEBUG_PARSE_FMT("open"," - %d",size);
7872                 RExC_open_parens[paren] += size;
7873             } else {
7874                 DEBUG_PARSE_FMT("open"," - %s","ok");
7875             }
7876             if ( RExC_close_parens[paren] >= opnd ) {
7877                 DEBUG_PARSE_FMT("close"," - %d",size);
7878                 RExC_close_parens[paren] += size;
7879             } else {
7880                 DEBUG_PARSE_FMT("close"," - %s","ok");
7881             }
7882         }
7883     }
7884
7885     while (src > opnd) {
7886         StructCopy(--src, --dst, regnode);
7887         if (RExC_offsets) {     /* MJD 20010112 */
7888             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
7889                   "reg_insert",
7890                   __LINE__,
7891                   reg_name[op],
7892                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7893                     ? "Overwriting end of array!\n" : "OK",
7894                   (UV)(src - RExC_emit_start),
7895                   (UV)(dst - RExC_emit_start),
7896                   (UV)RExC_offsets[0]));
7897             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7898             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
7899         }
7900     }
7901
7902
7903     place = opnd;               /* Op node, where operand used to be. */
7904     if (RExC_offsets) {         /* MJD */
7905         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
7906               "reginsert",
7907               __LINE__,
7908               reg_name[op],
7909               (UV)(place - RExC_emit_start) > RExC_offsets[0]
7910               ? "Overwriting end of array!\n" : "OK",
7911               (UV)(place - RExC_emit_start),
7912               (UV)(RExC_parse - RExC_start),
7913               (UV)RExC_offsets[0]));
7914         Set_Node_Offset(place, RExC_parse);
7915         Set_Node_Length(place, 1);
7916     }
7917     src = NEXTOPER(place);
7918     FILL_ADVANCE_NODE(place, op);
7919     Zero(src, offset, regnode);
7920 }
7921
7922 /*
7923 - regtail - set the next-pointer at the end of a node chain of p to val.
7924 - SEE ALSO: regtail_study
7925 */
7926 /* TODO: All three parms should be const */
7927 STATIC void
7928 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7929 {
7930     dVAR;
7931     register regnode *scan;
7932     GET_RE_DEBUG_FLAGS_DECL;
7933 #ifndef DEBUGGING
7934     PERL_UNUSED_ARG(depth);
7935 #endif
7936
7937     if (SIZE_ONLY)
7938         return;
7939
7940     /* Find last node. */
7941     scan = p;
7942     for (;;) {
7943         regnode * const temp = regnext(scan);
7944         DEBUG_PARSE_r({
7945             SV * const mysv=sv_newmortal();
7946             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7947             regprop(RExC_rx, mysv, scan);
7948             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7949                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7950                     (temp == NULL ? "->" : ""),
7951                     (temp == NULL ? reg_name[OP(val)] : "")
7952             );
7953         });
7954         if (temp == NULL)
7955             break;
7956         scan = temp;
7957     }
7958
7959     if (reg_off_by_arg[OP(scan)]) {
7960         ARG_SET(scan, val - scan);
7961     }
7962     else {
7963         NEXT_OFF(scan) = val - scan;
7964     }
7965 }
7966
7967 #ifdef DEBUGGING
7968 /*
7969 - regtail_study - set the next-pointer at the end of a node chain of p to val.
7970 - Look for optimizable sequences at the same time.
7971 - currently only looks for EXACT chains.
7972
7973 This is expermental code. The idea is to use this routine to perform
7974 in place optimizations on branches and groups as they are constructed,
7975 with the long term intention of removing optimization from study_chunk so
7976 that it is purely analytical.
7977
7978 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7979 to control which is which.
7980
7981 */
7982 /* TODO: All four parms should be const */
7983
7984 STATIC U8
7985 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7986 {
7987     dVAR;
7988     register regnode *scan;
7989     U8 exact = PSEUDO;
7990 #ifdef EXPERIMENTAL_INPLACESCAN
7991     I32 min = 0;
7992 #endif
7993
7994     GET_RE_DEBUG_FLAGS_DECL;
7995
7996
7997     if (SIZE_ONLY)
7998         return exact;
7999
8000     /* Find last node. */
8001
8002     scan = p;
8003     for (;;) {
8004         regnode * const temp = regnext(scan);
8005 #ifdef EXPERIMENTAL_INPLACESCAN
8006         if (PL_regkind[OP(scan)] == EXACT)
8007             if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8008                 return EXACT;
8009 #endif
8010         if ( exact ) {
8011             switch (OP(scan)) {
8012                 case EXACT:
8013                 case EXACTF:
8014                 case EXACTFL:
8015                         if( exact == PSEUDO )
8016                             exact= OP(scan);
8017                         else if ( exact != OP(scan) )
8018                             exact= 0;
8019                 case NOTHING:
8020                     break;
8021                 default:
8022                     exact= 0;
8023             }
8024         }
8025         DEBUG_PARSE_r({
8026             SV * const mysv=sv_newmortal();
8027             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8028             regprop(RExC_rx, mysv, scan);
8029             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8030                 SvPV_nolen_const(mysv),
8031                 REG_NODE_NUM(scan),
8032                 reg_name[exact]);
8033         });
8034         if (temp == NULL)
8035             break;
8036         scan = temp;
8037     }
8038     DEBUG_PARSE_r({
8039         SV * const mysv_val=sv_newmortal();
8040         DEBUG_PARSE_MSG("");
8041         regprop(RExC_rx, mysv_val, val);
8042         PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
8043             SvPV_nolen_const(mysv_val),
8044             REG_NODE_NUM(val),
8045             val - scan
8046         );
8047     });
8048     if (reg_off_by_arg[OP(scan)]) {
8049         ARG_SET(scan, val - scan);
8050     }
8051     else {
8052         NEXT_OFF(scan) = val - scan;
8053     }
8054
8055     return exact;
8056 }
8057 #endif
8058
8059 /*
8060  - regcurly - a little FSA that accepts {\d+,?\d*}
8061  */
8062 STATIC I32
8063 S_regcurly(register const char *s)
8064 {
8065     if (*s++ != '{')
8066         return FALSE;
8067     if (!isDIGIT(*s))
8068         return FALSE;
8069     while (isDIGIT(*s))
8070         s++;
8071     if (*s == ',')
8072         s++;
8073     while (isDIGIT(*s))
8074         s++;
8075     if (*s != '}')
8076         return FALSE;
8077     return TRUE;
8078 }
8079
8080
8081 /*
8082  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8083  */
8084 void
8085 Perl_regdump(pTHX_ const regexp *r)
8086 {
8087 #ifdef DEBUGGING
8088     dVAR;
8089     SV * const sv = sv_newmortal();
8090     SV *dsv= sv_newmortal();
8091
8092     (void)dumpuntil(r, r->program, r->program + 1, NULL, NULL, sv, 0, 0);
8093
8094     /* Header fields of interest. */
8095     if (r->anchored_substr) {
8096         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8097             RE_SV_DUMPLEN(r->anchored_substr), 30);
8098         PerlIO_printf(Perl_debug_log,
8099                       "anchored %s%s at %"IVdf" ",
8100                       s, RE_SV_TAIL(r->anchored_substr),
8101                       (IV)r->anchored_offset);
8102     } else if (r->anchored_utf8) {
8103         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8104             RE_SV_DUMPLEN(r->anchored_utf8), 30);
8105         PerlIO_printf(Perl_debug_log,
8106                       "anchored utf8 %s%s at %"IVdf" ",
8107                       s, RE_SV_TAIL(r->anchored_utf8),
8108                       (IV)r->anchored_offset);
8109     }
8110     if (r->float_substr) {
8111         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8112             RE_SV_DUMPLEN(r->float_substr), 30);
8113         PerlIO_printf(Perl_debug_log,
8114                       "floating %s%s at %"IVdf"..%"UVuf" ",
8115                       s, RE_SV_TAIL(r->float_substr),
8116                       (IV)r->float_min_offset, (UV)r->float_max_offset);
8117     } else if (r->float_utf8) {
8118         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8119             RE_SV_DUMPLEN(r->float_utf8), 30);
8120         PerlIO_printf(Perl_debug_log,
8121                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8122                       s, RE_SV_TAIL(r->float_utf8),
8123                       (IV)r->float_min_offset, (UV)r->float_max_offset);
8124     }
8125     if (r->check_substr || r->check_utf8)
8126         PerlIO_printf(Perl_debug_log,
8127                       (const char *)
8128                       (r->check_substr == r->float_substr
8129                        && r->check_utf8 == r->float_utf8
8130                        ? "(checking floating" : "(checking anchored"));
8131     if (r->reganch & ROPT_NOSCAN)
8132         PerlIO_printf(Perl_debug_log, " noscan");
8133     if (r->reganch & ROPT_CHECK_ALL)
8134         PerlIO_printf(Perl_debug_log, " isall");
8135     if (r->check_substr || r->check_utf8)
8136         PerlIO_printf(Perl_debug_log, ") ");
8137
8138     if (r->regstclass) {
8139         regprop(r, sv, r->regstclass);
8140         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
8141     }
8142     if (r->reganch & ROPT_ANCH) {
8143         PerlIO_printf(Perl_debug_log, "anchored");
8144         if (r->reganch & ROPT_ANCH_BOL)
8145             PerlIO_printf(Perl_debug_log, "(BOL)");
8146         if (r->reganch & ROPT_ANCH_MBOL)
8147             PerlIO_printf(Perl_debug_log, "(MBOL)");
8148         if (r->reganch & ROPT_ANCH_SBOL)
8149             PerlIO_printf(Perl_debug_log, "(SBOL)");
8150         if (r->reganch & ROPT_ANCH_GPOS)
8151             PerlIO_printf(Perl_debug_log, "(GPOS)");
8152         PerlIO_putc(Perl_debug_log, ' ');
8153     }
8154     if (r->reganch & ROPT_GPOS_SEEN)
8155         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", r->gofs);
8156     if (r->reganch & ROPT_SKIP)
8157         PerlIO_printf(Perl_debug_log, "plus ");
8158     if (r->reganch & ROPT_IMPLICIT)
8159         PerlIO_printf(Perl_debug_log, "implicit ");
8160     PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
8161     if (r->reganch & ROPT_EVAL_SEEN)
8162         PerlIO_printf(Perl_debug_log, "with eval ");
8163     PerlIO_printf(Perl_debug_log, "\n");
8164 #else
8165     PERL_UNUSED_CONTEXT;
8166     PERL_UNUSED_ARG(r);
8167 #endif  /* DEBUGGING */
8168 }
8169
8170 /*
8171 - regprop - printable representation of opcode
8172 */
8173 void
8174 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
8175 {
8176 #ifdef DEBUGGING
8177     dVAR;
8178     register int k;
8179     GET_RE_DEBUG_FLAGS_DECL;
8180
8181     sv_setpvn(sv, "", 0);
8182
8183     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
8184         /* It would be nice to FAIL() here, but this may be called from
8185            regexec.c, and it would be hard to supply pRExC_state. */
8186         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
8187     sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
8188
8189     k = PL_regkind[OP(o)];
8190
8191     if (k == EXACT) {
8192         SV * const dsv = sv_2mortal(newSVpvs(""));
8193         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8194          * is a crude hack but it may be the best for now since
8195          * we have no flag "this EXACTish node was UTF-8"
8196          * --jhi */
8197         const char * const s =
8198             pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
8199                 PL_colors[0], PL_colors[1],
8200                 PERL_PV_ESCAPE_UNI_DETECT |
8201                 PERL_PV_PRETTY_ELIPSES    |
8202                 PERL_PV_PRETTY_LTGT
8203             );
8204         Perl_sv_catpvf(aTHX_ sv, " %s", s );
8205     } else if (k == TRIE) {
8206         /* print the details of the trie in dumpuntil instead, as
8207          * prog->data isn't available here */
8208         const char op = OP(o);
8209         const I32 n = ARG(o);
8210         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
8211                (reg_ac_data *)prog->data->data[n] :
8212                NULL;
8213         const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
8214             (reg_trie_data*)prog->data->data[n] :
8215             ac->trie;
8216
8217         Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8218         DEBUG_TRIE_COMPILE_r(
8219             Perl_sv_catpvf(aTHX_ sv,
8220                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8221                 (UV)trie->startstate,
8222                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
8223                 (UV)trie->wordcount,
8224                 (UV)trie->minlen,
8225                 (UV)trie->maxlen,
8226                 (UV)TRIE_CHARCOUNT(trie),
8227                 (UV)trie->uniquecharcount
8228             )
8229         );
8230         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8231             int i;
8232             int rangestart = -1;
8233             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
8234             Perl_sv_catpvf(aTHX_ sv, "[");
8235             for (i = 0; i <= 256; i++) {
8236                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8237                     if (rangestart == -1)
8238                         rangestart = i;
8239                 } else if (rangestart != -1) {
8240                     if (i <= rangestart + 3)
8241                         for (; rangestart < i; rangestart++)
8242                             put_byte(sv, rangestart);
8243                     else {
8244                         put_byte(sv, rangestart);
8245                         sv_catpvs(sv, "-");
8246                         put_byte(sv, i - 1);
8247                     }
8248                     rangestart = -1;
8249                 }
8250             }
8251             Perl_sv_catpvf(aTHX_ sv, "]");
8252         }
8253
8254     } else if (k == CURLY) {
8255         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
8256             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8257         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
8258     }
8259     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
8260         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
8261     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT)
8262         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
8263     else if (k == GOSUB)
8264         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
8265     else if (k == VERB) {
8266         if (!o->flags)
8267             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
8268                 (SV*)prog->data->data[ ARG( o ) ]);
8269     } else if (k == LOGICAL)
8270         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
8271     else if (k == ANYOF) {
8272         int i, rangestart = -1;
8273         const U8 flags = ANYOF_FLAGS(o);
8274
8275         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8276         static const char * const anyofs[] = {
8277             "\\w",
8278             "\\W",
8279             "\\s",
8280             "\\S",
8281             "\\d",
8282             "\\D",
8283             "[:alnum:]",
8284             "[:^alnum:]",
8285             "[:alpha:]",
8286             "[:^alpha:]",
8287             "[:ascii:]",
8288             "[:^ascii:]",
8289             "[:ctrl:]",
8290             "[:^ctrl:]",
8291             "[:graph:]",
8292             "[:^graph:]",
8293             "[:lower:]",
8294             "[:^lower:]",
8295             "[:print:]",
8296             "[:^print:]",
8297             "[:punct:]",
8298             "[:^punct:]",
8299             "[:upper:]",
8300             "[:^upper:]",
8301             "[:xdigit:]",
8302             "[:^xdigit:]",
8303             "[:space:]",
8304             "[:^space:]",
8305             "[:blank:]",
8306             "[:^blank:]"
8307         };
8308
8309         if (flags & ANYOF_LOCALE)
8310             sv_catpvs(sv, "{loc}");
8311         if (flags & ANYOF_FOLD)
8312             sv_catpvs(sv, "{i}");
8313         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
8314         if (flags & ANYOF_INVERT)
8315             sv_catpvs(sv, "^");
8316         for (i = 0; i <= 256; i++) {
8317             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8318                 if (rangestart == -1)
8319                     rangestart = i;
8320             } else if (rangestart != -1) {
8321                 if (i <= rangestart + 3)
8322                     for (; rangestart < i; rangestart++)
8323                         put_byte(sv, rangestart);
8324                 else {
8325                     put_byte(sv, rangestart);
8326                     sv_catpvs(sv, "-");
8327                     put_byte(sv, i - 1);
8328                 }
8329                 rangestart = -1;
8330             }
8331         }
8332
8333         if (o->flags & ANYOF_CLASS)
8334             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
8335                 if (ANYOF_CLASS_TEST(o,i))
8336                     sv_catpv(sv, anyofs[i]);
8337
8338         if (flags & ANYOF_UNICODE)
8339             sv_catpvs(sv, "{unicode}");
8340         else if (flags & ANYOF_UNICODE_ALL)
8341             sv_catpvs(sv, "{unicode_all}");
8342
8343         {
8344             SV *lv;
8345             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
8346
8347             if (lv) {
8348                 if (sw) {
8349                     U8 s[UTF8_MAXBYTES_CASE+1];
8350
8351                     for (i = 0; i <= 256; i++) { /* just the first 256 */
8352                         uvchr_to_utf8(s, i);
8353
8354                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
8355                             if (rangestart == -1)
8356                                 rangestart = i;
8357                         } else if (rangestart != -1) {
8358                             if (i <= rangestart + 3)
8359                                 for (; rangestart < i; rangestart++) {
8360                                     const U8 * const e = uvchr_to_utf8(s,rangestart);
8361                                     U8 *p;
8362                                     for(p = s; p < e; p++)
8363                                         put_byte(sv, *p);
8364                                 }
8365                             else {
8366                                 const U8 *e = uvchr_to_utf8(s,rangestart);
8367                                 U8 *p;
8368                                 for (p = s; p < e; p++)
8369                                     put_byte(sv, *p);
8370                                 sv_catpvs(sv, "-");
8371                                 e = uvchr_to_utf8(s, i-1);
8372                                 for (p = s; p < e; p++)
8373                                     put_byte(sv, *p);
8374                                 }
8375                                 rangestart = -1;
8376                             }
8377                         }
8378
8379                     sv_catpvs(sv, "..."); /* et cetera */
8380                 }
8381
8382                 {
8383                     char *s = savesvpv(lv);
8384                     char * const origs = s;
8385
8386                     while (*s && *s != '\n')
8387                         s++;
8388
8389                     if (*s == '\n') {
8390                         const char * const t = ++s;
8391
8392                         while (*s) {
8393                             if (*s == '\n')
8394                                 *s = ' ';
8395                             s++;
8396                         }
8397                         if (s[-1] == ' ')
8398                             s[-1] = 0;
8399
8400                         sv_catpv(sv, t);
8401                     }
8402
8403                     Safefree(origs);
8404                 }
8405             }
8406         }
8407
8408         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8409     }
8410     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
8411         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
8412 #else
8413     PERL_UNUSED_CONTEXT;
8414     PERL_UNUSED_ARG(sv);
8415     PERL_UNUSED_ARG(o);
8416     PERL_UNUSED_ARG(prog);
8417 #endif  /* DEBUGGING */
8418 }
8419
8420 SV *
8421 Perl_re_intuit_string(pTHX_ regexp *prog)
8422 {                               /* Assume that RE_INTUIT is set */
8423     dVAR;
8424     GET_RE_DEBUG_FLAGS_DECL;
8425     PERL_UNUSED_CONTEXT;
8426
8427     DEBUG_COMPILE_r(
8428         {
8429             const char * const s = SvPV_nolen_const(prog->check_substr
8430                       ? prog->check_substr : prog->check_utf8);
8431
8432             if (!PL_colorset) reginitcolors();
8433             PerlIO_printf(Perl_debug_log,
8434                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
8435                       PL_colors[4],
8436                       prog->check_substr ? "" : "utf8 ",
8437                       PL_colors[5],PL_colors[0],
8438                       s,
8439                       PL_colors[1],
8440                       (strlen(s) > 60 ? "..." : ""));
8441         } );
8442
8443     return prog->check_substr ? prog->check_substr : prog->check_utf8;
8444 }
8445
8446 /*
8447    pregfree - free a regexp
8448
8449    See regdupe below if you change anything here.
8450 */
8451
8452 void
8453 Perl_pregfree(pTHX_ struct regexp *r)
8454 {
8455     dVAR;
8456
8457     GET_RE_DEBUG_FLAGS_DECL;
8458
8459     if (!r || (--r->refcnt > 0))
8460         return;
8461     DEBUG_COMPILE_r({
8462         if (!PL_colorset)
8463             reginitcolors();
8464         {
8465             SV *dsv= sv_newmortal();
8466             RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
8467                 dsv, r->precomp, r->prelen, 60);
8468             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8469                 PL_colors[4],PL_colors[5],s);
8470         }
8471     });
8472
8473     /* gcov results gave these as non-null 100% of the time, so there's no
8474        optimisation in checking them before calling Safefree  */
8475     Safefree(r->precomp);
8476     Safefree(r->offsets);             /* 20010421 MJD */
8477     RX_MATCH_COPY_FREE(r);
8478 #ifdef PERL_OLD_COPY_ON_WRITE
8479     if (r->saved_copy)
8480         SvREFCNT_dec(r->saved_copy);
8481 #endif
8482     if (r->substrs) {
8483         if (r->anchored_substr)
8484             SvREFCNT_dec(r->anchored_substr);
8485         if (r->anchored_utf8)
8486             SvREFCNT_dec(r->anchored_utf8);
8487         if (r->float_substr)
8488             SvREFCNT_dec(r->float_substr);
8489         if (r->float_utf8)
8490             SvREFCNT_dec(r->float_utf8);
8491         Safefree(r->substrs);
8492     }
8493     if (r->paren_names)
8494             SvREFCNT_dec(r->paren_names);
8495     if (r->data) {
8496         int n = r->data->count;
8497         PAD* new_comppad = NULL;
8498         PAD* old_comppad;
8499         PADOFFSET refcnt;
8500
8501         while (--n >= 0) {
8502           /* If you add a ->what type here, update the comment in regcomp.h */
8503             switch (r->data->what[n]) {
8504             case 's':
8505             case 'S':
8506                 SvREFCNT_dec((SV*)r->data->data[n]);
8507                 break;
8508             case 'f':
8509                 Safefree(r->data->data[n]);
8510                 break;
8511             case 'p':
8512                 new_comppad = (AV*)r->data->data[n];
8513                 break;
8514             case 'o':
8515                 if (new_comppad == NULL)
8516                     Perl_croak(aTHX_ "panic: pregfree comppad");
8517                 PAD_SAVE_LOCAL(old_comppad,
8518                     /* Watch out for global destruction's random ordering. */
8519                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
8520                 );
8521                 OP_REFCNT_LOCK;
8522                 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
8523                 OP_REFCNT_UNLOCK;
8524                 if (!refcnt)
8525                     op_free((OP_4tree*)r->data->data[n]);
8526
8527                 PAD_RESTORE_LOCAL(old_comppad);
8528                 SvREFCNT_dec((SV*)new_comppad);
8529                 new_comppad = NULL;
8530                 break;
8531             case 'n':
8532                 break;
8533             case 'T':
8534                 { /* Aho Corasick add-on structure for a trie node.
8535                      Used in stclass optimization only */
8536                     U32 refcount;
8537                     reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
8538                     OP_REFCNT_LOCK;
8539                     refcount = --aho->refcount;
8540                     OP_REFCNT_UNLOCK;
8541                     if ( !refcount ) {
8542                         Safefree(aho->states);
8543                         Safefree(aho->fail);
8544                         aho->trie=NULL; /* not necessary to free this as it is
8545                                            handled by the 't' case */
8546                         Safefree(r->data->data[n]); /* do this last!!!! */
8547                         Safefree(r->regstclass);
8548                     }
8549                 }
8550                 break;
8551             case 't':
8552                 {
8553                     /* trie structure. */
8554                     U32 refcount;
8555                     reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
8556                     OP_REFCNT_LOCK;
8557                     refcount = --trie->refcount;
8558                     OP_REFCNT_UNLOCK;
8559                     if ( !refcount ) {
8560                         Safefree(trie->charmap);
8561                         if (trie->widecharmap)
8562                             SvREFCNT_dec((SV*)trie->widecharmap);
8563                         Safefree(trie->states);
8564                         Safefree(trie->trans);
8565                         if (trie->bitmap)
8566                             Safefree(trie->bitmap);
8567                         if (trie->wordlen)
8568                             Safefree(trie->wordlen);
8569                         if (trie->jump)
8570                             Safefree(trie->jump);
8571                         if (trie->nextword)
8572                             Safefree(trie->nextword);
8573 #ifdef DEBUGGING
8574                         if (trie->words)
8575                             SvREFCNT_dec((SV*)trie->words);
8576                         if (trie->revcharmap)
8577                             SvREFCNT_dec((SV*)trie->revcharmap);
8578 #endif
8579                         Safefree(r->data->data[n]); /* do this last!!!! */
8580                     }
8581                 }
8582                 break;
8583             default:
8584                 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
8585             }
8586         }
8587         Safefree(r->data->what);
8588         Safefree(r->data);
8589     }
8590     Safefree(r->startp);
8591     Safefree(r->endp);
8592     if (r->swap) {
8593         Safefree(r->swap->startp);
8594         Safefree(r->swap->endp);
8595         Safefree(r->swap);
8596     }
8597     Safefree(r);
8598 }
8599
8600 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8601 #define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
8602 #define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
8603 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
8604
8605 /*
8606    regdupe - duplicate a regexp.
8607
8608    This routine is called by sv.c's re_dup and is expected to clone a
8609    given regexp structure. It is a no-op when not under USE_ITHREADS.
8610    (Originally this *was* re_dup() for change history see sv.c)
8611
8612    See pregfree() above if you change anything here.
8613 */
8614 #if defined(USE_ITHREADS)
8615 regexp *
8616 Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
8617 {
8618     dVAR;
8619     REGEXP *ret;
8620     int i, len, npar;
8621     struct reg_substr_datum *s;
8622
8623     if (!r)
8624         return (REGEXP *)NULL;
8625
8626     if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8627         return ret;
8628
8629     len = r->offsets[0];
8630     npar = r->nparens+1;
8631
8632     Newxc(ret, sizeof(regexp) + (len+1)*sizeof(regnode), char, regexp);
8633     Copy(r->program, ret->program, len+1, regnode);
8634
8635     Newx(ret->startp, npar, I32);
8636     Copy(r->startp, ret->startp, npar, I32);
8637     Newx(ret->endp, npar, I32);
8638     Copy(r->startp, ret->startp, npar, I32);
8639     if(r->swap) {
8640         Newx(ret->swap, 1, regexp_paren_ofs);
8641         /* no need to copy these */
8642         Newx(ret->swap->startp, npar, I32);
8643         Newx(ret->swap->endp, npar, I32);
8644     } else {
8645         ret->swap = NULL;
8646     }
8647
8648     Newx(ret->substrs, 1, struct reg_substr_data);
8649     for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8650         s->min_offset = r->substrs->data[i].min_offset;
8651         s->max_offset = r->substrs->data[i].max_offset;
8652         s->end_shift  = r->substrs->data[i].end_shift;
8653         s->substr     = sv_dup_inc(r->substrs->data[i].substr, param);
8654         s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8655     }
8656
8657     ret->regstclass = NULL;
8658     if (r->data) {
8659         struct reg_data *d;
8660         const int count = r->data->count;
8661         int i;
8662
8663         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8664                 char, struct reg_data);
8665         Newx(d->what, count, U8);
8666
8667         d->count = count;
8668         for (i = 0; i < count; i++) {
8669             d->what[i] = r->data->what[i];
8670             switch (d->what[i]) {
8671                 /* legal options are one of: sSfpont
8672                    see also regcomp.h and pregfree() */
8673             case 's':
8674             case 'S':
8675                 d->data[i] = sv_dup_inc((SV *)r->data->data[i], param);
8676                 break;
8677             case 'p':
8678                 d->data[i] = av_dup_inc((AV *)r->data->data[i], param);
8679                 break;
8680             case 'f':
8681                 /* This is cheating. */
8682                 Newx(d->data[i], 1, struct regnode_charclass_class);
8683                 StructCopy(r->data->data[i], d->data[i],
8684                             struct regnode_charclass_class);
8685                 ret->regstclass = (regnode*)d->data[i];
8686                 break;
8687             case 'o':
8688                 /* Compiled op trees are readonly, and can thus be
8689                    shared without duplication. */
8690                 OP_REFCNT_LOCK;
8691                 d->data[i] = (void*)OpREFCNT_inc((OP*)r->data->data[i]);
8692                 OP_REFCNT_UNLOCK;
8693                 break;
8694             case 'n':
8695                 d->data[i] = r->data->data[i];
8696                 break;
8697             case 't':
8698                 d->data[i] = r->data->data[i];
8699                 OP_REFCNT_LOCK;
8700                 ((reg_trie_data*)d->data[i])->refcount++;
8701                 OP_REFCNT_UNLOCK;
8702                 break;
8703             case 'T':
8704                 d->data[i] = r->data->data[i];
8705                 OP_REFCNT_LOCK;
8706                 ((reg_ac_data*)d->data[i])->refcount++;
8707                 OP_REFCNT_UNLOCK;
8708                 /* Trie stclasses are readonly and can thus be shared
8709                  * without duplication. We free the stclass in pregfree
8710                  * when the corresponding reg_ac_data struct is freed.
8711                  */
8712                 ret->regstclass= r->regstclass;
8713                 break;
8714             default:
8715                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", r->data->what[i]);
8716             }
8717         }
8718
8719         ret->data = d;
8720     }
8721     else
8722         ret->data = NULL;
8723
8724     Newx(ret->offsets, 2*len+1, U32);
8725     Copy(r->offsets, ret->offsets, 2*len+1, U32);
8726
8727     ret->precomp        = SAVEPVN(r->precomp, r->prelen);
8728     ret->refcnt         = r->refcnt;
8729     ret->minlen         = r->minlen;
8730     ret->minlenret      = r->minlenret;
8731     ret->prelen         = r->prelen;
8732     ret->nparens        = r->nparens;
8733     ret->lastparen      = r->lastparen;
8734     ret->lastcloseparen = r->lastcloseparen;
8735     ret->reganch        = r->reganch;
8736
8737     ret->sublen         = r->sublen;
8738
8739     ret->engine         = r->engine;
8740
8741     ret->paren_names    = hv_dup_inc(r->paren_names, param);
8742
8743     if (RX_MATCH_COPIED(ret))
8744         ret->subbeg  = SAVEPVN(r->subbeg, r->sublen);
8745     else
8746         ret->subbeg = NULL;
8747 #ifdef PERL_OLD_COPY_ON_WRITE
8748     ret->saved_copy = NULL;
8749 #endif
8750
8751     ptr_table_store(PL_ptr_table, r, ret);
8752     return ret;
8753 }
8754 #endif
8755
8756 /*
8757    reg_stringify()
8758
8759    converts a regexp embedded in a MAGIC struct to its stringified form,
8760    caching the converted form in the struct and returns the cached
8761    string.
8762
8763    If lp is nonnull then it is used to return the length of the
8764    resulting string
8765
8766    If flags is nonnull and the returned string contains UTF8 then
8767    (flags & 1) will be true.
8768
8769    If haseval is nonnull then it is used to return whether the pattern
8770    contains evals.
8771
8772    Normally called via macro:
8773
8774         CALLREG_STRINGIFY(mg,0,0);
8775
8776    And internally with
8777
8778         CALLREG_AS_STR(mg,lp,flags,haseval)
8779
8780    See sv_2pv_flags() in sv.c for an example of internal usage.
8781
8782  */
8783
8784 char *
8785 Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
8786     dVAR;
8787     const regexp * const re = (regexp *)mg->mg_obj;
8788
8789     if (!mg->mg_ptr) {
8790         const char *fptr = "msix";
8791         char reflags[6];
8792         char ch;
8793         int left = 0;
8794         int right = 4;
8795         bool need_newline = 0;
8796         U16 reganch = (U16)((re->reganch & PMf_COMPILETIME) >> 12);
8797
8798         while((ch = *fptr++)) {
8799             if(reganch & 1) {
8800                 reflags[left++] = ch;
8801             }
8802             else {
8803                 reflags[right--] = ch;
8804             }
8805             reganch >>= 1;
8806         }
8807         if(left != 4) {
8808             reflags[left] = '-';
8809             left = 5;
8810         }
8811
8812         mg->mg_len = re->prelen + 4 + left;
8813         /*
8814          * If /x was used, we have to worry about a regex ending with a
8815          * comment later being embedded within another regex. If so, we don't
8816          * want this regex's "commentization" to leak out to the right part of
8817          * the enclosing regex, we must cap it with a newline.
8818          *
8819          * So, if /x was used, we scan backwards from the end of the regex. If
8820          * we find a '#' before we find a newline, we need to add a newline
8821          * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
8822          * we don't need to add anything.  -jfriedl
8823          */
8824         if (PMf_EXTENDED & re->reganch) {
8825             const char *endptr = re->precomp + re->prelen;
8826             while (endptr >= re->precomp) {
8827                 const char c = *(endptr--);
8828                 if (c == '\n')
8829                     break; /* don't need another */
8830                 if (c == '#') {
8831                     /* we end while in a comment, so we need a newline */
8832                     mg->mg_len++; /* save space for it */
8833                     need_newline = 1; /* note to add it */
8834                     break;
8835                 }
8836             }
8837         }
8838
8839         Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
8840         mg->mg_ptr[0] = '(';
8841         mg->mg_ptr[1] = '?';
8842         Copy(reflags, mg->mg_ptr+2, left, char);
8843         *(mg->mg_ptr+left+2) = ':';
8844         Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
8845         if (need_newline)
8846             mg->mg_ptr[mg->mg_len - 2] = '\n';
8847         mg->mg_ptr[mg->mg_len - 1] = ')';
8848         mg->mg_ptr[mg->mg_len] = 0;
8849     }
8850     if (haseval)
8851         *haseval = re->program[0].next_off;
8852     if (flags)
8853         *flags = ((re->reganch & ROPT_UTF8) ? 1 : 0);
8854
8855     if (lp)
8856         *lp = mg->mg_len;
8857     return mg->mg_ptr;
8858 }
8859
8860
8861 #ifndef PERL_IN_XSUB_RE
8862 /*
8863  - regnext - dig the "next" pointer out of a node
8864  */
8865 regnode *
8866 Perl_regnext(pTHX_ register regnode *p)
8867 {
8868     dVAR;
8869     register I32 offset;
8870
8871     if (p == &PL_regdummy)
8872         return(NULL);
8873
8874     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8875     if (offset == 0)
8876         return(NULL);
8877
8878     return(p+offset);
8879 }
8880 #endif
8881
8882 STATIC void
8883 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
8884 {
8885     va_list args;
8886     STRLEN l1 = strlen(pat1);
8887     STRLEN l2 = strlen(pat2);
8888     char buf[512];
8889     SV *msv;
8890     const char *message;
8891
8892     if (l1 > 510)
8893         l1 = 510;
8894     if (l1 + l2 > 510)
8895         l2 = 510 - l1;
8896     Copy(pat1, buf, l1 , char);
8897     Copy(pat2, buf + l1, l2 , char);
8898     buf[l1 + l2] = '\n';
8899     buf[l1 + l2 + 1] = '\0';
8900 #ifdef I_STDARG
8901     /* ANSI variant takes additional second argument */
8902     va_start(args, pat2);
8903 #else
8904     va_start(args);
8905 #endif
8906     msv = vmess(buf, &args);
8907     va_end(args);
8908     message = SvPV_const(msv,l1);
8909     if (l1 > 512)
8910         l1 = 512;
8911     Copy(message, buf, l1 , char);
8912     buf[l1-1] = '\0';                   /* Overwrite \n */
8913     Perl_croak(aTHX_ "%s", buf);
8914 }
8915
8916 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
8917
8918 #ifndef PERL_IN_XSUB_RE
8919 void
8920 Perl_save_re_context(pTHX)
8921 {
8922     dVAR;
8923
8924     struct re_save_state *state;
8925
8926     SAVEVPTR(PL_curcop);
8927     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
8928
8929     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
8930     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
8931     SSPUSHINT(SAVEt_RE_STATE);
8932
8933     Copy(&PL_reg_state, state, 1, struct re_save_state);
8934
8935     PL_reg_start_tmp = 0;
8936     PL_reg_start_tmpl = 0;
8937     PL_reg_oldsaved = NULL;
8938     PL_reg_oldsavedlen = 0;
8939     PL_reg_maxiter = 0;
8940     PL_reg_leftiter = 0;
8941     PL_reg_poscache = NULL;
8942     PL_reg_poscache_size = 0;
8943 #ifdef PERL_OLD_COPY_ON_WRITE
8944     PL_nrs = NULL;
8945 #endif
8946
8947     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
8948     if (PL_curpm) {
8949         const REGEXP * const rx = PM_GETRE(PL_curpm);
8950         if (rx) {
8951             U32 i;
8952             for (i = 1; i <= rx->nparens; i++) {
8953                 char digits[TYPE_CHARS(long)];
8954                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
8955                 GV *const *const gvp
8956                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
8957
8958                 if (gvp) {
8959                     GV * const gv = *gvp;
8960                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
8961                         save_scalar(gv);
8962                 }
8963             }
8964         }
8965     }
8966 }
8967 #endif
8968
8969 static void
8970 clear_re(pTHX_ void *r)
8971 {
8972     dVAR;
8973     ReREFCNT_dec((regexp *)r);
8974 }
8975
8976 #ifdef DEBUGGING
8977
8978 STATIC void
8979 S_put_byte(pTHX_ SV *sv, int c)
8980 {
8981     if (isCNTRL(c) || c == 255 || !isPRINT(c))
8982         Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
8983     else if (c == '-' || c == ']' || c == '\\' || c == '^')
8984         Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
8985     else
8986         Perl_sv_catpvf(aTHX_ sv, "%c", c);
8987 }
8988
8989
8990 #define CLEAR_OPTSTART \
8991     if (optstart) STMT_START { \
8992             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
8993             optstart=NULL; \
8994     } STMT_END
8995
8996 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
8997
8998 STATIC const regnode *
8999 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
9000             const regnode *last, const regnode *plast,
9001             SV* sv, I32 indent, U32 depth)
9002 {
9003     dVAR;
9004     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
9005     register const regnode *next;
9006     const regnode *optstart= NULL;
9007     GET_RE_DEBUG_FLAGS_DECL;
9008
9009 #ifdef DEBUG_DUMPUNTIL
9010     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9011         last ? last-start : 0,plast ? plast-start : 0);
9012 #endif
9013
9014     if (plast && plast < last)
9015         last= plast;
9016
9017     while (PL_regkind[op] != END && (!last || node < last)) {
9018         /* While that wasn't END last time... */
9019
9020         NODE_ALIGN(node);
9021         op = OP(node);
9022         if (op == CLOSE || op == WHILEM)
9023             indent--;
9024         next = regnext((regnode *)node);
9025
9026         /* Where, what. */
9027         if (OP(node) == OPTIMIZED) {
9028             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
9029                 optstart = node;
9030             else
9031                 goto after_print;
9032         } else
9033             CLEAR_OPTSTART;
9034
9035         regprop(r, sv, node);
9036         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
9037                       (int)(2*indent + 1), "", SvPVX_const(sv));
9038
9039         if (OP(node) != OPTIMIZED) {
9040             if (next == NULL)           /* Next ptr. */
9041                 PerlIO_printf(Perl_debug_log, "(0)");
9042             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9043                 PerlIO_printf(Perl_debug_log, "(FAIL)");
9044             else
9045                 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
9046
9047             /*if (PL_regkind[(U8)op]  != TRIE)*/
9048                 (void)PerlIO_putc(Perl_debug_log, '\n');
9049         }
9050
9051       after_print:
9052         if (PL_regkind[(U8)op] == BRANCHJ) {
9053             assert(next);
9054             {
9055                 register const regnode *nnode = (OP(next) == LONGJMP
9056                                              ? regnext((regnode *)next)
9057                                              : next);
9058                 if (last && nnode > last)
9059                     nnode = last;
9060                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
9061             }
9062         }
9063         else if (PL_regkind[(U8)op] == BRANCH) {
9064             assert(next);
9065             DUMPUNTIL(NEXTOPER(node), next);
9066         }
9067         else if ( PL_regkind[(U8)op]  == TRIE ) {
9068             const regnode *this_trie = node;
9069             const char op = OP(node);
9070             const I32 n = ARG(node);
9071             const reg_ac_data * const ac = op>=AHOCORASICK ?
9072                (reg_ac_data *)r->data->data[n] :
9073                NULL;
9074             const reg_trie_data * const trie = op<AHOCORASICK ?
9075                 (reg_trie_data*)r->data->data[n] :
9076                 ac->trie;
9077             const regnode *nextbranch= NULL;
9078             I32 word_idx;
9079             sv_setpvn(sv, "", 0);
9080             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
9081                 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
9082
9083                 PerlIO_printf(Perl_debug_log, "%*s%s ",
9084                    (int)(2*(indent+3)), "",
9085                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
9086                             PL_colors[0], PL_colors[1],
9087                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9088                             PERL_PV_PRETTY_ELIPSES    |
9089                             PERL_PV_PRETTY_LTGT
9090                             )
9091                             : "???"
9092                 );
9093                 if (trie->jump) {
9094                     U16 dist= trie->jump[word_idx+1];
9095                     PerlIO_printf(Perl_debug_log, "(%u)\n",
9096                         (dist ? this_trie + dist : next) - start);
9097                     if (dist) {
9098                         if (!nextbranch)
9099                             nextbranch= this_trie + trie->jump[0];
9100                         DUMPUNTIL(this_trie + dist, nextbranch);
9101                     }
9102                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9103                         nextbranch= regnext((regnode *)nextbranch);
9104                 } else {
9105                     PerlIO_printf(Perl_debug_log, "\n");
9106                 }
9107             }
9108             if (last && next > last)
9109                 node= last;
9110             else
9111                 node= next;
9112         }
9113         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
9114             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9115                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
9116         }
9117         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
9118             assert(next);
9119             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
9120         }
9121         else if ( op == PLUS || op == STAR) {
9122             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
9123         }
9124         else if (op == ANYOF) {
9125             /* arglen 1 + class block */
9126             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9127                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9128             node = NEXTOPER(node);
9129         }
9130         else if (PL_regkind[(U8)op] == EXACT) {
9131             /* Literal string, where present. */
9132             node += NODE_SZ_STR(node) - 1;
9133             node = NEXTOPER(node);
9134         }
9135         else {
9136             node = NEXTOPER(node);
9137             node += regarglen[(U8)op];
9138         }
9139         if (op == CURLYX || op == OPEN)
9140             indent++;
9141     }
9142     CLEAR_OPTSTART;
9143 #ifdef DEBUG_DUMPUNTIL
9144     PerlIO_printf(Perl_debug_log, "--- %d\n",indent);
9145 #endif
9146     return node;
9147 }
9148
9149 #endif  /* DEBUGGING */
9150
9151 /*
9152  * Local variables:
9153  * c-indentation-style: bsd
9154  * c-basic-offset: 4
9155  * indent-tabs-mode: t
9156  * End:
9157  *
9158  * ex: set ts=8 sts=4 sw=4 noet:
9159  */