EBCDIC: this seems to calm the last of the
[p5sagit/p5-mst-13.2.git] / regcomp.c
1 /*    regcomp.c
2  */
3
4 /*
5  * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
6  */
7
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9  * confused with the original package (see point 3 below).  Thanks, Henry!
10  */
11
12 /* Additional note: this code is very heavily munged from Henry's version
13  * in places.  In some spots I've traded clarity for efficiency, so don't
14  * blame Henry for some of the lack of readability.
15  */
16
17 /* The names of the functions have been changed from regcomp and
18  * regexec to  pregcomp and pregexec in order to avoid conflicts
19  * with the POSIX routines of the same names.
20 */
21
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 #  ifndef PERL_IN_XSUB_RE
25 #    define PERL_IN_XSUB_RE
26 #  endif
27 /* need access to debugger hooks */
28 #  if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
29 #    define DEBUGGING
30 #  endif
31 #endif
32
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 #  define Perl_pregcomp my_regcomp
36 #  define Perl_regdump my_regdump
37 #  define Perl_regprop my_regprop
38 #  define Perl_pregfree my_regfree
39 #  define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 #  define Perl_regnext my_regnext
42 #  define Perl_save_re_context my_save_re_context
43 #  define Perl_reginitcolors my_reginitcolors
44
45 #  define PERL_NO_GET_CONTEXT
46 #endif
47
48 /*SUPPRESS 112*/
49 /*
50  * pregcomp and pregexec -- regsub and regerror are not used in perl
51  *
52  *      Copyright (c) 1986 by University of Toronto.
53  *      Written by Henry Spencer.  Not derived from licensed software.
54  *
55  *      Permission is granted to anyone to use this software for any
56  *      purpose on any computer system, and to redistribute it freely,
57  *      subject to the following restrictions:
58  *
59  *      1. The author is not responsible for the consequences of use of
60  *              this software, no matter how awful, even if they arise
61  *              from defects in it.
62  *
63  *      2. The origin of this software must not be misrepresented, either
64  *              by explicit claim or by omission.
65  *
66  *      3. Altered versions must be plainly marked as such, and must not
67  *              be misrepresented as being the original software.
68  *
69  *
70  ****    Alterations to Henry's code are...
71  ****
72  ****    Copyright (c) 1991-2002, Larry Wall
73  ****
74  ****    You may distribute under the terms of either the GNU General Public
75  ****    License or the Artistic License, as specified in the README file.
76
77  *
78  * Beware that some of this code is subtly aware of the way operator
79  * precedence is structured in regular expressions.  Serious changes in
80  * regular-expression syntax might require a total rethink.
81  */
82 #include "EXTERN.h"
83 #define PERL_IN_REGCOMP_C
84 #include "perl.h"
85
86 #ifndef PERL_IN_XSUB_RE
87 #  include "INTERN.h"
88 #endif
89
90 #define REG_COMP_C
91 #include "regcomp.h"
92
93 #ifdef op
94 #undef op
95 #endif /* op */
96
97 #ifdef MSDOS
98 # if defined(BUGGY_MSC6)
99  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100  # pragma optimize("a",off)
101  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102  # pragma optimize("w",on )
103 # endif /* BUGGY_MSC6 */
104 #endif /* MSDOS */
105
106 #ifndef STATIC
107 #define STATIC  static
108 #endif
109
110 typedef struct RExC_state_t {
111     U16         flags16;                /* are we folding, multilining? */
112     char        *precomp;               /* uncompiled string. */
113     regexp      *rx;
114     char        *start;                 /* Start of input for compile */
115     char        *end;                   /* End of input for compile */
116     char        *parse;                 /* Input-scan pointer. */
117     I32         whilem_seen;            /* number of WHILEM in this expr */
118     regnode     *emit_start;            /* Start of emitted-code area */
119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
120     I32         naughty;                /* How bad is this pattern? */
121     I32         sawback;                /* Did we see \1, ...? */
122     U32         seen;
123     I32         size;                   /* Code size. */
124     I32         npar;                   /* () count. */
125     I32         extralen;
126     I32         seen_zerolen;
127     I32         seen_evals;
128     I32         utf8;
129 #if ADD_TO_REGEXEC
130     char        *starttry;              /* -Dr: where regtry was called. */
131 #define RExC_starttry   (pRExC_state->starttry)
132 #endif
133 } RExC_state_t;
134
135 #define RExC_flags16    (pRExC_state->flags16)
136 #define RExC_precomp    (pRExC_state->precomp)
137 #define RExC_rx         (pRExC_state->rx)
138 #define RExC_start      (pRExC_state->start)
139 #define RExC_end        (pRExC_state->end)
140 #define RExC_parse      (pRExC_state->parse)
141 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
142 #define RExC_offsets    (pRExC_state->rx->offsets) /* I am not like the others */
143 #define RExC_emit       (pRExC_state->emit)
144 #define RExC_emit_start (pRExC_state->emit_start)
145 #define RExC_naughty    (pRExC_state->naughty)
146 #define RExC_sawback    (pRExC_state->sawback)
147 #define RExC_seen       (pRExC_state->seen)
148 #define RExC_size       (pRExC_state->size)
149 #define RExC_npar       (pRExC_state->npar)
150 #define RExC_extralen   (pRExC_state->extralen)
151 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
152 #define RExC_seen_evals (pRExC_state->seen_evals)
153 #define RExC_utf8       (pRExC_state->utf8)
154
155 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
156 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157         ((*s) == '{' && regcurly(s)))
158
159 #ifdef SPSTART
160 #undef SPSTART          /* dratted cpp namespace... */
161 #endif
162 /*
163  * Flags to be passed up and down.
164  */
165 #define WORST           0       /* Worst case. */
166 #define HASWIDTH        0x1     /* Known to match non-null strings. */
167 #define SIMPLE          0x2     /* Simple enough to be STAR/PLUS operand. */
168 #define SPSTART         0x4     /* Starts with * or +. */
169 #define TRYAGAIN        0x8     /* Weeded out a declaration. */
170
171 /* Length of a variant. */
172
173 typedef struct scan_data_t {
174     I32 len_min;
175     I32 len_delta;
176     I32 pos_min;
177     I32 pos_delta;
178     SV *last_found;
179     I32 last_end;                       /* min value, <0 unless valid. */
180     I32 last_start_min;
181     I32 last_start_max;
182     SV **longest;                       /* Either &l_fixed, or &l_float. */
183     SV *longest_fixed;
184     I32 offset_fixed;
185     SV *longest_float;
186     I32 offset_float_min;
187     I32 offset_float_max;
188     I32 flags;
189     I32 whilem_c;
190     I32 *last_closep;
191     struct regnode_charclass_class *start_class;
192 } scan_data_t;
193
194 /*
195  * Forward declarations for pregcomp()'s friends.
196  */
197
198 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199                                       0, 0, 0, 0, 0, 0};
200
201 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202 #define SF_BEFORE_SEOL          0x1
203 #define SF_BEFORE_MEOL          0x2
204 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
207 #ifdef NO_UNARY_PLUS
208 #  define SF_FIX_SHIFT_EOL      (0+2)
209 #  define SF_FL_SHIFT_EOL               (0+4)
210 #else
211 #  define SF_FIX_SHIFT_EOL      (+2)
212 #  define SF_FL_SHIFT_EOL               (+4)
213 #endif
214
215 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220 #define SF_IS_INF               0x40
221 #define SF_HAS_PAR              0x80
222 #define SF_IN_PAR               0x100
223 #define SF_HAS_EVAL             0x200
224 #define SCF_DO_SUBSTR           0x400
225 #define SCF_DO_STCLASS_AND      0x0800
226 #define SCF_DO_STCLASS_OR       0x1000
227 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
228 #define SCF_WHILEM_VISITED_POS  0x2000
229
230 #define UTF RExC_utf8
231 #define LOC (RExC_flags16 & PMf_LOCALE)
232 #define FOLD (RExC_flags16 & PMf_FOLD)
233
234 #define OOB_UNICODE             12345678
235 #define OOB_NAMEDCLASS          -1
236
237 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
240
241 /* length of regex to show in messages that don't mark a position within */
242 #define RegexLengthToShowInErrorMessages 127
243
244 /*
245  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247  * op/pragma/warn/regcomp.
248  */
249 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
250 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
251
252 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
253
254 /*
255  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256  * arg. Show regex, up to a maximum length. If it's too long, chop and add
257  * "...".
258  */
259 #define FAIL(msg)                                                             \
260     STMT_START {                                                             \
261         char *ellipses = "";                                                 \
262         IV len = RExC_end - RExC_precomp;                                \
263                                                                              \
264         if (!SIZE_ONLY)                                                      \
265             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
266                                                                              \
267         if (len > RegexLengthToShowInErrorMessages) {                        \
268             /* chop 10 shorter than the max, to ensure meaning of "..." */   \
269             len = RegexLengthToShowInErrorMessages - 10;                     \
270             ellipses = "...";                                                \
271         }                                                                    \
272         Perl_croak(aTHX_ "%s in regex m/%.*s%s/",                            \
273                    msg, (int)len, RExC_precomp, ellipses);                  \
274     } STMT_END
275
276 /*
277  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278  * args. Show regex, up to a maximum length. If it's too long, chop and add
279  * "...".
280  */
281 #define FAIL2(pat,msg)                                                        \
282     STMT_START {                                                             \
283         char *ellipses = "";                                                 \
284         IV len = RExC_end - RExC_precomp;                                \
285                                                                              \
286         if (!SIZE_ONLY)                                                      \
287             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
288                                                                              \
289         if (len > RegexLengthToShowInErrorMessages) {                        \
290             /* chop 10 shorter than the max, to ensure meaning of "..." */   \
291             len = RegexLengthToShowInErrorMessages - 10;                     \
292             ellipses = "...";                                                \
293         }                                                                    \
294         S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/",                        \
295                     msg, (int)len, RExC_precomp, ellipses);                \
296     } STMT_END
297
298
299 /*
300  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
301  */
302 #define Simple_vFAIL(m)                                                      \
303     STMT_START {                                                             \
304       IV offset = RExC_parse - RExC_precomp; \
305                                                                              \
306       Perl_croak(aTHX_ "%s" REPORT_LOCATION,               \
307                  m, (int)offset, RExC_precomp, RExC_precomp + offset);     \
308     } STMT_END
309
310 /*
311  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
312  */
313 #define vFAIL(m)                                                             \
314     STMT_START {                                                             \
315       if (!SIZE_ONLY)                                                        \
316             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
317       Simple_vFAIL(m);                                                       \
318     } STMT_END
319
320 /*
321  * Like Simple_vFAIL(), but accepts two arguments.
322  */
323 #define Simple_vFAIL2(m,a1)                                                  \
324     STMT_START {                                                             \
325       IV offset = RExC_parse - RExC_precomp; \
326                                                                              \
327       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,       \
328                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
329     } STMT_END
330
331 /*
332  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
333  */
334 #define vFAIL2(m,a1)                                                         \
335     STMT_START {                                                             \
336       if (!SIZE_ONLY)                                                        \
337             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
338       Simple_vFAIL2(m, a1);                                                  \
339     } STMT_END
340
341
342 /*
343  * Like Simple_vFAIL(), but accepts three arguments.
344  */
345 #define Simple_vFAIL3(m, a1, a2)                                             \
346     STMT_START {                                                             \
347       IV offset = RExC_parse - RExC_precomp; \
348                                                                              \
349       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,   \
350                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
351     } STMT_END
352
353 /*
354  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355  */
356 #define vFAIL3(m,a1,a2)                                                      \
357     STMT_START {                                                             \
358       if (!SIZE_ONLY)                                                        \
359             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
360       Simple_vFAIL3(m, a1, a2);                                              \
361     } STMT_END
362
363 /*
364  * Like Simple_vFAIL(), but accepts four arguments.
365  */
366 #define Simple_vFAIL4(m, a1, a2, a3)                                         \
367     STMT_START {                                                             \
368       IV offset = RExC_parse - RExC_precomp; \
369                                                                              \
370       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
371                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
372     } STMT_END
373
374 /*
375  * Like Simple_vFAIL(), but accepts five arguments.
376  */
377 #define Simple_vFAIL5(m, a1, a2, a3, a4)                                     \
378     STMT_START {                                                             \
379       IV offset = RExC_parse - RExC_precomp; \
380       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
381                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
382     } STMT_END
383
384
385 #define vWARN(loc,m)                                                         \
386     STMT_START {                                                             \
387         IV offset = loc - RExC_precomp;          \
388         Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
389                  m, (int)offset, RExC_precomp, RExC_precomp + offset);          \
390     } STMT_END                                                               \
391
392 #define vWARNdep(loc,m)                                                         \
393     STMT_START {                                                             \
394         IV offset = loc - RExC_precomp;          \
395         int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED;  \
396         Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\
397                  m, (int)offset, RExC_precomp, RExC_precomp + offset);          \
398     } STMT_END                                                               \
399
400
401 #define vWARN2(loc, m, a1)                                                   \
402     STMT_START {                                                             \
403         IV offset = loc - RExC_precomp;          \
404         Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
405                  a1,                                                         \
406                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
407     } STMT_END
408
409 #define vWARN3(loc, m, a1, a2)                                               \
410     STMT_START {                                                             \
411       IV offset = loc - RExC_precomp;        \
412         Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,                    \
413                  a1, a2,                                                     \
414                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
415     } STMT_END
416
417 #define vWARN4(loc, m, a1, a2, a3)                                           \
418     STMT_START {                                                             \
419       IV offset = loc - RExC_precomp;            \
420         Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
421                  a1, a2, a3,                                                 \
422                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
423     } STMT_END
424
425 /* used for the parse_flags section for (?c) -- japhy */
426 #define vWARN5(loc, m, a1, a2, a3, a4)                                       \
427   STMT_START {                                                   \
428       IV offset = loc - RExC_precomp;   \
429         Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,      \
430                  a1, a2, a3, a4,                                 \
431                  (int)offset, RExC_precomp, RExC_precomp + offset);  \
432     } STMT_END
433
434
435 /* Allow for side effects in s */
436 #define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
437
438 /* Macros for recording node offsets.   20001227 mjd@plover.com 
439  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
440  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
441  * Element 0 holds the number n.
442  */
443
444 #define MJD_OFFSET_DEBUG(x)
445 /* #define MJD_OFFSET_DEBUG(x) fprintf x */
446
447
448 #  define Set_Node_Offset_To_R(node,byte)                           \
449    STMT_START {                                        \
450      if (! SIZE_ONLY) {                                  \
451        if((node) < 0) {                   \
452          Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
453        } else {                                                        \
454          RExC_offsets[2*(node)-1] = (byte);                               \
455        }                                                               \
456      }                                                                 \
457    } STMT_END
458
459 #  define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
460 #  define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
461
462 #  define Set_Node_Length_To_R(node,len)                            \
463    STMT_START {                                        \
464      if (! SIZE_ONLY) {                                  \
465        MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
466        if((node) < 0) {                   \
467          Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
468        } else {                                                        \
469          RExC_offsets[2*(node)] = (len);                               \
470        }                                                               \
471      }                                                                 \
472    } STMT_END
473
474 #  define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
475 #  define Set_Cur_Node_Length(len)  Set_Node_Length(RExC_emit, len)
476 #  define Set_Node_Cur_Length(node)   Set_Node_Length(node, RExC_parse - parse_start)
477
478 /* Get offsets and lengths */
479 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
480 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
481
482 static void clear_re(pTHX_ void *r);
483
484 /* Mark that we cannot extend a found fixed substring at this point.
485    Updata the longest found anchored substring and the longest found
486    floating substrings if needed. */
487
488 STATIC void
489 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
490 {
491     STRLEN l = CHR_SVLEN(data->last_found);
492     STRLEN old_l = CHR_SVLEN(*data->longest);
493
494     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
495         sv_setsv(*data->longest, data->last_found);
496         if (*data->longest == data->longest_fixed) {
497             data->offset_fixed = l ? data->last_start_min : data->pos_min;
498             if (data->flags & SF_BEFORE_EOL)
499                 data->flags
500                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
501             else
502                 data->flags &= ~SF_FIX_BEFORE_EOL;
503         }
504         else {
505             data->offset_float_min = l ? data->last_start_min : data->pos_min;
506             data->offset_float_max = (l
507                                       ? data->last_start_max
508                                       : data->pos_min + data->pos_delta);
509             if (data->flags & SF_BEFORE_EOL)
510                 data->flags
511                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
512             else
513                 data->flags &= ~SF_FL_BEFORE_EOL;
514         }
515     }
516     SvCUR_set(data->last_found, 0);
517     data->last_end = -1;
518     data->flags &= ~SF_BEFORE_EOL;
519 }
520
521 /* Can match anything (initialization) */
522 STATIC void
523 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
524 {
525     ANYOF_CLASS_ZERO(cl);
526     ANYOF_BITMAP_SETALL(cl);
527     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
528     if (LOC)
529         cl->flags |= ANYOF_LOCALE;
530 }
531
532 /* Can match anything (initialization) */
533 STATIC int
534 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
535 {
536     int value;
537
538     for (value = 0; value <= ANYOF_MAX; value += 2)
539         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
540             return 1;
541     if (!(cl->flags & ANYOF_UNICODE_ALL))
542         return 0;
543     if (!ANYOF_BITMAP_TESTALLSET(cl))
544         return 0;
545     return 1;
546 }
547
548 /* Can match anything (initialization) */
549 STATIC void
550 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
551 {
552     Zero(cl, 1, struct regnode_charclass_class);
553     cl->type = ANYOF;
554     cl_anything(pRExC_state, cl);
555 }
556
557 STATIC void
558 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
559 {
560     Zero(cl, 1, struct regnode_charclass_class);
561     cl->type = ANYOF;
562     cl_anything(pRExC_state, cl);
563     if (LOC)
564         cl->flags |= ANYOF_LOCALE;
565 }
566
567 /* 'And' a given class with another one.  Can create false positives */
568 /* We assume that cl is not inverted */
569 STATIC void
570 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
571          struct regnode_charclass_class *and_with)
572 {
573     if (!(and_with->flags & ANYOF_CLASS)
574         && !(cl->flags & ANYOF_CLASS)
575         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
576         && !(and_with->flags & ANYOF_FOLD)
577         && !(cl->flags & ANYOF_FOLD)) {
578         int i;
579
580         if (and_with->flags & ANYOF_INVERT)
581             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
582                 cl->bitmap[i] &= ~and_with->bitmap[i];
583         else
584             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
585                 cl->bitmap[i] &= and_with->bitmap[i];
586     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
587     if (!(and_with->flags & ANYOF_EOS))
588         cl->flags &= ~ANYOF_EOS;
589
590     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
591         cl->flags &= ~ANYOF_UNICODE_ALL;
592         cl->flags |= ANYOF_UNICODE;
593         ARG_SET(cl, ARG(and_with));
594     }
595     if (!(and_with->flags & ANYOF_UNICODE_ALL))
596         cl->flags &= ~ANYOF_UNICODE_ALL;
597     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
598         cl->flags &= ~ANYOF_UNICODE;
599 }
600
601 /* 'OR' a given class with another one.  Can create false positives */
602 /* We assume that cl is not inverted */
603 STATIC void
604 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
605 {
606     if (or_with->flags & ANYOF_INVERT) {
607         /* We do not use
608          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
609          *   <= (B1 | !B2) | (CL1 | !CL2)
610          * which is wasteful if CL2 is small, but we ignore CL2:
611          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
612          * XXXX Can we handle case-fold?  Unclear:
613          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
614          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
615          */
616         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
617              && !(or_with->flags & ANYOF_FOLD)
618              && !(cl->flags & ANYOF_FOLD) ) {
619             int i;
620
621             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
622                 cl->bitmap[i] |= ~or_with->bitmap[i];
623         } /* XXXX: logic is complicated otherwise */
624         else {
625             cl_anything(pRExC_state, cl);
626         }
627     } else {
628         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
629         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
630              && (!(or_with->flags & ANYOF_FOLD)
631                  || (cl->flags & ANYOF_FOLD)) ) {
632             int i;
633
634             /* OR char bitmap and class bitmap separately */
635             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
636                 cl->bitmap[i] |= or_with->bitmap[i];
637             if (or_with->flags & ANYOF_CLASS) {
638                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
639                     cl->classflags[i] |= or_with->classflags[i];
640                 cl->flags |= ANYOF_CLASS;
641             }
642         }
643         else { /* XXXX: logic is complicated, leave it along for a moment. */
644             cl_anything(pRExC_state, cl);
645         }
646     }
647     if (or_with->flags & ANYOF_EOS)
648         cl->flags |= ANYOF_EOS;
649
650     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
651         ARG(cl) != ARG(or_with)) {
652         cl->flags |= ANYOF_UNICODE_ALL;
653         cl->flags &= ~ANYOF_UNICODE;
654     }
655     if (or_with->flags & ANYOF_UNICODE_ALL) {
656         cl->flags |= ANYOF_UNICODE_ALL;
657         cl->flags &= ~ANYOF_UNICODE;
658     }
659 }
660
661 /*
662  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
663  * These need to be revisited when a newer toolchain becomes available.
664  */
665 #if defined(__sparc64__) && defined(__GNUC__)
666 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
667 #       undef  SPARC64_GCC_WORKAROUND
668 #       define SPARC64_GCC_WORKAROUND 1
669 #   endif
670 #endif
671
672 /* REx optimizer.  Converts nodes into quickier variants "in place".
673    Finds fixed substrings.  */
674
675 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
676    to the position after last scanned or to NULL. */
677
678 STATIC I32
679 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
680                         /* scanp: Start here (read-write). */
681                         /* deltap: Write maxlen-minlen here. */
682                         /* last: Stop before this one. */
683 {
684     I32 min = 0, pars = 0, code;
685     regnode *scan = *scanp, *next;
686     I32 delta = 0;
687     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
688     int is_inf_internal = 0;            /* The studied chunk is infinite */
689     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
690     scan_data_t data_fake;
691     struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
692
693     while (scan && OP(scan) != END && scan < last) {
694         /* Peephole optimizer: */
695
696         if (PL_regkind[(U8)OP(scan)] == EXACT) {
697             /* Merge several consecutive EXACTish nodes into one. */
698             regnode *n = regnext(scan);
699             U32 stringok = 1;
700 #ifdef DEBUGGING
701             regnode *stop = scan;
702 #endif
703
704             next = scan + NODE_SZ_STR(scan);
705             /* Skip NOTHING, merge EXACT*. */
706             while (n &&
707                    ( PL_regkind[(U8)OP(n)] == NOTHING ||
708                      (stringok && (OP(n) == OP(scan))))
709                    && NEXT_OFF(n)
710                    && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
711                 if (OP(n) == TAIL || n > next)
712                     stringok = 0;
713                 if (PL_regkind[(U8)OP(n)] == NOTHING) {
714                     NEXT_OFF(scan) += NEXT_OFF(n);
715                     next = n + NODE_STEP_REGNODE;
716 #ifdef DEBUGGING
717                     if (stringok)
718                         stop = n;
719 #endif
720                     n = regnext(n);
721                 }
722                 else if (stringok) {
723                     int oldl = STR_LEN(scan);
724                     regnode *nnext = regnext(n);
725
726                     if (oldl + STR_LEN(n) > U8_MAX)
727                         break;
728                     NEXT_OFF(scan) += NEXT_OFF(n);
729                     STR_LEN(scan) += STR_LEN(n);
730                     next = n + NODE_SZ_STR(n);
731                     /* Now we can overwrite *n : */
732                     Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
733 #ifdef DEBUGGING
734                     stop = next - 1;
735 #endif
736                     n = nnext;
737                 }
738             }
739
740             if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
741 /*
742   Two problematic code points in Unicode casefolding of EXACT nodes:
743
744    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
745    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
746
747    which casefold to
748
749    Unicode                      UTF-8
750
751    U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
752    U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
753
754    This means that in case-insensitive matching (or "loose matching",
755    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
756    length of the above casefolded versions) can match a target string
757    of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
758    This would rather mess up the minimum length computation.
759
760    What we'll do is to look for the tail four bytes, and then peek
761    at the preceding two bytes to see whether we need to decrease
762    the minimum length by four (six minus two).
763
764    Thanks to the design of UTF-8, there cannot be false matches:
765    A sequence of valid UTF-8 bytes cannot be a subsequence of
766    another valid sequence of UTF-8 bytes.
767
768 */
769                  char *s0 = STRING(scan), *s, *t;
770                  char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
771                  char *t0 = "\xcc\x88\xcc\x81";
772                  char *t1 = t0 + 3;
773                  
774                  for (s = s0 + 2;
775                       s < s2 && (t = ninstr(s, s1, t0, t1));
776                       s = t + 4) {
777                       if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
778                           ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
779                            min -= 4;
780                  }
781             }
782
783 #ifdef DEBUGGING
784             /* Allow dumping */
785             n = scan + NODE_SZ_STR(scan);
786             while (n <= stop) {
787                 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
788                     OP(n) = OPTIMIZED;
789                     NEXT_OFF(n) = 0;
790                 }
791                 n++;
792             }
793 #endif
794         }
795         /* Follow the next-chain of the current node and optimize
796            away all the NOTHINGs from it.  */
797         if (OP(scan) != CURLYX) {
798             int max = (reg_off_by_arg[OP(scan)]
799                        ? I32_MAX
800                        /* I32 may be smaller than U16 on CRAYs! */
801                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
802             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
803             int noff;
804             regnode *n = scan;
805         
806             /* Skip NOTHING and LONGJMP. */
807             while ((n = regnext(n))
808                    && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
809                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
810                    && off + noff < max)
811                 off += noff;
812             if (reg_off_by_arg[OP(scan)])
813                 ARG(scan) = off;
814             else
815                 NEXT_OFF(scan) = off;
816         }
817         /* The principal pseudo-switch.  Cannot be a switch, since we
818            look into several different things.  */
819         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
820                    || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
821             next = regnext(scan);
822             code = OP(scan);
823         
824             if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
825                 I32 max1 = 0, min1 = I32_MAX, num = 0;
826                 struct regnode_charclass_class accum;
827                 
828                 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
829                     scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
830                 if (flags & SCF_DO_STCLASS)
831                     cl_init_zero(pRExC_state, &accum);
832                 while (OP(scan) == code) {
833                     I32 deltanext, minnext, f = 0, fake;
834                     struct regnode_charclass_class this_class;
835
836                     num++;
837                     data_fake.flags = 0;
838                     if (data) {         
839                         data_fake.whilem_c = data->whilem_c;
840                         data_fake.last_closep = data->last_closep;
841                     }
842                     else
843                         data_fake.last_closep = &fake;
844                     next = regnext(scan);
845                     scan = NEXTOPER(scan);
846                     if (code != BRANCH)
847                         scan = NEXTOPER(scan);
848                     if (flags & SCF_DO_STCLASS) {
849                         cl_init(pRExC_state, &this_class);
850                         data_fake.start_class = &this_class;
851                         f = SCF_DO_STCLASS_AND;
852                     }           
853                     if (flags & SCF_WHILEM_VISITED_POS)
854                         f |= SCF_WHILEM_VISITED_POS;
855                     /* we suppose the run is continuous, last=next...*/
856                     minnext = study_chunk(pRExC_state, &scan, &deltanext,
857                                           next, &data_fake, f);
858                     if (min1 > minnext)
859                         min1 = minnext;
860                     if (max1 < minnext + deltanext)
861                         max1 = minnext + deltanext;
862                     if (deltanext == I32_MAX)
863                         is_inf = is_inf_internal = 1;
864                     scan = next;
865                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
866                         pars++;
867                     if (data && (data_fake.flags & SF_HAS_EVAL))
868                         data->flags |= SF_HAS_EVAL;
869                     if (data)
870                         data->whilem_c = data_fake.whilem_c;
871                     if (flags & SCF_DO_STCLASS)
872                         cl_or(pRExC_state, &accum, &this_class);
873                     if (code == SUSPEND)
874                         break;
875                 }
876                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
877                     min1 = 0;
878                 if (flags & SCF_DO_SUBSTR) {
879                     data->pos_min += min1;
880                     data->pos_delta += max1 - min1;
881                     if (max1 != min1 || is_inf)
882                         data->longest = &(data->longest_float);
883                 }
884                 min += min1;
885                 delta += max1 - min1;
886                 if (flags & SCF_DO_STCLASS_OR) {
887                     cl_or(pRExC_state, data->start_class, &accum);
888                     if (min1) {
889                         cl_and(data->start_class, &and_with);
890                         flags &= ~SCF_DO_STCLASS;
891                     }
892                 }
893                 else if (flags & SCF_DO_STCLASS_AND) {
894                     if (min1) {
895                         cl_and(data->start_class, &accum);
896                         flags &= ~SCF_DO_STCLASS;
897                     }
898                     else {
899                         /* Switch to OR mode: cache the old value of
900                          * data->start_class */
901                         StructCopy(data->start_class, &and_with,
902                                    struct regnode_charclass_class);
903                         flags &= ~SCF_DO_STCLASS_AND;
904                         StructCopy(&accum, data->start_class,
905                                    struct regnode_charclass_class);
906                         flags |= SCF_DO_STCLASS_OR;
907                         data->start_class->flags |= ANYOF_EOS;
908                     }
909                 }
910             }
911             else if (code == BRANCHJ)   /* single branch is optimized. */
912                 scan = NEXTOPER(NEXTOPER(scan));
913             else                        /* single branch is optimized. */
914                 scan = NEXTOPER(scan);
915             continue;
916         }
917         else if (OP(scan) == EXACT) {
918             I32 l = STR_LEN(scan);
919             UV uc = *((U8*)STRING(scan));
920             if (UTF) {
921                 U8 *s = (U8*)STRING(scan);
922                 l = utf8_length(s, s + l);
923                 uc = utf8_to_uvchr(s, NULL);
924             }
925             min += l;
926             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
927                 /* The code below prefers earlier match for fixed
928                    offset, later match for variable offset.  */
929                 if (data->last_end == -1) { /* Update the start info. */
930                     data->last_start_min = data->pos_min;
931                     data->last_start_max = is_inf
932                         ? I32_MAX : data->pos_min + data->pos_delta;
933                 }
934                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
935                 data->last_end = data->pos_min + l;
936                 data->pos_min += l; /* As in the first entry. */
937                 data->flags &= ~SF_BEFORE_EOL;
938             }
939             if (flags & SCF_DO_STCLASS_AND) {
940                 /* Check whether it is compatible with what we know already! */
941                 int compat = 1;
942
943                 if (uc >= 0x100 ||
944                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
945                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
946                     && (!(data->start_class->flags & ANYOF_FOLD)
947                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
948                     )
949                     compat = 0;
950                 ANYOF_CLASS_ZERO(data->start_class);
951                 ANYOF_BITMAP_ZERO(data->start_class);
952                 if (compat)
953                     ANYOF_BITMAP_SET(data->start_class, uc);
954                 data->start_class->flags &= ~ANYOF_EOS;
955                 if (uc < 0x100)
956                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
957             }
958             else if (flags & SCF_DO_STCLASS_OR) {
959                 /* false positive possible if the class is case-folded */
960                 if (uc < 0x100)
961                     ANYOF_BITMAP_SET(data->start_class, uc);
962                 else
963                     data->start_class->flags |= ANYOF_UNICODE_ALL;
964                 data->start_class->flags &= ~ANYOF_EOS;
965                 cl_and(data->start_class, &and_with);
966             }
967             flags &= ~SCF_DO_STCLASS;
968         }
969         else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
970             I32 l = STR_LEN(scan);
971             UV uc = *((U8*)STRING(scan));
972
973             /* Search for fixed substrings supports EXACT only. */
974             if (flags & SCF_DO_SUBSTR)
975                 scan_commit(pRExC_state, data);
976             if (UTF) {
977                 U8 *s = (U8 *)STRING(scan);
978                 l = utf8_length(s, s + l);
979                 uc = utf8_to_uvchr(s, NULL);
980             }
981             min += l;
982             if (data && (flags & SCF_DO_SUBSTR))
983                 data->pos_min += l;
984             if (flags & SCF_DO_STCLASS_AND) {
985                 /* Check whether it is compatible with what we know already! */
986                 int compat = 1;
987
988                 if (uc >= 0x100 ||
989                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
990                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
991                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
992                     compat = 0;
993                 ANYOF_CLASS_ZERO(data->start_class);
994                 ANYOF_BITMAP_ZERO(data->start_class);
995                 if (compat) {
996                     ANYOF_BITMAP_SET(data->start_class, uc);
997                     data->start_class->flags &= ~ANYOF_EOS;
998                     data->start_class->flags |= ANYOF_FOLD;
999                     if (OP(scan) == EXACTFL)
1000                         data->start_class->flags |= ANYOF_LOCALE;
1001                 }
1002             }
1003             else if (flags & SCF_DO_STCLASS_OR) {
1004                 if (data->start_class->flags & ANYOF_FOLD) {
1005                     /* false positive possible if the class is case-folded.
1006                        Assume that the locale settings are the same... */
1007                     if (uc < 0x100)
1008                         ANYOF_BITMAP_SET(data->start_class, uc);
1009                     data->start_class->flags &= ~ANYOF_EOS;
1010                 }
1011                 cl_and(data->start_class, &and_with);
1012             }
1013             flags &= ~SCF_DO_STCLASS;
1014         }
1015         else if (strchr((char*)PL_varies,OP(scan))) {
1016             I32 mincount, maxcount, minnext, deltanext, fl = 0;
1017             I32 f = flags, pos_before = 0;
1018             regnode *oscan = scan;
1019             struct regnode_charclass_class this_class;
1020             struct regnode_charclass_class *oclass = NULL;
1021             I32 next_is_eval = 0;
1022
1023             switch (PL_regkind[(U8)OP(scan)]) {
1024             case WHILEM:                /* End of (?:...)* . */
1025                 scan = NEXTOPER(scan);
1026                 goto finish;
1027             case PLUS:
1028                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1029                     next = NEXTOPER(scan);
1030                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1031                         mincount = 1;
1032                         maxcount = REG_INFTY;
1033                         next = regnext(scan);
1034                         scan = NEXTOPER(scan);
1035                         goto do_curly;
1036                     }
1037                 }
1038                 if (flags & SCF_DO_SUBSTR)
1039                     data->pos_min++;
1040                 min++;
1041                 /* Fall through. */
1042             case STAR:
1043                 if (flags & SCF_DO_STCLASS) {
1044                     mincount = 0;
1045                     maxcount = REG_INFTY;
1046                     next = regnext(scan);
1047                     scan = NEXTOPER(scan);
1048                     goto do_curly;
1049                 }
1050                 is_inf = is_inf_internal = 1;
1051                 scan = regnext(scan);
1052                 if (flags & SCF_DO_SUBSTR) {
1053                     scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1054                     data->longest = &(data->longest_float);
1055                 }
1056                 goto optimize_curly_tail;
1057             case CURLY:
1058                 mincount = ARG1(scan);
1059                 maxcount = ARG2(scan);
1060                 next = regnext(scan);
1061                 if (OP(scan) == CURLYX) {
1062                     I32 lp = (data ? *(data->last_closep) : 0);
1063
1064                     scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1065                 }
1066                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1067                 next_is_eval = (OP(scan) == EVAL);
1068               do_curly:
1069                 if (flags & SCF_DO_SUBSTR) {
1070                     if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1071                     pos_before = data->pos_min;
1072                 }
1073                 if (data) {
1074                     fl = data->flags;
1075                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1076                     if (is_inf)
1077                         data->flags |= SF_IS_INF;
1078                 }
1079                 if (flags & SCF_DO_STCLASS) {
1080                     cl_init(pRExC_state, &this_class);
1081                     oclass = data->start_class;
1082                     data->start_class = &this_class;
1083                     f |= SCF_DO_STCLASS_AND;
1084                     f &= ~SCF_DO_STCLASS_OR;
1085                 }
1086                 /* These are the cases when once a subexpression
1087                    fails at a particular position, it cannot succeed
1088                    even after backtracking at the enclosing scope.
1089                 
1090                    XXXX what if minimal match and we are at the
1091                         initial run of {n,m}? */
1092                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1093                     f &= ~SCF_WHILEM_VISITED_POS;
1094
1095                 /* This will finish on WHILEM, setting scan, or on NULL: */
1096                 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1097                                       mincount == 0
1098                                         ? (f & ~SCF_DO_SUBSTR) : f);
1099
1100                 if (flags & SCF_DO_STCLASS)
1101                     data->start_class = oclass;
1102                 if (mincount == 0 || minnext == 0) {
1103                     if (flags & SCF_DO_STCLASS_OR) {
1104                         cl_or(pRExC_state, data->start_class, &this_class);
1105                     }
1106                     else if (flags & SCF_DO_STCLASS_AND) {
1107                         /* Switch to OR mode: cache the old value of
1108                          * data->start_class */
1109                         StructCopy(data->start_class, &and_with,
1110                                    struct regnode_charclass_class);
1111                         flags &= ~SCF_DO_STCLASS_AND;
1112                         StructCopy(&this_class, data->start_class,
1113                                    struct regnode_charclass_class);
1114                         flags |= SCF_DO_STCLASS_OR;
1115                         data->start_class->flags |= ANYOF_EOS;
1116                     }
1117                 } else {                /* Non-zero len */
1118                     if (flags & SCF_DO_STCLASS_OR) {
1119                         cl_or(pRExC_state, data->start_class, &this_class);
1120                         cl_and(data->start_class, &and_with);
1121                     }
1122                     else if (flags & SCF_DO_STCLASS_AND)
1123                         cl_and(data->start_class, &this_class);
1124                     flags &= ~SCF_DO_STCLASS;
1125                 }
1126                 if (!scan)              /* It was not CURLYX, but CURLY. */
1127                     scan = next;
1128                 if (ckWARN(WARN_REGEXP)
1129                        /* ? quantifier ok, except for (?{ ... }) */
1130                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
1131                     && (minnext == 0) && (deltanext == 0)
1132                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1133                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
1134                 {
1135                     vWARN(RExC_parse,
1136                           "Quantifier unexpected on zero-length expression");
1137                 }
1138
1139                 min += minnext * mincount;
1140                 is_inf_internal |= ((maxcount == REG_INFTY
1141                                      && (minnext + deltanext) > 0)
1142                                     || deltanext == I32_MAX);
1143                 is_inf |= is_inf_internal;
1144                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1145
1146                 /* Try powerful optimization CURLYX => CURLYN. */
1147                 if (  OP(oscan) == CURLYX && data
1148                       && data->flags & SF_IN_PAR
1149                       && !(data->flags & SF_HAS_EVAL)
1150                       && !deltanext && minnext == 1 ) {
1151                     /* Try to optimize to CURLYN.  */
1152                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1153                     regnode *nxt1 = nxt;
1154 #ifdef DEBUGGING
1155                     regnode *nxt2;
1156 #endif
1157
1158                     /* Skip open. */
1159                     nxt = regnext(nxt);
1160                     if (!strchr((char*)PL_simple,OP(nxt))
1161                         && !(PL_regkind[(U8)OP(nxt)] == EXACT
1162                              && STR_LEN(nxt) == 1))
1163                         goto nogo;
1164 #ifdef DEBUGGING
1165                     nxt2 = nxt;
1166 #endif
1167                     nxt = regnext(nxt);
1168                     if (OP(nxt) != CLOSE)
1169                         goto nogo;
1170                     /* Now we know that nxt2 is the only contents: */
1171                     oscan->flags = ARG(nxt);
1172                     OP(oscan) = CURLYN;
1173                     OP(nxt1) = NOTHING; /* was OPEN. */
1174 #ifdef DEBUGGING
1175                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1176                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1177                     NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1178                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
1179                     OP(nxt + 1) = OPTIMIZED; /* was count. */
1180                     NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1181 #endif
1182                 }
1183               nogo:
1184
1185                 /* Try optimization CURLYX => CURLYM. */
1186                 if (  OP(oscan) == CURLYX && data
1187                       && !(data->flags & SF_HAS_PAR)
1188                       && !(data->flags & SF_HAS_EVAL)
1189                       && !deltanext  ) {
1190                     /* XXXX How to optimize if data == 0? */
1191                     /* Optimize to a simpler form.  */
1192                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1193                     regnode *nxt2;
1194
1195                     OP(oscan) = CURLYM;
1196                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1197                             && (OP(nxt2) != WHILEM))
1198                         nxt = nxt2;
1199                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
1200                     /* Need to optimize away parenths. */
1201                     if (data->flags & SF_IN_PAR) {
1202                         /* Set the parenth number.  */
1203                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1204
1205                         if (OP(nxt) != CLOSE)
1206                             FAIL("Panic opt close");
1207                         oscan->flags = ARG(nxt);
1208                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
1209                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
1210 #ifdef DEBUGGING
1211                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1212                         OP(nxt + 1) = OPTIMIZED; /* was count. */
1213                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1214                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1215 #endif
1216 #if 0
1217                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
1218                             regnode *nnxt = regnext(nxt1);
1219                         
1220                             if (nnxt == nxt) {
1221                                 if (reg_off_by_arg[OP(nxt1)])
1222                                     ARG_SET(nxt1, nxt2 - nxt1);
1223                                 else if (nxt2 - nxt1 < U16_MAX)
1224                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
1225                                 else
1226                                     OP(nxt) = NOTHING;  /* Cannot beautify */
1227                             }
1228                             nxt1 = nnxt;
1229                         }
1230 #endif
1231                         /* Optimize again: */
1232                         study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1233                                     NULL, 0);
1234                     }
1235                     else
1236                         oscan->flags = 0;
1237                 }
1238                 else if ((OP(oscan) == CURLYX)
1239                          && (flags & SCF_WHILEM_VISITED_POS)
1240                          /* See the comment on a similar expression above.
1241                             However, this time it not a subexpression
1242                             we care about, but the expression itself. */
1243                          && (maxcount == REG_INFTY)
1244                          && data && ++data->whilem_c < 16) {
1245                     /* This stays as CURLYX, we can put the count/of pair. */
1246                     /* Find WHILEM (as in regexec.c) */
1247                     regnode *nxt = oscan + NEXT_OFF(oscan);
1248
1249                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1250                         nxt += ARG(nxt);
1251                     PREVOPER(nxt)->flags = data->whilem_c
1252                         | (RExC_whilem_seen << 4); /* On WHILEM */
1253                 }
1254                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1255                     pars++;
1256                 if (flags & SCF_DO_SUBSTR) {
1257                     SV *last_str = Nullsv;
1258                     int counted = mincount != 0;
1259
1260                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1261 #if defined(SPARC64_GCC_WORKAROUND)
1262                         I32 b = 0;
1263                         STRLEN l = 0;
1264                         char *s = NULL;
1265                         I32 old = 0;
1266
1267                         if (pos_before >= data->last_start_min)
1268                             b = pos_before;
1269                         else
1270                             b = data->last_start_min;
1271
1272                         l = 0;
1273                         s = SvPV(data->last_found, l);
1274                         old = b - data->last_start_min;
1275
1276 #else
1277                         I32 b = pos_before >= data->last_start_min
1278                             ? pos_before : data->last_start_min;
1279                         STRLEN l;
1280                         char *s = SvPV(data->last_found, l);
1281                         I32 old = b - data->last_start_min;
1282 #endif
1283
1284                         if (UTF)
1285                             old = utf8_hop((U8*)s, old) - (U8*)s;
1286                         
1287                         l -= old;
1288                         /* Get the added string: */
1289                         last_str = newSVpvn(s  + old, l);
1290                         if (deltanext == 0 && pos_before == b) {
1291                             /* What was added is a constant string */
1292                             if (mincount > 1) {
1293                                 SvGROW(last_str, (mincount * l) + 1);
1294                                 repeatcpy(SvPVX(last_str) + l,
1295                                           SvPVX(last_str), l, mincount - 1);
1296                                 SvCUR(last_str) *= mincount;
1297                                 /* Add additional parts. */
1298                                 SvCUR_set(data->last_found,
1299                                           SvCUR(data->last_found) - l);
1300                                 sv_catsv(data->last_found, last_str);
1301                                 data->last_end += l * (mincount - 1);
1302                             }
1303                         } else {
1304                             /* start offset must point into the last copy */
1305                             data->last_start_min += minnext * (mincount - 1);
1306                             data->last_start_max += is_inf ? 0 : (maxcount - 1)
1307                                 * (minnext + data->pos_delta);
1308                         }
1309                     }
1310                     /* It is counted once already... */
1311                     data->pos_min += minnext * (mincount - counted);
1312                     data->pos_delta += - counted * deltanext +
1313                         (minnext + deltanext) * maxcount - minnext * mincount;
1314                     if (mincount != maxcount) {
1315                          /* Cannot extend fixed substrings found inside
1316                             the group.  */
1317                         scan_commit(pRExC_state,data);
1318                         if (mincount && last_str) {
1319                             sv_setsv(data->last_found, last_str);
1320                             data->last_end = data->pos_min;
1321                             data->last_start_min =
1322                                 data->pos_min - CHR_SVLEN(last_str);
1323                             data->last_start_max = is_inf
1324                                 ? I32_MAX
1325                                 : data->pos_min + data->pos_delta
1326                                 - CHR_SVLEN(last_str);
1327                         }
1328                         data->longest = &(data->longest_float);
1329                     }
1330                     SvREFCNT_dec(last_str);
1331                 }
1332                 if (data && (fl & SF_HAS_EVAL))
1333                     data->flags |= SF_HAS_EVAL;
1334               optimize_curly_tail:
1335                 if (OP(oscan) != CURLYX) {
1336                     while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1337                            && NEXT_OFF(next))
1338                         NEXT_OFF(oscan) += NEXT_OFF(next);
1339                 }
1340                 continue;
1341             default:                    /* REF and CLUMP only? */
1342                 if (flags & SCF_DO_SUBSTR) {
1343                     scan_commit(pRExC_state,data);      /* Cannot expect anything... */
1344                     data->longest = &(data->longest_float);
1345                 }
1346                 is_inf = is_inf_internal = 1;
1347                 if (flags & SCF_DO_STCLASS_OR)
1348                     cl_anything(pRExC_state, data->start_class);
1349                 flags &= ~SCF_DO_STCLASS;
1350                 break;
1351             }
1352         }
1353         else if (strchr((char*)PL_simple,OP(scan))) {
1354             int value = 0;
1355
1356             if (flags & SCF_DO_SUBSTR) {
1357                 scan_commit(pRExC_state,data);
1358                 data->pos_min++;
1359             }
1360             min++;
1361             if (flags & SCF_DO_STCLASS) {
1362                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1363
1364                 /* Some of the logic below assumes that switching
1365                    locale on will only add false positives. */
1366                 switch (PL_regkind[(U8)OP(scan)]) {
1367                 case SANY:
1368                 default:
1369                   do_default:
1370                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1371                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1372                         cl_anything(pRExC_state, data->start_class);
1373                     break;
1374                 case REG_ANY:
1375                     if (OP(scan) == SANY)
1376                         goto do_default;
1377                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1378                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1379                                  || (data->start_class->flags & ANYOF_CLASS));
1380                         cl_anything(pRExC_state, data->start_class);
1381                     }
1382                     if (flags & SCF_DO_STCLASS_AND || !value)
1383                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1384                     break;
1385                 case ANYOF:
1386                     if (flags & SCF_DO_STCLASS_AND)
1387                         cl_and(data->start_class,
1388                                (struct regnode_charclass_class*)scan);
1389                     else
1390                         cl_or(pRExC_state, data->start_class,
1391                               (struct regnode_charclass_class*)scan);
1392                     break;
1393                 case ALNUM:
1394                     if (flags & SCF_DO_STCLASS_AND) {
1395                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1396                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1397                             for (value = 0; value < 256; value++)
1398                                 if (!isALNUM(value))
1399                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1400                         }
1401                     }
1402                     else {
1403                         if (data->start_class->flags & ANYOF_LOCALE)
1404                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1405                         else {
1406                             for (value = 0; value < 256; value++)
1407                                 if (isALNUM(value))
1408                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1409                         }
1410                     }
1411                     break;
1412                 case ALNUML:
1413                     if (flags & SCF_DO_STCLASS_AND) {
1414                         if (data->start_class->flags & ANYOF_LOCALE)
1415                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1416                     }
1417                     else {
1418                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1419                         data->start_class->flags |= ANYOF_LOCALE;
1420                     }
1421                     break;
1422                 case NALNUM:
1423                     if (flags & SCF_DO_STCLASS_AND) {
1424                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1425                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1426                             for (value = 0; value < 256; value++)
1427                                 if (isALNUM(value))
1428                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1429                         }
1430                     }
1431                     else {
1432                         if (data->start_class->flags & ANYOF_LOCALE)
1433                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1434                         else {
1435                             for (value = 0; value < 256; value++)
1436                                 if (!isALNUM(value))
1437                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1438                         }
1439                     }
1440                     break;
1441                 case NALNUML:
1442                     if (flags & SCF_DO_STCLASS_AND) {
1443                         if (data->start_class->flags & ANYOF_LOCALE)
1444                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1445                     }
1446                     else {
1447                         data->start_class->flags |= ANYOF_LOCALE;
1448                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1449                     }
1450                     break;
1451                 case SPACE:
1452                     if (flags & SCF_DO_STCLASS_AND) {
1453                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1454                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1455                             for (value = 0; value < 256; value++)
1456                                 if (!isSPACE(value))
1457                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1458                         }
1459                     }
1460                     else {
1461                         if (data->start_class->flags & ANYOF_LOCALE)
1462                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1463                         else {
1464                             for (value = 0; value < 256; value++)
1465                                 if (isSPACE(value))
1466                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1467                         }
1468                     }
1469                     break;
1470                 case SPACEL:
1471                     if (flags & SCF_DO_STCLASS_AND) {
1472                         if (data->start_class->flags & ANYOF_LOCALE)
1473                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1474                     }
1475                     else {
1476                         data->start_class->flags |= ANYOF_LOCALE;
1477                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1478                     }
1479                     break;
1480                 case NSPACE:
1481                     if (flags & SCF_DO_STCLASS_AND) {
1482                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1483                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1484                             for (value = 0; value < 256; value++)
1485                                 if (isSPACE(value))
1486                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1487                         }
1488                     }
1489                     else {
1490                         if (data->start_class->flags & ANYOF_LOCALE)
1491                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1492                         else {
1493                             for (value = 0; value < 256; value++)
1494                                 if (!isSPACE(value))
1495                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1496                         }
1497                     }
1498                     break;
1499                 case NSPACEL:
1500                     if (flags & SCF_DO_STCLASS_AND) {
1501                         if (data->start_class->flags & ANYOF_LOCALE) {
1502                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1503                             for (value = 0; value < 256; value++)
1504                                 if (!isSPACE(value))
1505                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1506                         }
1507                     }
1508                     else {
1509                         data->start_class->flags |= ANYOF_LOCALE;
1510                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1511                     }
1512                     break;
1513                 case DIGIT:
1514                     if (flags & SCF_DO_STCLASS_AND) {
1515                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1516                         for (value = 0; value < 256; value++)
1517                             if (!isDIGIT(value))
1518                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1519                     }
1520                     else {
1521                         if (data->start_class->flags & ANYOF_LOCALE)
1522                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1523                         else {
1524                             for (value = 0; value < 256; value++)
1525                                 if (isDIGIT(value))
1526                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1527                         }
1528                     }
1529                     break;
1530                 case NDIGIT:
1531                     if (flags & SCF_DO_STCLASS_AND) {
1532                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1533                         for (value = 0; value < 256; value++)
1534                             if (isDIGIT(value))
1535                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1536                     }
1537                     else {
1538                         if (data->start_class->flags & ANYOF_LOCALE)
1539                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1540                         else {
1541                             for (value = 0; value < 256; value++)
1542                                 if (!isDIGIT(value))
1543                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1544                         }
1545                     }
1546                     break;
1547                 }
1548                 if (flags & SCF_DO_STCLASS_OR)
1549                     cl_and(data->start_class, &and_with);
1550                 flags &= ~SCF_DO_STCLASS;
1551             }
1552         }
1553         else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1554             data->flags |= (OP(scan) == MEOL
1555                             ? SF_BEFORE_MEOL
1556                             : SF_BEFORE_SEOL);
1557         }
1558         else if (  PL_regkind[(U8)OP(scan)] == BRANCHJ
1559                  /* Lookbehind, or need to calculate parens/evals/stclass: */
1560                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
1561                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1562             /* Lookahead/lookbehind */
1563             I32 deltanext, minnext, fake = 0;
1564             regnode *nscan;
1565             struct regnode_charclass_class intrnl;
1566             int f = 0;
1567
1568             data_fake.flags = 0;
1569             if (data) {         
1570                 data_fake.whilem_c = data->whilem_c;
1571                 data_fake.last_closep = data->last_closep;
1572             }
1573             else
1574                 data_fake.last_closep = &fake;
1575             if ( flags & SCF_DO_STCLASS && !scan->flags
1576                  && OP(scan) == IFMATCH ) { /* Lookahead */
1577                 cl_init(pRExC_state, &intrnl);
1578                 data_fake.start_class = &intrnl;
1579                 f |= SCF_DO_STCLASS_AND;
1580             }
1581             if (flags & SCF_WHILEM_VISITED_POS)
1582                 f |= SCF_WHILEM_VISITED_POS;
1583             next = regnext(scan);
1584             nscan = NEXTOPER(NEXTOPER(scan));
1585             minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1586             if (scan->flags) {
1587                 if (deltanext) {
1588                     vFAIL("Variable length lookbehind not implemented");
1589                 }
1590                 else if (minnext > U8_MAX) {
1591                     vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1592                 }
1593                 scan->flags = minnext;
1594             }
1595             if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1596                 pars++;
1597             if (data && (data_fake.flags & SF_HAS_EVAL))
1598                 data->flags |= SF_HAS_EVAL;
1599             if (data)
1600                 data->whilem_c = data_fake.whilem_c;
1601             if (f & SCF_DO_STCLASS_AND) {
1602                 int was = (data->start_class->flags & ANYOF_EOS);
1603
1604                 cl_and(data->start_class, &intrnl);
1605                 if (was)
1606                     data->start_class->flags |= ANYOF_EOS;
1607             }
1608         }
1609         else if (OP(scan) == OPEN) {
1610             pars++;
1611         }
1612         else if (OP(scan) == CLOSE) {
1613             if (ARG(scan) == is_par) {
1614                 next = regnext(scan);
1615
1616                 if ( next && (OP(next) != WHILEM) && next < last)
1617                     is_par = 0;         /* Disable optimization */
1618             }
1619             if (data)
1620                 *(data->last_closep) = ARG(scan);
1621         }
1622         else if (OP(scan) == EVAL) {
1623                 if (data)
1624                     data->flags |= SF_HAS_EVAL;
1625         }
1626         else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1627                 if (flags & SCF_DO_SUBSTR) {
1628                     scan_commit(pRExC_state,data);
1629                     data->longest = &(data->longest_float);
1630                 }
1631                 is_inf = is_inf_internal = 1;
1632                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1633                     cl_anything(pRExC_state, data->start_class);
1634                 flags &= ~SCF_DO_STCLASS;
1635         }
1636         /* Else: zero-length, ignore. */
1637         scan = regnext(scan);
1638     }
1639
1640   finish:
1641     *scanp = scan;
1642     *deltap = is_inf_internal ? I32_MAX : delta;
1643     if (flags & SCF_DO_SUBSTR && is_inf)
1644         data->pos_delta = I32_MAX - data->pos_min;
1645     if (is_par > U8_MAX)
1646         is_par = 0;
1647     if (is_par && pars==1 && data) {
1648         data->flags |= SF_IN_PAR;
1649         data->flags &= ~SF_HAS_PAR;
1650     }
1651     else if (pars && data) {
1652         data->flags |= SF_HAS_PAR;
1653         data->flags &= ~SF_IN_PAR;
1654     }
1655     if (flags & SCF_DO_STCLASS_OR)
1656         cl_and(data->start_class, &and_with);
1657     return min;
1658 }
1659
1660 STATIC I32
1661 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1662 {
1663     if (RExC_rx->data) {
1664         Renewc(RExC_rx->data,
1665                sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1666                char, struct reg_data);
1667         Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1668         RExC_rx->data->count += n;
1669     }
1670     else {
1671         Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1672              char, struct reg_data);
1673         New(1208, RExC_rx->data->what, n, U8);
1674         RExC_rx->data->count = n;
1675     }
1676     Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1677     return RExC_rx->data->count - n;
1678 }
1679
1680 void
1681 Perl_reginitcolors(pTHX)
1682 {
1683     int i = 0;
1684     char *s = PerlEnv_getenv("PERL_RE_COLORS");
1685         
1686     if (s) {
1687         PL_colors[0] = s = savepv(s);
1688         while (++i < 6) {
1689             s = strchr(s, '\t');
1690             if (s) {
1691                 *s = '\0';
1692                 PL_colors[i] = ++s;
1693             }
1694             else
1695                 PL_colors[i] = s = "";
1696         }
1697     } else {
1698         while (i < 6)
1699             PL_colors[i++] = "";
1700     }
1701     PL_colorset = 1;
1702 }
1703
1704
1705 /*
1706  - pregcomp - compile a regular expression into internal code
1707  *
1708  * We can't allocate space until we know how big the compiled form will be,
1709  * but we can't compile it (and thus know how big it is) until we've got a
1710  * place to put the code.  So we cheat:  we compile it twice, once with code
1711  * generation turned off and size counting turned on, and once "for real".
1712  * This also means that we don't allocate space until we are sure that the
1713  * thing really will compile successfully, and we never have to move the
1714  * code and thus invalidate pointers into it.  (Note that it has to be in
1715  * one piece because free() must be able to free it all.) [NB: not true in perl]
1716  *
1717  * Beware that the optimization-preparation code in here knows about some
1718  * of the structure of the compiled regexp.  [I'll say.]
1719  */
1720 regexp *
1721 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1722 {
1723     register regexp *r;
1724     regnode *scan;
1725     regnode *first;
1726     I32 flags;
1727     I32 minlen = 0;
1728     I32 sawplus = 0;
1729     I32 sawopen = 0;
1730     scan_data_t data;
1731     RExC_state_t RExC_state;
1732     RExC_state_t *pRExC_state = &RExC_state;
1733
1734     if (exp == NULL)
1735         FAIL("NULL regexp argument");
1736
1737     RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1738
1739     RExC_precomp = exp;
1740     DEBUG_r({
1741          if (!PL_colorset) reginitcolors();
1742          PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1743                        PL_colors[4],PL_colors[5],PL_colors[0],
1744                        (int)(xend - exp), RExC_precomp, PL_colors[1]);
1745     });
1746     RExC_flags16 = pm->op_pmflags;
1747     RExC_sawback = 0;
1748
1749     RExC_seen = 0;
1750     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1751     RExC_seen_evals = 0;
1752     RExC_extralen = 0;
1753
1754     /* First pass: determine size, legality. */
1755     RExC_parse = exp;
1756     RExC_start = exp;
1757     RExC_end = xend;
1758     RExC_naughty = 0;
1759     RExC_npar = 1;
1760     RExC_size = 0L;
1761     RExC_emit = &PL_regdummy;
1762     RExC_whilem_seen = 0;
1763 #if 0 /* REGC() is (currently) a NOP at the first pass.
1764        * Clever compilers notice this and complain. --jhi */
1765     REGC((U8)REG_MAGIC, (char*)RExC_emit);
1766 #endif
1767     if (reg(pRExC_state, 0, &flags) == NULL) {
1768         RExC_precomp = Nullch;
1769         return(NULL);
1770     }
1771     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1772
1773     /* Small enough for pointer-storage convention?
1774        If extralen==0, this means that we will not need long jumps. */
1775     if (RExC_size >= 0x10000L && RExC_extralen)
1776         RExC_size += RExC_extralen;
1777     else
1778         RExC_extralen = 0;
1779     if (RExC_whilem_seen > 15)
1780         RExC_whilem_seen = 15;
1781
1782     /* Allocate space and initialize. */
1783     Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1784          char, regexp);
1785     if (r == NULL)
1786         FAIL("Regexp out of space");
1787
1788 #ifdef DEBUGGING
1789     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1790     Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1791 #endif
1792     r->refcnt = 1;
1793     r->prelen = xend - exp;
1794     r->precomp = savepvn(RExC_precomp, r->prelen);
1795     r->subbeg = NULL;
1796     r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1797     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1798
1799     r->substrs = 0;                     /* Useful during FAIL. */
1800     r->startp = 0;                      /* Useful during FAIL. */
1801     r->endp = 0;                        /* Useful during FAIL. */
1802
1803     Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1804     if (r->offsets) {
1805       r->offsets[0] = RExC_size; 
1806     }
1807     DEBUG_r(PerlIO_printf(Perl_debug_log, 
1808                           "%s %"UVuf" bytes for offset annotations.\n", 
1809                           r->offsets ? "Got" : "Couldn't get", 
1810                           (UV)((2*RExC_size+1) * sizeof(U32))));
1811
1812     RExC_rx = r;
1813
1814     /* Second pass: emit code. */
1815     RExC_flags16 = pm->op_pmflags;      /* don't let top level (?i) bleed */
1816     RExC_parse = exp;
1817     RExC_end = xend;
1818     RExC_naughty = 0;
1819     RExC_npar = 1;
1820     RExC_emit_start = r->program;
1821     RExC_emit = r->program;
1822     /* Store the count of eval-groups for security checks: */
1823     RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1824     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1825     r->data = 0;
1826     if (reg(pRExC_state, 0, &flags) == NULL)
1827         return(NULL);
1828
1829     /* Dig out information for optimizations. */
1830     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1831     pm->op_pmflags = RExC_flags16;
1832     if (UTF)
1833         r->reganch |= ROPT_UTF8;        /* Unicode in it? */
1834     r->regstclass = NULL;
1835     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
1836         r->reganch |= ROPT_NAUGHTY;
1837     scan = r->program + 1;              /* First BRANCH. */
1838
1839     /* XXXX To minimize changes to RE engine we always allocate
1840        3-units-long substrs field. */
1841     Newz(1004, r->substrs, 1, struct reg_substr_data);
1842
1843     StructCopy(&zero_scan_data, &data, scan_data_t);
1844     /* XXXX Should not we check for something else?  Usually it is OPEN1... */
1845     if (OP(scan) != BRANCH) {   /* Only one top-level choice. */
1846         I32 fake;
1847         STRLEN longest_float_length, longest_fixed_length;
1848         struct regnode_charclass_class ch_class;
1849         int stclass_flag;
1850         I32 last_close = 0;
1851
1852         first = scan;
1853         /* Skip introductions and multiplicators >= 1. */
1854         while ((OP(first) == OPEN && (sawopen = 1)) ||
1855                /* An OR of *one* alternative - should not happen now. */
1856             (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1857             (OP(first) == PLUS) ||
1858             (OP(first) == MINMOD) ||
1859                /* An {n,m} with n>0 */
1860             (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1861                 if (OP(first) == PLUS)
1862                     sawplus = 1;
1863                 else
1864                     first += regarglen[(U8)OP(first)];
1865                 first = NEXTOPER(first);
1866         }
1867
1868         /* Starting-point info. */
1869       again:
1870         if (PL_regkind[(U8)OP(first)] == EXACT) {
1871             if (OP(first) == EXACT)
1872                 ;       /* Empty, get anchored substr later. */
1873             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1874                 r->regstclass = first;
1875         }
1876         else if (strchr((char*)PL_simple,OP(first)))
1877             r->regstclass = first;
1878         else if (PL_regkind[(U8)OP(first)] == BOUND ||
1879                  PL_regkind[(U8)OP(first)] == NBOUND)
1880             r->regstclass = first;
1881         else if (PL_regkind[(U8)OP(first)] == BOL) {
1882             r->reganch |= (OP(first) == MBOL
1883                            ? ROPT_ANCH_MBOL
1884                            : (OP(first) == SBOL
1885                               ? ROPT_ANCH_SBOL
1886                               : ROPT_ANCH_BOL));
1887             first = NEXTOPER(first);
1888             goto again;
1889         }
1890         else if (OP(first) == GPOS) {
1891             r->reganch |= ROPT_ANCH_GPOS;
1892             first = NEXTOPER(first);
1893             goto again;
1894         }
1895         else if (!sawopen && (OP(first) == STAR &&
1896             PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1897             !(r->reganch & ROPT_ANCH) )
1898         {
1899             /* turn .* into ^.* with an implied $*=1 */
1900             int type = OP(NEXTOPER(first));
1901
1902             if (type == REG_ANY)
1903                 type = ROPT_ANCH_MBOL;
1904             else
1905                 type = ROPT_ANCH_SBOL;
1906
1907             r->reganch |= type | ROPT_IMPLICIT;
1908             first = NEXTOPER(first);
1909             goto again;
1910         }
1911         if (sawplus && (!sawopen || !RExC_sawback)
1912             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1913             /* x+ must match at the 1st pos of run of x's */
1914             r->reganch |= ROPT_SKIP;
1915
1916         /* Scan is after the zeroth branch, first is atomic matcher. */
1917         DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1918                               (IV)(first - scan + 1)));
1919         /*
1920         * If there's something expensive in the r.e., find the
1921         * longest literal string that must appear and make it the
1922         * regmust.  Resolve ties in favor of later strings, since
1923         * the regstart check works with the beginning of the r.e.
1924         * and avoiding duplication strengthens checking.  Not a
1925         * strong reason, but sufficient in the absence of others.
1926         * [Now we resolve ties in favor of the earlier string if
1927         * it happens that c_offset_min has been invalidated, since the
1928         * earlier string may buy us something the later one won't.]
1929         */
1930         minlen = 0;
1931
1932         data.longest_fixed = newSVpvn("",0);
1933         data.longest_float = newSVpvn("",0);
1934         data.last_found = newSVpvn("",0);
1935         data.longest = &(data.longest_fixed);
1936         first = scan;
1937         if (!r->regstclass) {
1938             cl_init(pRExC_state, &ch_class);
1939             data.start_class = &ch_class;
1940             stclass_flag = SCF_DO_STCLASS_AND;
1941         } else                          /* XXXX Check for BOUND? */
1942             stclass_flag = 0;
1943         data.last_closep = &last_close;
1944
1945         minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1946                              &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1947         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1948              && data.last_start_min == 0 && data.last_end > 0
1949              && !RExC_seen_zerolen
1950              && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1951             r->reganch |= ROPT_CHECK_ALL;
1952         scan_commit(pRExC_state, &data);
1953         SvREFCNT_dec(data.last_found);
1954
1955         longest_float_length = CHR_SVLEN(data.longest_float);
1956         if (longest_float_length
1957             || (data.flags & SF_FL_BEFORE_EOL
1958                 && (!(data.flags & SF_FL_BEFORE_MEOL)
1959                     || (RExC_flags16 & PMf_MULTILINE)))) {
1960             int t;
1961
1962             if (SvCUR(data.longest_fixed)                       /* ok to leave SvCUR */
1963                 && data.offset_fixed == data.offset_float_min
1964                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1965                     goto remove_float;          /* As in (a)+. */
1966
1967             r->float_substr = data.longest_float;
1968             r->float_min_offset = data.offset_float_min;
1969             r->float_max_offset = data.offset_float_max;
1970             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1971                        && (!(data.flags & SF_FL_BEFORE_MEOL)
1972                            || (RExC_flags16 & PMf_MULTILINE)));
1973             fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
1974         }
1975         else {
1976           remove_float:
1977             r->float_substr = Nullsv;
1978             SvREFCNT_dec(data.longest_float);
1979             longest_float_length = 0;
1980         }
1981
1982         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
1983         if (longest_fixed_length
1984             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1985                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1986                     || (RExC_flags16 & PMf_MULTILINE)))) {
1987             int t;
1988
1989             r->anchored_substr = data.longest_fixed;
1990             r->anchored_offset = data.offset_fixed;
1991             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1992                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
1993                      || (RExC_flags16 & PMf_MULTILINE)));
1994             fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
1995         }
1996         else {
1997             r->anchored_substr = Nullsv;
1998             SvREFCNT_dec(data.longest_fixed);
1999             longest_fixed_length = 0;
2000         }
2001         if (r->regstclass
2002             && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2003             r->regstclass = NULL;
2004         if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
2005             && !(data.start_class->flags & ANYOF_EOS)
2006             && !cl_is_anything(data.start_class)) {
2007             I32 n = add_data(pRExC_state, 1, "f");
2008
2009             New(1006, RExC_rx->data->data[n], 1,
2010                 struct regnode_charclass_class);
2011             StructCopy(data.start_class,
2012                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2013                        struct regnode_charclass_class);
2014             r->regstclass = (regnode*)RExC_rx->data->data[n];
2015             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2016             PL_regdata = r->data; /* for regprop() */
2017             DEBUG_r({ SV *sv = sv_newmortal();
2018                       regprop(sv, (regnode*)data.start_class);
2019                       PerlIO_printf(Perl_debug_log,
2020                                     "synthetic stclass `%s'.\n",
2021                                     SvPVX(sv));});
2022         }
2023
2024         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2025         if (longest_fixed_length > longest_float_length) {
2026             r->check_substr = r->anchored_substr;
2027             r->check_offset_min = r->check_offset_max = r->anchored_offset;
2028             if (r->reganch & ROPT_ANCH_SINGLE)
2029                 r->reganch |= ROPT_NOSCAN;
2030         }
2031         else {
2032             r->check_substr = r->float_substr;
2033             r->check_offset_min = data.offset_float_min;
2034             r->check_offset_max = data.offset_float_max;
2035         }
2036         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2037            This should be changed ASAP!  */
2038         if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
2039             r->reganch |= RE_USE_INTUIT;
2040             if (SvTAIL(r->check_substr))
2041                 r->reganch |= RE_INTUIT_TAIL;
2042         }
2043     }
2044     else {
2045         /* Several toplevels. Best we can is to set minlen. */
2046         I32 fake;
2047         struct regnode_charclass_class ch_class;
2048         I32 last_close = 0;
2049         
2050         DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2051         scan = r->program + 1;
2052         cl_init(pRExC_state, &ch_class);
2053         data.start_class = &ch_class;
2054         data.last_closep = &last_close;
2055         minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2056         r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
2057         if (!(data.start_class->flags & ANYOF_EOS)
2058             && !cl_is_anything(data.start_class)) {
2059             I32 n = add_data(pRExC_state, 1, "f");
2060
2061             New(1006, RExC_rx->data->data[n], 1,
2062                 struct regnode_charclass_class);
2063             StructCopy(data.start_class,
2064                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2065                        struct regnode_charclass_class);
2066             r->regstclass = (regnode*)RExC_rx->data->data[n];
2067             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2068             DEBUG_r({ SV* sv = sv_newmortal();
2069                       regprop(sv, (regnode*)data.start_class);
2070                       PerlIO_printf(Perl_debug_log,
2071                                     "synthetic stclass `%s'.\n",
2072                                     SvPVX(sv));});
2073         }
2074     }
2075
2076     r->minlen = minlen;
2077     if (RExC_seen & REG_SEEN_GPOS)
2078         r->reganch |= ROPT_GPOS_SEEN;
2079     if (RExC_seen & REG_SEEN_LOOKBEHIND)
2080         r->reganch |= ROPT_LOOKBEHIND_SEEN;
2081     if (RExC_seen & REG_SEEN_EVAL)
2082         r->reganch |= ROPT_EVAL_SEEN;
2083     if (RExC_seen & REG_SEEN_CANY)
2084         r->reganch |= ROPT_CANY_SEEN;
2085     Newz(1002, r->startp, RExC_npar, I32);
2086     Newz(1002, r->endp, RExC_npar, I32);
2087     PL_regdata = r->data; /* for regprop() */
2088     DEBUG_r(regdump(r));
2089     return(r);
2090 }
2091
2092 /*
2093  - reg - regular expression, i.e. main body or parenthesized thing
2094  *
2095  * Caller must absorb opening parenthesis.
2096  *
2097  * Combining parenthesis handling with the base level of regular expression
2098  * is a trifle forced, but the need to tie the tails of the branches to what
2099  * follows makes it hard to avoid.
2100  */
2101 STATIC regnode *
2102 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2103     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2104 {
2105     register regnode *ret;              /* Will be the head of the group. */
2106     register regnode *br;
2107     register regnode *lastbr;
2108     register regnode *ender = 0;
2109     register I32 parno = 0;
2110     I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
2111
2112     /* for (?g), (?gc), and (?o) warnings; warning
2113        about (?c) will warn about (?g) -- japhy    */
2114
2115     I32 wastedflags = 0x00,
2116         wasted_o    = 0x01,
2117         wasted_g    = 0x02,
2118         wasted_gc   = 0x02 | 0x04,
2119         wasted_c    = 0x04;
2120
2121     char * parse_start = RExC_parse; /* MJD */
2122     char *oregcomp_parse = RExC_parse;
2123     char c;
2124
2125     *flagp = 0;                         /* Tentatively. */
2126
2127
2128     /* Make an OPEN node, if parenthesized. */
2129     if (paren) {
2130         if (*RExC_parse == '?') { /* (?...) */
2131             U16 posflags = 0, negflags = 0;
2132             U16 *flagsp = &posflags;
2133             int logical = 0;
2134             char *seqstart = RExC_parse;
2135
2136             RExC_parse++;
2137             paren = *RExC_parse++;
2138             ret = NULL;                 /* For look-ahead/behind. */
2139             switch (paren) {
2140             case '<':           /* (?<...) */
2141                 RExC_seen |= REG_SEEN_LOOKBEHIND;
2142                 if (*RExC_parse == '!')
2143                     paren = ',';
2144                 if (*RExC_parse != '=' && *RExC_parse != '!')
2145                     goto unknown;
2146                 RExC_parse++;
2147             case '=':           /* (?=...) */
2148             case '!':           /* (?!...) */
2149                 RExC_seen_zerolen++;
2150             case ':':           /* (?:...) */
2151             case '>':           /* (?>...) */
2152                 break;
2153             case '$':           /* (?$...) */
2154             case '@':           /* (?@...) */
2155                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2156                 break;
2157             case '#':           /* (?#...) */
2158                 while (*RExC_parse && *RExC_parse != ')')
2159                     RExC_parse++;
2160                 if (*RExC_parse != ')')
2161                     FAIL("Sequence (?#... not terminated");
2162                 nextchar(pRExC_state);
2163                 *flagp = TRYAGAIN;
2164                 return NULL;
2165             case 'p':           /* (?p...) */
2166                 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2167                     vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2168                 /* FALL THROUGH*/
2169             case '?':           /* (??...) */
2170                 logical = 1;
2171                 if (*RExC_parse != '{')
2172                     goto unknown;
2173                 paren = *RExC_parse++;
2174                 /* FALL THROUGH */
2175             case '{':           /* (?{...}) */
2176             {
2177                 I32 count = 1, n = 0;
2178                 char c;
2179                 char *s = RExC_parse;
2180                 SV *sv;
2181                 OP_4tree *sop, *rop;
2182
2183                 RExC_seen_zerolen++;
2184                 RExC_seen |= REG_SEEN_EVAL;
2185                 while (count && (c = *RExC_parse)) {
2186                     if (c == '\\' && RExC_parse[1])
2187                         RExC_parse++;
2188                     else if (c == '{')
2189                         count++;
2190                     else if (c == '}')
2191                         count--;
2192                     RExC_parse++;
2193                 }
2194                 if (*RExC_parse != ')')
2195                 {
2196                     RExC_parse = s;             
2197                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2198                 }
2199                 if (!SIZE_ONLY) {
2200                     AV *av;
2201                 
2202                     if (RExC_parse - 1 - s)
2203                         sv = newSVpvn(s, RExC_parse - 1 - s);
2204                     else
2205                         sv = newSVpvn("", 0);
2206
2207                     ENTER;
2208                     Perl_save_re_context(aTHX);
2209                     rop = sv_compile_2op(sv, &sop, "re", &av);
2210                     sop->op_private |= OPpREFCOUNTED;
2211                     /* re_dup will OpREFCNT_inc */
2212                     OpREFCNT_set(sop, 1);
2213                     LEAVE;
2214
2215                     n = add_data(pRExC_state, 3, "nop");
2216                     RExC_rx->data->data[n] = (void*)rop;
2217                     RExC_rx->data->data[n+1] = (void*)sop;
2218                     RExC_rx->data->data[n+2] = (void*)av;
2219                     SvREFCNT_dec(sv);
2220                 }
2221                 else {                                          /* First pass */
2222                     if (PL_reginterp_cnt < ++RExC_seen_evals
2223                         && PL_curcop != &PL_compiling)
2224                         /* No compiled RE interpolated, has runtime
2225                            components ===> unsafe.  */
2226                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
2227                     if (PL_tainting && PL_tainted)
2228                         FAIL("Eval-group in insecure regular expression");
2229                 }
2230                 
2231                 nextchar(pRExC_state);
2232                 if (logical) {
2233                     ret = reg_node(pRExC_state, LOGICAL);
2234                     if (!SIZE_ONLY)
2235                         ret->flags = 2;
2236                     regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2237                     /* deal with the length of this later - MJD */
2238                     return ret;
2239                 }
2240                 return reganode(pRExC_state, EVAL, n);
2241             }
2242             case '(':           /* (?(?{...})...) and (?(?=...)...) */
2243             {
2244                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
2245                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2246                         || RExC_parse[1] == '<'
2247                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
2248                         I32 flag;
2249                         
2250                         ret = reg_node(pRExC_state, LOGICAL);
2251                         if (!SIZE_ONLY)
2252                             ret->flags = 1;
2253                         regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2254                         goto insert_if;
2255                     }
2256                 }
2257                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2258                     /* (?(1)...) */
2259                     parno = atoi(RExC_parse++);
2260
2261                     while (isDIGIT(*RExC_parse))
2262                         RExC_parse++;
2263                     ret = reganode(pRExC_state, GROUPP, parno);
2264                     
2265                     if ((c = *nextchar(pRExC_state)) != ')')
2266                         vFAIL("Switch condition not recognized");
2267                   insert_if:
2268                     regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2269                     br = regbranch(pRExC_state, &flags, 1);
2270                     if (br == NULL)
2271                         br = reganode(pRExC_state, LONGJMP, 0);
2272                     else
2273                         regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2274                     c = *nextchar(pRExC_state);
2275                     if (flags&HASWIDTH)
2276                         *flagp |= HASWIDTH;
2277                     if (c == '|') {
2278                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2279                         regbranch(pRExC_state, &flags, 1);
2280                         regtail(pRExC_state, ret, lastbr);
2281                         if (flags&HASWIDTH)
2282                             *flagp |= HASWIDTH;
2283                         c = *nextchar(pRExC_state);
2284                     }
2285                     else
2286                         lastbr = NULL;
2287                     if (c != ')')
2288                         vFAIL("Switch (?(condition)... contains too many branches");
2289                     ender = reg_node(pRExC_state, TAIL);
2290                     regtail(pRExC_state, br, ender);
2291                     if (lastbr) {
2292                         regtail(pRExC_state, lastbr, ender);
2293                         regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2294                     }
2295                     else
2296                         regtail(pRExC_state, ret, ender);
2297                     return ret;
2298                 }
2299                 else {
2300                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2301                 }
2302             }
2303             case 0:
2304                 RExC_parse--; /* for vFAIL to print correctly */
2305                 vFAIL("Sequence (? incomplete");
2306                 break;
2307             default:
2308                 --RExC_parse;
2309               parse_flags:      /* (?i) */
2310                 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2311                     /* (?g), (?gc) and (?o) are useless here
2312                        and must be globally applied -- japhy */
2313
2314                     if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2315                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2316                             I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2317                             if (! (wastedflags & wflagbit) ) {
2318                                 wastedflags |= wflagbit;
2319                                 vWARN5(
2320                                     RExC_parse + 1,
2321                                     "Useless (%s%c) - %suse /%c modifier",
2322                                     flagsp == &negflags ? "?-" : "?",
2323                                     *RExC_parse,
2324                                     flagsp == &negflags ? "don't " : "",
2325                                     *RExC_parse
2326                                 );
2327                             }
2328                         }
2329                     }
2330                     else if (*RExC_parse == 'c') {
2331                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2332                             if (! (wastedflags & wasted_c) ) {
2333                                 wastedflags |= wasted_gc;
2334                                 vWARN3(
2335                                     RExC_parse + 1,
2336                                     "Useless (%sc) - %suse /gc modifier",
2337                                     flagsp == &negflags ? "?-" : "?",
2338                                     flagsp == &negflags ? "don't " : ""
2339                                 );
2340                             }
2341                         }
2342                     }
2343                     else { pmflag(flagsp, *RExC_parse); }
2344
2345                     ++RExC_parse;
2346                 }
2347                 if (*RExC_parse == '-') {
2348                     flagsp = &negflags;
2349                     wastedflags = 0;  /* reset so (?g-c) warns twice */
2350                     ++RExC_parse;
2351                     goto parse_flags;
2352                 }
2353                 RExC_flags16 |= posflags;
2354                 RExC_flags16 &= ~negflags;
2355                 if (*RExC_parse == ':') {
2356                     RExC_parse++;
2357                     paren = ':';
2358                     break;
2359                 }               
2360               unknown:
2361                 if (*RExC_parse != ')') {
2362                     RExC_parse++;
2363                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2364                 }
2365                 nextchar(pRExC_state);
2366                 *flagp = TRYAGAIN;
2367                 return NULL;
2368             }
2369         }
2370         else {                  /* (...) */
2371             parno = RExC_npar;
2372             RExC_npar++;
2373             ret = reganode(pRExC_state, OPEN, parno);
2374             Set_Node_Length(ret, 1); /* MJD */
2375             Set_Node_Offset(ret, RExC_parse); /* MJD */
2376             open = 1;
2377         }
2378     }
2379     else                        /* ! paren */
2380         ret = NULL;
2381
2382     /* Pick up the branches, linking them together. */
2383     parse_start = RExC_parse;   /* MJD */
2384     br = regbranch(pRExC_state, &flags, 1);
2385     /*     branch_len = (paren != 0); */
2386     
2387     if (br == NULL)
2388         return(NULL);
2389     if (*RExC_parse == '|') {
2390         if (!SIZE_ONLY && RExC_extralen) {
2391             reginsert(pRExC_state, BRANCHJ, br);
2392         }
2393         else {                  /* MJD */
2394             reginsert(pRExC_state, BRANCH, br);
2395             Set_Node_Length(br, paren != 0);
2396             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2397         }
2398         have_branch = 1;
2399         if (SIZE_ONLY)
2400             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
2401     }
2402     else if (paren == ':') {
2403         *flagp |= flags&SIMPLE;
2404     }
2405     if (open) {                         /* Starts with OPEN. */
2406         regtail(pRExC_state, ret, br);          /* OPEN -> first. */
2407     }
2408     else if (paren != '?')              /* Not Conditional */
2409         ret = br;
2410     *flagp |= flags & (SPSTART | HASWIDTH);
2411     lastbr = br;
2412     while (*RExC_parse == '|') {
2413         if (!SIZE_ONLY && RExC_extralen) {
2414             ender = reganode(pRExC_state, LONGJMP,0);
2415             regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2416         }
2417         if (SIZE_ONLY)
2418             RExC_extralen += 2;         /* Account for LONGJMP. */
2419         nextchar(pRExC_state);
2420         br = regbranch(pRExC_state, &flags, 0);
2421         
2422         if (br == NULL)
2423             return(NULL);
2424         regtail(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
2425         lastbr = br;
2426         if (flags&HASWIDTH)
2427             *flagp |= HASWIDTH;
2428         *flagp |= flags&SPSTART;
2429     }
2430
2431     if (have_branch || paren != ':') {
2432         /* Make a closing node, and hook it on the end. */
2433         switch (paren) {
2434         case ':':
2435             ender = reg_node(pRExC_state, TAIL);
2436             break;
2437         case 1:
2438             ender = reganode(pRExC_state, CLOSE, parno);
2439             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2440             Set_Node_Length(ender,1); /* MJD */
2441             break;
2442         case '<':
2443         case ',':
2444         case '=':
2445         case '!':
2446             *flagp &= ~HASWIDTH;
2447             /* FALL THROUGH */
2448         case '>':
2449             ender = reg_node(pRExC_state, SUCCEED);
2450             break;
2451         case 0:
2452             ender = reg_node(pRExC_state, END);
2453             break;
2454         }
2455         regtail(pRExC_state, lastbr, ender);
2456
2457         if (have_branch) {
2458             /* Hook the tails of the branches to the closing node. */
2459             for (br = ret; br != NULL; br = regnext(br)) {
2460                 regoptail(pRExC_state, br, ender);
2461             }
2462         }
2463     }
2464
2465     {
2466         char *p;
2467         static char parens[] = "=!<,>";
2468
2469         if (paren && (p = strchr(parens, paren))) {
2470             int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2471             int flag = (p - parens) > 1;
2472
2473             if (paren == '>')
2474                 node = SUSPEND, flag = 0;
2475             reginsert(pRExC_state, node,ret);
2476             ret->flags = flag;
2477             regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2478         }
2479     }
2480
2481     /* Check for proper termination. */
2482     if (paren) {
2483         RExC_flags16 = oregflags;
2484         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2485             RExC_parse = oregcomp_parse;
2486             vFAIL("Unmatched (");
2487         }
2488     }
2489     else if (!paren && RExC_parse < RExC_end) {
2490         if (*RExC_parse == ')') {
2491             RExC_parse++;
2492             vFAIL("Unmatched )");
2493         }
2494         else
2495             FAIL("Junk on end of regexp");      /* "Can't happen". */
2496         /* NOTREACHED */
2497     }
2498
2499     return(ret);
2500 }
2501
2502 /*
2503  - regbranch - one alternative of an | operator
2504  *
2505  * Implements the concatenation operator.
2506  */
2507 STATIC regnode *
2508 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2509 {
2510     register regnode *ret;
2511     register regnode *chain = NULL;
2512     register regnode *latest;
2513     I32 flags = 0, c = 0;
2514
2515     if (first)
2516         ret = NULL;
2517     else {
2518         if (!SIZE_ONLY && RExC_extralen)
2519             ret = reganode(pRExC_state, BRANCHJ,0);
2520         else {
2521             ret = reg_node(pRExC_state, BRANCH);
2522             Set_Node_Length(ret, 1);
2523         }
2524     }
2525         
2526     if (!first && SIZE_ONLY)
2527         RExC_extralen += 1;                     /* BRANCHJ */
2528
2529     *flagp = WORST;                     /* Tentatively. */
2530
2531     RExC_parse--;
2532     nextchar(pRExC_state);
2533     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2534         flags &= ~TRYAGAIN;
2535         latest = regpiece(pRExC_state, &flags);
2536         if (latest == NULL) {
2537             if (flags & TRYAGAIN)
2538                 continue;
2539             return(NULL);
2540         }
2541         else if (ret == NULL)
2542             ret = latest;
2543         *flagp |= flags&HASWIDTH;
2544         if (chain == NULL)      /* First piece. */
2545             *flagp |= flags&SPSTART;
2546         else {
2547             RExC_naughty++;
2548             regtail(pRExC_state, chain, latest);
2549         }
2550         chain = latest;
2551         c++;
2552     }
2553     if (chain == NULL) {        /* Loop ran zero times. */
2554         chain = reg_node(pRExC_state, NOTHING);
2555         if (ret == NULL)
2556             ret = chain;
2557     }
2558     if (c == 1) {
2559         *flagp |= flags&SIMPLE;
2560     }
2561
2562     return(ret);
2563 }
2564
2565 /*
2566  - regpiece - something followed by possible [*+?]
2567  *
2568  * Note that the branching code sequences used for ? and the general cases
2569  * of * and + are somewhat optimized:  they use the same NOTHING node as
2570  * both the endmarker for their branch list and the body of the last branch.
2571  * It might seem that this node could be dispensed with entirely, but the
2572  * endmarker role is not redundant.
2573  */
2574 STATIC regnode *
2575 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2576 {
2577     register regnode *ret;
2578     register char op;
2579     register char *next;
2580     I32 flags;
2581     char *origparse = RExC_parse;
2582     char *maxpos;
2583     I32 min;
2584     I32 max = REG_INFTY;
2585     char *parse_start;
2586
2587     ret = regatom(pRExC_state, &flags);
2588     if (ret == NULL) {
2589         if (flags & TRYAGAIN)
2590             *flagp |= TRYAGAIN;
2591         return(NULL);
2592     }
2593
2594     op = *RExC_parse;
2595
2596     if (op == '{' && regcurly(RExC_parse)) {
2597         parse_start = RExC_parse; /* MJD */
2598         next = RExC_parse + 1;
2599         maxpos = Nullch;
2600         while (isDIGIT(*next) || *next == ',') {
2601             if (*next == ',') {
2602                 if (maxpos)
2603                     break;
2604                 else
2605                     maxpos = next;
2606             }
2607             next++;
2608         }
2609         if (*next == '}') {             /* got one */
2610             if (!maxpos)
2611                 maxpos = next;
2612             RExC_parse++;
2613             min = atoi(RExC_parse);
2614             if (*maxpos == ',')
2615                 maxpos++;
2616             else
2617                 maxpos = RExC_parse;
2618             max = atoi(maxpos);
2619             if (!max && *maxpos != '0')
2620                 max = REG_INFTY;                /* meaning "infinity" */
2621             else if (max >= REG_INFTY)
2622                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2623             RExC_parse = next;
2624             nextchar(pRExC_state);
2625
2626         do_curly:
2627             if ((flags&SIMPLE)) {
2628                 RExC_naughty += 2 + RExC_naughty / 2;
2629                 reginsert(pRExC_state, CURLY, ret);
2630                 Set_Node_Offset(ret, parse_start+1); /* MJD */
2631                 Set_Node_Cur_Length(ret);
2632             }
2633             else {
2634                 regnode *w = reg_node(pRExC_state, WHILEM);
2635
2636                 w->flags = 0;
2637                 regtail(pRExC_state, ret, w);
2638                 if (!SIZE_ONLY && RExC_extralen) {
2639                     reginsert(pRExC_state, LONGJMP,ret);
2640                     reginsert(pRExC_state, NOTHING,ret);
2641                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
2642                 }
2643                 reginsert(pRExC_state, CURLYX,ret);
2644                                 /* MJD hk */
2645                 Set_Node_Offset(ret, parse_start+1);
2646                 Set_Node_Length(ret, 
2647                                 op == '{' ? (RExC_parse - parse_start) : 1);
2648                 
2649                 if (!SIZE_ONLY && RExC_extralen)
2650                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
2651                 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2652                 if (SIZE_ONLY)
2653                     RExC_whilem_seen++, RExC_extralen += 3;
2654                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
2655             }
2656             ret->flags = 0;
2657
2658             if (min > 0)
2659                 *flagp = WORST;
2660             if (max > 0)
2661                 *flagp |= HASWIDTH;
2662             if (max && max < min)
2663                 vFAIL("Can't do {n,m} with n > m");
2664             if (!SIZE_ONLY) {
2665                 ARG1_SET(ret, min);
2666                 ARG2_SET(ret, max);
2667             }
2668
2669             goto nest_check;
2670         }
2671     }
2672
2673     if (!ISMULT1(op)) {
2674         *flagp = flags;
2675         return(ret);
2676     }
2677
2678 #if 0                           /* Now runtime fix should be reliable. */
2679
2680     /* if this is reinstated, don't forget to put this back into perldiag:
2681
2682             =item Regexp *+ operand could be empty at {#} in regex m/%s/
2683
2684            (F) The part of the regexp subject to either the * or + quantifier
2685            could match an empty string. The {#} shows in the regular
2686            expression about where the problem was discovered.
2687
2688     */
2689
2690     if (!(flags&HASWIDTH) && op != '?')
2691       vFAIL("Regexp *+ operand could be empty");
2692 #endif
2693
2694     parse_start = RExC_parse;
2695     nextchar(pRExC_state);
2696
2697     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2698
2699     if (op == '*' && (flags&SIMPLE)) {
2700         reginsert(pRExC_state, STAR, ret);
2701         ret->flags = 0;
2702         RExC_naughty += 4;
2703     }
2704     else if (op == '*') {
2705         min = 0;
2706         goto do_curly;
2707     }
2708     else if (op == '+' && (flags&SIMPLE)) {
2709         reginsert(pRExC_state, PLUS, ret);
2710         ret->flags = 0;
2711         RExC_naughty += 3;
2712     }
2713     else if (op == '+') {
2714         min = 1;
2715         goto do_curly;
2716     }
2717     else if (op == '?') {
2718         min = 0; max = 1;
2719         goto do_curly;
2720     }
2721   nest_check:
2722     if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2723         vWARN3(RExC_parse,
2724                "%.*s matches null string many times",
2725                RExC_parse - origparse,
2726                origparse);
2727     }
2728
2729     if (*RExC_parse == '?') {
2730         nextchar(pRExC_state);
2731         reginsert(pRExC_state, MINMOD, ret);
2732         regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2733     }
2734     if (ISMULT2(RExC_parse)) {
2735         RExC_parse++;
2736         vFAIL("Nested quantifiers");
2737     }
2738
2739     return(ret);
2740 }
2741
2742 /*
2743  - regatom - the lowest level
2744  *
2745  * Optimization:  gobbles an entire sequence of ordinary characters so that
2746  * it can turn them into a single node, which is smaller to store and
2747  * faster to run.  Backslashed characters are exceptions, each becoming a
2748  * separate node; the code is simpler that way and it's not worth fixing.
2749  *
2750  * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2751 STATIC regnode *
2752 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2753 {
2754     register regnode *ret = 0;
2755     I32 flags;
2756     char *parse_start = 0;
2757
2758     *flagp = WORST;             /* Tentatively. */
2759
2760 tryagain:
2761     switch (*RExC_parse) {
2762     case '^':
2763         RExC_seen_zerolen++;
2764         nextchar(pRExC_state);
2765         if (RExC_flags16 & PMf_MULTILINE)
2766             ret = reg_node(pRExC_state, MBOL);
2767         else if (RExC_flags16 & PMf_SINGLELINE)
2768             ret = reg_node(pRExC_state, SBOL);
2769         else
2770             ret = reg_node(pRExC_state, BOL);
2771         Set_Node_Length(ret, 1); /* MJD */
2772         break;
2773     case '$':
2774         nextchar(pRExC_state);
2775         if (*RExC_parse)
2776             RExC_seen_zerolen++;
2777         if (RExC_flags16 & PMf_MULTILINE)
2778             ret = reg_node(pRExC_state, MEOL);
2779         else if (RExC_flags16 & PMf_SINGLELINE)
2780             ret = reg_node(pRExC_state, SEOL);
2781         else
2782             ret = reg_node(pRExC_state, EOL);
2783         Set_Node_Length(ret, 1); /* MJD */
2784         break;
2785     case '.':
2786         nextchar(pRExC_state);
2787         if (RExC_flags16 & PMf_SINGLELINE)
2788             ret = reg_node(pRExC_state, SANY);
2789         else
2790             ret = reg_node(pRExC_state, REG_ANY);
2791         *flagp |= HASWIDTH|SIMPLE;
2792         RExC_naughty++;
2793         Set_Node_Length(ret, 1); /* MJD */
2794         break;
2795     case '[':
2796     {
2797         char *oregcomp_parse = ++RExC_parse;
2798         ret = regclass(pRExC_state);
2799         if (*RExC_parse != ']') {
2800             RExC_parse = oregcomp_parse;
2801             vFAIL("Unmatched [");
2802         }
2803         nextchar(pRExC_state);
2804         *flagp |= HASWIDTH|SIMPLE;
2805         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2806         break;
2807     }
2808     case '(':
2809         nextchar(pRExC_state);
2810         ret = reg(pRExC_state, 1, &flags);
2811         if (ret == NULL) {
2812                 if (flags & TRYAGAIN) {
2813                     if (RExC_parse == RExC_end) {
2814                          /* Make parent create an empty node if needed. */
2815                         *flagp |= TRYAGAIN;
2816                         return(NULL);
2817                     }
2818                     goto tryagain;
2819                 }
2820                 return(NULL);
2821         }
2822         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2823         break;
2824     case '|':
2825     case ')':
2826         if (flags & TRYAGAIN) {
2827             *flagp |= TRYAGAIN;
2828             return NULL;
2829         }
2830         vFAIL("Internal urp");
2831                                 /* Supposed to be caught earlier. */
2832         break;
2833     case '{':
2834         if (!regcurly(RExC_parse)) {
2835             RExC_parse++;
2836             goto defchar;
2837         }
2838         /* FALL THROUGH */
2839     case '?':
2840     case '+':
2841     case '*':
2842         RExC_parse++;
2843         vFAIL("Quantifier follows nothing");
2844         break;
2845     case '\\':
2846         switch (*++RExC_parse) {
2847         case 'A':
2848             RExC_seen_zerolen++;
2849             ret = reg_node(pRExC_state, SBOL);
2850             *flagp |= SIMPLE;
2851             nextchar(pRExC_state);
2852             Set_Node_Length(ret, 2); /* MJD */
2853             break;
2854         case 'G':
2855             ret = reg_node(pRExC_state, GPOS);
2856             RExC_seen |= REG_SEEN_GPOS;
2857             *flagp |= SIMPLE;
2858             nextchar(pRExC_state);
2859             Set_Node_Length(ret, 2); /* MJD */
2860             break;
2861         case 'Z':
2862             ret = reg_node(pRExC_state, SEOL);
2863             *flagp |= SIMPLE;
2864             RExC_seen_zerolen++;                /* Do not optimize RE away */
2865             nextchar(pRExC_state);
2866             break;
2867         case 'z':
2868             ret = reg_node(pRExC_state, EOS);
2869             *flagp |= SIMPLE;
2870             RExC_seen_zerolen++;                /* Do not optimize RE away */
2871             nextchar(pRExC_state);
2872             Set_Node_Length(ret, 2); /* MJD */
2873             break;
2874         case 'C':
2875             ret = reg_node(pRExC_state, CANY);
2876             RExC_seen |= REG_SEEN_CANY;
2877             *flagp |= HASWIDTH|SIMPLE;
2878             nextchar(pRExC_state);
2879             Set_Node_Length(ret, 2); /* MJD */
2880             break;
2881         case 'X':
2882             ret = reg_node(pRExC_state, CLUMP);
2883             *flagp |= HASWIDTH;
2884             nextchar(pRExC_state);
2885             Set_Node_Length(ret, 2); /* MJD */
2886             break;
2887         case 'w':
2888             ret = reg_node(pRExC_state, LOC ? ALNUML     : ALNUM);
2889             *flagp |= HASWIDTH|SIMPLE;
2890             nextchar(pRExC_state);
2891             Set_Node_Length(ret, 2); /* MJD */
2892             break;
2893         case 'W':
2894             ret = reg_node(pRExC_state, LOC ? NALNUML     : NALNUM);
2895             *flagp |= HASWIDTH|SIMPLE;
2896             nextchar(pRExC_state);
2897             Set_Node_Length(ret, 2); /* MJD */
2898             break;
2899         case 'b':
2900             RExC_seen_zerolen++;
2901             RExC_seen |= REG_SEEN_LOOKBEHIND;
2902             ret = reg_node(pRExC_state, LOC ? BOUNDL     : BOUND);
2903             *flagp |= SIMPLE;
2904             nextchar(pRExC_state);
2905             Set_Node_Length(ret, 2); /* MJD */
2906             break;
2907         case 'B':
2908             RExC_seen_zerolen++;
2909             RExC_seen |= REG_SEEN_LOOKBEHIND;
2910             ret = reg_node(pRExC_state, LOC ? NBOUNDL     : NBOUND);
2911             *flagp |= SIMPLE;
2912             nextchar(pRExC_state);
2913             Set_Node_Length(ret, 2); /* MJD */
2914             break;
2915         case 's':
2916             ret = reg_node(pRExC_state, LOC ? SPACEL     : SPACE);
2917             *flagp |= HASWIDTH|SIMPLE;
2918             nextchar(pRExC_state);
2919             Set_Node_Length(ret, 2); /* MJD */
2920             break;
2921         case 'S':
2922             ret = reg_node(pRExC_state, LOC ? NSPACEL     : NSPACE);
2923             *flagp |= HASWIDTH|SIMPLE;
2924             nextchar(pRExC_state);
2925             Set_Node_Length(ret, 2); /* MJD */
2926             break;
2927         case 'd':
2928             ret = reg_node(pRExC_state, DIGIT);
2929             *flagp |= HASWIDTH|SIMPLE;
2930             nextchar(pRExC_state);
2931             Set_Node_Length(ret, 2); /* MJD */
2932             break;
2933         case 'D':
2934             ret = reg_node(pRExC_state, NDIGIT);
2935             *flagp |= HASWIDTH|SIMPLE;
2936             nextchar(pRExC_state);
2937             Set_Node_Length(ret, 2); /* MJD */
2938             break;
2939         case 'p':
2940         case 'P':
2941             {   
2942                 char* oldregxend = RExC_end;
2943                 char* parse_start = RExC_parse;
2944
2945                 if (RExC_parse[1] == '{') {
2946                   /* a lovely hack--pretend we saw [\pX] instead */
2947                     RExC_end = strchr(RExC_parse, '}');
2948                     if (!RExC_end) {
2949                         U8 c = (U8)*RExC_parse;
2950                         RExC_parse += 2;
2951                         RExC_end = oldregxend;
2952                         vFAIL2("Missing right brace on \\%c{}", c);
2953                     }
2954                     RExC_end++;
2955                 }
2956                 else
2957                     RExC_end = RExC_parse + 2;
2958                 RExC_parse--;
2959
2960                 ret = regclass(pRExC_state);
2961
2962                 RExC_end = oldregxend;
2963                 RExC_parse--;
2964                 Set_Node_Cur_Length(ret); /* MJD */
2965                 nextchar(pRExC_state);
2966                 *flagp |= HASWIDTH|SIMPLE;
2967             }
2968             break;
2969         case 'n':
2970         case 'r':
2971         case 't':
2972         case 'f':
2973         case 'e':
2974         case 'a':
2975         case 'x':
2976         case 'c':
2977         case '0':
2978             goto defchar;
2979         case '1': case '2': case '3': case '4':
2980         case '5': case '6': case '7': case '8': case '9':
2981             {
2982                 I32 num = atoi(RExC_parse);
2983
2984                 if (num > 9 && num >= RExC_npar)
2985                     goto defchar;
2986                 else {
2987                     char * parse_start = RExC_parse - 1; /* MJD */
2988                     while (isDIGIT(*RExC_parse))
2989                         RExC_parse++;
2990
2991                     if (!SIZE_ONLY && num > RExC_rx->nparens)
2992                         vFAIL("Reference to nonexistent group");
2993                     RExC_sawback = 1;
2994                     ret = reganode(pRExC_state, FOLD
2995                                    ? (LOC ? REFFL : REFF)
2996                                    : REF, num);
2997                     *flagp |= HASWIDTH;
2998                     
2999                     /* override incorrect value set in reganode MJD */
3000                     Set_Node_Offset(ret, parse_start+1); 
3001                     Set_Node_Cur_Length(ret); /* MJD */
3002                     RExC_parse--;
3003                     nextchar(pRExC_state);
3004                 }
3005             }
3006             break;
3007         case '\0':
3008             if (RExC_parse >= RExC_end)
3009                 FAIL("Trailing \\");
3010             /* FALL THROUGH */
3011         default:
3012             /* Do not generate `unrecognized' warnings here, we fall
3013                back into the quick-grab loop below */
3014             goto defchar;
3015         }
3016         break;
3017
3018     case '#':
3019         if (RExC_flags16 & PMf_EXTENDED) {
3020             while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3021             if (RExC_parse < RExC_end)
3022                 goto tryagain;
3023         }
3024         /* FALL THROUGH */
3025
3026     default: {
3027             register STRLEN len;
3028             register UV ender;
3029             register char *p;
3030             char *oldp, *s;
3031             STRLEN numlen;
3032             STRLEN foldlen;
3033             U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3034
3035             parse_start = RExC_parse - 1;
3036
3037             RExC_parse++;
3038
3039         defchar:
3040             ret = reg_node(pRExC_state, FOLD
3041                           ? (LOC ? EXACTFL : EXACTF)
3042                           : EXACT);
3043             s = STRING(ret);
3044             for (len = 0, p = RExC_parse - 1;
3045               len < 127 && p < RExC_end;
3046               len++)
3047             {
3048                 oldp = p;
3049
3050                 if (RExC_flags16 & PMf_EXTENDED)
3051                     p = regwhite(p, RExC_end);
3052                 switch (*p) {
3053                 case '^':
3054                 case '$':
3055                 case '.':
3056                 case '[':
3057                 case '(':
3058                 case ')':
3059                 case '|':
3060                     goto loopdone;
3061                 case '\\':
3062                     switch (*++p) {
3063                     case 'A':
3064                     case 'C':
3065                     case 'X':
3066                     case 'G':
3067                     case 'Z':
3068                     case 'z':
3069                     case 'w':
3070                     case 'W':
3071                     case 'b':
3072                     case 'B':
3073                     case 's':
3074                     case 'S':
3075                     case 'd':
3076                     case 'D':
3077                     case 'p':
3078                     case 'P':
3079                         --p;
3080                         goto loopdone;
3081                     case 'n':
3082                         ender = '\n';
3083                         p++;
3084                         break;
3085                     case 'r':
3086                         ender = '\r';
3087                         p++;
3088                         break;
3089                     case 't':
3090                         ender = '\t';
3091                         p++;
3092                         break;
3093                     case 'f':
3094                         ender = '\f';
3095                         p++;
3096                         break;
3097                     case 'e':
3098                           ender = ASCII_TO_NATIVE('\033');
3099                         p++;
3100                         break;
3101                     case 'a':
3102                           ender = ASCII_TO_NATIVE('\007');
3103                         p++;
3104                         break;
3105                     case 'x':
3106                         if (*++p == '{') {
3107                             char* e = strchr(p, '}');
3108         
3109                             if (!e) {
3110                                 RExC_parse = p + 1;
3111                                 vFAIL("Missing right brace on \\x{}");
3112                             }
3113                             else {
3114                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3115                                     | PERL_SCAN_DISALLOW_PREFIX;
3116                                 numlen = e - p - 1;
3117                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3118                                 if (ender > 0xff)
3119                                     RExC_utf8 = 1;
3120                                 /* numlen is generous */
3121                                 if (numlen + len >= 127) {
3122                                     p--;
3123                                     goto loopdone;
3124                                 }
3125                                 p = e + 1;
3126                             }
3127                         }
3128                         else {
3129                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3130                             numlen = 2;
3131                             ender = grok_hex(p, &numlen, &flags, NULL);
3132                             p += numlen;
3133                         }
3134                         break;
3135                     case 'c':
3136                         p++;
3137                         ender = UCHARAT(p++);
3138                         ender = toCTRL(ender);
3139                         break;
3140                     case '0': case '1': case '2': case '3':case '4':
3141                     case '5': case '6': case '7': case '8':case '9':
3142                         if (*p == '0' ||
3143                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3144                             I32 flags = 0;
3145                             numlen = 3;
3146                             ender = grok_oct(p, &numlen, &flags, NULL);
3147                             p += numlen;
3148                         }
3149                         else {
3150                             --p;
3151                             goto loopdone;
3152                         }
3153                         break;
3154                     case '\0':
3155                         if (p >= RExC_end)
3156                             FAIL("Trailing \\");
3157                         /* FALL THROUGH */
3158                     default:
3159                         if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3160                             vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3161                         goto normal_default;
3162                     }
3163                     break;
3164                 default:
3165                   normal_default:
3166                     if (UTF8_IS_START(*p) && UTF) {
3167                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3168                                                &numlen, 0);
3169                         p += numlen;
3170                     }
3171                     else
3172                         ender = *p++;
3173                     break;
3174                 }
3175                 if (RExC_flags16 & PMf_EXTENDED)
3176                     p = regwhite(p, RExC_end);
3177                 if (UTF && FOLD) {
3178                     /* Prime the casefolded buffer. */
3179                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3180                 }
3181                 if (ISMULT2(p)) { /* Back off on ?+*. */
3182                     if (len)
3183                         p = oldp;
3184                     else if (UTF) {
3185                          STRLEN unilen;
3186
3187                          if (FOLD) {
3188                               /* Emit all the Unicode characters. */
3189                               for (foldbuf = tmpbuf;
3190                                    foldlen;
3191                                    foldlen -= numlen) {
3192                                    ender = utf8_to_uvchr(foldbuf, &numlen);
3193                                    if (numlen > 0) {
3194                                         reguni(pRExC_state, ender, s, &unilen);
3195                                         s       += unilen;
3196                                         len     += unilen;
3197                                         /* In EBCDIC the numlen
3198                                          * and unilen can differ. */
3199                                         foldbuf += numlen;
3200                                         if (numlen >= foldlen)
3201                                              break;
3202                                    }
3203                                    else
3204                                         break; /* "Can't happen." */
3205                               }
3206                          }
3207                          else {
3208                               reguni(pRExC_state, ender, s, &unilen);
3209                               if (numlen > 0) {
3210                                    s   += unilen;
3211                                    len += unilen;
3212                               }
3213                          }
3214                     }
3215                     else {
3216                         len++;
3217                         REGC(ender, s++);
3218                     }
3219                     break;
3220                 }
3221                 if (UTF) {
3222                      STRLEN unilen;
3223
3224                      if (FOLD) {
3225                           /* Emit all the Unicode characters. */
3226                           for (foldbuf = tmpbuf;
3227                                foldlen;
3228                                foldlen -= numlen) {
3229                                ender = utf8_to_uvchr(foldbuf, &numlen);
3230                                if (numlen > 0) {
3231                                     reguni(pRExC_state, ender, s, &unilen);
3232                                     len     += unilen;
3233                                     s       += unilen;
3234                                     /* In EBCDIC the numlen
3235                                      * and unilen can differ. */
3236                                     foldbuf += numlen;
3237                                     if (numlen >= foldlen)
3238                                          break;
3239                                }
3240                                else
3241                                     break;
3242                           }
3243                      }
3244                      else {
3245                           reguni(pRExC_state, ender, s, &unilen);
3246                           if (numlen > 0) {
3247                                s   += unilen;
3248                                len += unilen;
3249                           }
3250                      }
3251                      len--;
3252                 }
3253                 else
3254                     REGC(ender, s++);
3255             }
3256         loopdone:
3257             RExC_parse = p - 1;
3258             Set_Node_Cur_Length(ret); /* MJD */
3259             nextchar(pRExC_state);
3260             {
3261                 /* len is STRLEN which is unsigned, need to copy to signed */
3262                 IV iv = len;
3263                 if (iv < 0)
3264                     vFAIL("Internal disaster");
3265             }
3266             if (len > 0)
3267                 *flagp |= HASWIDTH;
3268             if (len == 1)
3269                 *flagp |= SIMPLE;
3270             if (!SIZE_ONLY)
3271                 STR_LEN(ret) = len;
3272             if (SIZE_ONLY)
3273                 RExC_size += STR_SZ(len);
3274             else
3275                 RExC_emit += STR_SZ(len);
3276         }
3277         break;
3278     }
3279
3280     /* If the encoding pragma is in effect recode the text of
3281      * any EXACT-kind nodes. */
3282     if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3283          STRLEN oldlen = STR_LEN(ret);
3284          SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3285
3286          if (RExC_utf8)
3287               SvUTF8_on(sv);
3288          if (sv_utf8_downgrade(sv, TRUE)) {
3289               char *s       = Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding);
3290               STRLEN newlen = SvCUR(sv);
3291          
3292               if (!SIZE_ONLY) {
3293                    DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3294                                          (int)oldlen, STRING(ret),
3295                                          (int)newlen, s));
3296                    Copy(s, STRING(ret), newlen, char);
3297                    STR_LEN(ret) += newlen - oldlen;
3298                    RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3299               } else
3300                    RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3301          }
3302     }
3303
3304     return(ret);
3305 }
3306
3307 STATIC char *
3308 S_regwhite(pTHX_ char *p, char *e)
3309 {
3310     while (p < e) {
3311         if (isSPACE(*p))
3312             ++p;
3313         else if (*p == '#') {
3314             do {
3315                 p++;
3316             } while (p < e && *p != '\n');
3317         }
3318         else
3319             break;
3320     }
3321     return p;
3322 }
3323
3324 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3325    Character classes ([:foo:]) can also be negated ([:^foo:]).
3326    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3327    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3328    but trigger failures because they are currently unimplemented. */
3329
3330 #define POSIXCC_DONE(c)   ((c) == ':')
3331 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3332 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3333
3334 STATIC I32
3335 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3336 {
3337     char *posixcc = 0;
3338     I32 namedclass = OOB_NAMEDCLASS;
3339
3340     if (value == '[' && RExC_parse + 1 < RExC_end &&
3341         /* I smell either [: or [= or [. -- POSIX has been here, right? */
3342         POSIXCC(UCHARAT(RExC_parse))) {
3343         char  c = UCHARAT(RExC_parse);
3344         char* s = RExC_parse++;
3345         
3346         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3347             RExC_parse++;
3348         if (RExC_parse == RExC_end)
3349             /* Grandfather lone [:, [=, [. */
3350             RExC_parse = s;
3351         else {
3352             char* t = RExC_parse++; /* skip over the c */
3353
3354             if (UCHARAT(RExC_parse) == ']') {
3355                 RExC_parse++; /* skip over the ending ] */
3356                 posixcc = s + 1;
3357                 if (*s == ':') {
3358                     I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3359                     I32 skip = 5; /* the most common skip */
3360
3361                     switch (*posixcc) {
3362                     case 'a':
3363                         if (strnEQ(posixcc, "alnum", 5))
3364                             namedclass =
3365                                 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3366                         else if (strnEQ(posixcc, "alpha", 5))
3367                             namedclass =
3368                                 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3369                         else if (strnEQ(posixcc, "ascii", 5))
3370                             namedclass =
3371                                 complement ? ANYOF_NASCII : ANYOF_ASCII;
3372                         break;
3373                     case 'b':
3374                         if (strnEQ(posixcc, "blank", 5))
3375                             namedclass =
3376                                 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3377                         break;
3378                     case 'c':
3379                         if (strnEQ(posixcc, "cntrl", 5))
3380                             namedclass =
3381                                 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3382                         break;
3383                     case 'd':
3384                         if (strnEQ(posixcc, "digit", 5))
3385                             namedclass =
3386                                 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3387                         break;
3388                     case 'g':
3389                         if (strnEQ(posixcc, "graph", 5))
3390                             namedclass =
3391                                 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3392                         break;
3393                     case 'l':
3394                         if (strnEQ(posixcc, "lower", 5))
3395                             namedclass =
3396                                 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3397                         break;
3398                     case 'p':
3399                         if (strnEQ(posixcc, "print", 5))
3400                             namedclass =
3401                                 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3402                         else if (strnEQ(posixcc, "punct", 5))
3403                             namedclass =
3404                                 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3405                         break;
3406                     case 's':
3407                         if (strnEQ(posixcc, "space", 5))
3408                             namedclass =
3409                                 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3410                         break;
3411                     case 'u':
3412                         if (strnEQ(posixcc, "upper", 5))
3413                             namedclass =
3414                                 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3415                         break;
3416                     case 'w': /* this is not POSIX, this is the Perl \w */
3417                         if (strnEQ(posixcc, "word", 4)) {
3418                             namedclass =
3419                                 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3420                             skip = 4;
3421                         }
3422                         break;
3423                     case 'x':
3424                         if (strnEQ(posixcc, "xdigit", 6)) {
3425                             namedclass =
3426                                 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3427                             skip = 6;
3428                         }
3429                         break;
3430                     }
3431                     if (namedclass == OOB_NAMEDCLASS ||
3432                         posixcc[skip] != ':' ||
3433                         posixcc[skip+1] != ']')
3434                     {
3435                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3436                                       t - s - 1, s + 1);
3437                     }
3438                 } else if (!SIZE_ONLY) {
3439                     /* [[=foo=]] and [[.foo.]] are still future. */
3440
3441                     /* adjust RExC_parse so the warning shows after
3442                        the class closes */
3443                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3444                         RExC_parse++;
3445                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3446                 }
3447             } else {
3448                 /* Maternal grandfather:
3449                  * "[:" ending in ":" but not in ":]" */
3450                 RExC_parse = s;
3451             }
3452         }
3453     }
3454
3455     return namedclass;
3456 }
3457
3458 STATIC void
3459 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3460 {
3461     if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3462         char *s = RExC_parse;
3463         char  c = *s++;
3464
3465         while(*s && isALNUM(*s))
3466             s++;
3467         if (*s && c == *s && s[1] == ']') {
3468             vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3469
3470             /* [[=foo=]] and [[.foo.]] are still future. */
3471             if (POSIXCC_NOTYET(c)) {
3472                 /* adjust RExC_parse so the error shows after
3473                    the class closes */
3474                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3475                     ;
3476                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3477             }
3478         }
3479     }
3480 }
3481
3482 STATIC regnode *
3483 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3484 {
3485     register UV value;
3486     register UV nextvalue;
3487     register IV prevvalue = OOB_UNICODE;
3488     register IV range = 0;
3489     register regnode *ret;
3490     STRLEN numlen;
3491     IV namedclass;
3492     char *rangebegin = 0;
3493     bool need_class = 0;
3494     SV *listsv = Nullsv;
3495     register char *e;
3496     UV n;
3497     bool optimize_invert   = TRUE;
3498     AV* unicode_alternate  = 0;
3499
3500     ret = reganode(pRExC_state, ANYOF, 0);
3501
3502     if (!SIZE_ONLY)
3503         ANYOF_FLAGS(ret) = 0;
3504
3505     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
3506         RExC_naughty++;
3507         RExC_parse++;
3508         if (!SIZE_ONLY)
3509             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3510     }
3511
3512     if (SIZE_ONLY)
3513         RExC_size += ANYOF_SKIP;
3514     else {
3515         RExC_emit += ANYOF_SKIP;
3516         if (FOLD)
3517             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3518         if (LOC)
3519             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3520         ANYOF_BITMAP_ZERO(ret);
3521         listsv = newSVpvn("# comment\n", 10);
3522     }
3523
3524     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3525
3526     if (!SIZE_ONLY && POSIXCC(nextvalue))
3527         checkposixcc(pRExC_state);
3528
3529     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3530     if (UCHARAT(RExC_parse) == ']')
3531         goto charclassloop;
3532
3533     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3534
3535     charclassloop:
3536
3537         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3538
3539         if (!range)
3540             rangebegin = RExC_parse;
3541         if (UTF) {
3542             value = utf8n_to_uvchr((U8*)RExC_parse,
3543                                    RExC_end - RExC_parse,
3544                                    &numlen, 0);
3545             RExC_parse += numlen;
3546         }
3547         else
3548             value = UCHARAT(RExC_parse++);
3549         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3550         if (value == '[' && POSIXCC(nextvalue))
3551             namedclass = regpposixcc(pRExC_state, value);
3552         else if (value == '\\') {
3553             if (UTF) {
3554                 value = utf8n_to_uvchr((U8*)RExC_parse,
3555                                    RExC_end - RExC_parse,
3556                                    &numlen, 0);
3557                 RExC_parse += numlen;
3558             }
3559             else
3560                 value = UCHARAT(RExC_parse++);
3561             /* Some compilers cannot handle switching on 64-bit integer
3562              * values, therefore value cannot be an UV.  Yes, this will
3563              * be a problem later if we want switch on Unicode.
3564              * A similar issue a little bit later when switching on
3565              * namedclass. --jhi */
3566             switch ((I32)value) {
3567             case 'w':   namedclass = ANYOF_ALNUM;       break;
3568             case 'W':   namedclass = ANYOF_NALNUM;      break;
3569             case 's':   namedclass = ANYOF_SPACE;       break;
3570             case 'S':   namedclass = ANYOF_NSPACE;      break;
3571             case 'd':   namedclass = ANYOF_DIGIT;       break;
3572             case 'D':   namedclass = ANYOF_NDIGIT;      break;
3573             case 'p':
3574             case 'P':
3575                 if (*RExC_parse == '{') {
3576                     U8 c = (U8)value;
3577                     e = strchr(RExC_parse++, '}');
3578                     if (!e)
3579                         vFAIL2("Missing right brace on \\%c{}", c);
3580                     while (isSPACE(UCHARAT(RExC_parse)))
3581                         RExC_parse++;
3582                     if (e == RExC_parse)
3583                         vFAIL2("Empty \\%c{}", c);
3584                     n = e - RExC_parse;
3585                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3586                         n--;
3587                 }
3588                 else {
3589                     e = RExC_parse;
3590                     n = 1;
3591                 }
3592                 if (!SIZE_ONLY) {
3593                     if (UCHARAT(RExC_parse) == '^') {
3594                          RExC_parse++;
3595                          n--;
3596                          value = value == 'p' ? 'P' : 'p'; /* toggle */
3597                          while (isSPACE(UCHARAT(RExC_parse))) {
3598                               RExC_parse++;
3599                               n--;
3600                          }
3601                     }
3602                     if (value == 'p')
3603                          Perl_sv_catpvf(aTHX_ listsv,
3604                                         "+utf8::%.*s\n", (int)n, RExC_parse);
3605                     else
3606                          Perl_sv_catpvf(aTHX_ listsv,
3607                                         "!utf8::%.*s\n", (int)n, RExC_parse);
3608                 }
3609                 RExC_parse = e + 1;
3610                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3611                 continue;
3612             case 'n':   value = '\n';                   break;
3613             case 'r':   value = '\r';                   break;
3614             case 't':   value = '\t';                   break;
3615             case 'f':   value = '\f';                   break;
3616             case 'b':   value = '\b';                   break;
3617             case 'e':   value = ASCII_TO_NATIVE('\033');break;
3618             case 'a':   value = ASCII_TO_NATIVE('\007');break;
3619             case 'x':
3620                 if (*RExC_parse == '{') {
3621                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3622                         | PERL_SCAN_DISALLOW_PREFIX;
3623                     e = strchr(RExC_parse++, '}');
3624                     if (!e)
3625                         vFAIL("Missing right brace on \\x{}");
3626
3627                     numlen = e - RExC_parse;
3628                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3629                     RExC_parse = e + 1;
3630                 }
3631                 else {
3632                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3633                     numlen = 2;
3634                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3635                     RExC_parse += numlen;
3636                 }
3637                 break;
3638             case 'c':
3639                 value = UCHARAT(RExC_parse++);
3640                 value = toCTRL(value);
3641                 break;
3642             case '0': case '1': case '2': case '3': case '4':
3643             case '5': case '6': case '7': case '8': case '9':
3644             {
3645                 I32 flags = 0;
3646                 numlen = 3;
3647                 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3648                 RExC_parse += numlen;
3649                 break;
3650             }
3651             default:
3652                 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3653                     vWARN2(RExC_parse,
3654                            "Unrecognized escape \\%c in character class passed through",
3655                            (int)value);
3656                 break;
3657             }
3658         } /* end of \blah */
3659
3660         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3661
3662             if (!SIZE_ONLY && !need_class)
3663                 ANYOF_CLASS_ZERO(ret);
3664
3665             need_class = 1;
3666
3667             /* a bad range like a-\d, a-[:digit:] ? */
3668             if (range) {
3669                 if (!SIZE_ONLY) {
3670                     if (ckWARN(WARN_REGEXP))
3671                         vWARN4(RExC_parse,
3672                                "False [] range \"%*.*s\"",
3673                                RExC_parse - rangebegin,
3674                                RExC_parse - rangebegin,
3675                                rangebegin);
3676                     if (prevvalue < 256) {
3677                         ANYOF_BITMAP_SET(ret, prevvalue);
3678                         ANYOF_BITMAP_SET(ret, '-');
3679                     }
3680                     else {
3681                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3682                         Perl_sv_catpvf(aTHX_ listsv,
3683                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3684                     }
3685                 }
3686
3687                 range = 0; /* this was not a true range */
3688             }
3689
3690             if (!SIZE_ONLY) {
3691                 if (namedclass > OOB_NAMEDCLASS)
3692                     optimize_invert = FALSE;
3693                 /* Possible truncation here but in some 64-bit environments
3694                  * the compiler gets heartburn about switch on 64-bit values.
3695                  * A similar issue a little earlier when switching on value.
3696                  * --jhi */
3697                 switch ((I32)namedclass) {
3698                 case ANYOF_ALNUM:
3699                     if (LOC)
3700                         ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3701                     else {
3702                         for (value = 0; value < 256; value++)
3703                             if (isALNUM(value))
3704                                 ANYOF_BITMAP_SET(ret, value);
3705                     }
3706                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");    
3707                     break;
3708                 case ANYOF_NALNUM:
3709                     if (LOC)
3710                         ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3711                     else {
3712                         for (value = 0; value < 256; value++)
3713                             if (!isALNUM(value))
3714                                 ANYOF_BITMAP_SET(ret, value);
3715                     }
3716                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3717                     break;
3718                 case ANYOF_ALNUMC:
3719                     if (LOC)
3720                         ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3721                     else {
3722                         for (value = 0; value < 256; value++)
3723                             if (isALNUMC(value))
3724                                 ANYOF_BITMAP_SET(ret, value);
3725                     }
3726                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3727                     break;
3728                 case ANYOF_NALNUMC:
3729                     if (LOC)
3730                         ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3731                     else {
3732                         for (value = 0; value < 256; value++)
3733                             if (!isALNUMC(value))
3734                                 ANYOF_BITMAP_SET(ret, value);
3735                     }
3736                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3737                     break;
3738                 case ANYOF_ALPHA:
3739                     if (LOC)
3740                         ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3741                     else {
3742                         for (value = 0; value < 256; value++)
3743                             if (isALPHA(value))
3744                                 ANYOF_BITMAP_SET(ret, value);
3745                     }
3746                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3747                     break;
3748                 case ANYOF_NALPHA:
3749                     if (LOC)
3750                         ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3751                     else {
3752                         for (value = 0; value < 256; value++)
3753                             if (!isALPHA(value))
3754                                 ANYOF_BITMAP_SET(ret, value);
3755                     }
3756                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3757                     break;
3758                 case ANYOF_ASCII:
3759                     if (LOC)
3760                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3761                     else {
3762 #ifndef EBCDIC
3763                         for (value = 0; value < 128; value++)
3764                             ANYOF_BITMAP_SET(ret, value);
3765 #else  /* EBCDIC */
3766                         for (value = 0; value < 256; value++) {
3767                             if (isASCII(value))
3768                                 ANYOF_BITMAP_SET(ret, value);
3769                         }
3770 #endif /* EBCDIC */
3771                     }
3772                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3773                     break;
3774                 case ANYOF_NASCII:
3775                     if (LOC)
3776                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3777                     else {
3778 #ifndef EBCDIC
3779                         for (value = 128; value < 256; value++)
3780                             ANYOF_BITMAP_SET(ret, value);
3781 #else  /* EBCDIC */
3782                         for (value = 0; value < 256; value++) {
3783                             if (!isASCII(value))
3784                                 ANYOF_BITMAP_SET(ret, value);
3785                         }
3786 #endif /* EBCDIC */
3787                     }
3788                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3789                     break;
3790                 case ANYOF_BLANK:
3791                     if (LOC)
3792                         ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3793                     else {
3794                         for (value = 0; value < 256; value++)
3795                             if (isBLANK(value))
3796                                 ANYOF_BITMAP_SET(ret, value);
3797                     }
3798                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3799                     break;
3800                 case ANYOF_NBLANK:
3801                     if (LOC)
3802                         ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3803                     else {
3804                         for (value = 0; value < 256; value++)
3805                             if (!isBLANK(value))
3806                                 ANYOF_BITMAP_SET(ret, value);
3807                     }
3808                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3809                     break;
3810                 case ANYOF_CNTRL:
3811                     if (LOC)
3812                         ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3813                     else {
3814                         for (value = 0; value < 256; value++)
3815                             if (isCNTRL(value))
3816                                 ANYOF_BITMAP_SET(ret, value);
3817                     }
3818                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3819                     break;
3820                 case ANYOF_NCNTRL:
3821                     if (LOC)
3822                         ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3823                     else {
3824                         for (value = 0; value < 256; value++)
3825                             if (!isCNTRL(value))
3826                                 ANYOF_BITMAP_SET(ret, value);
3827                     }
3828                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3829                     break;
3830                 case ANYOF_DIGIT:
3831                     if (LOC)
3832                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3833                     else {
3834                         /* consecutive digits assumed */
3835                         for (value = '0'; value <= '9'; value++)
3836                             ANYOF_BITMAP_SET(ret, value);
3837                     }
3838                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3839                     break;
3840                 case ANYOF_NDIGIT:
3841                     if (LOC)
3842                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3843                     else {
3844                         /* consecutive digits assumed */
3845                         for (value = 0; value < '0'; value++)
3846                             ANYOF_BITMAP_SET(ret, value);
3847                         for (value = '9' + 1; value < 256; value++)
3848                             ANYOF_BITMAP_SET(ret, value);
3849                     }
3850                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3851                     break;
3852                 case ANYOF_GRAPH:
3853                     if (LOC)
3854                         ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3855                     else {
3856                         for (value = 0; value < 256; value++)
3857                             if (isGRAPH(value))
3858                                 ANYOF_BITMAP_SET(ret, value);
3859                     }
3860                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3861                     break;
3862                 case ANYOF_NGRAPH:
3863                     if (LOC)
3864                         ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3865                     else {
3866                         for (value = 0; value < 256; value++)
3867                             if (!isGRAPH(value))
3868                                 ANYOF_BITMAP_SET(ret, value);
3869                     }
3870                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3871                     break;
3872                 case ANYOF_LOWER:
3873                     if (LOC)
3874                         ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3875                     else {
3876                         for (value = 0; value < 256; value++)
3877                             if (isLOWER(value))
3878                                 ANYOF_BITMAP_SET(ret, value);
3879                     }
3880                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3881                     break;
3882                 case ANYOF_NLOWER:
3883                     if (LOC)
3884                         ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3885                     else {
3886                         for (value = 0; value < 256; value++)
3887                             if (!isLOWER(value))
3888                                 ANYOF_BITMAP_SET(ret, value);
3889                     }
3890                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3891                     break;
3892                 case ANYOF_PRINT:
3893                     if (LOC)
3894                         ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3895                     else {
3896                         for (value = 0; value < 256; value++)
3897                             if (isPRINT(value))
3898                                 ANYOF_BITMAP_SET(ret, value);
3899                     }
3900                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3901                     break;
3902                 case ANYOF_NPRINT:
3903                     if (LOC)
3904                         ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3905                     else {
3906                         for (value = 0; value < 256; value++)
3907                             if (!isPRINT(value))
3908                                 ANYOF_BITMAP_SET(ret, value);
3909                     }
3910                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3911                     break;
3912                 case ANYOF_PSXSPC:
3913                     if (LOC)
3914                         ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3915                     else {
3916                         for (value = 0; value < 256; value++)
3917                             if (isPSXSPC(value))
3918                                 ANYOF_BITMAP_SET(ret, value);
3919                     }
3920                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3921                     break;
3922                 case ANYOF_NPSXSPC:
3923                     if (LOC)
3924                         ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3925                     else {
3926                         for (value = 0; value < 256; value++)
3927                             if (!isPSXSPC(value))
3928                                 ANYOF_BITMAP_SET(ret, value);
3929                     }
3930                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3931                     break;
3932                 case ANYOF_PUNCT:
3933                     if (LOC)
3934                         ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3935                     else {
3936                         for (value = 0; value < 256; value++)
3937                             if (isPUNCT(value))
3938                                 ANYOF_BITMAP_SET(ret, value);
3939                     }
3940                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
3941                     break;
3942                 case ANYOF_NPUNCT:
3943                     if (LOC)
3944                         ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
3945                     else {
3946                         for (value = 0; value < 256; value++)
3947                             if (!isPUNCT(value))
3948                                 ANYOF_BITMAP_SET(ret, value);
3949                     }
3950                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3951                     break;
3952                 case ANYOF_SPACE:
3953                     if (LOC)
3954                         ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3955                     else {
3956                         for (value = 0; value < 256; value++)
3957                             if (isSPACE(value))
3958                                 ANYOF_BITMAP_SET(ret, value);
3959                     }
3960                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3961                     break;
3962                 case ANYOF_NSPACE:
3963                     if (LOC)
3964                         ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3965                     else {
3966                         for (value = 0; value < 256; value++)
3967                             if (!isSPACE(value))
3968                                 ANYOF_BITMAP_SET(ret, value);
3969                     }
3970                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
3971                     break;
3972                 case ANYOF_UPPER:
3973                     if (LOC)
3974                         ANYOF_CLASS_SET(ret, ANYOF_UPPER);
3975                     else {
3976                         for (value = 0; value < 256; value++)
3977                             if (isUPPER(value))
3978                                 ANYOF_BITMAP_SET(ret, value);
3979                     }
3980                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
3981                     break;
3982                 case ANYOF_NUPPER:
3983                     if (LOC)
3984                         ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
3985                     else {
3986                         for (value = 0; value < 256; value++)
3987                             if (!isUPPER(value))
3988                                 ANYOF_BITMAP_SET(ret, value);
3989                     }
3990                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
3991                     break;
3992                 case ANYOF_XDIGIT:
3993                     if (LOC)
3994                         ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
3995                     else {
3996                         for (value = 0; value < 256; value++)
3997                             if (isXDIGIT(value))
3998                                 ANYOF_BITMAP_SET(ret, value);
3999                     }
4000                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4001                     break;
4002                 case ANYOF_NXDIGIT:
4003                     if (LOC)
4004                         ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4005                     else {
4006                         for (value = 0; value < 256; value++)
4007                             if (!isXDIGIT(value))
4008                                 ANYOF_BITMAP_SET(ret, value);
4009                     }
4010                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4011                     break;
4012                 default:
4013                     vFAIL("Invalid [::] class");
4014                     break;
4015                 }
4016                 if (LOC)
4017                     ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4018                 continue;
4019             }
4020         } /* end of namedclass \blah */
4021
4022         if (range) {
4023             if (prevvalue > value) /* b-a */ {
4024                 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4025                               RExC_parse - rangebegin,
4026                               RExC_parse - rangebegin,
4027                               rangebegin);
4028                 range = 0; /* not a valid range */
4029             }
4030         }
4031         else {
4032             prevvalue = value; /* save the beginning of the range */
4033             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4034                 RExC_parse[1] != ']') {
4035                 RExC_parse++;
4036
4037                 /* a bad range like \w-, [:word:]- ? */
4038                 if (namedclass > OOB_NAMEDCLASS) {
4039                     if (ckWARN(WARN_REGEXP))
4040                         vWARN4(RExC_parse,
4041                                "False [] range \"%*.*s\"",
4042                                RExC_parse - rangebegin,
4043                                RExC_parse - rangebegin,
4044                                rangebegin);
4045                     if (!SIZE_ONLY)
4046                         ANYOF_BITMAP_SET(ret, '-');
4047                 } else
4048                     range = 1;  /* yeah, it's a range! */
4049                 continue;       /* but do it the next time */
4050             }
4051         }
4052
4053         /* now is the next time */
4054         if (!SIZE_ONLY) {
4055             IV i;
4056
4057             if (prevvalue < 256) {
4058                 IV ceilvalue = value < 256 ? value : 255;
4059
4060 #ifdef EBCDIC
4061                 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4062                     (isUPPER(prevvalue) && isUPPER(ceilvalue)))
4063                 {
4064                     if (isLOWER(prevvalue)) {
4065                         for (i = prevvalue; i <= ceilvalue; i++)
4066                             if (isLOWER(i))
4067                                 ANYOF_BITMAP_SET(ret, i);
4068                     } else {
4069                         for (i = prevvalue; i <= ceilvalue; i++)
4070                             if (isUPPER(i))
4071                                 ANYOF_BITMAP_SET(ret, i);
4072                     }
4073                 }
4074                 else
4075 #endif
4076                       for (i = prevvalue; i <= ceilvalue; i++)
4077                           ANYOF_BITMAP_SET(ret, i);
4078           }
4079           if (value > 255 || UTF) {
4080                 UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
4081                 UV natvalue      = NATIVE_TO_UNI(value);
4082
4083                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4084                 if (prevnatvalue < natvalue) { /* what about > ? */
4085                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4086                                    prevnatvalue, natvalue);
4087                 }
4088                 else if (prevnatvalue == natvalue) {
4089                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4090                     if (FOLD) {
4091                          U8 tmpbuf [UTF8_MAXLEN+1];
4092                          U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4093                          STRLEN foldlen;
4094                          UV f;
4095
4096                          uvchr_to_utf8(tmpbuf, natvalue);
4097                          to_utf8_fold(tmpbuf, foldbuf, &foldlen);
4098                          f = UNI_TO_NATIVE(utf8_to_uvchr(foldbuf, 0));
4099
4100                          /* If folding and foldable and a single
4101                           * character, insert also the folded version
4102                           * to the charclass. */
4103                          if (f != value) {
4104                               if (foldlen == UNISKIP(f))
4105                                   Perl_sv_catpvf(aTHX_ listsv,
4106                                                  "%04"UVxf"\n", f);
4107                               else {
4108                                   /* Any multicharacter foldings
4109                                    * require the following transform:
4110                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4111                                    * where E folds into "pq" and F folds
4112                                    * into "rst", all other characters
4113                                    * fold to single characters.  We save
4114                                    * away these multicharacter foldings,
4115                                    * to be later saved as part of the
4116                                    * additional "s" data. */
4117                                   SV *sv;
4118
4119                                   if (!unicode_alternate)
4120                                       unicode_alternate = newAV();
4121                                   sv = newSVpvn((char*)foldbuf, foldlen);
4122                                   SvUTF8_on(sv);
4123                                   av_push(unicode_alternate, sv);
4124                               }
4125                          }
4126
4127                          /* If folding and the value is one of the Greek
4128                           * sigmas insert a few more sigmas to make the
4129                           * folding rules of the sigmas to work right.
4130                           * Note that not all the possible combinations
4131                           * are handled here: some of them are handled
4132                           * by the standard folding rules, and some of
4133                           * them (literal or EXACTF cases) are handled
4134                           * during runtime in regexec.c:S_find_byclass(). */
4135                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4136                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4137                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4138                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4139                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4140                          }
4141                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4142                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4143                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4144                     }
4145                 }
4146             }
4147         }
4148
4149         range = 0; /* this range (if it was one) is done now */
4150     }
4151
4152     if (need_class) {
4153         ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4154         if (SIZE_ONLY)
4155             RExC_size += ANYOF_CLASS_ADD_SKIP;
4156         else
4157             RExC_emit += ANYOF_CLASS_ADD_SKIP;
4158     }
4159
4160     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4161     if (!SIZE_ONLY &&
4162          /* If the only flag is folding (plus possibly inversion). */
4163         ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4164        ) {
4165         for (value = 0; value < 256; ++value) {
4166             if (ANYOF_BITMAP_TEST(ret, value)) {
4167                 IV fold = PL_fold[value];
4168
4169                 if (fold != value)
4170                     ANYOF_BITMAP_SET(ret, fold);
4171             }
4172         }
4173         ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4174     }
4175
4176     /* optimize inverted simple patterns (e.g. [^a-z]) */
4177     if (!SIZE_ONLY && optimize_invert &&
4178         /* If the only flag is inversion. */
4179         (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4180         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4181             ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4182         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4183     }
4184
4185     if (!SIZE_ONLY) {
4186         AV *av = newAV();
4187         SV *rv;
4188
4189         /* The 0th element stores the character class description
4190          * in its textual form: used later (regexec.c:Perl_regclass_swatch())
4191          * to initialize the appropriate swash (which gets stored in
4192          * the 1st element), and also useful for dumping the regnode.
4193          * The 2nd element stores the multicharacter foldings,
4194          * used later (regexec.c:s_reginclasslen()). */
4195         av_store(av, 0, listsv);
4196         av_store(av, 1, NULL);
4197         av_store(av, 2, (SV*)unicode_alternate);
4198         rv = newRV_noinc((SV*)av);
4199         n = add_data(pRExC_state, 1, "s");
4200         RExC_rx->data->data[n] = (void*)rv;
4201         ARG_SET(ret, n);
4202     }
4203
4204     return ret;
4205 }
4206
4207 STATIC char*
4208 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4209 {
4210     char* retval = RExC_parse++;
4211
4212     for (;;) {
4213         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4214                 RExC_parse[2] == '#') {
4215             while (*RExC_parse && *RExC_parse != ')')
4216                 RExC_parse++;
4217             RExC_parse++;
4218             continue;
4219         }
4220         if (RExC_flags16 & PMf_EXTENDED) {
4221             if (isSPACE(*RExC_parse)) {
4222                 RExC_parse++;
4223                 continue;
4224             }
4225             else if (*RExC_parse == '#') {
4226                 while (*RExC_parse && *RExC_parse != '\n')
4227                     RExC_parse++;
4228                 RExC_parse++;
4229                 continue;
4230             }
4231         }
4232         return retval;
4233     }
4234 }
4235
4236 /*
4237 - reg_node - emit a node
4238 */
4239 STATIC regnode *                        /* Location. */
4240 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4241 {
4242     register regnode *ret;
4243     register regnode *ptr;
4244
4245     ret = RExC_emit;
4246     if (SIZE_ONLY) {
4247         SIZE_ALIGN(RExC_size);
4248         RExC_size += 1;
4249         return(ret);
4250     }
4251
4252     NODE_ALIGN_FILL(ret);
4253     ptr = ret;
4254     FILL_ADVANCE_NODE(ptr, op);
4255     if (RExC_offsets) {         /* MJD */
4256       MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n", 
4257               "reg_node", __LINE__, 
4258               reg_name[op],
4259               RExC_emit - RExC_emit_start > RExC_offsets[0] 
4260               ? "Overwriting end of array!\n" : "OK",
4261               RExC_emit - RExC_emit_start,
4262               RExC_parse - RExC_start,
4263               RExC_offsets[0])); 
4264       Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4265     }
4266             
4267     RExC_emit = ptr;
4268
4269     return(ret);
4270 }
4271
4272 /*
4273 - reganode - emit a node with an argument
4274 */
4275 STATIC regnode *                        /* Location. */
4276 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4277 {
4278     register regnode *ret;
4279     register regnode *ptr;
4280
4281     ret = RExC_emit;
4282     if (SIZE_ONLY) {
4283         SIZE_ALIGN(RExC_size);
4284         RExC_size += 2;
4285         return(ret);
4286     }
4287
4288     NODE_ALIGN_FILL(ret);
4289     ptr = ret;
4290     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4291     if (RExC_offsets) {         /* MJD */
4292       MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n", 
4293               "reganode",
4294               RExC_emit - RExC_emit_start > RExC_offsets[0] ? 
4295               "Overwriting end of array!\n" : "OK",
4296               RExC_emit - RExC_emit_start,
4297               RExC_parse - RExC_start,
4298               RExC_offsets[0])); 
4299       Set_Cur_Node_Offset;
4300     }
4301             
4302     RExC_emit = ptr;
4303
4304     return(ret);
4305 }
4306
4307 /*
4308 - reguni - emit (if appropriate) a Unicode character
4309 */
4310 STATIC void
4311 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4312 {
4313     *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4314 }
4315
4316 /*
4317 - reginsert - insert an operator in front of already-emitted operand
4318 *
4319 * Means relocating the operand.
4320 */
4321 STATIC void
4322 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4323 {
4324     register regnode *src;
4325     register regnode *dst;
4326     register regnode *place;
4327     register int offset = regarglen[(U8)op];
4328
4329 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4330
4331     if (SIZE_ONLY) {
4332         RExC_size += NODE_STEP_REGNODE + offset;
4333         return;
4334     }
4335
4336     src = RExC_emit;
4337     RExC_emit += NODE_STEP_REGNODE + offset;
4338     dst = RExC_emit;
4339     while (src > opnd) {
4340         StructCopy(--src, --dst, regnode);
4341         if (RExC_offsets) {     /* MJD 20010112 */
4342           MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n", 
4343                   "reg_insert",
4344                   dst - RExC_emit_start > RExC_offsets[0] 
4345                   ? "Overwriting end of array!\n" : "OK",
4346                   src - RExC_emit_start,
4347                   dst - RExC_emit_start,
4348                   RExC_offsets[0])); 
4349           Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4350           Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4351         }
4352     }
4353     
4354
4355     place = opnd;               /* Op node, where operand used to be. */
4356     if (RExC_offsets) {         /* MJD */
4357       MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n", 
4358               "reginsert",
4359               place - RExC_emit_start > RExC_offsets[0] 
4360               ? "Overwriting end of array!\n" : "OK",
4361               place - RExC_emit_start,
4362               RExC_parse - RExC_start,
4363               RExC_offsets[0])); 
4364       Set_Node_Offset(place, RExC_parse);
4365     }
4366     src = NEXTOPER(place);
4367     FILL_ADVANCE_NODE(place, op);
4368     Zero(src, offset, regnode);
4369 }
4370
4371 /*
4372 - regtail - set the next-pointer at the end of a node chain of p to val.
4373 */
4374 STATIC void
4375 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4376 {
4377     register regnode *scan;
4378     register regnode *temp;
4379
4380     if (SIZE_ONLY)
4381         return;
4382
4383     /* Find last node. */
4384     scan = p;
4385     for (;;) {
4386         temp = regnext(scan);
4387         if (temp == NULL)
4388             break;
4389         scan = temp;
4390     }
4391
4392     if (reg_off_by_arg[OP(scan)]) {
4393         ARG_SET(scan, val - scan);
4394     }
4395     else {
4396         NEXT_OFF(scan) = val - scan;
4397     }
4398 }
4399
4400 /*
4401 - regoptail - regtail on operand of first argument; nop if operandless
4402 */
4403 STATIC void
4404 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4405 {
4406     /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4407     if (p == NULL || SIZE_ONLY)
4408         return;
4409     if (PL_regkind[(U8)OP(p)] == BRANCH) {
4410         regtail(pRExC_state, NEXTOPER(p), val);
4411     }
4412     else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4413         regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4414     }
4415     else
4416         return;
4417 }
4418
4419 /*
4420  - regcurly - a little FSA that accepts {\d+,?\d*}
4421  */
4422 STATIC I32
4423 S_regcurly(pTHX_ register char *s)
4424 {
4425     if (*s++ != '{')
4426         return FALSE;
4427     if (!isDIGIT(*s))
4428         return FALSE;
4429     while (isDIGIT(*s))
4430         s++;
4431     if (*s == ',')
4432         s++;
4433     while (isDIGIT(*s))
4434         s++;
4435     if (*s != '}')
4436         return FALSE;
4437     return TRUE;
4438 }
4439
4440
4441 #ifdef DEBUGGING
4442
4443 STATIC regnode *
4444 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4445 {
4446     register U8 op = EXACT;     /* Arbitrary non-END op. */
4447     register regnode *next;
4448
4449     while (op != END && (!last || node < last)) {
4450         /* While that wasn't END last time... */
4451
4452         NODE_ALIGN(node);
4453         op = OP(node);
4454         if (op == CLOSE)
4455             l--;        
4456         next = regnext(node);
4457         /* Where, what. */
4458         if (OP(node) == OPTIMIZED)
4459             goto after_print;
4460         regprop(sv, node);
4461         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4462                       (int)(2*l + 1), "", SvPVX(sv));
4463         if (next == NULL)               /* Next ptr. */
4464             PerlIO_printf(Perl_debug_log, "(0)");
4465         else
4466             PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4467         (void)PerlIO_putc(Perl_debug_log, '\n');
4468       after_print:
4469         if (PL_regkind[(U8)op] == BRANCHJ) {
4470             register regnode *nnode = (OP(next) == LONGJMP
4471                                        ? regnext(next)
4472                                        : next);
4473             if (last && nnode > last)
4474                 nnode = last;
4475             node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4476         }
4477         else if (PL_regkind[(U8)op] == BRANCH) {
4478             node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4479         }
4480         else if ( op == CURLY) {   /* `next' might be very big: optimizer */
4481             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4482                              NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4483         }
4484         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4485             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4486                              next, sv, l + 1);
4487         }
4488         else if ( op == PLUS || op == STAR) {
4489             node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4490         }
4491         else if (op == ANYOF) {
4492             /* arglen 1 + class block */
4493             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4494                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4495             node = NEXTOPER(node);
4496         }
4497         else if (PL_regkind[(U8)op] == EXACT) {
4498             /* Literal string, where present. */
4499             node += NODE_SZ_STR(node) - 1;
4500             node = NEXTOPER(node);
4501         }
4502         else {
4503             node = NEXTOPER(node);
4504             node += regarglen[(U8)op];
4505         }
4506         if (op == CURLYX || op == OPEN)
4507             l++;
4508         else if (op == WHILEM)
4509             l--;
4510     }
4511     return node;
4512 }
4513
4514 #endif  /* DEBUGGING */
4515
4516 /*
4517  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4518  */
4519 void
4520 Perl_regdump(pTHX_ regexp *r)
4521 {
4522 #ifdef DEBUGGING
4523     SV *sv = sv_newmortal();
4524
4525     (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4526
4527     /* Header fields of interest. */
4528     if (r->anchored_substr)
4529         PerlIO_printf(Perl_debug_log,
4530                       "anchored `%s%.*s%s'%s at %"IVdf" ",
4531                       PL_colors[0],
4532                       (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4533                       SvPVX(r->anchored_substr),
4534                       PL_colors[1],
4535                       SvTAIL(r->anchored_substr) ? "$" : "",
4536                       (IV)r->anchored_offset);
4537     if (r->float_substr)
4538         PerlIO_printf(Perl_debug_log,
4539                       "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4540                       PL_colors[0],
4541                       (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4542                       SvPVX(r->float_substr),
4543                       PL_colors[1],
4544                       SvTAIL(r->float_substr) ? "$" : "",
4545                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4546     if (r->check_substr)
4547         PerlIO_printf(Perl_debug_log,
4548                       r->check_substr == r->float_substr
4549                       ? "(checking floating" : "(checking anchored");
4550     if (r->reganch & ROPT_NOSCAN)
4551         PerlIO_printf(Perl_debug_log, " noscan");
4552     if (r->reganch & ROPT_CHECK_ALL)
4553         PerlIO_printf(Perl_debug_log, " isall");
4554     if (r->check_substr)
4555         PerlIO_printf(Perl_debug_log, ") ");
4556
4557     if (r->regstclass) {
4558         regprop(sv, r->regstclass);
4559         PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4560     }
4561     if (r->reganch & ROPT_ANCH) {
4562         PerlIO_printf(Perl_debug_log, "anchored");
4563         if (r->reganch & ROPT_ANCH_BOL)
4564             PerlIO_printf(Perl_debug_log, "(BOL)");
4565         if (r->reganch & ROPT_ANCH_MBOL)
4566             PerlIO_printf(Perl_debug_log, "(MBOL)");
4567         if (r->reganch & ROPT_ANCH_SBOL)
4568             PerlIO_printf(Perl_debug_log, "(SBOL)");
4569         if (r->reganch & ROPT_ANCH_GPOS)
4570             PerlIO_printf(Perl_debug_log, "(GPOS)");
4571         PerlIO_putc(Perl_debug_log, ' ');
4572     }
4573     if (r->reganch & ROPT_GPOS_SEEN)
4574         PerlIO_printf(Perl_debug_log, "GPOS ");
4575     if (r->reganch & ROPT_SKIP)
4576         PerlIO_printf(Perl_debug_log, "plus ");
4577     if (r->reganch & ROPT_IMPLICIT)
4578         PerlIO_printf(Perl_debug_log, "implicit ");
4579     PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4580     if (r->reganch & ROPT_EVAL_SEEN)
4581         PerlIO_printf(Perl_debug_log, "with eval ");
4582     PerlIO_printf(Perl_debug_log, "\n");
4583     if (r->offsets) {
4584       U32 i;
4585       U32 len = r->offsets[0];
4586       PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4587       for (i = 1; i <= len; i++)
4588         PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ", 
4589                       (UV)r->offsets[i*2-1], 
4590                       (UV)r->offsets[i*2]);
4591       PerlIO_printf(Perl_debug_log, "\n");
4592     }
4593 #endif  /* DEBUGGING */
4594 }
4595
4596 #ifdef DEBUGGING
4597
4598 STATIC void
4599 S_put_byte(pTHX_ SV *sv, int c)
4600 {
4601     if (isCNTRL(c) || c == 255 || !isPRINT(c))
4602         Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4603     else if (c == '-' || c == ']' || c == '\\' || c == '^')
4604         Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4605     else
4606         Perl_sv_catpvf(aTHX_ sv, "%c", c);
4607 }
4608
4609 #endif  /* DEBUGGING */
4610
4611 /*
4612 - regprop - printable representation of opcode
4613 */
4614 void
4615 Perl_regprop(pTHX_ SV *sv, regnode *o)
4616 {
4617 #ifdef DEBUGGING
4618     register int k;
4619
4620     sv_setpvn(sv, "", 0);
4621     if (OP(o) >= reg_num)               /* regnode.type is unsigned */
4622         /* It would be nice to FAIL() here, but this may be called from
4623            regexec.c, and it would be hard to supply pRExC_state. */
4624         Perl_croak(aTHX_ "Corrupted regexp opcode");
4625     sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4626
4627     k = PL_regkind[(U8)OP(o)];
4628
4629     if (k == EXACT) {
4630         SV *dsv = sv_2mortal(newSVpvn("", 0));
4631         /* Using is_utf8_string() is a crude hack but it may
4632          * be the best for now since we have no flag "this EXACTish
4633          * node was UTF-8" --jhi */
4634         bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4635         char *s    = do_utf8 ?
4636           pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4637                          UNI_DISPLAY_REGEX) :
4638           STRING(o);
4639         int len = do_utf8 ?
4640           strlen(s) :
4641           STR_LEN(o);
4642         Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4643                        PL_colors[0],
4644                        len, s,
4645                        PL_colors[1]);
4646     }
4647     else if (k == CURLY) {
4648         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4649             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4650         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4651     }
4652     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
4653         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4654     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4655         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
4656     else if (k == LOGICAL)
4657         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
4658     else if (k == ANYOF) {
4659         int i, rangestart = -1;
4660         U8 flags = ANYOF_FLAGS(o);
4661         const char * const anyofs[] = { /* Should be syncronized with
4662                                          * ANYOF_ #xdefines in regcomp.h */
4663             "\\w",
4664             "\\W",
4665             "\\s",
4666             "\\S",
4667             "\\d",
4668             "\\D",
4669             "[:alnum:]",
4670             "[:^alnum:]",
4671             "[:alpha:]",
4672             "[:^alpha:]",
4673             "[:ascii:]",
4674             "[:^ascii:]",
4675             "[:ctrl:]",
4676             "[:^ctrl:]",
4677             "[:graph:]",
4678             "[:^graph:]",
4679             "[:lower:]",
4680             "[:^lower:]",
4681             "[:print:]",
4682             "[:^print:]",
4683             "[:punct:]",
4684             "[:^punct:]",
4685             "[:upper:]",
4686             "[:^upper:]",
4687             "[:xdigit:]",
4688             "[:^xdigit:]",
4689             "[:space:]",
4690             "[:^space:]",
4691             "[:blank:]",
4692             "[:^blank:]"
4693         };
4694
4695         if (flags & ANYOF_LOCALE)
4696             sv_catpv(sv, "{loc}");
4697         if (flags & ANYOF_FOLD)
4698             sv_catpv(sv, "{i}");
4699         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4700         if (flags & ANYOF_INVERT)
4701             sv_catpv(sv, "^");
4702         for (i = 0; i <= 256; i++) {
4703             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4704                 if (rangestart == -1)
4705                     rangestart = i;
4706             } else if (rangestart != -1) {
4707                 if (i <= rangestart + 3)
4708                     for (; rangestart < i; rangestart++)
4709                         put_byte(sv, rangestart);
4710                 else {
4711                     put_byte(sv, rangestart);
4712                     sv_catpv(sv, "-");
4713                     put_byte(sv, i - 1);
4714                 }
4715                 rangestart = -1;
4716             }
4717         }
4718
4719         if (o->flags & ANYOF_CLASS)
4720             for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4721                 if (ANYOF_CLASS_TEST(o,i))
4722                     sv_catpv(sv, anyofs[i]);
4723
4724         if (flags & ANYOF_UNICODE)
4725             sv_catpv(sv, "{unicode}");
4726         else if (flags & ANYOF_UNICODE_ALL)
4727             sv_catpv(sv, "{unicode_all}");
4728
4729         {
4730             SV *lv;
4731             SV *sw = regclass_swash(o, FALSE, &lv, 0);
4732         
4733             if (lv) {
4734                 if (sw) {
4735                     UV i;
4736                     U8 s[UTF8_MAXLEN+1];
4737                 
4738                     for (i = 0; i <= 256; i++) { /* just the first 256 */
4739                         U8 *e = uvchr_to_utf8(s, i);
4740                         
4741                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
4742                             if (rangestart == -1)
4743                                 rangestart = i;
4744                         } else if (rangestart != -1) {
4745                             U8 *p;
4746                         
4747                             if (i <= rangestart + 3)
4748                                 for (; rangestart < i; rangestart++) {
4749                                     for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4750                                         put_byte(sv, *p);
4751                                 }
4752                             else {
4753                                 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4754                                     put_byte(sv, *p);
4755                                 sv_catpv(sv, "-");
4756                                     for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4757                                         put_byte(sv, *p);
4758                                 }
4759                                 rangestart = -1;
4760                             }
4761                         }
4762                         
4763                     sv_catpv(sv, "..."); /* et cetera */
4764                 }
4765
4766                 {
4767                     char *s = savepv(SvPVX(lv));
4768                     char *origs = s;
4769                 
4770                     while(*s && *s != '\n') s++;
4771                 
4772                     if (*s == '\n') {
4773                         char *t = ++s;
4774                         
4775                         while (*s) {
4776                             if (*s == '\n')
4777                                 *s = ' ';
4778                             s++;
4779                         }
4780                         if (s[-1] == ' ')
4781                             s[-1] = 0;
4782                         
4783                         sv_catpv(sv, t);
4784                     }
4785                 
4786                     Safefree(origs);
4787                 }
4788             }
4789         }
4790
4791         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4792     }
4793     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4794         Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4795 #endif  /* DEBUGGING */
4796 }
4797
4798 SV *
4799 Perl_re_intuit_string(pTHX_ regexp *prog)
4800 {                               /* Assume that RE_INTUIT is set */
4801     DEBUG_r(
4802         {   STRLEN n_a;
4803             char *s = SvPV(prog->check_substr,n_a);
4804
4805             if (!PL_colorset) reginitcolors();
4806             PerlIO_printf(Perl_debug_log,
4807                       "%sUsing REx substr:%s `%s%.60s%s%s'\n",
4808                       PL_colors[4],PL_colors[5],PL_colors[0],
4809                       s,
4810                       PL_colors[1],
4811                       (strlen(s) > 60 ? "..." : ""));
4812         } );
4813
4814     return prog->check_substr;
4815 }
4816
4817 void
4818 Perl_pregfree(pTHX_ struct regexp *r)
4819 {
4820 #ifdef DEBUGGING
4821     SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4822 #endif
4823
4824     if (!r || (--r->refcnt > 0))
4825         return;
4826     DEBUG_r({
4827          char *s = pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60,
4828                                   UNI_DISPLAY_REGEX);
4829          int len = SvCUR(dsv);
4830          if (!PL_colorset)
4831               reginitcolors();
4832          PerlIO_printf(Perl_debug_log,
4833                        "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4834                        PL_colors[4],PL_colors[5],PL_colors[0],
4835                        len, len, s,
4836                        PL_colors[1],
4837                        len > 60 ? "..." : "");
4838     });
4839
4840     if (r->precomp)
4841         Safefree(r->precomp);
4842     if (r->offsets)             /* 20010421 MJD */
4843         Safefree(r->offsets);
4844     if (RX_MATCH_COPIED(r))
4845         Safefree(r->subbeg);
4846     if (r->substrs) {
4847         if (r->anchored_substr)
4848             SvREFCNT_dec(r->anchored_substr);
4849         if (r->float_substr)
4850             SvREFCNT_dec(r->float_substr);
4851         Safefree(r->substrs);
4852     }
4853     if (r->data) {
4854         int n = r->data->count;
4855         AV* new_comppad = NULL;
4856         AV* old_comppad;
4857         SV** old_curpad;
4858
4859         while (--n >= 0) {
4860           /* If you add a ->what type here, update the comment in regcomp.h */
4861             switch (r->data->what[n]) {
4862             case 's':
4863                 SvREFCNT_dec((SV*)r->data->data[n]);
4864                 break;
4865             case 'f':
4866                 Safefree(r->data->data[n]);
4867                 break;
4868             case 'p':
4869                 new_comppad = (AV*)r->data->data[n];
4870                 break;
4871             case 'o':
4872                 if (new_comppad == NULL)
4873                     Perl_croak(aTHX_ "panic: pregfree comppad");
4874                 old_comppad = PL_comppad;
4875                 old_curpad = PL_curpad;
4876                 /* Watch out for global destruction's random ordering. */
4877                 if (SvTYPE(new_comppad) == SVt_PVAV) {
4878                     PL_comppad = new_comppad;
4879                     PL_curpad = AvARRAY(new_comppad);
4880                 }
4881                 else
4882                     PL_curpad = NULL;
4883
4884                 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4885                     op_free((OP_4tree*)r->data->data[n]);
4886                 }
4887
4888                 PL_comppad = old_comppad;
4889                 PL_curpad = old_curpad;
4890                 SvREFCNT_dec((SV*)new_comppad);
4891                 new_comppad = NULL;
4892                 break;
4893             case 'n':
4894                 break;
4895             default:
4896                 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4897             }
4898         }
4899         Safefree(r->data->what);
4900         Safefree(r->data);
4901     }
4902     Safefree(r->startp);
4903     Safefree(r->endp);
4904     Safefree(r);
4905 }
4906
4907 /*
4908  - regnext - dig the "next" pointer out of a node
4909  *
4910  * [Note, when REGALIGN is defined there are two places in regmatch()
4911  * that bypass this code for speed.]
4912  */
4913 regnode *
4914 Perl_regnext(pTHX_ register regnode *p)
4915 {
4916     register I32 offset;
4917
4918     if (p == &PL_regdummy)
4919         return(NULL);
4920
4921     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4922     if (offset == 0)
4923         return(NULL);
4924
4925     return(p+offset);
4926 }
4927
4928 STATIC void     
4929 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
4930 {
4931     va_list args;
4932     STRLEN l1 = strlen(pat1);
4933     STRLEN l2 = strlen(pat2);
4934     char buf[512];
4935     SV *msv;
4936     char *message;
4937
4938     if (l1 > 510)
4939         l1 = 510;
4940     if (l1 + l2 > 510)
4941         l2 = 510 - l1;
4942     Copy(pat1, buf, l1 , char);
4943     Copy(pat2, buf + l1, l2 , char);
4944     buf[l1 + l2] = '\n';
4945     buf[l1 + l2 + 1] = '\0';
4946 #ifdef I_STDARG
4947     /* ANSI variant takes additional second argument */
4948     va_start(args, pat2);
4949 #else
4950     va_start(args);
4951 #endif
4952     msv = vmess(buf, &args);
4953     va_end(args);
4954     message = SvPV(msv,l1);
4955     if (l1 > 512)
4956         l1 = 512;
4957     Copy(message, buf, l1 , char);
4958     buf[l1] = '\0';                     /* Overwrite \n */
4959     Perl_croak(aTHX_ "%s", buf);
4960 }
4961
4962 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
4963
4964 void
4965 Perl_save_re_context(pTHX)
4966 {
4967 #if 0
4968     SAVEPPTR(RExC_precomp);             /* uncompiled string. */
4969     SAVEI32(RExC_npar);         /* () count. */
4970     SAVEI32(RExC_size);         /* Code size. */
4971     SAVEI16(RExC_flags16);              /* are we folding, multilining? */
4972     SAVEVPTR(RExC_rx);          /* from regcomp.c */
4973     SAVEI32(RExC_seen);         /* from regcomp.c */
4974     SAVEI32(RExC_sawback);              /* Did we see \1, ...? */
4975     SAVEI32(RExC_naughty);              /* How bad is this pattern? */
4976     SAVEVPTR(RExC_emit);                /* Code-emit pointer; &regdummy = don't */
4977     SAVEPPTR(RExC_end);         /* End of input for compile */
4978     SAVEPPTR(RExC_parse);               /* Input-scan pointer. */
4979 #endif
4980
4981     SAVEI32(PL_reg_flags);              /* from regexec.c */
4982     SAVEPPTR(PL_bostr);
4983     SAVEPPTR(PL_reginput);              /* String-input pointer. */
4984     SAVEPPTR(PL_regbol);                /* Beginning of input, for ^ check. */
4985     SAVEPPTR(PL_regeol);                /* End of input, for $ check. */
4986     SAVEVPTR(PL_regstartp);             /* Pointer to startp array. */
4987     SAVEVPTR(PL_regendp);               /* Ditto for endp. */
4988     SAVEVPTR(PL_reglastparen);          /* Similarly for lastparen. */
4989     SAVEPPTR(PL_regtill);               /* How far we are required to go. */
4990     SAVEGENERICPV(PL_reg_start_tmp);            /* from regexec.c */
4991     PL_reg_start_tmp = 0;
4992     SAVEI32(PL_reg_start_tmpl);         /* from regexec.c */
4993     PL_reg_start_tmpl = 0;
4994     SAVEVPTR(PL_regdata);
4995     SAVEI32(PL_reg_eval_set);           /* from regexec.c */
4996     SAVEI32(PL_regnarrate);             /* from regexec.c */
4997     SAVEVPTR(PL_regprogram);            /* from regexec.c */
4998     SAVEINT(PL_regindent);              /* from regexec.c */
4999     SAVEVPTR(PL_regcc);                 /* from regexec.c */
5000     SAVEVPTR(PL_curcop);
5001     SAVEVPTR(PL_reg_call_cc);           /* from regexec.c */
5002     SAVEVPTR(PL_reg_re);                /* from regexec.c */
5003     SAVEPPTR(PL_reg_ganch);             /* from regexec.c */
5004     SAVESPTR(PL_reg_sv);                /* from regexec.c */
5005     SAVEI8(PL_reg_match_utf8);          /* from regexec.c */
5006     SAVEVPTR(PL_reg_magic);             /* from regexec.c */
5007     SAVEI32(PL_reg_oldpos);                     /* from regexec.c */
5008     SAVEVPTR(PL_reg_oldcurpm);          /* from regexec.c */
5009     SAVEVPTR(PL_reg_curpm);             /* from regexec.c */
5010     SAVEI32(PL_regnpar);                /* () count. */
5011     SAVEI32(PL_regsize);                /* from regexec.c */
5012 #ifdef DEBUGGING
5013     SAVEPPTR(PL_reg_starttry);          /* from regexec.c */
5014 #endif
5015 }
5016
5017 static void
5018 clear_re(pTHX_ void *r)
5019 {
5020     ReREFCNT_dec((regexp *)r);
5021 }
5022