Signedness nits.
[p5sagit/p5-mst-13.2.git] / regcomp.c
1 /*    regcomp.c
2  */
3
4 /*
5  * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
6  */
7
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9  * confused with the original package (see point 3 below).  Thanks, Henry!
10  */
11
12 /* Additional note: this code is very heavily munged from Henry's version
13  * in places.  In some spots I've traded clarity for efficiency, so don't
14  * blame Henry for some of the lack of readability.
15  */
16
17 /* The names of the functions have been changed from regcomp and
18  * regexec to  pregcomp and pregexec in order to avoid conflicts
19  * with the POSIX routines of the same names.
20 */
21
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 #  ifndef PERL_IN_XSUB_RE
25 #    define PERL_IN_XSUB_RE
26 #  endif
27 /* need access to debugger hooks */
28 #  if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
29 #    define DEBUGGING
30 #  endif
31 #endif
32
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 #  define Perl_pregcomp my_regcomp
36 #  define Perl_regdump my_regdump
37 #  define Perl_regprop my_regprop
38 #  define Perl_pregfree my_regfree
39 #  define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 #  define Perl_regnext my_regnext
42 #  define Perl_save_re_context my_save_re_context
43 #  define Perl_reginitcolors my_reginitcolors
44
45 #  define PERL_NO_GET_CONTEXT
46 #endif
47
48 /*SUPPRESS 112*/
49 /*
50  * pregcomp and pregexec -- regsub and regerror are not used in perl
51  *
52  *      Copyright (c) 1986 by University of Toronto.
53  *      Written by Henry Spencer.  Not derived from licensed software.
54  *
55  *      Permission is granted to anyone to use this software for any
56  *      purpose on any computer system, and to redistribute it freely,
57  *      subject to the following restrictions:
58  *
59  *      1. The author is not responsible for the consequences of use of
60  *              this software, no matter how awful, even if they arise
61  *              from defects in it.
62  *
63  *      2. The origin of this software must not be misrepresented, either
64  *              by explicit claim or by omission.
65  *
66  *      3. Altered versions must be plainly marked as such, and must not
67  *              be misrepresented as being the original software.
68  *
69  *
70  ****    Alterations to Henry's code are...
71  ****
72  ****    Copyright (c) 1991-2003, Larry Wall
73  ****
74  ****    You may distribute under the terms of either the GNU General Public
75  ****    License or the Artistic License, as specified in the README file.
76
77  *
78  * Beware that some of this code is subtly aware of the way operator
79  * precedence is structured in regular expressions.  Serious changes in
80  * regular-expression syntax might require a total rethink.
81  */
82 #include "EXTERN.h"
83 #define PERL_IN_REGCOMP_C
84 #include "perl.h"
85
86 #ifndef PERL_IN_XSUB_RE
87 #  include "INTERN.h"
88 #endif
89
90 #define REG_COMP_C
91 #include "regcomp.h"
92
93 #ifdef op
94 #undef op
95 #endif /* op */
96
97 #ifdef MSDOS
98 # if defined(BUGGY_MSC6)
99  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100  # pragma optimize("a",off)
101  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102  # pragma optimize("w",on )
103 # endif /* BUGGY_MSC6 */
104 #endif /* MSDOS */
105
106 #ifndef STATIC
107 #define STATIC  static
108 #endif
109
110 typedef struct RExC_state_t {
111     U32         flags;                  /* are we folding, multilining? */
112     char        *precomp;               /* uncompiled string. */
113     regexp      *rx;
114     char        *start;                 /* Start of input for compile */
115     char        *end;                   /* End of input for compile */
116     char        *parse;                 /* Input-scan pointer. */
117     I32         whilem_seen;            /* number of WHILEM in this expr */
118     regnode     *emit_start;            /* Start of emitted-code area */
119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
120     I32         naughty;                /* How bad is this pattern? */
121     I32         sawback;                /* Did we see \1, ...? */
122     U32         seen;
123     I32         size;                   /* Code size. */
124     I32         npar;                   /* () count. */
125     I32         extralen;
126     I32         seen_zerolen;
127     I32         seen_evals;
128     I32         utf8;
129 #if ADD_TO_REGEXEC
130     char        *starttry;              /* -Dr: where regtry was called. */
131 #define RExC_starttry   (pRExC_state->starttry)
132 #endif
133 } RExC_state_t;
134
135 #define RExC_flags      (pRExC_state->flags)
136 #define RExC_precomp    (pRExC_state->precomp)
137 #define RExC_rx         (pRExC_state->rx)
138 #define RExC_start      (pRExC_state->start)
139 #define RExC_end        (pRExC_state->end)
140 #define RExC_parse      (pRExC_state->parse)
141 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
142 #define RExC_offsets    (pRExC_state->rx->offsets) /* I am not like the others */
143 #define RExC_emit       (pRExC_state->emit)
144 #define RExC_emit_start (pRExC_state->emit_start)
145 #define RExC_naughty    (pRExC_state->naughty)
146 #define RExC_sawback    (pRExC_state->sawback)
147 #define RExC_seen       (pRExC_state->seen)
148 #define RExC_size       (pRExC_state->size)
149 #define RExC_npar       (pRExC_state->npar)
150 #define RExC_extralen   (pRExC_state->extralen)
151 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
152 #define RExC_seen_evals (pRExC_state->seen_evals)
153 #define RExC_utf8       (pRExC_state->utf8)
154
155 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
156 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157         ((*s) == '{' && regcurly(s)))
158
159 #ifdef SPSTART
160 #undef SPSTART          /* dratted cpp namespace... */
161 #endif
162 /*
163  * Flags to be passed up and down.
164  */
165 #define WORST           0       /* Worst case. */
166 #define HASWIDTH        0x1     /* Known to match non-null strings. */
167 #define SIMPLE          0x2     /* Simple enough to be STAR/PLUS operand. */
168 #define SPSTART         0x4     /* Starts with * or +. */
169 #define TRYAGAIN        0x8     /* Weeded out a declaration. */
170
171 /* Length of a variant. */
172
173 typedef struct scan_data_t {
174     I32 len_min;
175     I32 len_delta;
176     I32 pos_min;
177     I32 pos_delta;
178     SV *last_found;
179     I32 last_end;                       /* min value, <0 unless valid. */
180     I32 last_start_min;
181     I32 last_start_max;
182     SV **longest;                       /* Either &l_fixed, or &l_float. */
183     SV *longest_fixed;
184     I32 offset_fixed;
185     SV *longest_float;
186     I32 offset_float_min;
187     I32 offset_float_max;
188     I32 flags;
189     I32 whilem_c;
190     I32 *last_closep;
191     struct regnode_charclass_class *start_class;
192 } scan_data_t;
193
194 /*
195  * Forward declarations for pregcomp()'s friends.
196  */
197
198 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199                                       0, 0, 0, 0, 0, 0};
200
201 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202 #define SF_BEFORE_SEOL          0x1
203 #define SF_BEFORE_MEOL          0x2
204 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
207 #ifdef NO_UNARY_PLUS
208 #  define SF_FIX_SHIFT_EOL      (0+2)
209 #  define SF_FL_SHIFT_EOL               (0+4)
210 #else
211 #  define SF_FIX_SHIFT_EOL      (+2)
212 #  define SF_FL_SHIFT_EOL               (+4)
213 #endif
214
215 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220 #define SF_IS_INF               0x40
221 #define SF_HAS_PAR              0x80
222 #define SF_IN_PAR               0x100
223 #define SF_HAS_EVAL             0x200
224 #define SCF_DO_SUBSTR           0x400
225 #define SCF_DO_STCLASS_AND      0x0800
226 #define SCF_DO_STCLASS_OR       0x1000
227 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
228 #define SCF_WHILEM_VISITED_POS  0x2000
229
230 #define UTF (RExC_utf8 != 0)
231 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
232 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
233
234 #define OOB_UNICODE             12345678
235 #define OOB_NAMEDCLASS          -1
236
237 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
240
241 /* length of regex to show in messages that don't mark a position within */
242 #define RegexLengthToShowInErrorMessages 127
243
244 /*
245  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247  * op/pragma/warn/regcomp.
248  */
249 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
250 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
251
252 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
253
254 /*
255  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256  * arg. Show regex, up to a maximum length. If it's too long, chop and add
257  * "...".
258  */
259 #define FAIL(msg) STMT_START {                                          \
260     char *ellipses = "";                                                \
261     IV len = RExC_end - RExC_precomp;                                   \
262                                                                         \
263     if (!SIZE_ONLY)                                                     \
264         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
265     if (len > RegexLengthToShowInErrorMessages) {                       \
266         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
267         len = RegexLengthToShowInErrorMessages - 10;                    \
268         ellipses = "...";                                               \
269     }                                                                   \
270     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",                           \
271             msg, (int)len, RExC_precomp, ellipses);                     \
272 } STMT_END
273
274 /*
275  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
276  * args. Show regex, up to a maximum length. If it's too long, chop and add
277  * "...".
278  */
279 #define FAIL2(pat,msg) STMT_START {                                     \
280     char *ellipses = "";                                                \
281     IV len = RExC_end - RExC_precomp;                                   \
282                                                                         \
283     if (!SIZE_ONLY)                                                     \
284         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
285     if (len > RegexLengthToShowInErrorMessages) {                       \
286         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
287         len = RegexLengthToShowInErrorMessages - 10;                    \
288         ellipses = "...";                                               \
289     }                                                                   \
290     S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/",                       \
291             msg, (int)len, RExC_precomp, ellipses);                     \
292 } STMT_END
293
294
295 /*
296  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
297  */
298 #define Simple_vFAIL(m) STMT_START {                                    \
299     IV offset = RExC_parse - RExC_precomp;                              \
300     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
301             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
302 } STMT_END
303
304 /*
305  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
306  */
307 #define vFAIL(m) STMT_START {                           \
308     if (!SIZE_ONLY)                                     \
309         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
310     Simple_vFAIL(m);                                    \
311 } STMT_END
312
313 /*
314  * Like Simple_vFAIL(), but accepts two arguments.
315  */
316 #define Simple_vFAIL2(m,a1) STMT_START {                        \
317     IV offset = RExC_parse - RExC_precomp;                      \
318     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
319             (int)offset, RExC_precomp, RExC_precomp + offset);  \
320 } STMT_END
321
322 /*
323  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
324  */
325 #define vFAIL2(m,a1) STMT_START {                       \
326     if (!SIZE_ONLY)                                     \
327         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
328     Simple_vFAIL2(m, a1);                               \
329 } STMT_END
330
331
332 /*
333  * Like Simple_vFAIL(), but accepts three arguments.
334  */
335 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
336     IV offset = RExC_parse - RExC_precomp;                      \
337     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
338             (int)offset, RExC_precomp, RExC_precomp + offset);  \
339 } STMT_END
340
341 /*
342  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
343  */
344 #define vFAIL3(m,a1,a2) STMT_START {                    \
345     if (!SIZE_ONLY)                                     \
346         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
347     Simple_vFAIL3(m, a1, a2);                           \
348 } STMT_END
349
350 /*
351  * Like Simple_vFAIL(), but accepts four arguments.
352  */
353 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
354     IV offset = RExC_parse - RExC_precomp;                      \
355     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
356             (int)offset, RExC_precomp, RExC_precomp + offset);  \
357 } STMT_END
358
359 /*
360  * Like Simple_vFAIL(), but accepts five arguments.
361  */
362 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START {           \
363     IV offset = RExC_parse - RExC_precomp;                      \
364     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,       \
365             (int)offset, RExC_precomp, RExC_precomp + offset);  \
366 } STMT_END
367
368
369 #define vWARN(loc,m) STMT_START {                                       \
370     IV offset = loc - RExC_precomp;                                     \
371     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
372             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
373 } STMT_END
374
375 #define vWARNdep(loc,m) STMT_START {                                    \
376     IV offset = loc - RExC_precomp;                                     \
377     Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),          \
378             "%s" REPORT_LOCATION,                                       \
379             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
380 } STMT_END
381
382
383 #define vWARN2(loc, m, a1) STMT_START {                                 \
384     IV offset = loc - RExC_precomp;                                     \
385     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
386             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
387 } STMT_END
388
389 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
390     IV offset = loc - RExC_precomp;                                     \
391     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
392             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
393 } STMT_END
394
395 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
396     IV offset = loc - RExC_precomp;                                     \
397     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
398             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
399 } STMT_END
400
401 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
402     IV offset = loc - RExC_precomp;                                     \
403     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
404             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
405 } STMT_END
406
407
408 /* Allow for side effects in s */
409 #define REGC(c,s) STMT_START {                  \
410     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
411 } STMT_END
412
413 /* Macros for recording node offsets.   20001227 mjd@plover.com 
414  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
415  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
416  * Element 0 holds the number n.
417  */
418
419 #define MJD_OFFSET_DEBUG(x)
420 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
421
422
423 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
424     if (! SIZE_ONLY) {                                                  \
425         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
426                 __LINE__, (node), (byte)));                             \
427         if((node) < 0) {                                                \
428             Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
429         } else {                                                        \
430             RExC_offsets[2*(node)-1] = (byte);                          \
431         }                                                               \
432     }                                                                   \
433 } STMT_END
434
435 #define Set_Node_Offset(node,byte) \
436     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
437 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
438
439 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
440     if (! SIZE_ONLY) {                                                  \
441         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
442                 __LINE__, (node), (len)));                              \
443         if((node) < 0) {                                                \
444             Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
445         } else {                                                        \
446             RExC_offsets[2*(node)] = (len);                             \
447         }                                                               \
448     }                                                                   \
449 } STMT_END
450
451 #define Set_Node_Length(node,len) \
452     Set_Node_Length_To_R((node)-RExC_emit_start, len)
453 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
454 #define Set_Node_Cur_Length(node) \
455     Set_Node_Length(node, RExC_parse - parse_start)
456
457 /* Get offsets and lengths */
458 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
459 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
460
461 static void clear_re(pTHX_ void *r);
462
463 /* Mark that we cannot extend a found fixed substring at this point.
464    Updata the longest found anchored substring and the longest found
465    floating substrings if needed. */
466
467 STATIC void
468 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
469 {
470     STRLEN l = CHR_SVLEN(data->last_found);
471     STRLEN old_l = CHR_SVLEN(*data->longest);
472
473     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
474         sv_setsv(*data->longest, data->last_found);
475         if (*data->longest == data->longest_fixed) {
476             data->offset_fixed = l ? data->last_start_min : data->pos_min;
477             if (data->flags & SF_BEFORE_EOL)
478                 data->flags
479                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
480             else
481                 data->flags &= ~SF_FIX_BEFORE_EOL;
482         }
483         else {
484             data->offset_float_min = l ? data->last_start_min : data->pos_min;
485             data->offset_float_max = (l
486                                       ? data->last_start_max
487                                       : data->pos_min + data->pos_delta);
488             if ((U32)data->offset_float_max > (U32)I32_MAX)
489                 data->offset_float_max = I32_MAX;
490             if (data->flags & SF_BEFORE_EOL)
491                 data->flags
492                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
493             else
494                 data->flags &= ~SF_FL_BEFORE_EOL;
495         }
496     }
497     SvCUR_set(data->last_found, 0);
498     {
499         SV * sv = data->last_found;
500         MAGIC *mg =
501             SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
502         if (mg && mg->mg_len > 0)
503             mg->mg_len = 0;
504     }
505     data->last_end = -1;
506     data->flags &= ~SF_BEFORE_EOL;
507 }
508
509 /* Can match anything (initialization) */
510 STATIC void
511 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
512 {
513     ANYOF_CLASS_ZERO(cl);
514     ANYOF_BITMAP_SETALL(cl);
515     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
516     if (LOC)
517         cl->flags |= ANYOF_LOCALE;
518 }
519
520 /* Can match anything (initialization) */
521 STATIC int
522 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
523 {
524     int value;
525
526     for (value = 0; value <= ANYOF_MAX; value += 2)
527         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
528             return 1;
529     if (!(cl->flags & ANYOF_UNICODE_ALL))
530         return 0;
531     if (!ANYOF_BITMAP_TESTALLSET(cl))
532         return 0;
533     return 1;
534 }
535
536 /* Can match anything (initialization) */
537 STATIC void
538 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
539 {
540     Zero(cl, 1, struct regnode_charclass_class);
541     cl->type = ANYOF;
542     cl_anything(pRExC_state, cl);
543 }
544
545 STATIC void
546 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
547 {
548     Zero(cl, 1, struct regnode_charclass_class);
549     cl->type = ANYOF;
550     cl_anything(pRExC_state, cl);
551     if (LOC)
552         cl->flags |= ANYOF_LOCALE;
553 }
554
555 /* 'And' a given class with another one.  Can create false positives */
556 /* We assume that cl is not inverted */
557 STATIC void
558 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
559          struct regnode_charclass_class *and_with)
560 {
561     if (!(and_with->flags & ANYOF_CLASS)
562         && !(cl->flags & ANYOF_CLASS)
563         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
564         && !(and_with->flags & ANYOF_FOLD)
565         && !(cl->flags & ANYOF_FOLD)) {
566         int i;
567
568         if (and_with->flags & ANYOF_INVERT)
569             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
570                 cl->bitmap[i] &= ~and_with->bitmap[i];
571         else
572             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
573                 cl->bitmap[i] &= and_with->bitmap[i];
574     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
575     if (!(and_with->flags & ANYOF_EOS))
576         cl->flags &= ~ANYOF_EOS;
577
578     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
579         cl->flags &= ~ANYOF_UNICODE_ALL;
580         cl->flags |= ANYOF_UNICODE;
581         ARG_SET(cl, ARG(and_with));
582     }
583     if (!(and_with->flags & ANYOF_UNICODE_ALL))
584         cl->flags &= ~ANYOF_UNICODE_ALL;
585     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
586         cl->flags &= ~ANYOF_UNICODE;
587 }
588
589 /* 'OR' a given class with another one.  Can create false positives */
590 /* We assume that cl is not inverted */
591 STATIC void
592 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
593 {
594     if (or_with->flags & ANYOF_INVERT) {
595         /* We do not use
596          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
597          *   <= (B1 | !B2) | (CL1 | !CL2)
598          * which is wasteful if CL2 is small, but we ignore CL2:
599          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
600          * XXXX Can we handle case-fold?  Unclear:
601          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
602          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
603          */
604         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
605              && !(or_with->flags & ANYOF_FOLD)
606              && !(cl->flags & ANYOF_FOLD) ) {
607             int i;
608
609             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
610                 cl->bitmap[i] |= ~or_with->bitmap[i];
611         } /* XXXX: logic is complicated otherwise */
612         else {
613             cl_anything(pRExC_state, cl);
614         }
615     } else {
616         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
617         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
618              && (!(or_with->flags & ANYOF_FOLD)
619                  || (cl->flags & ANYOF_FOLD)) ) {
620             int i;
621
622             /* OR char bitmap and class bitmap separately */
623             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
624                 cl->bitmap[i] |= or_with->bitmap[i];
625             if (or_with->flags & ANYOF_CLASS) {
626                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
627                     cl->classflags[i] |= or_with->classflags[i];
628                 cl->flags |= ANYOF_CLASS;
629             }
630         }
631         else { /* XXXX: logic is complicated, leave it along for a moment. */
632             cl_anything(pRExC_state, cl);
633         }
634     }
635     if (or_with->flags & ANYOF_EOS)
636         cl->flags |= ANYOF_EOS;
637
638     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
639         ARG(cl) != ARG(or_with)) {
640         cl->flags |= ANYOF_UNICODE_ALL;
641         cl->flags &= ~ANYOF_UNICODE;
642     }
643     if (or_with->flags & ANYOF_UNICODE_ALL) {
644         cl->flags |= ANYOF_UNICODE_ALL;
645         cl->flags &= ~ANYOF_UNICODE;
646     }
647 }
648
649 /*
650  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
651  * These need to be revisited when a newer toolchain becomes available.
652  */
653 #if defined(__sparc64__) && defined(__GNUC__)
654 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
655 #       undef  SPARC64_GCC_WORKAROUND
656 #       define SPARC64_GCC_WORKAROUND 1
657 #   endif
658 #endif
659
660 /* REx optimizer.  Converts nodes into quickier variants "in place".
661    Finds fixed substrings.  */
662
663 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
664    to the position after last scanned or to NULL. */
665
666 STATIC I32
667 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
668                         /* scanp: Start here (read-write). */
669                         /* deltap: Write maxlen-minlen here. */
670                         /* last: Stop before this one. */
671 {
672     I32 min = 0, pars = 0, code;
673     regnode *scan = *scanp, *next;
674     I32 delta = 0;
675     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
676     int is_inf_internal = 0;            /* The studied chunk is infinite */
677     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
678     scan_data_t data_fake;
679     struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
680
681     while (scan && OP(scan) != END && scan < last) {
682         /* Peephole optimizer: */
683
684         if (PL_regkind[(U8)OP(scan)] == EXACT) {
685             /* Merge several consecutive EXACTish nodes into one. */
686             regnode *n = regnext(scan);
687             U32 stringok = 1;
688 #ifdef DEBUGGING
689             regnode *stop = scan;
690 #endif
691
692             next = scan + NODE_SZ_STR(scan);
693             /* Skip NOTHING, merge EXACT*. */
694             while (n &&
695                    ( PL_regkind[(U8)OP(n)] == NOTHING ||
696                      (stringok && (OP(n) == OP(scan))))
697                    && NEXT_OFF(n)
698                    && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
699                 if (OP(n) == TAIL || n > next)
700                     stringok = 0;
701                 if (PL_regkind[(U8)OP(n)] == NOTHING) {
702                     NEXT_OFF(scan) += NEXT_OFF(n);
703                     next = n + NODE_STEP_REGNODE;
704 #ifdef DEBUGGING
705                     if (stringok)
706                         stop = n;
707 #endif
708                     n = regnext(n);
709                 }
710                 else if (stringok) {
711                     int oldl = STR_LEN(scan);
712                     regnode *nnext = regnext(n);
713
714                     if (oldl + STR_LEN(n) > U8_MAX)
715                         break;
716                     NEXT_OFF(scan) += NEXT_OFF(n);
717                     STR_LEN(scan) += STR_LEN(n);
718                     next = n + NODE_SZ_STR(n);
719                     /* Now we can overwrite *n : */
720                     Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
721 #ifdef DEBUGGING
722                     stop = next - 1;
723 #endif
724                     n = nnext;
725                 }
726             }
727
728             if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
729 /*
730   Two problematic code points in Unicode casefolding of EXACT nodes:
731
732    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
733    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
734
735    which casefold to
736
737    Unicode                      UTF-8
738
739    U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
740    U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
741
742    This means that in case-insensitive matching (or "loose matching",
743    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
744    length of the above casefolded versions) can match a target string
745    of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
746    This would rather mess up the minimum length computation.
747
748    What we'll do is to look for the tail four bytes, and then peek
749    at the preceding two bytes to see whether we need to decrease
750    the minimum length by four (six minus two).
751
752    Thanks to the design of UTF-8, there cannot be false matches:
753    A sequence of valid UTF-8 bytes cannot be a subsequence of
754    another valid sequence of UTF-8 bytes.
755
756 */
757                  char *s0 = STRING(scan), *s, *t;
758                  char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
759                  char *t0 = "\xcc\x88\xcc\x81";
760                  char *t1 = t0 + 3;
761                  
762                  for (s = s0 + 2;
763                       s < s2 && (t = ninstr(s, s1, t0, t1));
764                       s = t + 4) {
765                       if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
766                           ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
767                            min -= 4;
768                  }
769             }
770
771 #ifdef DEBUGGING
772             /* Allow dumping */
773             n = scan + NODE_SZ_STR(scan);
774             while (n <= stop) {
775                 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
776                     OP(n) = OPTIMIZED;
777                     NEXT_OFF(n) = 0;
778                 }
779                 n++;
780             }
781 #endif
782         }
783         /* Follow the next-chain of the current node and optimize
784            away all the NOTHINGs from it.  */
785         if (OP(scan) != CURLYX) {
786             int max = (reg_off_by_arg[OP(scan)]
787                        ? I32_MAX
788                        /* I32 may be smaller than U16 on CRAYs! */
789                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
790             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
791             int noff;
792             regnode *n = scan;
793         
794             /* Skip NOTHING and LONGJMP. */
795             while ((n = regnext(n))
796                    && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
797                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
798                    && off + noff < max)
799                 off += noff;
800             if (reg_off_by_arg[OP(scan)])
801                 ARG(scan) = off;
802             else
803                 NEXT_OFF(scan) = off;
804         }
805         /* The principal pseudo-switch.  Cannot be a switch, since we
806            look into several different things.  */
807         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
808                    || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
809             next = regnext(scan);
810             code = OP(scan);
811         
812             if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
813                 I32 max1 = 0, min1 = I32_MAX, num = 0;
814                 struct regnode_charclass_class accum;
815                 
816                 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
817                     scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
818                 if (flags & SCF_DO_STCLASS)
819                     cl_init_zero(pRExC_state, &accum);
820                 while (OP(scan) == code) {
821                     I32 deltanext, minnext, f = 0, fake;
822                     struct regnode_charclass_class this_class;
823
824                     num++;
825                     data_fake.flags = 0;
826                     if (data) {         
827                         data_fake.whilem_c = data->whilem_c;
828                         data_fake.last_closep = data->last_closep;
829                     }
830                     else
831                         data_fake.last_closep = &fake;
832                     next = regnext(scan);
833                     scan = NEXTOPER(scan);
834                     if (code != BRANCH)
835                         scan = NEXTOPER(scan);
836                     if (flags & SCF_DO_STCLASS) {
837                         cl_init(pRExC_state, &this_class);
838                         data_fake.start_class = &this_class;
839                         f = SCF_DO_STCLASS_AND;
840                     }           
841                     if (flags & SCF_WHILEM_VISITED_POS)
842                         f |= SCF_WHILEM_VISITED_POS;
843                     /* we suppose the run is continuous, last=next...*/
844                     minnext = study_chunk(pRExC_state, &scan, &deltanext,
845                                           next, &data_fake, f);
846                     if (min1 > minnext)
847                         min1 = minnext;
848                     if (max1 < minnext + deltanext)
849                         max1 = minnext + deltanext;
850                     if (deltanext == I32_MAX)
851                         is_inf = is_inf_internal = 1;
852                     scan = next;
853                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
854                         pars++;
855                     if (data && (data_fake.flags & SF_HAS_EVAL))
856                         data->flags |= SF_HAS_EVAL;
857                     if (data)
858                         data->whilem_c = data_fake.whilem_c;
859                     if (flags & SCF_DO_STCLASS)
860                         cl_or(pRExC_state, &accum, &this_class);
861                     if (code == SUSPEND)
862                         break;
863                 }
864                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
865                     min1 = 0;
866                 if (flags & SCF_DO_SUBSTR) {
867                     data->pos_min += min1;
868                     data->pos_delta += max1 - min1;
869                     if (max1 != min1 || is_inf)
870                         data->longest = &(data->longest_float);
871                 }
872                 min += min1;
873                 delta += max1 - min1;
874                 if (flags & SCF_DO_STCLASS_OR) {
875                     cl_or(pRExC_state, data->start_class, &accum);
876                     if (min1) {
877                         cl_and(data->start_class, &and_with);
878                         flags &= ~SCF_DO_STCLASS;
879                     }
880                 }
881                 else if (flags & SCF_DO_STCLASS_AND) {
882                     if (min1) {
883                         cl_and(data->start_class, &accum);
884                         flags &= ~SCF_DO_STCLASS;
885                     }
886                     else {
887                         /* Switch to OR mode: cache the old value of
888                          * data->start_class */
889                         StructCopy(data->start_class, &and_with,
890                                    struct regnode_charclass_class);
891                         flags &= ~SCF_DO_STCLASS_AND;
892                         StructCopy(&accum, data->start_class,
893                                    struct regnode_charclass_class);
894                         flags |= SCF_DO_STCLASS_OR;
895                         data->start_class->flags |= ANYOF_EOS;
896                     }
897                 }
898             }
899             else if (code == BRANCHJ)   /* single branch is optimized. */
900                 scan = NEXTOPER(NEXTOPER(scan));
901             else                        /* single branch is optimized. */
902                 scan = NEXTOPER(scan);
903             continue;
904         }
905         else if (OP(scan) == EXACT) {
906             I32 l = STR_LEN(scan);
907             UV uc = *((U8*)STRING(scan));
908             if (UTF) {
909                 U8 *s = (U8*)STRING(scan);
910                 l = utf8_length(s, s + l);
911                 uc = utf8_to_uvchr(s, NULL);
912             }
913             min += l;
914             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
915                 /* The code below prefers earlier match for fixed
916                    offset, later match for variable offset.  */
917                 if (data->last_end == -1) { /* Update the start info. */
918                     data->last_start_min = data->pos_min;
919                     data->last_start_max = is_inf
920                         ? I32_MAX : data->pos_min + data->pos_delta;
921                 }
922                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
923                 {
924                     SV * sv = data->last_found;
925                     MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
926                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
927                     if (mg && mg->mg_len >= 0)
928                         mg->mg_len += utf8_length((U8*)STRING(scan),
929                                                   (U8*)STRING(scan)+STR_LEN(scan));
930                 }
931                 if (UTF)
932                     SvUTF8_on(data->last_found);
933                 data->last_end = data->pos_min + l;
934                 data->pos_min += l; /* As in the first entry. */
935                 data->flags &= ~SF_BEFORE_EOL;
936             }
937             if (flags & SCF_DO_STCLASS_AND) {
938                 /* Check whether it is compatible with what we know already! */
939                 int compat = 1;
940
941                 if (uc >= 0x100 ||
942                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
943                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
944                     && (!(data->start_class->flags & ANYOF_FOLD)
945                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
946                     )
947                     compat = 0;
948                 ANYOF_CLASS_ZERO(data->start_class);
949                 ANYOF_BITMAP_ZERO(data->start_class);
950                 if (compat)
951                     ANYOF_BITMAP_SET(data->start_class, uc);
952                 data->start_class->flags &= ~ANYOF_EOS;
953                 if (uc < 0x100)
954                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
955             }
956             else if (flags & SCF_DO_STCLASS_OR) {
957                 /* false positive possible if the class is case-folded */
958                 if (uc < 0x100)
959                     ANYOF_BITMAP_SET(data->start_class, uc);
960                 else
961                     data->start_class->flags |= ANYOF_UNICODE_ALL;
962                 data->start_class->flags &= ~ANYOF_EOS;
963                 cl_and(data->start_class, &and_with);
964             }
965             flags &= ~SCF_DO_STCLASS;
966         }
967         else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
968             I32 l = STR_LEN(scan);
969             UV uc = *((U8*)STRING(scan));
970
971             /* Search for fixed substrings supports EXACT only. */
972             if (flags & SCF_DO_SUBSTR)
973                 scan_commit(pRExC_state, data);
974             if (UTF) {
975                 U8 *s = (U8 *)STRING(scan);
976                 l = utf8_length(s, s + l);
977                 uc = utf8_to_uvchr(s, NULL);
978             }
979             min += l;
980             if (data && (flags & SCF_DO_SUBSTR))
981                 data->pos_min += l;
982             if (flags & SCF_DO_STCLASS_AND) {
983                 /* Check whether it is compatible with what we know already! */
984                 int compat = 1;
985
986                 if (uc >= 0x100 ||
987                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
988                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
989                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
990                     compat = 0;
991                 ANYOF_CLASS_ZERO(data->start_class);
992                 ANYOF_BITMAP_ZERO(data->start_class);
993                 if (compat) {
994                     ANYOF_BITMAP_SET(data->start_class, uc);
995                     data->start_class->flags &= ~ANYOF_EOS;
996                     data->start_class->flags |= ANYOF_FOLD;
997                     if (OP(scan) == EXACTFL)
998                         data->start_class->flags |= ANYOF_LOCALE;
999                 }
1000             }
1001             else if (flags & SCF_DO_STCLASS_OR) {
1002                 if (data->start_class->flags & ANYOF_FOLD) {
1003                     /* false positive possible if the class is case-folded.
1004                        Assume that the locale settings are the same... */
1005                     if (uc < 0x100)
1006                         ANYOF_BITMAP_SET(data->start_class, uc);
1007                     data->start_class->flags &= ~ANYOF_EOS;
1008                 }
1009                 cl_and(data->start_class, &and_with);
1010             }
1011             flags &= ~SCF_DO_STCLASS;
1012         }
1013         else if (strchr((char*)PL_varies,OP(scan))) {
1014             I32 mincount, maxcount, minnext, deltanext, fl = 0;
1015             I32 f = flags, pos_before = 0;
1016             regnode *oscan = scan;
1017             struct regnode_charclass_class this_class;
1018             struct regnode_charclass_class *oclass = NULL;
1019             I32 next_is_eval = 0;
1020
1021             switch (PL_regkind[(U8)OP(scan)]) {
1022             case WHILEM:                /* End of (?:...)* . */
1023                 scan = NEXTOPER(scan);
1024                 goto finish;
1025             case PLUS:
1026                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1027                     next = NEXTOPER(scan);
1028                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1029                         mincount = 1;
1030                         maxcount = REG_INFTY;
1031                         next = regnext(scan);
1032                         scan = NEXTOPER(scan);
1033                         goto do_curly;
1034                     }
1035                 }
1036                 if (flags & SCF_DO_SUBSTR)
1037                     data->pos_min++;
1038                 min++;
1039                 /* Fall through. */
1040             case STAR:
1041                 if (flags & SCF_DO_STCLASS) {
1042                     mincount = 0;
1043                     maxcount = REG_INFTY;
1044                     next = regnext(scan);
1045                     scan = NEXTOPER(scan);
1046                     goto do_curly;
1047                 }
1048                 is_inf = is_inf_internal = 1;
1049                 scan = regnext(scan);
1050                 if (flags & SCF_DO_SUBSTR) {
1051                     scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1052                     data->longest = &(data->longest_float);
1053                 }
1054                 goto optimize_curly_tail;
1055             case CURLY:
1056                 mincount = ARG1(scan);
1057                 maxcount = ARG2(scan);
1058                 next = regnext(scan);
1059                 if (OP(scan) == CURLYX) {
1060                     I32 lp = (data ? *(data->last_closep) : 0);
1061
1062                     scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1063                 }
1064                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1065                 next_is_eval = (OP(scan) == EVAL);
1066               do_curly:
1067                 if (flags & SCF_DO_SUBSTR) {
1068                     if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1069                     pos_before = data->pos_min;
1070                 }
1071                 if (data) {
1072                     fl = data->flags;
1073                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1074                     if (is_inf)
1075                         data->flags |= SF_IS_INF;
1076                 }
1077                 if (flags & SCF_DO_STCLASS) {
1078                     cl_init(pRExC_state, &this_class);
1079                     oclass = data->start_class;
1080                     data->start_class = &this_class;
1081                     f |= SCF_DO_STCLASS_AND;
1082                     f &= ~SCF_DO_STCLASS_OR;
1083                 }
1084                 /* These are the cases when once a subexpression
1085                    fails at a particular position, it cannot succeed
1086                    even after backtracking at the enclosing scope.
1087                 
1088                    XXXX what if minimal match and we are at the
1089                         initial run of {n,m}? */
1090                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1091                     f &= ~SCF_WHILEM_VISITED_POS;
1092
1093                 /* This will finish on WHILEM, setting scan, or on NULL: */
1094                 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1095                                       mincount == 0
1096                                         ? (f & ~SCF_DO_SUBSTR) : f);
1097
1098                 if (flags & SCF_DO_STCLASS)
1099                     data->start_class = oclass;
1100                 if (mincount == 0 || minnext == 0) {
1101                     if (flags & SCF_DO_STCLASS_OR) {
1102                         cl_or(pRExC_state, data->start_class, &this_class);
1103                     }
1104                     else if (flags & SCF_DO_STCLASS_AND) {
1105                         /* Switch to OR mode: cache the old value of
1106                          * data->start_class */
1107                         StructCopy(data->start_class, &and_with,
1108                                    struct regnode_charclass_class);
1109                         flags &= ~SCF_DO_STCLASS_AND;
1110                         StructCopy(&this_class, data->start_class,
1111                                    struct regnode_charclass_class);
1112                         flags |= SCF_DO_STCLASS_OR;
1113                         data->start_class->flags |= ANYOF_EOS;
1114                     }
1115                 } else {                /* Non-zero len */
1116                     if (flags & SCF_DO_STCLASS_OR) {
1117                         cl_or(pRExC_state, data->start_class, &this_class);
1118                         cl_and(data->start_class, &and_with);
1119                     }
1120                     else if (flags & SCF_DO_STCLASS_AND)
1121                         cl_and(data->start_class, &this_class);
1122                     flags &= ~SCF_DO_STCLASS;
1123                 }
1124                 if (!scan)              /* It was not CURLYX, but CURLY. */
1125                     scan = next;
1126                 if (ckWARN(WARN_REGEXP)
1127                        /* ? quantifier ok, except for (?{ ... }) */
1128                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
1129                     && (minnext == 0) && (deltanext == 0)
1130                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1131                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
1132                 {
1133                     vWARN(RExC_parse,
1134                           "Quantifier unexpected on zero-length expression");
1135                 }
1136
1137                 min += minnext * mincount;
1138                 is_inf_internal |= ((maxcount == REG_INFTY
1139                                      && (minnext + deltanext) > 0)
1140                                     || deltanext == I32_MAX);
1141                 is_inf |= is_inf_internal;
1142                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1143
1144                 /* Try powerful optimization CURLYX => CURLYN. */
1145                 if (  OP(oscan) == CURLYX && data
1146                       && data->flags & SF_IN_PAR
1147                       && !(data->flags & SF_HAS_EVAL)
1148                       && !deltanext && minnext == 1 ) {
1149                     /* Try to optimize to CURLYN.  */
1150                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1151                     regnode *nxt1 = nxt;
1152 #ifdef DEBUGGING
1153                     regnode *nxt2;
1154 #endif
1155
1156                     /* Skip open. */
1157                     nxt = regnext(nxt);
1158                     if (!strchr((char*)PL_simple,OP(nxt))
1159                         && !(PL_regkind[(U8)OP(nxt)] == EXACT
1160                              && STR_LEN(nxt) == 1))
1161                         goto nogo;
1162 #ifdef DEBUGGING
1163                     nxt2 = nxt;
1164 #endif
1165                     nxt = regnext(nxt);
1166                     if (OP(nxt) != CLOSE)
1167                         goto nogo;
1168                     /* Now we know that nxt2 is the only contents: */
1169                     oscan->flags = (U8)ARG(nxt);
1170                     OP(oscan) = CURLYN;
1171                     OP(nxt1) = NOTHING; /* was OPEN. */
1172 #ifdef DEBUGGING
1173                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1174                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1175                     NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1176                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
1177                     OP(nxt + 1) = OPTIMIZED; /* was count. */
1178                     NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1179 #endif
1180                 }
1181               nogo:
1182
1183                 /* Try optimization CURLYX => CURLYM. */
1184                 if (  OP(oscan) == CURLYX && data
1185                       && !(data->flags & SF_HAS_PAR)
1186                       && !(data->flags & SF_HAS_EVAL)
1187                       && !deltanext  ) {
1188                     /* XXXX How to optimize if data == 0? */
1189                     /* Optimize to a simpler form.  */
1190                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1191                     regnode *nxt2;
1192
1193                     OP(oscan) = CURLYM;
1194                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1195                             && (OP(nxt2) != WHILEM))
1196                         nxt = nxt2;
1197                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
1198                     /* Need to optimize away parenths. */
1199                     if (data->flags & SF_IN_PAR) {
1200                         /* Set the parenth number.  */
1201                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1202
1203                         if (OP(nxt) != CLOSE)
1204                             FAIL("Panic opt close");
1205                         oscan->flags = (U8)ARG(nxt);
1206                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
1207                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
1208 #ifdef DEBUGGING
1209                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1210                         OP(nxt + 1) = OPTIMIZED; /* was count. */
1211                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1212                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1213 #endif
1214 #if 0
1215                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
1216                             regnode *nnxt = regnext(nxt1);
1217                         
1218                             if (nnxt == nxt) {
1219                                 if (reg_off_by_arg[OP(nxt1)])
1220                                     ARG_SET(nxt1, nxt2 - nxt1);
1221                                 else if (nxt2 - nxt1 < U16_MAX)
1222                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
1223                                 else
1224                                     OP(nxt) = NOTHING;  /* Cannot beautify */
1225                             }
1226                             nxt1 = nnxt;
1227                         }
1228 #endif
1229                         /* Optimize again: */
1230                         study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1231                                     NULL, 0);
1232                     }
1233                     else
1234                         oscan->flags = 0;
1235                 }
1236                 else if ((OP(oscan) == CURLYX)
1237                          && (flags & SCF_WHILEM_VISITED_POS)
1238                          /* See the comment on a similar expression above.
1239                             However, this time it not a subexpression
1240                             we care about, but the expression itself. */
1241                          && (maxcount == REG_INFTY)
1242                          && data && ++data->whilem_c < 16) {
1243                     /* This stays as CURLYX, we can put the count/of pair. */
1244                     /* Find WHILEM (as in regexec.c) */
1245                     regnode *nxt = oscan + NEXT_OFF(oscan);
1246
1247                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1248                         nxt += ARG(nxt);
1249                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
1250                         | (RExC_whilem_seen << 4)); /* On WHILEM */
1251                 }
1252                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1253                     pars++;
1254                 if (flags & SCF_DO_SUBSTR) {
1255                     SV *last_str = Nullsv;
1256                     int counted = mincount != 0;
1257
1258                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1259 #if defined(SPARC64_GCC_WORKAROUND)
1260                         I32 b = 0;
1261                         STRLEN l = 0;
1262                         char *s = NULL;
1263                         I32 old = 0;
1264
1265                         if (pos_before >= data->last_start_min)
1266                             b = pos_before;
1267                         else
1268                             b = data->last_start_min;
1269
1270                         l = 0;
1271                         s = SvPV(data->last_found, l);
1272                         old = b - data->last_start_min;
1273
1274 #else
1275                         I32 b = pos_before >= data->last_start_min
1276                             ? pos_before : data->last_start_min;
1277                         STRLEN l;
1278                         char *s = SvPV(data->last_found, l);
1279                         I32 old = b - data->last_start_min;
1280 #endif
1281
1282                         if (UTF)
1283                             old = utf8_hop((U8*)s, old) - (U8*)s;
1284                         
1285                         l -= old;
1286                         /* Get the added string: */
1287                         last_str = newSVpvn(s  + old, l);
1288                         if (UTF)
1289                             SvUTF8_on(last_str);
1290                         if (deltanext == 0 && pos_before == b) {
1291                             /* What was added is a constant string */
1292                             if (mincount > 1) {
1293                                 SvGROW(last_str, (mincount * l) + 1);
1294                                 repeatcpy(SvPVX(last_str) + l,
1295                                           SvPVX(last_str), l, mincount - 1);
1296                                 SvCUR(last_str) *= mincount;
1297                                 /* Add additional parts. */
1298                                 SvCUR_set(data->last_found,
1299                                           SvCUR(data->last_found) - l);
1300                                 sv_catsv(data->last_found, last_str);
1301                                 {
1302                                     SV * sv = data->last_found;
1303                                     MAGIC *mg =
1304                                         SvUTF8(sv) && SvMAGICAL(sv) ?
1305                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
1306                                     if (mg && mg->mg_len >= 0)
1307                                         mg->mg_len += CHR_SVLEN(last_str);
1308                                 }
1309                                 data->last_end += l * (mincount - 1);
1310                             }
1311                         } else {
1312                             /* start offset must point into the last copy */
1313                             data->last_start_min += minnext * (mincount - 1);
1314                             data->last_start_max += is_inf ? I32_MAX
1315                                 : (maxcount - 1) * (minnext + data->pos_delta);
1316                         }
1317                     }
1318                     /* It is counted once already... */
1319                     data->pos_min += minnext * (mincount - counted);
1320                     data->pos_delta += - counted * deltanext +
1321                         (minnext + deltanext) * maxcount - minnext * mincount;
1322                     if (mincount != maxcount) {
1323                          /* Cannot extend fixed substrings found inside
1324                             the group.  */
1325                         scan_commit(pRExC_state,data);
1326                         if (mincount && last_str) {
1327                             sv_setsv(data->last_found, last_str);
1328                             data->last_end = data->pos_min;
1329                             data->last_start_min =
1330                                 data->pos_min - CHR_SVLEN(last_str);
1331                             data->last_start_max = is_inf
1332                                 ? I32_MAX
1333                                 : data->pos_min + data->pos_delta
1334                                 - CHR_SVLEN(last_str);
1335                         }
1336                         data->longest = &(data->longest_float);
1337                     }
1338                     SvREFCNT_dec(last_str);
1339                 }
1340                 if (data && (fl & SF_HAS_EVAL))
1341                     data->flags |= SF_HAS_EVAL;
1342               optimize_curly_tail:
1343                 if (OP(oscan) != CURLYX) {
1344                     while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1345                            && NEXT_OFF(next))
1346                         NEXT_OFF(oscan) += NEXT_OFF(next);
1347                 }
1348                 continue;
1349             default:                    /* REF and CLUMP only? */
1350                 if (flags & SCF_DO_SUBSTR) {
1351                     scan_commit(pRExC_state,data);      /* Cannot expect anything... */
1352                     data->longest = &(data->longest_float);
1353                 }
1354                 is_inf = is_inf_internal = 1;
1355                 if (flags & SCF_DO_STCLASS_OR)
1356                     cl_anything(pRExC_state, data->start_class);
1357                 flags &= ~SCF_DO_STCLASS;
1358                 break;
1359             }
1360         }
1361         else if (strchr((char*)PL_simple,OP(scan))) {
1362             int value = 0;
1363
1364             if (flags & SCF_DO_SUBSTR) {
1365                 scan_commit(pRExC_state,data);
1366                 data->pos_min++;
1367             }
1368             min++;
1369             if (flags & SCF_DO_STCLASS) {
1370                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1371
1372                 /* Some of the logic below assumes that switching
1373                    locale on will only add false positives. */
1374                 switch (PL_regkind[(U8)OP(scan)]) {
1375                 case SANY:
1376                 default:
1377                   do_default:
1378                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1379                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1380                         cl_anything(pRExC_state, data->start_class);
1381                     break;
1382                 case REG_ANY:
1383                     if (OP(scan) == SANY)
1384                         goto do_default;
1385                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1386                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1387                                  || (data->start_class->flags & ANYOF_CLASS));
1388                         cl_anything(pRExC_state, data->start_class);
1389                     }
1390                     if (flags & SCF_DO_STCLASS_AND || !value)
1391                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1392                     break;
1393                 case ANYOF:
1394                     if (flags & SCF_DO_STCLASS_AND)
1395                         cl_and(data->start_class,
1396                                (struct regnode_charclass_class*)scan);
1397                     else
1398                         cl_or(pRExC_state, data->start_class,
1399                               (struct regnode_charclass_class*)scan);
1400                     break;
1401                 case ALNUM:
1402                     if (flags & SCF_DO_STCLASS_AND) {
1403                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1404                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1405                             for (value = 0; value < 256; value++)
1406                                 if (!isALNUM(value))
1407                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1408                         }
1409                     }
1410                     else {
1411                         if (data->start_class->flags & ANYOF_LOCALE)
1412                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1413                         else {
1414                             for (value = 0; value < 256; value++)
1415                                 if (isALNUM(value))
1416                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1417                         }
1418                     }
1419                     break;
1420                 case ALNUML:
1421                     if (flags & SCF_DO_STCLASS_AND) {
1422                         if (data->start_class->flags & ANYOF_LOCALE)
1423                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1424                     }
1425                     else {
1426                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1427                         data->start_class->flags |= ANYOF_LOCALE;
1428                     }
1429                     break;
1430                 case NALNUM:
1431                     if (flags & SCF_DO_STCLASS_AND) {
1432                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1433                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1434                             for (value = 0; value < 256; value++)
1435                                 if (isALNUM(value))
1436                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1437                         }
1438                     }
1439                     else {
1440                         if (data->start_class->flags & ANYOF_LOCALE)
1441                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1442                         else {
1443                             for (value = 0; value < 256; value++)
1444                                 if (!isALNUM(value))
1445                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1446                         }
1447                     }
1448                     break;
1449                 case NALNUML:
1450                     if (flags & SCF_DO_STCLASS_AND) {
1451                         if (data->start_class->flags & ANYOF_LOCALE)
1452                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1453                     }
1454                     else {
1455                         data->start_class->flags |= ANYOF_LOCALE;
1456                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1457                     }
1458                     break;
1459                 case SPACE:
1460                     if (flags & SCF_DO_STCLASS_AND) {
1461                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1462                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1463                             for (value = 0; value < 256; value++)
1464                                 if (!isSPACE(value))
1465                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1466                         }
1467                     }
1468                     else {
1469                         if (data->start_class->flags & ANYOF_LOCALE)
1470                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1471                         else {
1472                             for (value = 0; value < 256; value++)
1473                                 if (isSPACE(value))
1474                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1475                         }
1476                     }
1477                     break;
1478                 case SPACEL:
1479                     if (flags & SCF_DO_STCLASS_AND) {
1480                         if (data->start_class->flags & ANYOF_LOCALE)
1481                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1482                     }
1483                     else {
1484                         data->start_class->flags |= ANYOF_LOCALE;
1485                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1486                     }
1487                     break;
1488                 case NSPACE:
1489                     if (flags & SCF_DO_STCLASS_AND) {
1490                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1491                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1492                             for (value = 0; value < 256; value++)
1493                                 if (isSPACE(value))
1494                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1495                         }
1496                     }
1497                     else {
1498                         if (data->start_class->flags & ANYOF_LOCALE)
1499                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1500                         else {
1501                             for (value = 0; value < 256; value++)
1502                                 if (!isSPACE(value))
1503                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1504                         }
1505                     }
1506                     break;
1507                 case NSPACEL:
1508                     if (flags & SCF_DO_STCLASS_AND) {
1509                         if (data->start_class->flags & ANYOF_LOCALE) {
1510                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1511                             for (value = 0; value < 256; value++)
1512                                 if (!isSPACE(value))
1513                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1514                         }
1515                     }
1516                     else {
1517                         data->start_class->flags |= ANYOF_LOCALE;
1518                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1519                     }
1520                     break;
1521                 case DIGIT:
1522                     if (flags & SCF_DO_STCLASS_AND) {
1523                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1524                         for (value = 0; value < 256; value++)
1525                             if (!isDIGIT(value))
1526                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1527                     }
1528                     else {
1529                         if (data->start_class->flags & ANYOF_LOCALE)
1530                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1531                         else {
1532                             for (value = 0; value < 256; value++)
1533                                 if (isDIGIT(value))
1534                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1535                         }
1536                     }
1537                     break;
1538                 case NDIGIT:
1539                     if (flags & SCF_DO_STCLASS_AND) {
1540                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1541                         for (value = 0; value < 256; value++)
1542                             if (isDIGIT(value))
1543                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1544                     }
1545                     else {
1546                         if (data->start_class->flags & ANYOF_LOCALE)
1547                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1548                         else {
1549                             for (value = 0; value < 256; value++)
1550                                 if (!isDIGIT(value))
1551                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1552                         }
1553                     }
1554                     break;
1555                 }
1556                 if (flags & SCF_DO_STCLASS_OR)
1557                     cl_and(data->start_class, &and_with);
1558                 flags &= ~SCF_DO_STCLASS;
1559             }
1560         }
1561         else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1562             data->flags |= (OP(scan) == MEOL
1563                             ? SF_BEFORE_MEOL
1564                             : SF_BEFORE_SEOL);
1565         }
1566         else if (  PL_regkind[(U8)OP(scan)] == BRANCHJ
1567                  /* Lookbehind, or need to calculate parens/evals/stclass: */
1568                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
1569                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1570             /* Lookahead/lookbehind */
1571             I32 deltanext, minnext, fake = 0;
1572             regnode *nscan;
1573             struct regnode_charclass_class intrnl;
1574             int f = 0;
1575
1576             data_fake.flags = 0;
1577             if (data) {         
1578                 data_fake.whilem_c = data->whilem_c;
1579                 data_fake.last_closep = data->last_closep;
1580             }
1581             else
1582                 data_fake.last_closep = &fake;
1583             if ( flags & SCF_DO_STCLASS && !scan->flags
1584                  && OP(scan) == IFMATCH ) { /* Lookahead */
1585                 cl_init(pRExC_state, &intrnl);
1586                 data_fake.start_class = &intrnl;
1587                 f |= SCF_DO_STCLASS_AND;
1588             }
1589             if (flags & SCF_WHILEM_VISITED_POS)
1590                 f |= SCF_WHILEM_VISITED_POS;
1591             next = regnext(scan);
1592             nscan = NEXTOPER(NEXTOPER(scan));
1593             minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1594             if (scan->flags) {
1595                 if (deltanext) {
1596                     vFAIL("Variable length lookbehind not implemented");
1597                 }
1598                 else if (minnext > U8_MAX) {
1599                     vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1600                 }
1601                 scan->flags = (U8)minnext;
1602             }
1603             if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1604                 pars++;
1605             if (data && (data_fake.flags & SF_HAS_EVAL))
1606                 data->flags |= SF_HAS_EVAL;
1607             if (data)
1608                 data->whilem_c = data_fake.whilem_c;
1609             if (f & SCF_DO_STCLASS_AND) {
1610                 int was = (data->start_class->flags & ANYOF_EOS);
1611
1612                 cl_and(data->start_class, &intrnl);
1613                 if (was)
1614                     data->start_class->flags |= ANYOF_EOS;
1615             }
1616         }
1617         else if (OP(scan) == OPEN) {
1618             pars++;
1619         }
1620         else if (OP(scan) == CLOSE) {
1621             if ((I32)ARG(scan) == is_par) {
1622                 next = regnext(scan);
1623
1624                 if ( next && (OP(next) != WHILEM) && next < last)
1625                     is_par = 0;         /* Disable optimization */
1626             }
1627             if (data)
1628                 *(data->last_closep) = ARG(scan);
1629         }
1630         else if (OP(scan) == EVAL) {
1631                 if (data)
1632                     data->flags |= SF_HAS_EVAL;
1633         }
1634         else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1635                 if (flags & SCF_DO_SUBSTR) {
1636                     scan_commit(pRExC_state,data);
1637                     data->longest = &(data->longest_float);
1638                 }
1639                 is_inf = is_inf_internal = 1;
1640                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1641                     cl_anything(pRExC_state, data->start_class);
1642                 flags &= ~SCF_DO_STCLASS;
1643         }
1644         /* Else: zero-length, ignore. */
1645         scan = regnext(scan);
1646     }
1647
1648   finish:
1649     *scanp = scan;
1650     *deltap = is_inf_internal ? I32_MAX : delta;
1651     if (flags & SCF_DO_SUBSTR && is_inf)
1652         data->pos_delta = I32_MAX - data->pos_min;
1653     if (is_par > U8_MAX)
1654         is_par = 0;
1655     if (is_par && pars==1 && data) {
1656         data->flags |= SF_IN_PAR;
1657         data->flags &= ~SF_HAS_PAR;
1658     }
1659     else if (pars && data) {
1660         data->flags |= SF_HAS_PAR;
1661         data->flags &= ~SF_IN_PAR;
1662     }
1663     if (flags & SCF_DO_STCLASS_OR)
1664         cl_and(data->start_class, &and_with);
1665     return min;
1666 }
1667
1668 STATIC I32
1669 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1670 {
1671     if (RExC_rx->data) {
1672         Renewc(RExC_rx->data,
1673                sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1674                char, struct reg_data);
1675         Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1676         RExC_rx->data->count += n;
1677     }
1678     else {
1679         Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1680              char, struct reg_data);
1681         New(1208, RExC_rx->data->what, n, U8);
1682         RExC_rx->data->count = n;
1683     }
1684     Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1685     return RExC_rx->data->count - n;
1686 }
1687
1688 void
1689 Perl_reginitcolors(pTHX)
1690 {
1691     int i = 0;
1692     char *s = PerlEnv_getenv("PERL_RE_COLORS");
1693         
1694     if (s) {
1695         PL_colors[0] = s = savepv(s);
1696         while (++i < 6) {
1697             s = strchr(s, '\t');
1698             if (s) {
1699                 *s = '\0';
1700                 PL_colors[i] = ++s;
1701             }
1702             else
1703                 PL_colors[i] = s = "";
1704         }
1705     } else {
1706         while (i < 6)
1707             PL_colors[i++] = "";
1708     }
1709     PL_colorset = 1;
1710 }
1711
1712
1713 /*
1714  - pregcomp - compile a regular expression into internal code
1715  *
1716  * We can't allocate space until we know how big the compiled form will be,
1717  * but we can't compile it (and thus know how big it is) until we've got a
1718  * place to put the code.  So we cheat:  we compile it twice, once with code
1719  * generation turned off and size counting turned on, and once "for real".
1720  * This also means that we don't allocate space until we are sure that the
1721  * thing really will compile successfully, and we never have to move the
1722  * code and thus invalidate pointers into it.  (Note that it has to be in
1723  * one piece because free() must be able to free it all.) [NB: not true in perl]
1724  *
1725  * Beware that the optimization-preparation code in here knows about some
1726  * of the structure of the compiled regexp.  [I'll say.]
1727  */
1728 regexp *
1729 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1730 {
1731     register regexp *r;
1732     regnode *scan;
1733     regnode *first;
1734     I32 flags;
1735     I32 minlen = 0;
1736     I32 sawplus = 0;
1737     I32 sawopen = 0;
1738     scan_data_t data;
1739     RExC_state_t RExC_state;
1740     RExC_state_t *pRExC_state = &RExC_state;
1741
1742     if (exp == NULL)
1743         FAIL("NULL regexp argument");
1744
1745     RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1746
1747     RExC_precomp = exp;
1748     DEBUG_r({
1749          if (!PL_colorset) reginitcolors();
1750          PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1751                        PL_colors[4],PL_colors[5],PL_colors[0],
1752                        (int)(xend - exp), RExC_precomp, PL_colors[1]);
1753     });
1754     RExC_flags = pm->op_pmflags;
1755     RExC_sawback = 0;
1756
1757     RExC_seen = 0;
1758     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1759     RExC_seen_evals = 0;
1760     RExC_extralen = 0;
1761
1762     /* First pass: determine size, legality. */
1763     RExC_parse = exp;
1764     RExC_start = exp;
1765     RExC_end = xend;
1766     RExC_naughty = 0;
1767     RExC_npar = 1;
1768     RExC_size = 0L;
1769     RExC_emit = &PL_regdummy;
1770     RExC_whilem_seen = 0;
1771 #if 0 /* REGC() is (currently) a NOP at the first pass.
1772        * Clever compilers notice this and complain. --jhi */
1773     REGC((U8)REG_MAGIC, (char*)RExC_emit);
1774 #endif
1775     if (reg(pRExC_state, 0, &flags) == NULL) {
1776         RExC_precomp = Nullch;
1777         return(NULL);
1778     }
1779     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1780
1781     /* Small enough for pointer-storage convention?
1782        If extralen==0, this means that we will not need long jumps. */
1783     if (RExC_size >= 0x10000L && RExC_extralen)
1784         RExC_size += RExC_extralen;
1785     else
1786         RExC_extralen = 0;
1787     if (RExC_whilem_seen > 15)
1788         RExC_whilem_seen = 15;
1789
1790     /* Allocate space and initialize. */
1791     Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1792          char, regexp);
1793     if (r == NULL)
1794         FAIL("Regexp out of space");
1795
1796 #ifdef DEBUGGING
1797     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1798     Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1799 #endif
1800     r->refcnt = 1;
1801     r->prelen = xend - exp;
1802     r->precomp = savepvn(RExC_precomp, r->prelen);
1803     r->subbeg = NULL;
1804 #ifdef PERL_COPY_ON_WRITE
1805     r->saved_copy = Nullsv;
1806 #endif
1807     r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1808     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1809
1810     r->substrs = 0;                     /* Useful during FAIL. */
1811     r->startp = 0;                      /* Useful during FAIL. */
1812     r->endp = 0;                        /* Useful during FAIL. */
1813
1814     Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1815     if (r->offsets) {
1816       r->offsets[0] = RExC_size; 
1817     }
1818     DEBUG_r(PerlIO_printf(Perl_debug_log, 
1819                           "%s %"UVuf" bytes for offset annotations.\n", 
1820                           r->offsets ? "Got" : "Couldn't get", 
1821                           (UV)((2*RExC_size+1) * sizeof(U32))));
1822
1823     RExC_rx = r;
1824
1825     /* Second pass: emit code. */
1826     RExC_flags = pm->op_pmflags;        /* don't let top level (?i) bleed */
1827     RExC_parse = exp;
1828     RExC_end = xend;
1829     RExC_naughty = 0;
1830     RExC_npar = 1;
1831     RExC_emit_start = r->program;
1832     RExC_emit = r->program;
1833     /* Store the count of eval-groups for security checks: */
1834     RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1835     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1836     r->data = 0;
1837     if (reg(pRExC_state, 0, &flags) == NULL)
1838         return(NULL);
1839
1840     /* Dig out information for optimizations. */
1841     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1842     pm->op_pmflags = RExC_flags;
1843     if (UTF)
1844         r->reganch |= ROPT_UTF8;        /* Unicode in it? */
1845     r->regstclass = NULL;
1846     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
1847         r->reganch |= ROPT_NAUGHTY;
1848     scan = r->program + 1;              /* First BRANCH. */
1849
1850     /* XXXX To minimize changes to RE engine we always allocate
1851        3-units-long substrs field. */
1852     Newz(1004, r->substrs, 1, struct reg_substr_data);
1853
1854     StructCopy(&zero_scan_data, &data, scan_data_t);
1855     /* XXXX Should not we check for something else?  Usually it is OPEN1... */
1856     if (OP(scan) != BRANCH) {   /* Only one top-level choice. */
1857         I32 fake;
1858         STRLEN longest_float_length, longest_fixed_length;
1859         struct regnode_charclass_class ch_class;
1860         int stclass_flag;
1861         I32 last_close = 0;
1862
1863         first = scan;
1864         /* Skip introductions and multiplicators >= 1. */
1865         while ((OP(first) == OPEN && (sawopen = 1)) ||
1866                /* An OR of *one* alternative - should not happen now. */
1867             (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1868             (OP(first) == PLUS) ||
1869             (OP(first) == MINMOD) ||
1870                /* An {n,m} with n>0 */
1871             (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1872                 if (OP(first) == PLUS)
1873                     sawplus = 1;
1874                 else
1875                     first += regarglen[(U8)OP(first)];
1876                 first = NEXTOPER(first);
1877         }
1878
1879         /* Starting-point info. */
1880       again:
1881         if (PL_regkind[(U8)OP(first)] == EXACT) {
1882             if (OP(first) == EXACT)
1883                 ;       /* Empty, get anchored substr later. */
1884             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1885                 r->regstclass = first;
1886         }
1887         else if (strchr((char*)PL_simple,OP(first)))
1888             r->regstclass = first;
1889         else if (PL_regkind[(U8)OP(first)] == BOUND ||
1890                  PL_regkind[(U8)OP(first)] == NBOUND)
1891             r->regstclass = first;
1892         else if (PL_regkind[(U8)OP(first)] == BOL) {
1893             r->reganch |= (OP(first) == MBOL
1894                            ? ROPT_ANCH_MBOL
1895                            : (OP(first) == SBOL
1896                               ? ROPT_ANCH_SBOL
1897                               : ROPT_ANCH_BOL));
1898             first = NEXTOPER(first);
1899             goto again;
1900         }
1901         else if (OP(first) == GPOS) {
1902             r->reganch |= ROPT_ANCH_GPOS;
1903             first = NEXTOPER(first);
1904             goto again;
1905         }
1906         else if (!sawopen && (OP(first) == STAR &&
1907             PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1908             !(r->reganch & ROPT_ANCH) )
1909         {
1910             /* turn .* into ^.* with an implied $*=1 */
1911             int type = OP(NEXTOPER(first));
1912
1913             if (type == REG_ANY)
1914                 type = ROPT_ANCH_MBOL;
1915             else
1916                 type = ROPT_ANCH_SBOL;
1917
1918             r->reganch |= type | ROPT_IMPLICIT;
1919             first = NEXTOPER(first);
1920             goto again;
1921         }
1922         if (sawplus && (!sawopen || !RExC_sawback)
1923             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1924             /* x+ must match at the 1st pos of run of x's */
1925             r->reganch |= ROPT_SKIP;
1926
1927         /* Scan is after the zeroth branch, first is atomic matcher. */
1928         DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1929                               (IV)(first - scan + 1)));
1930         /*
1931         * If there's something expensive in the r.e., find the
1932         * longest literal string that must appear and make it the
1933         * regmust.  Resolve ties in favor of later strings, since
1934         * the regstart check works with the beginning of the r.e.
1935         * and avoiding duplication strengthens checking.  Not a
1936         * strong reason, but sufficient in the absence of others.
1937         * [Now we resolve ties in favor of the earlier string if
1938         * it happens that c_offset_min has been invalidated, since the
1939         * earlier string may buy us something the later one won't.]
1940         */
1941         minlen = 0;
1942
1943         data.longest_fixed = newSVpvn("",0);
1944         data.longest_float = newSVpvn("",0);
1945         data.last_found = newSVpvn("",0);
1946         data.longest = &(data.longest_fixed);
1947         first = scan;
1948         if (!r->regstclass) {
1949             cl_init(pRExC_state, &ch_class);
1950             data.start_class = &ch_class;
1951             stclass_flag = SCF_DO_STCLASS_AND;
1952         } else                          /* XXXX Check for BOUND? */
1953             stclass_flag = 0;
1954         data.last_closep = &last_close;
1955
1956         minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1957                              &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1958         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1959              && data.last_start_min == 0 && data.last_end > 0
1960              && !RExC_seen_zerolen
1961              && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1962             r->reganch |= ROPT_CHECK_ALL;
1963         scan_commit(pRExC_state, &data);
1964         SvREFCNT_dec(data.last_found);
1965
1966         longest_float_length = CHR_SVLEN(data.longest_float);
1967         if (longest_float_length
1968             || (data.flags & SF_FL_BEFORE_EOL
1969                 && (!(data.flags & SF_FL_BEFORE_MEOL)
1970                     || (RExC_flags & PMf_MULTILINE)))) {
1971             int t;
1972
1973             if (SvCUR(data.longest_fixed)                       /* ok to leave SvCUR */
1974                 && data.offset_fixed == data.offset_float_min
1975                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1976                     goto remove_float;          /* As in (a)+. */
1977
1978             if (SvUTF8(data.longest_float)) {
1979                 r->float_utf8 = data.longest_float;
1980                 r->float_substr = Nullsv;
1981             } else {
1982                 r->float_substr = data.longest_float;
1983                 r->float_utf8 = Nullsv;
1984             }
1985             r->float_min_offset = data.offset_float_min;
1986             r->float_max_offset = data.offset_float_max;
1987             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1988                        && (!(data.flags & SF_FL_BEFORE_MEOL)
1989                            || (RExC_flags & PMf_MULTILINE)));
1990             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1991         }
1992         else {
1993           remove_float:
1994             r->float_substr = r->float_utf8 = Nullsv;
1995             SvREFCNT_dec(data.longest_float);
1996             longest_float_length = 0;
1997         }
1998
1999         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
2000         if (longest_fixed_length
2001             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2002                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2003                     || (RExC_flags & PMf_MULTILINE)))) {
2004             int t;
2005
2006             if (SvUTF8(data.longest_fixed)) {
2007                 r->anchored_utf8 = data.longest_fixed;
2008                 r->anchored_substr = Nullsv;
2009             } else {
2010                 r->anchored_substr = data.longest_fixed;
2011                 r->anchored_utf8 = Nullsv;
2012             }
2013             r->anchored_offset = data.offset_fixed;
2014             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2015                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
2016                      || (RExC_flags & PMf_MULTILINE)));
2017             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2018         }
2019         else {
2020             r->anchored_substr = r->anchored_utf8 = Nullsv;
2021             SvREFCNT_dec(data.longest_fixed);
2022             longest_fixed_length = 0;
2023         }
2024         if (r->regstclass
2025             && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2026             r->regstclass = NULL;
2027         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2028             && stclass_flag
2029             && !(data.start_class->flags & ANYOF_EOS)
2030             && !cl_is_anything(data.start_class))
2031         {
2032             I32 n = add_data(pRExC_state, 1, "f");
2033
2034             New(1006, RExC_rx->data->data[n], 1,
2035                 struct regnode_charclass_class);
2036             StructCopy(data.start_class,
2037                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2038                        struct regnode_charclass_class);
2039             r->regstclass = (regnode*)RExC_rx->data->data[n];
2040             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2041             PL_regdata = r->data; /* for regprop() */
2042             DEBUG_r({ SV *sv = sv_newmortal();
2043                       regprop(sv, (regnode*)data.start_class);
2044                       PerlIO_printf(Perl_debug_log,
2045                                     "synthetic stclass `%s'.\n",
2046                                     SvPVX(sv));});
2047         }
2048
2049         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2050         if (longest_fixed_length > longest_float_length) {
2051             r->check_substr = r->anchored_substr;
2052             r->check_utf8 = r->anchored_utf8;
2053             r->check_offset_min = r->check_offset_max = r->anchored_offset;
2054             if (r->reganch & ROPT_ANCH_SINGLE)
2055                 r->reganch |= ROPT_NOSCAN;
2056         }
2057         else {
2058             r->check_substr = r->float_substr;
2059             r->check_utf8 = r->float_utf8;
2060             r->check_offset_min = data.offset_float_min;
2061             r->check_offset_max = data.offset_float_max;
2062         }
2063         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2064            This should be changed ASAP!  */
2065         if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2066             r->reganch |= RE_USE_INTUIT;
2067             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2068                 r->reganch |= RE_INTUIT_TAIL;
2069         }
2070     }
2071     else {
2072         /* Several toplevels. Best we can is to set minlen. */
2073         I32 fake;
2074         struct regnode_charclass_class ch_class;
2075         I32 last_close = 0;
2076         
2077         DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2078         scan = r->program + 1;
2079         cl_init(pRExC_state, &ch_class);
2080         data.start_class = &ch_class;
2081         data.last_closep = &last_close;
2082         minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2083         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2084                 = r->float_substr = r->float_utf8 = Nullsv;
2085         if (!(data.start_class->flags & ANYOF_EOS)
2086             && !cl_is_anything(data.start_class))
2087         {
2088             I32 n = add_data(pRExC_state, 1, "f");
2089
2090             New(1006, RExC_rx->data->data[n], 1,
2091                 struct regnode_charclass_class);
2092             StructCopy(data.start_class,
2093                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2094                        struct regnode_charclass_class);
2095             r->regstclass = (regnode*)RExC_rx->data->data[n];
2096             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2097             DEBUG_r({ SV* sv = sv_newmortal();
2098                       regprop(sv, (regnode*)data.start_class);
2099                       PerlIO_printf(Perl_debug_log,
2100                                     "synthetic stclass `%s'.\n",
2101                                     SvPVX(sv));});
2102         }
2103     }
2104
2105     r->minlen = minlen;
2106     if (RExC_seen & REG_SEEN_GPOS)
2107         r->reganch |= ROPT_GPOS_SEEN;
2108     if (RExC_seen & REG_SEEN_LOOKBEHIND)
2109         r->reganch |= ROPT_LOOKBEHIND_SEEN;
2110     if (RExC_seen & REG_SEEN_EVAL)
2111         r->reganch |= ROPT_EVAL_SEEN;
2112     if (RExC_seen & REG_SEEN_CANY)
2113         r->reganch |= ROPT_CANY_SEEN;
2114     Newz(1002, r->startp, RExC_npar, I32);
2115     Newz(1002, r->endp, RExC_npar, I32);
2116     PL_regdata = r->data; /* for regprop() */
2117     DEBUG_r(regdump(r));
2118     return(r);
2119 }
2120
2121 /*
2122  - reg - regular expression, i.e. main body or parenthesized thing
2123  *
2124  * Caller must absorb opening parenthesis.
2125  *
2126  * Combining parenthesis handling with the base level of regular expression
2127  * is a trifle forced, but the need to tie the tails of the branches to what
2128  * follows makes it hard to avoid.
2129  */
2130 STATIC regnode *
2131 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2132     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2133 {
2134     register regnode *ret;              /* Will be the head of the group. */
2135     register regnode *br;
2136     register regnode *lastbr;
2137     register regnode *ender = 0;
2138     register I32 parno = 0;
2139     I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2140
2141     /* for (?g), (?gc), and (?o) warnings; warning
2142        about (?c) will warn about (?g) -- japhy    */
2143
2144     I32 wastedflags = 0x00,
2145         wasted_o    = 0x01,
2146         wasted_g    = 0x02,
2147         wasted_gc   = 0x02 | 0x04,
2148         wasted_c    = 0x04;
2149
2150     char * parse_start = RExC_parse; /* MJD */
2151     char *oregcomp_parse = RExC_parse;
2152     char c;
2153
2154     *flagp = 0;                         /* Tentatively. */
2155
2156
2157     /* Make an OPEN node, if parenthesized. */
2158     if (paren) {
2159         if (*RExC_parse == '?') { /* (?...) */
2160             U32 posflags = 0, negflags = 0;
2161             U32 *flagsp = &posflags;
2162             int logical = 0;
2163             char *seqstart = RExC_parse;
2164
2165             RExC_parse++;
2166             paren = *RExC_parse++;
2167             ret = NULL;                 /* For look-ahead/behind. */
2168             switch (paren) {
2169             case '<':           /* (?<...) */
2170                 RExC_seen |= REG_SEEN_LOOKBEHIND;
2171                 if (*RExC_parse == '!')
2172                     paren = ',';
2173                 if (*RExC_parse != '=' && *RExC_parse != '!')
2174                     goto unknown;
2175                 RExC_parse++;
2176             case '=':           /* (?=...) */
2177             case '!':           /* (?!...) */
2178                 RExC_seen_zerolen++;
2179             case ':':           /* (?:...) */
2180             case '>':           /* (?>...) */
2181                 break;
2182             case '$':           /* (?$...) */
2183             case '@':           /* (?@...) */
2184                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2185                 break;
2186             case '#':           /* (?#...) */
2187                 while (*RExC_parse && *RExC_parse != ')')
2188                     RExC_parse++;
2189                 if (*RExC_parse != ')')
2190                     FAIL("Sequence (?#... not terminated");
2191                 nextchar(pRExC_state);
2192                 *flagp = TRYAGAIN;
2193                 return NULL;
2194             case 'p':           /* (?p...) */
2195                 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2196                     vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2197                 /* FALL THROUGH*/
2198             case '?':           /* (??...) */
2199                 logical = 1;
2200                 if (*RExC_parse != '{')
2201                     goto unknown;
2202                 paren = *RExC_parse++;
2203                 /* FALL THROUGH */
2204             case '{':           /* (?{...}) */
2205             {
2206                 I32 count = 1, n = 0;
2207                 char c;
2208                 char *s = RExC_parse;
2209                 SV *sv;
2210                 OP_4tree *sop, *rop;
2211
2212                 RExC_seen_zerolen++;
2213                 RExC_seen |= REG_SEEN_EVAL;
2214                 while (count && (c = *RExC_parse)) {
2215                     if (c == '\\' && RExC_parse[1])
2216                         RExC_parse++;
2217                     else if (c == '{')
2218                         count++;
2219                     else if (c == '}')
2220                         count--;
2221                     RExC_parse++;
2222                 }
2223                 if (*RExC_parse != ')')
2224                 {
2225                     RExC_parse = s;             
2226                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2227                 }
2228                 if (!SIZE_ONLY) {
2229                     PAD *pad;
2230                 
2231                     if (RExC_parse - 1 - s)
2232                         sv = newSVpvn(s, RExC_parse - 1 - s);
2233                     else
2234                         sv = newSVpvn("", 0);
2235
2236                     ENTER;
2237                     Perl_save_re_context(aTHX);
2238                     rop = sv_compile_2op(sv, &sop, "re", &pad);
2239                     sop->op_private |= OPpREFCOUNTED;
2240                     /* re_dup will OpREFCNT_inc */
2241                     OpREFCNT_set(sop, 1);
2242                     LEAVE;
2243
2244                     n = add_data(pRExC_state, 3, "nop");
2245                     RExC_rx->data->data[n] = (void*)rop;
2246                     RExC_rx->data->data[n+1] = (void*)sop;
2247                     RExC_rx->data->data[n+2] = (void*)pad;
2248                     SvREFCNT_dec(sv);
2249                 }
2250                 else {                                          /* First pass */
2251                     if (PL_reginterp_cnt < ++RExC_seen_evals
2252                         && PL_curcop != &PL_compiling)
2253                         /* No compiled RE interpolated, has runtime
2254                            components ===> unsafe.  */
2255                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
2256                     if (PL_tainting && PL_tainted)
2257                         FAIL("Eval-group in insecure regular expression");
2258                 }
2259                 
2260                 nextchar(pRExC_state);
2261                 if (logical) {
2262                     ret = reg_node(pRExC_state, LOGICAL);
2263                     if (!SIZE_ONLY)
2264                         ret->flags = 2;
2265                     regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2266                     /* deal with the length of this later - MJD */
2267                     return ret;
2268                 }
2269                 ret = reganode(pRExC_state, EVAL, n);
2270                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2271                 Set_Node_Offset(ret, parse_start);
2272                 return ret;
2273             }
2274             case '(':           /* (?(?{...})...) and (?(?=...)...) */
2275             {
2276                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
2277                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2278                         || RExC_parse[1] == '<'
2279                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
2280                         I32 flag;
2281                         
2282                         ret = reg_node(pRExC_state, LOGICAL);
2283                         if (!SIZE_ONLY)
2284                             ret->flags = 1;
2285                         regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2286                         goto insert_if;
2287                     }
2288                 }
2289                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2290                     /* (?(1)...) */
2291                     parno = atoi(RExC_parse++);
2292
2293                     while (isDIGIT(*RExC_parse))
2294                         RExC_parse++;
2295                     ret = reganode(pRExC_state, GROUPP, parno);
2296                     
2297                     if ((c = *nextchar(pRExC_state)) != ')')
2298                         vFAIL("Switch condition not recognized");
2299                   insert_if:
2300                     regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2301                     br = regbranch(pRExC_state, &flags, 1);
2302                     if (br == NULL)
2303                         br = reganode(pRExC_state, LONGJMP, 0);
2304                     else
2305                         regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2306                     c = *nextchar(pRExC_state);
2307                     if (flags&HASWIDTH)
2308                         *flagp |= HASWIDTH;
2309                     if (c == '|') {
2310                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2311                         regbranch(pRExC_state, &flags, 1);
2312                         regtail(pRExC_state, ret, lastbr);
2313                         if (flags&HASWIDTH)
2314                             *flagp |= HASWIDTH;
2315                         c = *nextchar(pRExC_state);
2316                     }
2317                     else
2318                         lastbr = NULL;
2319                     if (c != ')')
2320                         vFAIL("Switch (?(condition)... contains too many branches");
2321                     ender = reg_node(pRExC_state, TAIL);
2322                     regtail(pRExC_state, br, ender);
2323                     if (lastbr) {
2324                         regtail(pRExC_state, lastbr, ender);
2325                         regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2326                     }
2327                     else
2328                         regtail(pRExC_state, ret, ender);
2329                     return ret;
2330                 }
2331                 else {
2332                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2333                 }
2334             }
2335             case 0:
2336                 RExC_parse--; /* for vFAIL to print correctly */
2337                 vFAIL("Sequence (? incomplete");
2338                 break;
2339             default:
2340                 --RExC_parse;
2341               parse_flags:      /* (?i) */
2342                 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2343                     /* (?g), (?gc) and (?o) are useless here
2344                        and must be globally applied -- japhy */
2345
2346                     if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2347                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2348                             I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2349                             if (! (wastedflags & wflagbit) ) {
2350                                 wastedflags |= wflagbit;
2351                                 vWARN5(
2352                                     RExC_parse + 1,
2353                                     "Useless (%s%c) - %suse /%c modifier",
2354                                     flagsp == &negflags ? "?-" : "?",
2355                                     *RExC_parse,
2356                                     flagsp == &negflags ? "don't " : "",
2357                                     *RExC_parse
2358                                 );
2359                             }
2360                         }
2361                     }
2362                     else if (*RExC_parse == 'c') {
2363                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2364                             if (! (wastedflags & wasted_c) ) {
2365                                 wastedflags |= wasted_gc;
2366                                 vWARN3(
2367                                     RExC_parse + 1,
2368                                     "Useless (%sc) - %suse /gc modifier",
2369                                     flagsp == &negflags ? "?-" : "?",
2370                                     flagsp == &negflags ? "don't " : ""
2371                                 );
2372                             }
2373                         }
2374                     }
2375                     else { pmflag(flagsp, *RExC_parse); }
2376
2377                     ++RExC_parse;
2378                 }
2379                 if (*RExC_parse == '-') {
2380                     flagsp = &negflags;
2381                     wastedflags = 0;  /* reset so (?g-c) warns twice */
2382                     ++RExC_parse;
2383                     goto parse_flags;
2384                 }
2385                 RExC_flags |= posflags;
2386                 RExC_flags &= ~negflags;
2387                 if (*RExC_parse == ':') {
2388                     RExC_parse++;
2389                     paren = ':';
2390                     break;
2391                 }               
2392               unknown:
2393                 if (*RExC_parse != ')') {
2394                     RExC_parse++;
2395                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2396                 }
2397                 nextchar(pRExC_state);
2398                 *flagp = TRYAGAIN;
2399                 return NULL;
2400             }
2401         }
2402         else {                  /* (...) */
2403             parno = RExC_npar;
2404             RExC_npar++;
2405             ret = reganode(pRExC_state, OPEN, parno);
2406             Set_Node_Length(ret, 1); /* MJD */
2407             Set_Node_Offset(ret, RExC_parse); /* MJD */
2408             open = 1;
2409         }
2410     }
2411     else                        /* ! paren */
2412         ret = NULL;
2413
2414     /* Pick up the branches, linking them together. */
2415     parse_start = RExC_parse;   /* MJD */
2416     br = regbranch(pRExC_state, &flags, 1);
2417     /*     branch_len = (paren != 0); */
2418     
2419     if (br == NULL)
2420         return(NULL);
2421     if (*RExC_parse == '|') {
2422         if (!SIZE_ONLY && RExC_extralen) {
2423             reginsert(pRExC_state, BRANCHJ, br);
2424         }
2425         else {                  /* MJD */
2426             reginsert(pRExC_state, BRANCH, br);
2427             Set_Node_Length(br, paren != 0);
2428             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2429         }
2430         have_branch = 1;
2431         if (SIZE_ONLY)
2432             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
2433     }
2434     else if (paren == ':') {
2435         *flagp |= flags&SIMPLE;
2436     }
2437     if (open) {                         /* Starts with OPEN. */
2438         regtail(pRExC_state, ret, br);          /* OPEN -> first. */
2439     }
2440     else if (paren != '?')              /* Not Conditional */
2441         ret = br;
2442     *flagp |= flags & (SPSTART | HASWIDTH);
2443     lastbr = br;
2444     while (*RExC_parse == '|') {
2445         if (!SIZE_ONLY && RExC_extralen) {
2446             ender = reganode(pRExC_state, LONGJMP,0);
2447             regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2448         }
2449         if (SIZE_ONLY)
2450             RExC_extralen += 2;         /* Account for LONGJMP. */
2451         nextchar(pRExC_state);
2452         br = regbranch(pRExC_state, &flags, 0);
2453         
2454         if (br == NULL)
2455             return(NULL);
2456         regtail(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
2457         lastbr = br;
2458         if (flags&HASWIDTH)
2459             *flagp |= HASWIDTH;
2460         *flagp |= flags&SPSTART;
2461     }
2462
2463     if (have_branch || paren != ':') {
2464         /* Make a closing node, and hook it on the end. */
2465         switch (paren) {
2466         case ':':
2467             ender = reg_node(pRExC_state, TAIL);
2468             break;
2469         case 1:
2470             ender = reganode(pRExC_state, CLOSE, parno);
2471             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2472             Set_Node_Length(ender,1); /* MJD */
2473             break;
2474         case '<':
2475         case ',':
2476         case '=':
2477         case '!':
2478             *flagp &= ~HASWIDTH;
2479             /* FALL THROUGH */
2480         case '>':
2481             ender = reg_node(pRExC_state, SUCCEED);
2482             break;
2483         case 0:
2484             ender = reg_node(pRExC_state, END);
2485             break;
2486         }
2487         regtail(pRExC_state, lastbr, ender);
2488
2489         if (have_branch) {
2490             /* Hook the tails of the branches to the closing node. */
2491             for (br = ret; br != NULL; br = regnext(br)) {
2492                 regoptail(pRExC_state, br, ender);
2493             }
2494         }
2495     }
2496
2497     {
2498         char *p;
2499         static char parens[] = "=!<,>";
2500
2501         if (paren && (p = strchr(parens, paren))) {
2502             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2503             int flag = (p - parens) > 1;
2504
2505             if (paren == '>')
2506                 node = SUSPEND, flag = 0;
2507             reginsert(pRExC_state, node,ret);
2508             Set_Node_Offset(ret, oregcomp_parse);
2509             Set_Node_Length(ret,  RExC_parse - oregcomp_parse + 2);
2510             ret->flags = flag;
2511             regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2512         }
2513     }
2514
2515     /* Check for proper termination. */
2516     if (paren) {
2517         RExC_flags = oregflags;
2518         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2519             RExC_parse = oregcomp_parse;
2520             vFAIL("Unmatched (");
2521         }
2522     }
2523     else if (!paren && RExC_parse < RExC_end) {
2524         if (*RExC_parse == ')') {
2525             RExC_parse++;
2526             vFAIL("Unmatched )");
2527         }
2528         else
2529             FAIL("Junk on end of regexp");      /* "Can't happen". */
2530         /* NOTREACHED */
2531     }
2532
2533     return(ret);
2534 }
2535
2536 /*
2537  - regbranch - one alternative of an | operator
2538  *
2539  * Implements the concatenation operator.
2540  */
2541 STATIC regnode *
2542 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2543 {
2544     register regnode *ret;
2545     register regnode *chain = NULL;
2546     register regnode *latest;
2547     I32 flags = 0, c = 0;
2548
2549     if (first)
2550         ret = NULL;
2551     else {
2552         if (!SIZE_ONLY && RExC_extralen)
2553             ret = reganode(pRExC_state, BRANCHJ,0);
2554         else {
2555             ret = reg_node(pRExC_state, BRANCH);
2556             Set_Node_Length(ret, 1);
2557         }
2558     }
2559         
2560     if (!first && SIZE_ONLY)
2561         RExC_extralen += 1;                     /* BRANCHJ */
2562
2563     *flagp = WORST;                     /* Tentatively. */
2564
2565     RExC_parse--;
2566     nextchar(pRExC_state);
2567     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2568         flags &= ~TRYAGAIN;
2569         latest = regpiece(pRExC_state, &flags);
2570         if (latest == NULL) {
2571             if (flags & TRYAGAIN)
2572                 continue;
2573             return(NULL);
2574         }
2575         else if (ret == NULL)
2576             ret = latest;
2577         *flagp |= flags&HASWIDTH;
2578         if (chain == NULL)      /* First piece. */
2579             *flagp |= flags&SPSTART;
2580         else {
2581             RExC_naughty++;
2582             regtail(pRExC_state, chain, latest);
2583         }
2584         chain = latest;
2585         c++;
2586     }
2587     if (chain == NULL) {        /* Loop ran zero times. */
2588         chain = reg_node(pRExC_state, NOTHING);
2589         if (ret == NULL)
2590             ret = chain;
2591     }
2592     if (c == 1) {
2593         *flagp |= flags&SIMPLE;
2594     }
2595
2596     return(ret);
2597 }
2598
2599 /*
2600  - regpiece - something followed by possible [*+?]
2601  *
2602  * Note that the branching code sequences used for ? and the general cases
2603  * of * and + are somewhat optimized:  they use the same NOTHING node as
2604  * both the endmarker for their branch list and the body of the last branch.
2605  * It might seem that this node could be dispensed with entirely, but the
2606  * endmarker role is not redundant.
2607  */
2608 STATIC regnode *
2609 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2610 {
2611     register regnode *ret;
2612     register char op;
2613     register char *next;
2614     I32 flags;
2615     char *origparse = RExC_parse;
2616     char *maxpos;
2617     I32 min;
2618     I32 max = REG_INFTY;
2619     char *parse_start;
2620
2621     ret = regatom(pRExC_state, &flags);
2622     if (ret == NULL) {
2623         if (flags & TRYAGAIN)
2624             *flagp |= TRYAGAIN;
2625         return(NULL);
2626     }
2627
2628     op = *RExC_parse;
2629
2630     if (op == '{' && regcurly(RExC_parse)) {
2631         parse_start = RExC_parse; /* MJD */
2632         next = RExC_parse + 1;
2633         maxpos = Nullch;
2634         while (isDIGIT(*next) || *next == ',') {
2635             if (*next == ',') {
2636                 if (maxpos)
2637                     break;
2638                 else
2639                     maxpos = next;
2640             }
2641             next++;
2642         }
2643         if (*next == '}') {             /* got one */
2644             if (!maxpos)
2645                 maxpos = next;
2646             RExC_parse++;
2647             min = atoi(RExC_parse);
2648             if (*maxpos == ',')
2649                 maxpos++;
2650             else
2651                 maxpos = RExC_parse;
2652             max = atoi(maxpos);
2653             if (!max && *maxpos != '0')
2654                 max = REG_INFTY;                /* meaning "infinity" */
2655             else if (max >= REG_INFTY)
2656                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2657             RExC_parse = next;
2658             nextchar(pRExC_state);
2659
2660         do_curly:
2661             if ((flags&SIMPLE)) {
2662                 RExC_naughty += 2 + RExC_naughty / 2;
2663                 reginsert(pRExC_state, CURLY, ret);
2664                 Set_Node_Offset(ret, parse_start+1); /* MJD */
2665                 Set_Node_Cur_Length(ret);
2666             }
2667             else {
2668                 regnode *w = reg_node(pRExC_state, WHILEM);
2669
2670                 w->flags = 0;
2671                 regtail(pRExC_state, ret, w);
2672                 if (!SIZE_ONLY && RExC_extralen) {
2673                     reginsert(pRExC_state, LONGJMP,ret);
2674                     reginsert(pRExC_state, NOTHING,ret);
2675                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
2676                 }
2677                 reginsert(pRExC_state, CURLYX,ret);
2678                                 /* MJD hk */
2679                 Set_Node_Offset(ret, parse_start+1);
2680                 Set_Node_Length(ret, 
2681                                 op == '{' ? (RExC_parse - parse_start) : 1);
2682                 
2683                 if (!SIZE_ONLY && RExC_extralen)
2684                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
2685                 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2686                 if (SIZE_ONLY)
2687                     RExC_whilem_seen++, RExC_extralen += 3;
2688                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
2689             }
2690             ret->flags = 0;
2691
2692             if (min > 0)
2693                 *flagp = WORST;
2694             if (max > 0)
2695                 *flagp |= HASWIDTH;
2696             if (max && max < min)
2697                 vFAIL("Can't do {n,m} with n > m");
2698             if (!SIZE_ONLY) {
2699                 ARG1_SET(ret, (U16)min);
2700                 ARG2_SET(ret, (U16)max);
2701             }
2702
2703             goto nest_check;
2704         }
2705     }
2706
2707     if (!ISMULT1(op)) {
2708         *flagp = flags;
2709         return(ret);
2710     }
2711
2712 #if 0                           /* Now runtime fix should be reliable. */
2713
2714     /* if this is reinstated, don't forget to put this back into perldiag:
2715
2716             =item Regexp *+ operand could be empty at {#} in regex m/%s/
2717
2718            (F) The part of the regexp subject to either the * or + quantifier
2719            could match an empty string. The {#} shows in the regular
2720            expression about where the problem was discovered.
2721
2722     */
2723
2724     if (!(flags&HASWIDTH) && op != '?')
2725       vFAIL("Regexp *+ operand could be empty");
2726 #endif
2727
2728     parse_start = RExC_parse;
2729     nextchar(pRExC_state);
2730
2731     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2732
2733     if (op == '*' && (flags&SIMPLE)) {
2734         reginsert(pRExC_state, STAR, ret);
2735         ret->flags = 0;
2736         RExC_naughty += 4;
2737     }
2738     else if (op == '*') {
2739         min = 0;
2740         goto do_curly;
2741     }
2742     else if (op == '+' && (flags&SIMPLE)) {
2743         reginsert(pRExC_state, PLUS, ret);
2744         ret->flags = 0;
2745         RExC_naughty += 3;
2746     }
2747     else if (op == '+') {
2748         min = 1;
2749         goto do_curly;
2750     }
2751     else if (op == '?') {
2752         min = 0; max = 1;
2753         goto do_curly;
2754     }
2755   nest_check:
2756     if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2757         vWARN3(RExC_parse,
2758                "%.*s matches null string many times",
2759                RExC_parse - origparse,
2760                origparse);
2761     }
2762
2763     if (*RExC_parse == '?') {
2764         nextchar(pRExC_state);
2765         reginsert(pRExC_state, MINMOD, ret);
2766         regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2767     }
2768     if (ISMULT2(RExC_parse)) {
2769         RExC_parse++;
2770         vFAIL("Nested quantifiers");
2771     }
2772
2773     return(ret);
2774 }
2775
2776 /*
2777  - regatom - the lowest level
2778  *
2779  * Optimization:  gobbles an entire sequence of ordinary characters so that
2780  * it can turn them into a single node, which is smaller to store and
2781  * faster to run.  Backslashed characters are exceptions, each becoming a
2782  * separate node; the code is simpler that way and it's not worth fixing.
2783  *
2784  * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2785 STATIC regnode *
2786 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2787 {
2788     register regnode *ret = 0;
2789     I32 flags;
2790     char *parse_start = 0;
2791
2792     *flagp = WORST;             /* Tentatively. */
2793
2794 tryagain:
2795     switch (*RExC_parse) {
2796     case '^':
2797         RExC_seen_zerolen++;
2798         nextchar(pRExC_state);
2799         if (RExC_flags & PMf_MULTILINE)
2800             ret = reg_node(pRExC_state, MBOL);
2801         else if (RExC_flags & PMf_SINGLELINE)
2802             ret = reg_node(pRExC_state, SBOL);
2803         else
2804             ret = reg_node(pRExC_state, BOL);
2805         Set_Node_Length(ret, 1); /* MJD */
2806         break;
2807     case '$':
2808         nextchar(pRExC_state);
2809         if (*RExC_parse)
2810             RExC_seen_zerolen++;
2811         if (RExC_flags & PMf_MULTILINE)
2812             ret = reg_node(pRExC_state, MEOL);
2813         else if (RExC_flags & PMf_SINGLELINE)
2814             ret = reg_node(pRExC_state, SEOL);
2815         else
2816             ret = reg_node(pRExC_state, EOL);
2817         Set_Node_Length(ret, 1); /* MJD */
2818         break;
2819     case '.':
2820         nextchar(pRExC_state);
2821         if (RExC_flags & PMf_SINGLELINE)
2822             ret = reg_node(pRExC_state, SANY);
2823         else
2824             ret = reg_node(pRExC_state, REG_ANY);
2825         *flagp |= HASWIDTH|SIMPLE;
2826         RExC_naughty++;
2827         Set_Node_Length(ret, 1); /* MJD */
2828         break;
2829     case '[':
2830     {
2831         char *oregcomp_parse = ++RExC_parse;
2832         ret = regclass(pRExC_state);
2833         if (*RExC_parse != ']') {
2834             RExC_parse = oregcomp_parse;
2835             vFAIL("Unmatched [");
2836         }
2837         nextchar(pRExC_state);
2838         *flagp |= HASWIDTH|SIMPLE;
2839         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2840         break;
2841     }
2842     case '(':
2843         nextchar(pRExC_state);
2844         ret = reg(pRExC_state, 1, &flags);
2845         if (ret == NULL) {
2846                 if (flags & TRYAGAIN) {
2847                     if (RExC_parse == RExC_end) {
2848                          /* Make parent create an empty node if needed. */
2849                         *flagp |= TRYAGAIN;
2850                         return(NULL);
2851                     }
2852                     goto tryagain;
2853                 }
2854                 return(NULL);
2855         }
2856         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2857         break;
2858     case '|':
2859     case ')':
2860         if (flags & TRYAGAIN) {
2861             *flagp |= TRYAGAIN;
2862             return NULL;
2863         }
2864         vFAIL("Internal urp");
2865                                 /* Supposed to be caught earlier. */
2866         break;
2867     case '{':
2868         if (!regcurly(RExC_parse)) {
2869             RExC_parse++;
2870             goto defchar;
2871         }
2872         /* FALL THROUGH */
2873     case '?':
2874     case '+':
2875     case '*':
2876         RExC_parse++;
2877         vFAIL("Quantifier follows nothing");
2878         break;
2879     case '\\':
2880         switch (*++RExC_parse) {
2881         case 'A':
2882             RExC_seen_zerolen++;
2883             ret = reg_node(pRExC_state, SBOL);
2884             *flagp |= SIMPLE;
2885             nextchar(pRExC_state);
2886             Set_Node_Length(ret, 2); /* MJD */
2887             break;
2888         case 'G':
2889             ret = reg_node(pRExC_state, GPOS);
2890             RExC_seen |= REG_SEEN_GPOS;
2891             *flagp |= SIMPLE;
2892             nextchar(pRExC_state);
2893             Set_Node_Length(ret, 2); /* MJD */
2894             break;
2895         case 'Z':
2896             ret = reg_node(pRExC_state, SEOL);
2897             *flagp |= SIMPLE;
2898             RExC_seen_zerolen++;                /* Do not optimize RE away */
2899             nextchar(pRExC_state);
2900             break;
2901         case 'z':
2902             ret = reg_node(pRExC_state, EOS);
2903             *flagp |= SIMPLE;
2904             RExC_seen_zerolen++;                /* Do not optimize RE away */
2905             nextchar(pRExC_state);
2906             Set_Node_Length(ret, 2); /* MJD */
2907             break;
2908         case 'C':
2909             ret = reg_node(pRExC_state, CANY);
2910             RExC_seen |= REG_SEEN_CANY;
2911             *flagp |= HASWIDTH|SIMPLE;
2912             nextchar(pRExC_state);
2913             Set_Node_Length(ret, 2); /* MJD */
2914             break;
2915         case 'X':
2916             ret = reg_node(pRExC_state, CLUMP);
2917             *flagp |= HASWIDTH;
2918             nextchar(pRExC_state);
2919             Set_Node_Length(ret, 2); /* MJD */
2920             break;
2921         case 'w':
2922             ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
2923             *flagp |= HASWIDTH|SIMPLE;
2924             nextchar(pRExC_state);
2925             Set_Node_Length(ret, 2); /* MJD */
2926             break;
2927         case 'W':
2928             ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
2929             *flagp |= HASWIDTH|SIMPLE;
2930             nextchar(pRExC_state);
2931             Set_Node_Length(ret, 2); /* MJD */
2932             break;
2933         case 'b':
2934             RExC_seen_zerolen++;
2935             RExC_seen |= REG_SEEN_LOOKBEHIND;
2936             ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
2937             *flagp |= SIMPLE;
2938             nextchar(pRExC_state);
2939             Set_Node_Length(ret, 2); /* MJD */
2940             break;
2941         case 'B':
2942             RExC_seen_zerolen++;
2943             RExC_seen |= REG_SEEN_LOOKBEHIND;
2944             ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
2945             *flagp |= SIMPLE;
2946             nextchar(pRExC_state);
2947             Set_Node_Length(ret, 2); /* MJD */
2948             break;
2949         case 's':
2950             ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
2951             *flagp |= HASWIDTH|SIMPLE;
2952             nextchar(pRExC_state);
2953             Set_Node_Length(ret, 2); /* MJD */
2954             break;
2955         case 'S':
2956             ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
2957             *flagp |= HASWIDTH|SIMPLE;
2958             nextchar(pRExC_state);
2959             Set_Node_Length(ret, 2); /* MJD */
2960             break;
2961         case 'd':
2962             ret = reg_node(pRExC_state, DIGIT);
2963             *flagp |= HASWIDTH|SIMPLE;
2964             nextchar(pRExC_state);
2965             Set_Node_Length(ret, 2); /* MJD */
2966             break;
2967         case 'D':
2968             ret = reg_node(pRExC_state, NDIGIT);
2969             *flagp |= HASWIDTH|SIMPLE;
2970             nextchar(pRExC_state);
2971             Set_Node_Length(ret, 2); /* MJD */
2972             break;
2973         case 'p':
2974         case 'P':
2975             {   
2976                 char* oldregxend = RExC_end;
2977                 char* parse_start = RExC_parse - 2;
2978
2979                 if (RExC_parse[1] == '{') {
2980                   /* a lovely hack--pretend we saw [\pX] instead */
2981                     RExC_end = strchr(RExC_parse, '}');
2982                     if (!RExC_end) {
2983                         U8 c = (U8)*RExC_parse;
2984                         RExC_parse += 2;
2985                         RExC_end = oldregxend;
2986                         vFAIL2("Missing right brace on \\%c{}", c);
2987                     }
2988                     RExC_end++;
2989                 }
2990                 else {
2991                     RExC_end = RExC_parse + 2;
2992                     if (RExC_end > oldregxend)
2993                         RExC_end = oldregxend;
2994                 }
2995                 RExC_parse--;
2996
2997                 ret = regclass(pRExC_state);
2998
2999                 RExC_end = oldregxend;
3000                 RExC_parse--;
3001
3002                 Set_Node_Offset(ret, parse_start + 2);
3003                 Set_Node_Cur_Length(ret);
3004                 nextchar(pRExC_state);
3005                 *flagp |= HASWIDTH|SIMPLE;
3006             }
3007             break;
3008         case 'n':
3009         case 'r':
3010         case 't':
3011         case 'f':
3012         case 'e':
3013         case 'a':
3014         case 'x':
3015         case 'c':
3016         case '0':
3017             goto defchar;
3018         case '1': case '2': case '3': case '4':
3019         case '5': case '6': case '7': case '8': case '9':
3020             {
3021                 I32 num = atoi(RExC_parse);
3022
3023                 if (num > 9 && num >= RExC_npar)
3024                     goto defchar;
3025                 else {
3026                     char * parse_start = RExC_parse - 1; /* MJD */
3027                     while (isDIGIT(*RExC_parse))
3028                         RExC_parse++;
3029
3030                     if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3031                         vFAIL("Reference to nonexistent group");
3032                     RExC_sawback = 1;
3033                     ret = reganode(pRExC_state,
3034                                    (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3035                                    num);
3036                     *flagp |= HASWIDTH;
3037                     
3038                     /* override incorrect value set in reganode MJD */
3039                     Set_Node_Offset(ret, parse_start+1); 
3040                     Set_Node_Cur_Length(ret); /* MJD */
3041                     RExC_parse--;
3042                     nextchar(pRExC_state);
3043                 }
3044             }
3045             break;
3046         case '\0':
3047             if (RExC_parse >= RExC_end)
3048                 FAIL("Trailing \\");
3049             /* FALL THROUGH */
3050         default:
3051             /* Do not generate `unrecognized' warnings here, we fall
3052                back into the quick-grab loop below */
3053             goto defchar;
3054         }
3055         break;
3056
3057     case '#':
3058         if (RExC_flags & PMf_EXTENDED) {
3059             while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3060             if (RExC_parse < RExC_end)
3061                 goto tryagain;
3062         }
3063         /* FALL THROUGH */
3064
3065     default: {
3066             register STRLEN len;
3067             register UV ender;
3068             register char *p;
3069             char *oldp, *s;
3070             STRLEN numlen;
3071             STRLEN foldlen;
3072             U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3073
3074             parse_start = RExC_parse - 1;
3075
3076             RExC_parse++;
3077
3078         defchar:
3079             ender = 0;
3080             ret = reg_node(pRExC_state,
3081                            (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3082             s = STRING(ret);
3083             for (len = 0, p = RExC_parse - 1;
3084               len < 127 && p < RExC_end;
3085               len++)
3086             {
3087                 oldp = p;
3088
3089                 if (RExC_flags & PMf_EXTENDED)
3090                     p = regwhite(p, RExC_end);
3091                 switch (*p) {
3092                 case '^':
3093                 case '$':
3094                 case '.':
3095                 case '[':
3096                 case '(':
3097                 case ')':
3098                 case '|':
3099                     goto loopdone;
3100                 case '\\':
3101                     switch (*++p) {
3102                     case 'A':
3103                     case 'C':
3104                     case 'X':
3105                     case 'G':
3106                     case 'Z':
3107                     case 'z':
3108                     case 'w':
3109                     case 'W':
3110                     case 'b':
3111                     case 'B':
3112                     case 's':
3113                     case 'S':
3114                     case 'd':
3115                     case 'D':
3116                     case 'p':
3117                     case 'P':
3118                         --p;
3119                         goto loopdone;
3120                     case 'n':
3121                         ender = '\n';
3122                         p++;
3123                         break;
3124                     case 'r':
3125                         ender = '\r';
3126                         p++;
3127                         break;
3128                     case 't':
3129                         ender = '\t';
3130                         p++;
3131                         break;
3132                     case 'f':
3133                         ender = '\f';
3134                         p++;
3135                         break;
3136                     case 'e':
3137                           ender = ASCII_TO_NATIVE('\033');
3138                         p++;
3139                         break;
3140                     case 'a':
3141                           ender = ASCII_TO_NATIVE('\007');
3142                         p++;
3143                         break;
3144                     case 'x':
3145                         if (*++p == '{') {
3146                             char* e = strchr(p, '}');
3147         
3148                             if (!e) {
3149                                 RExC_parse = p + 1;
3150                                 vFAIL("Missing right brace on \\x{}");
3151                             }
3152                             else {
3153                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3154                                     | PERL_SCAN_DISALLOW_PREFIX;
3155                                 numlen = e - p - 1;
3156                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3157                                 if (ender > 0xff)
3158                                     RExC_utf8 = 1;
3159                                 /* numlen is generous */
3160                                 if (numlen + len >= 127) {
3161                                     p--;
3162                                     goto loopdone;
3163                                 }
3164                                 p = e + 1;
3165                             }
3166                         }
3167                         else {
3168                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3169                             numlen = 2;
3170                             ender = grok_hex(p, &numlen, &flags, NULL);
3171                             p += numlen;
3172                         }
3173                         break;
3174                     case 'c':
3175                         p++;
3176                         ender = UCHARAT(p++);
3177                         ender = toCTRL(ender);
3178                         break;
3179                     case '0': case '1': case '2': case '3':case '4':
3180                     case '5': case '6': case '7': case '8':case '9':
3181                         if (*p == '0' ||
3182                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3183                             I32 flags = 0;
3184                             numlen = 3;
3185                             ender = grok_oct(p, &numlen, &flags, NULL);
3186                             p += numlen;
3187                         }
3188                         else {
3189                             --p;
3190                             goto loopdone;
3191                         }
3192                         break;
3193                     case '\0':
3194                         if (p >= RExC_end)
3195                             FAIL("Trailing \\");
3196                         /* FALL THROUGH */
3197                     default:
3198                         if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3199                             vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3200                         goto normal_default;
3201                     }
3202                     break;
3203                 default:
3204                   normal_default:
3205                     if (UTF8_IS_START(*p) && UTF) {
3206                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3207                                                &numlen, 0);
3208                         p += numlen;
3209                     }
3210                     else
3211                         ender = *p++;
3212                     break;
3213                 }
3214                 if (RExC_flags & PMf_EXTENDED)
3215                     p = regwhite(p, RExC_end);
3216                 if (UTF && FOLD) {
3217                     /* Prime the casefolded buffer. */
3218                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3219                 }
3220                 if (ISMULT2(p)) { /* Back off on ?+*. */
3221                     if (len)
3222                         p = oldp;
3223                     else if (UTF) {
3224                          STRLEN unilen;
3225
3226                          if (FOLD) {
3227                               /* Emit all the Unicode characters. */
3228                               for (foldbuf = tmpbuf;
3229                                    foldlen;
3230                                    foldlen -= numlen) {
3231                                    ender = utf8_to_uvchr(foldbuf, &numlen);
3232                                    if (numlen > 0) {
3233                                         reguni(pRExC_state, ender, s, &unilen);
3234                                         s       += unilen;
3235                                         len     += unilen;
3236                                         /* In EBCDIC the numlen
3237                                          * and unilen can differ. */
3238                                         foldbuf += numlen;
3239                                         if (numlen >= foldlen)
3240                                              break;
3241                                    }
3242                                    else
3243                                         break; /* "Can't happen." */
3244                               }
3245                          }
3246                          else {
3247                               reguni(pRExC_state, ender, s, &unilen);
3248                               if (unilen > 0) {
3249                                    s   += unilen;
3250                                    len += unilen;
3251                               }
3252                          }
3253                     }
3254                     else {
3255                         len++;
3256                         REGC((char)ender, s++);
3257                     }
3258                     break;
3259                 }
3260                 if (UTF) {
3261                      STRLEN unilen;
3262
3263                      if (FOLD) {
3264                           /* Emit all the Unicode characters. */
3265                           for (foldbuf = tmpbuf;
3266                                foldlen;
3267                                foldlen -= numlen) {
3268                                ender = utf8_to_uvchr(foldbuf, &numlen);
3269                                if (numlen > 0) {
3270                                     reguni(pRExC_state, ender, s, &unilen);
3271                                     len     += unilen;
3272                                     s       += unilen;
3273                                     /* In EBCDIC the numlen
3274                                      * and unilen can differ. */
3275                                     foldbuf += numlen;
3276                                     if (numlen >= foldlen)
3277                                          break;
3278                                }
3279                                else
3280                                     break;
3281                           }
3282                      }
3283                      else {
3284                           reguni(pRExC_state, ender, s, &unilen);
3285                           if (unilen > 0) {
3286                                s   += unilen;
3287                                len += unilen;
3288                           }
3289                      }
3290                      len--;
3291                 }
3292                 else
3293                     REGC((char)ender, s++);
3294             }
3295         loopdone:
3296             RExC_parse = p - 1;
3297             Set_Node_Cur_Length(ret); /* MJD */
3298             nextchar(pRExC_state);
3299             {
3300                 /* len is STRLEN which is unsigned, need to copy to signed */
3301                 IV iv = len;
3302                 if (iv < 0)
3303                     vFAIL("Internal disaster");
3304             }
3305             if (len > 0)
3306                 *flagp |= HASWIDTH;
3307             if (len == 1)
3308                 *flagp |= SIMPLE;
3309             if (!SIZE_ONLY)
3310                 STR_LEN(ret) = len;
3311             if (SIZE_ONLY)
3312                 RExC_size += STR_SZ(len);
3313             else
3314                 RExC_emit += STR_SZ(len);
3315         }
3316         break;
3317     }
3318
3319     /* If the encoding pragma is in effect recode the text of
3320      * any EXACT-kind nodes. */
3321     if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3322         STRLEN oldlen = STR_LEN(ret);
3323         SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3324
3325         if (RExC_utf8)
3326             SvUTF8_on(sv);
3327         if (sv_utf8_downgrade(sv, TRUE)) {
3328             char *s       = sv_recode_to_utf8(sv, PL_encoding);
3329             STRLEN newlen = SvCUR(sv);
3330
3331             if (SvUTF8(sv))
3332                 RExC_utf8 = 1;
3333             if (!SIZE_ONLY) {
3334                 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3335                                       (int)oldlen, STRING(ret),
3336                                       (int)newlen, s));
3337                 Copy(s, STRING(ret), newlen, char);
3338                 STR_LEN(ret) += newlen - oldlen;
3339                 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3340             } else
3341                 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3342         }
3343     }
3344
3345     return(ret);
3346 }
3347
3348 STATIC char *
3349 S_regwhite(pTHX_ char *p, char *e)
3350 {
3351     while (p < e) {
3352         if (isSPACE(*p))
3353             ++p;
3354         else if (*p == '#') {
3355             do {
3356                 p++;
3357             } while (p < e && *p != '\n');
3358         }
3359         else
3360             break;
3361     }
3362     return p;
3363 }
3364
3365 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3366    Character classes ([:foo:]) can also be negated ([:^foo:]).
3367    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3368    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3369    but trigger failures because they are currently unimplemented. */
3370
3371 #define POSIXCC_DONE(c)   ((c) == ':')
3372 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3373 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3374
3375 STATIC I32
3376 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3377 {
3378     char *posixcc = 0;
3379     I32 namedclass = OOB_NAMEDCLASS;
3380
3381     if (value == '[' && RExC_parse + 1 < RExC_end &&
3382         /* I smell either [: or [= or [. -- POSIX has been here, right? */
3383         POSIXCC(UCHARAT(RExC_parse))) {
3384         char  c = UCHARAT(RExC_parse);
3385         char* s = RExC_parse++;
3386         
3387         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3388             RExC_parse++;
3389         if (RExC_parse == RExC_end)
3390             /* Grandfather lone [:, [=, [. */
3391             RExC_parse = s;
3392         else {
3393             char* t = RExC_parse++; /* skip over the c */
3394
3395             if (UCHARAT(RExC_parse) == ']') {
3396                 RExC_parse++; /* skip over the ending ] */
3397                 posixcc = s + 1;
3398                 if (*s == ':') {
3399                     I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3400                     I32 skip = 5; /* the most common skip */
3401
3402                     switch (*posixcc) {
3403                     case 'a':
3404                         if (strnEQ(posixcc, "alnum", 5))
3405                             namedclass =
3406                                 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3407                         else if (strnEQ(posixcc, "alpha", 5))
3408                             namedclass =
3409                                 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3410                         else if (strnEQ(posixcc, "ascii", 5))
3411                             namedclass =
3412                                 complement ? ANYOF_NASCII : ANYOF_ASCII;
3413                         break;
3414                     case 'b':
3415                         if (strnEQ(posixcc, "blank", 5))
3416                             namedclass =
3417                                 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3418                         break;
3419                     case 'c':
3420                         if (strnEQ(posixcc, "cntrl", 5))
3421                             namedclass =
3422                                 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3423                         break;
3424                     case 'd':
3425                         if (strnEQ(posixcc, "digit", 5))
3426                             namedclass =
3427                                 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3428                         break;
3429                     case 'g':
3430                         if (strnEQ(posixcc, "graph", 5))
3431                             namedclass =
3432                                 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3433                         break;
3434                     case 'l':
3435                         if (strnEQ(posixcc, "lower", 5))
3436                             namedclass =
3437                                 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3438                         break;
3439                     case 'p':
3440                         if (strnEQ(posixcc, "print", 5))
3441                             namedclass =
3442                                 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3443                         else if (strnEQ(posixcc, "punct", 5))
3444                             namedclass =
3445                                 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3446                         break;
3447                     case 's':
3448                         if (strnEQ(posixcc, "space", 5))
3449                             namedclass =
3450                                 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3451                         break;
3452                     case 'u':
3453                         if (strnEQ(posixcc, "upper", 5))
3454                             namedclass =
3455                                 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3456                         break;
3457                     case 'w': /* this is not POSIX, this is the Perl \w */
3458                         if (strnEQ(posixcc, "word", 4)) {
3459                             namedclass =
3460                                 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3461                             skip = 4;
3462                         }
3463                         break;
3464                     case 'x':
3465                         if (strnEQ(posixcc, "xdigit", 6)) {
3466                             namedclass =
3467                                 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3468                             skip = 6;
3469                         }
3470                         break;
3471                     }
3472                     if (namedclass == OOB_NAMEDCLASS ||
3473                         posixcc[skip] != ':' ||
3474                         posixcc[skip+1] != ']')
3475                     {
3476                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3477                                       t - s - 1, s + 1);
3478                     }
3479                 } else if (!SIZE_ONLY) {
3480                     /* [[=foo=]] and [[.foo.]] are still future. */
3481
3482                     /* adjust RExC_parse so the warning shows after
3483                        the class closes */
3484                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3485                         RExC_parse++;
3486                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3487                 }
3488             } else {
3489                 /* Maternal grandfather:
3490                  * "[:" ending in ":" but not in ":]" */
3491                 RExC_parse = s;
3492             }
3493         }
3494     }
3495
3496     return namedclass;
3497 }
3498
3499 STATIC void
3500 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3501 {
3502     if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3503         char *s = RExC_parse;
3504         char  c = *s++;
3505
3506         while(*s && isALNUM(*s))
3507             s++;
3508         if (*s && c == *s && s[1] == ']') {
3509             if (ckWARN(WARN_REGEXP))
3510                 vWARN3(s+2,
3511                         "POSIX syntax [%c %c] belongs inside character classes",
3512                         c, c);
3513
3514             /* [[=foo=]] and [[.foo.]] are still future. */
3515             if (POSIXCC_NOTYET(c)) {
3516                 /* adjust RExC_parse so the error shows after
3517                    the class closes */
3518                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3519                     ;
3520                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3521             }
3522         }
3523     }
3524 }
3525
3526 STATIC regnode *
3527 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3528 {
3529     register UV value;
3530     register UV nextvalue;
3531     register IV prevvalue = OOB_UNICODE;
3532     register IV range = 0;
3533     register regnode *ret;
3534     STRLEN numlen;
3535     IV namedclass;
3536     char *rangebegin = 0;
3537     bool need_class = 0;
3538     SV *listsv = Nullsv;
3539     register char *e;
3540     UV n;
3541     bool optimize_invert   = TRUE;
3542     AV* unicode_alternate  = 0;
3543 #ifdef EBCDIC
3544     UV literal_endpoint = 0;
3545 #endif
3546
3547     ret = reganode(pRExC_state, ANYOF, 0);
3548
3549     if (!SIZE_ONLY)
3550         ANYOF_FLAGS(ret) = 0;
3551
3552     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
3553         RExC_naughty++;
3554         RExC_parse++;
3555         if (!SIZE_ONLY)
3556             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3557     }
3558
3559     if (SIZE_ONLY)
3560         RExC_size += ANYOF_SKIP;
3561     else {
3562         RExC_emit += ANYOF_SKIP;
3563         if (FOLD)
3564             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3565         if (LOC)
3566             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3567         ANYOF_BITMAP_ZERO(ret);
3568         listsv = newSVpvn("# comment\n", 10);
3569     }
3570
3571     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3572
3573     if (!SIZE_ONLY && POSIXCC(nextvalue))
3574         checkposixcc(pRExC_state);
3575
3576     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3577     if (UCHARAT(RExC_parse) == ']')
3578         goto charclassloop;
3579
3580     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3581
3582     charclassloop:
3583
3584         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3585
3586         if (!range)
3587             rangebegin = RExC_parse;
3588         if (UTF) {
3589             value = utf8n_to_uvchr((U8*)RExC_parse,
3590                                    RExC_end - RExC_parse,
3591                                    &numlen, 0);
3592             RExC_parse += numlen;
3593         }
3594         else
3595             value = UCHARAT(RExC_parse++);
3596         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3597         if (value == '[' && POSIXCC(nextvalue))
3598             namedclass = regpposixcc(pRExC_state, value);
3599         else if (value == '\\') {
3600             if (UTF) {
3601                 value = utf8n_to_uvchr((U8*)RExC_parse,
3602                                    RExC_end - RExC_parse,
3603                                    &numlen, 0);
3604                 RExC_parse += numlen;
3605             }
3606             else
3607                 value = UCHARAT(RExC_parse++);
3608             /* Some compilers cannot handle switching on 64-bit integer
3609              * values, therefore value cannot be an UV.  Yes, this will
3610              * be a problem later if we want switch on Unicode.
3611              * A similar issue a little bit later when switching on
3612              * namedclass. --jhi */
3613             switch ((I32)value) {
3614             case 'w':   namedclass = ANYOF_ALNUM;       break;
3615             case 'W':   namedclass = ANYOF_NALNUM;      break;
3616             case 's':   namedclass = ANYOF_SPACE;       break;
3617             case 'S':   namedclass = ANYOF_NSPACE;      break;
3618             case 'd':   namedclass = ANYOF_DIGIT;       break;
3619             case 'D':   namedclass = ANYOF_NDIGIT;      break;
3620             case 'p':
3621             case 'P':
3622                 if (RExC_parse >= RExC_end)
3623                     vFAIL2("Empty \\%c{}", (U8)value);
3624                 if (*RExC_parse == '{') {
3625                     U8 c = (U8)value;
3626                     e = strchr(RExC_parse++, '}');
3627                     if (!e)
3628                         vFAIL2("Missing right brace on \\%c{}", c);
3629                     while (isSPACE(UCHARAT(RExC_parse)))
3630                         RExC_parse++;
3631                     if (e == RExC_parse)
3632                         vFAIL2("Empty \\%c{}", c);
3633                     n = e - RExC_parse;
3634                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3635                         n--;
3636                 }
3637                 else {
3638                     e = RExC_parse;
3639                     n = 1;
3640                 }
3641                 if (!SIZE_ONLY) {
3642                     if (UCHARAT(RExC_parse) == '^') {
3643                          RExC_parse++;
3644                          n--;
3645                          value = value == 'p' ? 'P' : 'p'; /* toggle */
3646                          while (isSPACE(UCHARAT(RExC_parse))) {
3647                               RExC_parse++;
3648                               n--;
3649                          }
3650                     }
3651                     if (value == 'p')
3652                          Perl_sv_catpvf(aTHX_ listsv,
3653                                         "+utf8::%.*s\n", (int)n, RExC_parse);
3654                     else
3655                          Perl_sv_catpvf(aTHX_ listsv,
3656                                         "!utf8::%.*s\n", (int)n, RExC_parse);
3657                 }
3658                 RExC_parse = e + 1;
3659                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3660                 continue;
3661             case 'n':   value = '\n';                   break;
3662             case 'r':   value = '\r';                   break;
3663             case 't':   value = '\t';                   break;
3664             case 'f':   value = '\f';                   break;
3665             case 'b':   value = '\b';                   break;
3666             case 'e':   value = ASCII_TO_NATIVE('\033');break;
3667             case 'a':   value = ASCII_TO_NATIVE('\007');break;
3668             case 'x':
3669                 if (*RExC_parse == '{') {
3670                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3671                         | PERL_SCAN_DISALLOW_PREFIX;
3672                     e = strchr(RExC_parse++, '}');
3673                     if (!e)
3674                         vFAIL("Missing right brace on \\x{}");
3675
3676                     numlen = e - RExC_parse;
3677                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3678                     RExC_parse = e + 1;
3679                 }
3680                 else {
3681                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3682                     numlen = 2;
3683                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3684                     RExC_parse += numlen;
3685                 }
3686                 break;
3687             case 'c':
3688                 value = UCHARAT(RExC_parse++);
3689                 value = toCTRL(value);
3690                 break;
3691             case '0': case '1': case '2': case '3': case '4':
3692             case '5': case '6': case '7': case '8': case '9':
3693             {
3694                 I32 flags = 0;
3695                 numlen = 3;
3696                 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3697                 RExC_parse += numlen;
3698                 break;
3699             }
3700             default:
3701                 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3702                     vWARN2(RExC_parse,
3703                            "Unrecognized escape \\%c in character class passed through",
3704                            (int)value);
3705                 break;
3706             }
3707         } /* end of \blah */
3708 #ifdef EBCDIC
3709         else
3710             literal_endpoint++;
3711 #endif
3712
3713         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3714
3715             if (!SIZE_ONLY && !need_class)
3716                 ANYOF_CLASS_ZERO(ret);
3717
3718             need_class = 1;
3719
3720             /* a bad range like a-\d, a-[:digit:] ? */
3721             if (range) {
3722                 if (!SIZE_ONLY) {
3723                     if (ckWARN(WARN_REGEXP))
3724                         vWARN4(RExC_parse,
3725                                "False [] range \"%*.*s\"",
3726                                RExC_parse - rangebegin,
3727                                RExC_parse - rangebegin,
3728                                rangebegin);
3729                     if (prevvalue < 256) {
3730                         ANYOF_BITMAP_SET(ret, prevvalue);
3731                         ANYOF_BITMAP_SET(ret, '-');
3732                     }
3733                     else {
3734                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3735                         Perl_sv_catpvf(aTHX_ listsv,
3736                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3737                     }
3738                 }
3739
3740                 range = 0; /* this was not a true range */
3741             }
3742
3743             if (!SIZE_ONLY) {
3744                 if (namedclass > OOB_NAMEDCLASS)
3745                     optimize_invert = FALSE;
3746                 /* Possible truncation here but in some 64-bit environments
3747                  * the compiler gets heartburn about switch on 64-bit values.
3748                  * A similar issue a little earlier when switching on value.
3749                  * --jhi */
3750                 switch ((I32)namedclass) {
3751                 case ANYOF_ALNUM:
3752                     if (LOC)
3753                         ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3754                     else {
3755                         for (value = 0; value < 256; value++)
3756                             if (isALNUM(value))
3757                                 ANYOF_BITMAP_SET(ret, value);
3758                     }
3759                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");    
3760                     break;
3761                 case ANYOF_NALNUM:
3762                     if (LOC)
3763                         ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3764                     else {
3765                         for (value = 0; value < 256; value++)
3766                             if (!isALNUM(value))
3767                                 ANYOF_BITMAP_SET(ret, value);
3768                     }
3769                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3770                     break;
3771                 case ANYOF_ALNUMC:
3772                     if (LOC)
3773                         ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3774                     else {
3775                         for (value = 0; value < 256; value++)
3776                             if (isALNUMC(value))
3777                                 ANYOF_BITMAP_SET(ret, value);
3778                     }
3779                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3780                     break;
3781                 case ANYOF_NALNUMC:
3782                     if (LOC)
3783                         ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3784                     else {
3785                         for (value = 0; value < 256; value++)
3786                             if (!isALNUMC(value))
3787                                 ANYOF_BITMAP_SET(ret, value);
3788                     }
3789                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3790                     break;
3791                 case ANYOF_ALPHA:
3792                     if (LOC)
3793                         ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3794                     else {
3795                         for (value = 0; value < 256; value++)
3796                             if (isALPHA(value))
3797                                 ANYOF_BITMAP_SET(ret, value);
3798                     }
3799                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3800                     break;
3801                 case ANYOF_NALPHA:
3802                     if (LOC)
3803                         ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3804                     else {
3805                         for (value = 0; value < 256; value++)
3806                             if (!isALPHA(value))
3807                                 ANYOF_BITMAP_SET(ret, value);
3808                     }
3809                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3810                     break;
3811                 case ANYOF_ASCII:
3812                     if (LOC)
3813                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3814                     else {
3815 #ifndef EBCDIC
3816                         for (value = 0; value < 128; value++)
3817                             ANYOF_BITMAP_SET(ret, value);
3818 #else  /* EBCDIC */
3819                         for (value = 0; value < 256; value++) {
3820                             if (isASCII(value))
3821                                 ANYOF_BITMAP_SET(ret, value);
3822                         }
3823 #endif /* EBCDIC */
3824                     }
3825                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3826                     break;
3827                 case ANYOF_NASCII:
3828                     if (LOC)
3829                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3830                     else {
3831 #ifndef EBCDIC
3832                         for (value = 128; value < 256; value++)
3833                             ANYOF_BITMAP_SET(ret, value);
3834 #else  /* EBCDIC */
3835                         for (value = 0; value < 256; value++) {
3836                             if (!isASCII(value))
3837                                 ANYOF_BITMAP_SET(ret, value);
3838                         }
3839 #endif /* EBCDIC */
3840                     }
3841                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3842                     break;
3843                 case ANYOF_BLANK:
3844                     if (LOC)
3845                         ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3846                     else {
3847                         for (value = 0; value < 256; value++)
3848                             if (isBLANK(value))
3849                                 ANYOF_BITMAP_SET(ret, value);
3850                     }
3851                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3852                     break;
3853                 case ANYOF_NBLANK:
3854                     if (LOC)
3855                         ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3856                     else {
3857                         for (value = 0; value < 256; value++)
3858                             if (!isBLANK(value))
3859                                 ANYOF_BITMAP_SET(ret, value);
3860                     }
3861                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3862                     break;
3863                 case ANYOF_CNTRL:
3864                     if (LOC)
3865                         ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3866                     else {
3867                         for (value = 0; value < 256; value++)
3868                             if (isCNTRL(value))
3869                                 ANYOF_BITMAP_SET(ret, value);
3870                     }
3871                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3872                     break;
3873                 case ANYOF_NCNTRL:
3874                     if (LOC)
3875                         ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3876                     else {
3877                         for (value = 0; value < 256; value++)
3878                             if (!isCNTRL(value))
3879                                 ANYOF_BITMAP_SET(ret, value);
3880                     }
3881                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3882                     break;
3883                 case ANYOF_DIGIT:
3884                     if (LOC)
3885                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3886                     else {
3887                         /* consecutive digits assumed */
3888                         for (value = '0'; value <= '9'; value++)
3889                             ANYOF_BITMAP_SET(ret, value);
3890                     }
3891                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3892                     break;
3893                 case ANYOF_NDIGIT:
3894                     if (LOC)
3895                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3896                     else {
3897                         /* consecutive digits assumed */
3898                         for (value = 0; value < '0'; value++)
3899                             ANYOF_BITMAP_SET(ret, value);
3900                         for (value = '9' + 1; value < 256; value++)
3901                             ANYOF_BITMAP_SET(ret, value);
3902                     }
3903                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3904                     break;
3905                 case ANYOF_GRAPH:
3906                     if (LOC)
3907                         ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3908                     else {
3909                         for (value = 0; value < 256; value++)
3910                             if (isGRAPH(value))
3911                                 ANYOF_BITMAP_SET(ret, value);
3912                     }
3913                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3914                     break;
3915                 case ANYOF_NGRAPH:
3916                     if (LOC)
3917                         ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3918                     else {
3919                         for (value = 0; value < 256; value++)
3920                             if (!isGRAPH(value))
3921                                 ANYOF_BITMAP_SET(ret, value);
3922                     }
3923                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3924                     break;
3925                 case ANYOF_LOWER:
3926                     if (LOC)
3927                         ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3928                     else {
3929                         for (value = 0; value < 256; value++)
3930                             if (isLOWER(value))
3931                                 ANYOF_BITMAP_SET(ret, value);
3932                     }
3933                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3934                     break;
3935                 case ANYOF_NLOWER:
3936                     if (LOC)
3937                         ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3938                     else {
3939                         for (value = 0; value < 256; value++)
3940                             if (!isLOWER(value))
3941                                 ANYOF_BITMAP_SET(ret, value);
3942                     }
3943                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3944                     break;
3945                 case ANYOF_PRINT:
3946                     if (LOC)
3947                         ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3948                     else {
3949                         for (value = 0; value < 256; value++)
3950                             if (isPRINT(value))
3951                                 ANYOF_BITMAP_SET(ret, value);
3952                     }
3953                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3954                     break;
3955                 case ANYOF_NPRINT:
3956                     if (LOC)
3957                         ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3958                     else {
3959                         for (value = 0; value < 256; value++)
3960                             if (!isPRINT(value))
3961                                 ANYOF_BITMAP_SET(ret, value);
3962                     }
3963                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3964                     break;
3965                 case ANYOF_PSXSPC:
3966                     if (LOC)
3967                         ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3968                     else {
3969                         for (value = 0; value < 256; value++)
3970                             if (isPSXSPC(value))
3971                                 ANYOF_BITMAP_SET(ret, value);
3972                     }
3973                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3974                     break;
3975                 case ANYOF_NPSXSPC:
3976                     if (LOC)
3977                         ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3978                     else {
3979                         for (value = 0; value < 256; value++)
3980                             if (!isPSXSPC(value))
3981                                 ANYOF_BITMAP_SET(ret, value);
3982                     }
3983                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3984                     break;
3985                 case ANYOF_PUNCT:
3986                     if (LOC)
3987                         ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3988                     else {
3989                         for (value = 0; value < 256; value++)
3990                             if (isPUNCT(value))
3991                                 ANYOF_BITMAP_SET(ret, value);
3992                     }
3993                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
3994                     break;
3995                 case ANYOF_NPUNCT:
3996                     if (LOC)
3997                         ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
3998                     else {
3999                         for (value = 0; value < 256; value++)
4000                             if (!isPUNCT(value))
4001                                 ANYOF_BITMAP_SET(ret, value);
4002                     }
4003                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
4004                     break;
4005                 case ANYOF_SPACE:
4006                     if (LOC)
4007                         ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4008                     else {
4009                         for (value = 0; value < 256; value++)
4010                             if (isSPACE(value))
4011                                 ANYOF_BITMAP_SET(ret, value);
4012                     }
4013                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
4014                     break;
4015                 case ANYOF_NSPACE:
4016                     if (LOC)
4017                         ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4018                     else {
4019                         for (value = 0; value < 256; value++)
4020                             if (!isSPACE(value))
4021                                 ANYOF_BITMAP_SET(ret, value);
4022                     }
4023                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
4024                     break;
4025                 case ANYOF_UPPER:
4026                     if (LOC)
4027                         ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4028                     else {
4029                         for (value = 0; value < 256; value++)
4030                             if (isUPPER(value))
4031                                 ANYOF_BITMAP_SET(ret, value);
4032                     }
4033                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
4034                     break;
4035                 case ANYOF_NUPPER:
4036                     if (LOC)
4037                         ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4038                     else {
4039                         for (value = 0; value < 256; value++)
4040                             if (!isUPPER(value))
4041                                 ANYOF_BITMAP_SET(ret, value);
4042                     }
4043                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
4044                     break;
4045                 case ANYOF_XDIGIT:
4046                     if (LOC)
4047                         ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4048                     else {
4049                         for (value = 0; value < 256; value++)
4050                             if (isXDIGIT(value))
4051                                 ANYOF_BITMAP_SET(ret, value);
4052                     }
4053                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4054                     break;
4055                 case ANYOF_NXDIGIT:
4056                     if (LOC)
4057                         ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4058                     else {
4059                         for (value = 0; value < 256; value++)
4060                             if (!isXDIGIT(value))
4061                                 ANYOF_BITMAP_SET(ret, value);
4062                     }
4063                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4064                     break;
4065                 default:
4066                     vFAIL("Invalid [::] class");
4067                     break;
4068                 }
4069                 if (LOC)
4070                     ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4071                 continue;
4072             }
4073         } /* end of namedclass \blah */
4074
4075         if (range) {
4076             if (prevvalue > (IV)value) /* b-a */ {
4077                 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4078                               RExC_parse - rangebegin,
4079                               RExC_parse - rangebegin,
4080                               rangebegin);
4081                 range = 0; /* not a valid range */
4082             }
4083         }
4084         else {
4085             prevvalue = value; /* save the beginning of the range */
4086             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4087                 RExC_parse[1] != ']') {
4088                 RExC_parse++;
4089
4090                 /* a bad range like \w-, [:word:]- ? */
4091                 if (namedclass > OOB_NAMEDCLASS) {
4092                     if (ckWARN(WARN_REGEXP))
4093                         vWARN4(RExC_parse,
4094                                "False [] range \"%*.*s\"",
4095                                RExC_parse - rangebegin,
4096                                RExC_parse - rangebegin,
4097                                rangebegin);
4098                     if (!SIZE_ONLY)
4099                         ANYOF_BITMAP_SET(ret, '-');
4100                 } else
4101                     range = 1;  /* yeah, it's a range! */
4102                 continue;       /* but do it the next time */
4103             }
4104         }
4105
4106         /* now is the next time */
4107         if (!SIZE_ONLY) {
4108             IV i;
4109
4110             if (prevvalue < 256) {
4111                 IV ceilvalue = value < 256 ? value : 255;
4112
4113 #ifdef EBCDIC
4114                 /* In EBCDIC [\x89-\x91] should include
4115                  * the \x8e but [i-j] should not. */
4116                 if (literal_endpoint == 2 &&
4117                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4118                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4119                 {
4120                     if (isLOWER(prevvalue)) {
4121                         for (i = prevvalue; i <= ceilvalue; i++)
4122                             if (isLOWER(i))
4123                                 ANYOF_BITMAP_SET(ret, i);
4124                     } else {
4125                         for (i = prevvalue; i <= ceilvalue; i++)
4126                             if (isUPPER(i))
4127                                 ANYOF_BITMAP_SET(ret, i);
4128                     }
4129                 }
4130                 else
4131 #endif
4132                       for (i = prevvalue; i <= ceilvalue; i++)
4133                           ANYOF_BITMAP_SET(ret, i);
4134           }
4135           if (value > 255 || UTF) {
4136                 UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
4137                 UV natvalue      = NATIVE_TO_UNI(value);
4138
4139                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4140                 if (prevnatvalue < natvalue) { /* what about > ? */
4141                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4142                                    prevnatvalue, natvalue);
4143                 }
4144                 else if (prevnatvalue == natvalue) {
4145                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4146                     if (FOLD) {
4147                          U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4148                          STRLEN foldlen;
4149                          UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4150
4151                          /* If folding and foldable and a single
4152                           * character, insert also the folded version
4153                           * to the charclass. */
4154                          if (f != value) {
4155                               if (foldlen == (STRLEN)UNISKIP(f))
4156                                   Perl_sv_catpvf(aTHX_ listsv,
4157                                                  "%04"UVxf"\n", f);
4158                               else {
4159                                   /* Any multicharacter foldings
4160                                    * require the following transform:
4161                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4162                                    * where E folds into "pq" and F folds
4163                                    * into "rst", all other characters
4164                                    * fold to single characters.  We save
4165                                    * away these multicharacter foldings,
4166                                    * to be later saved as part of the
4167                                    * additional "s" data. */
4168                                   SV *sv;
4169
4170                                   if (!unicode_alternate)
4171                                       unicode_alternate = newAV();
4172                                   sv = newSVpvn((char*)foldbuf, foldlen);
4173                                   SvUTF8_on(sv);
4174                                   av_push(unicode_alternate, sv);
4175                               }
4176                          }
4177
4178                          /* If folding and the value is one of the Greek
4179                           * sigmas insert a few more sigmas to make the
4180                           * folding rules of the sigmas to work right.
4181                           * Note that not all the possible combinations
4182                           * are handled here: some of them are handled
4183                           * by the standard folding rules, and some of
4184                           * them (literal or EXACTF cases) are handled
4185                           * during runtime in regexec.c:S_find_byclass(). */
4186                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4187                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4188                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4189                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4190                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4191                          }
4192                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4193                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4194                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4195                     }
4196                 }
4197             }
4198 #ifdef EBCDIC
4199             literal_endpoint = 0;
4200 #endif
4201         }
4202
4203         range = 0; /* this range (if it was one) is done now */
4204     }
4205
4206     if (need_class) {
4207         ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4208         if (SIZE_ONLY)
4209             RExC_size += ANYOF_CLASS_ADD_SKIP;
4210         else
4211             RExC_emit += ANYOF_CLASS_ADD_SKIP;
4212     }
4213
4214     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4215     if (!SIZE_ONLY &&
4216          /* If the only flag is folding (plus possibly inversion). */
4217         ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4218        ) {
4219         for (value = 0; value < 256; ++value) {
4220             if (ANYOF_BITMAP_TEST(ret, value)) {
4221                 UV fold = PL_fold[value];
4222
4223                 if (fold != value)
4224                     ANYOF_BITMAP_SET(ret, fold);
4225             }
4226         }
4227         ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4228     }
4229
4230     /* optimize inverted simple patterns (e.g. [^a-z]) */
4231     if (!SIZE_ONLY && optimize_invert &&
4232         /* If the only flag is inversion. */
4233         (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4234         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4235             ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4236         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4237     }
4238
4239     if (!SIZE_ONLY) {
4240         AV *av = newAV();
4241         SV *rv;
4242
4243         /* The 0th element stores the character class description
4244          * in its textual form: used later (regexec.c:Perl_regclass_swash())
4245          * to initialize the appropriate swash (which gets stored in
4246          * the 1st element), and also useful for dumping the regnode.
4247          * The 2nd element stores the multicharacter foldings,
4248          * used later (regexec.c:S_reginclass()). */
4249         av_store(av, 0, listsv);
4250         av_store(av, 1, NULL);
4251         av_store(av, 2, (SV*)unicode_alternate);
4252         rv = newRV_noinc((SV*)av);
4253         n = add_data(pRExC_state, 1, "s");
4254         RExC_rx->data->data[n] = (void*)rv;
4255         ARG_SET(ret, n);
4256     }
4257
4258     return ret;
4259 }
4260
4261 STATIC char*
4262 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4263 {
4264     char* retval = RExC_parse++;
4265
4266     for (;;) {
4267         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4268                 RExC_parse[2] == '#') {
4269             while (*RExC_parse && *RExC_parse != ')')
4270                 RExC_parse++;
4271             RExC_parse++;
4272             continue;
4273         }
4274         if (RExC_flags & PMf_EXTENDED) {
4275             if (isSPACE(*RExC_parse)) {
4276                 RExC_parse++;
4277                 continue;
4278             }
4279             else if (*RExC_parse == '#') {
4280                 while (*RExC_parse && *RExC_parse != '\n')
4281                     RExC_parse++;
4282                 RExC_parse++;
4283                 continue;
4284             }
4285         }
4286         return retval;
4287     }
4288 }
4289
4290 /*
4291 - reg_node - emit a node
4292 */
4293 STATIC regnode *                        /* Location. */
4294 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4295 {
4296     register regnode *ret;
4297     register regnode *ptr;
4298
4299     ret = RExC_emit;
4300     if (SIZE_ONLY) {
4301         SIZE_ALIGN(RExC_size);
4302         RExC_size += 1;
4303         return(ret);
4304     }
4305
4306     NODE_ALIGN_FILL(ret);
4307     ptr = ret;
4308     FILL_ADVANCE_NODE(ptr, op);
4309     if (RExC_offsets) {         /* MJD */
4310         MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n", 
4311               "reg_node", __LINE__, 
4312               reg_name[op],
4313               RExC_emit - RExC_emit_start > RExC_offsets[0] 
4314               ? "Overwriting end of array!\n" : "OK",
4315               RExC_emit - RExC_emit_start,
4316               RExC_parse - RExC_start,
4317               RExC_offsets[0])); 
4318         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4319     }
4320             
4321     RExC_emit = ptr;
4322
4323     return(ret);
4324 }
4325
4326 /*
4327 - reganode - emit a node with an argument
4328 */
4329 STATIC regnode *                        /* Location. */
4330 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4331 {
4332     register regnode *ret;
4333     register regnode *ptr;
4334
4335     ret = RExC_emit;
4336     if (SIZE_ONLY) {
4337         SIZE_ALIGN(RExC_size);
4338         RExC_size += 2;
4339         return(ret);
4340     }
4341
4342     NODE_ALIGN_FILL(ret);
4343     ptr = ret;
4344     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4345     if (RExC_offsets) {         /* MJD */
4346         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n", 
4347               "reganode",
4348               __LINE__,
4349               reg_name[op],
4350               RExC_emit - RExC_emit_start > RExC_offsets[0] ? 
4351               "Overwriting end of array!\n" : "OK",
4352               RExC_emit - RExC_emit_start,
4353               RExC_parse - RExC_start,
4354               RExC_offsets[0])); 
4355         Set_Cur_Node_Offset;
4356     }
4357             
4358     RExC_emit = ptr;
4359
4360     return(ret);
4361 }
4362
4363 /*
4364 - reguni - emit (if appropriate) a Unicode character
4365 */
4366 STATIC void
4367 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4368 {
4369     *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4370 }
4371
4372 /*
4373 - reginsert - insert an operator in front of already-emitted operand
4374 *
4375 * Means relocating the operand.
4376 */
4377 STATIC void
4378 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4379 {
4380     register regnode *src;
4381     register regnode *dst;
4382     register regnode *place;
4383     register int offset = regarglen[(U8)op];
4384
4385 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4386
4387     if (SIZE_ONLY) {
4388         RExC_size += NODE_STEP_REGNODE + offset;
4389         return;
4390     }
4391
4392     src = RExC_emit;
4393     RExC_emit += NODE_STEP_REGNODE + offset;
4394     dst = RExC_emit;
4395     while (src > opnd) {
4396         StructCopy(--src, --dst, regnode);
4397         if (RExC_offsets) {     /* MJD 20010112 */
4398             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4399                   "reg_insert",
4400                   __LINE__,
4401                   reg_name[op],
4402                   dst - RExC_emit_start > RExC_offsets[0] 
4403                   ? "Overwriting end of array!\n" : "OK",
4404                   src - RExC_emit_start,
4405                   dst - RExC_emit_start,
4406                   RExC_offsets[0])); 
4407             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4408             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4409         }
4410     }
4411     
4412
4413     place = opnd;               /* Op node, where operand used to be. */
4414     if (RExC_offsets) {         /* MJD */
4415         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n", 
4416               "reginsert",
4417               __LINE__,
4418               reg_name[op],
4419               place - RExC_emit_start > RExC_offsets[0] 
4420               ? "Overwriting end of array!\n" : "OK",
4421               place - RExC_emit_start,
4422               RExC_parse - RExC_start,
4423               RExC_offsets[0])); 
4424         Set_Node_Offset(place, RExC_parse);
4425     }
4426     src = NEXTOPER(place);
4427     FILL_ADVANCE_NODE(place, op);
4428     Zero(src, offset, regnode);
4429 }
4430
4431 /*
4432 - regtail - set the next-pointer at the end of a node chain of p to val.
4433 */
4434 STATIC void
4435 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4436 {
4437     register regnode *scan;
4438     register regnode *temp;
4439
4440     if (SIZE_ONLY)
4441         return;
4442
4443     /* Find last node. */
4444     scan = p;
4445     for (;;) {
4446         temp = regnext(scan);
4447         if (temp == NULL)
4448             break;
4449         scan = temp;
4450     }
4451
4452     if (reg_off_by_arg[OP(scan)]) {
4453         ARG_SET(scan, val - scan);
4454     }
4455     else {
4456         NEXT_OFF(scan) = val - scan;
4457     }
4458 }
4459
4460 /*
4461 - regoptail - regtail on operand of first argument; nop if operandless
4462 */
4463 STATIC void
4464 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4465 {
4466     /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4467     if (p == NULL || SIZE_ONLY)
4468         return;
4469     if (PL_regkind[(U8)OP(p)] == BRANCH) {
4470         regtail(pRExC_state, NEXTOPER(p), val);
4471     }
4472     else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4473         regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4474     }
4475     else
4476         return;
4477 }
4478
4479 /*
4480  - regcurly - a little FSA that accepts {\d+,?\d*}
4481  */
4482 STATIC I32
4483 S_regcurly(pTHX_ register char *s)
4484 {
4485     if (*s++ != '{')
4486         return FALSE;
4487     if (!isDIGIT(*s))
4488         return FALSE;
4489     while (isDIGIT(*s))
4490         s++;
4491     if (*s == ',')
4492         s++;
4493     while (isDIGIT(*s))
4494         s++;
4495     if (*s != '}')
4496         return FALSE;
4497     return TRUE;
4498 }
4499
4500
4501 #ifdef DEBUGGING
4502
4503 STATIC regnode *
4504 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4505 {
4506     register U8 op = EXACT;     /* Arbitrary non-END op. */
4507     register regnode *next;
4508
4509     while (op != END && (!last || node < last)) {
4510         /* While that wasn't END last time... */
4511
4512         NODE_ALIGN(node);
4513         op = OP(node);
4514         if (op == CLOSE)
4515             l--;        
4516         next = regnext(node);
4517         /* Where, what. */
4518         if (OP(node) == OPTIMIZED)
4519             goto after_print;
4520         regprop(sv, node);
4521         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4522                       (int)(2*l + 1), "", SvPVX(sv));
4523         if (next == NULL)               /* Next ptr. */
4524             PerlIO_printf(Perl_debug_log, "(0)");
4525         else
4526             PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4527         (void)PerlIO_putc(Perl_debug_log, '\n');
4528       after_print:
4529         if (PL_regkind[(U8)op] == BRANCHJ) {
4530             register regnode *nnode = (OP(next) == LONGJMP
4531                                        ? regnext(next)
4532                                        : next);
4533             if (last && nnode > last)
4534                 nnode = last;
4535             node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4536         }
4537         else if (PL_regkind[(U8)op] == BRANCH) {
4538             node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4539         }
4540         else if ( op == CURLY) {   /* `next' might be very big: optimizer */
4541             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4542                              NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4543         }
4544         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4545             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4546                              next, sv, l + 1);
4547         }
4548         else if ( op == PLUS || op == STAR) {
4549             node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4550         }
4551         else if (op == ANYOF) {
4552             /* arglen 1 + class block */
4553             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4554                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4555             node = NEXTOPER(node);
4556         }
4557         else if (PL_regkind[(U8)op] == EXACT) {
4558             /* Literal string, where present. */
4559             node += NODE_SZ_STR(node) - 1;
4560             node = NEXTOPER(node);
4561         }
4562         else {
4563             node = NEXTOPER(node);
4564             node += regarglen[(U8)op];
4565         }
4566         if (op == CURLYX || op == OPEN)
4567             l++;
4568         else if (op == WHILEM)
4569             l--;
4570     }
4571     return node;
4572 }
4573
4574 #endif  /* DEBUGGING */
4575
4576 /*
4577  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4578  */
4579 void
4580 Perl_regdump(pTHX_ regexp *r)
4581 {
4582 #ifdef DEBUGGING
4583     SV *sv = sv_newmortal();
4584
4585     (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4586
4587     /* Header fields of interest. */
4588     if (r->anchored_substr)
4589         PerlIO_printf(Perl_debug_log,
4590                       "anchored `%s%.*s%s'%s at %"IVdf" ",
4591                       PL_colors[0],
4592                       (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4593                       SvPVX(r->anchored_substr),
4594                       PL_colors[1],
4595                       SvTAIL(r->anchored_substr) ? "$" : "",
4596                       (IV)r->anchored_offset);
4597     else if (r->anchored_utf8)
4598         PerlIO_printf(Perl_debug_log,
4599                       "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4600                       PL_colors[0],
4601                       (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4602                       SvPVX(r->anchored_utf8),
4603                       PL_colors[1],
4604                       SvTAIL(r->anchored_utf8) ? "$" : "",
4605                       (IV)r->anchored_offset);
4606     if (r->float_substr)
4607         PerlIO_printf(Perl_debug_log,
4608                       "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4609                       PL_colors[0],
4610                       (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4611                       SvPVX(r->float_substr),
4612                       PL_colors[1],
4613                       SvTAIL(r->float_substr) ? "$" : "",
4614                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4615     else if (r->float_utf8)
4616         PerlIO_printf(Perl_debug_log,
4617                       "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4618                       PL_colors[0],
4619                       (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4620                       SvPVX(r->float_utf8),
4621                       PL_colors[1],
4622                       SvTAIL(r->float_utf8) ? "$" : "",
4623                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4624     if (r->check_substr || r->check_utf8)
4625         PerlIO_printf(Perl_debug_log,
4626                       r->check_substr == r->float_substr
4627                       && r->check_utf8 == r->float_utf8
4628                       ? "(checking floating" : "(checking anchored");
4629     if (r->reganch & ROPT_NOSCAN)
4630         PerlIO_printf(Perl_debug_log, " noscan");
4631     if (r->reganch & ROPT_CHECK_ALL)
4632         PerlIO_printf(Perl_debug_log, " isall");
4633     if (r->check_substr || r->check_utf8)
4634         PerlIO_printf(Perl_debug_log, ") ");
4635
4636     if (r->regstclass) {
4637         regprop(sv, r->regstclass);
4638         PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4639     }
4640     if (r->reganch & ROPT_ANCH) {
4641         PerlIO_printf(Perl_debug_log, "anchored");
4642         if (r->reganch & ROPT_ANCH_BOL)
4643             PerlIO_printf(Perl_debug_log, "(BOL)");
4644         if (r->reganch & ROPT_ANCH_MBOL)
4645             PerlIO_printf(Perl_debug_log, "(MBOL)");
4646         if (r->reganch & ROPT_ANCH_SBOL)
4647             PerlIO_printf(Perl_debug_log, "(SBOL)");
4648         if (r->reganch & ROPT_ANCH_GPOS)
4649             PerlIO_printf(Perl_debug_log, "(GPOS)");
4650         PerlIO_putc(Perl_debug_log, ' ');
4651     }
4652     if (r->reganch & ROPT_GPOS_SEEN)
4653         PerlIO_printf(Perl_debug_log, "GPOS ");
4654     if (r->reganch & ROPT_SKIP)
4655         PerlIO_printf(Perl_debug_log, "plus ");
4656     if (r->reganch & ROPT_IMPLICIT)
4657         PerlIO_printf(Perl_debug_log, "implicit ");
4658     PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4659     if (r->reganch & ROPT_EVAL_SEEN)
4660         PerlIO_printf(Perl_debug_log, "with eval ");
4661     PerlIO_printf(Perl_debug_log, "\n");
4662     if (r->offsets) {
4663       U32 i;
4664       U32 len = r->offsets[0];
4665       PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4666       for (i = 1; i <= len; i++)
4667         PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ", 
4668                       (UV)r->offsets[i*2-1], 
4669                       (UV)r->offsets[i*2]);
4670       PerlIO_printf(Perl_debug_log, "\n");
4671     }
4672 #endif  /* DEBUGGING */
4673 }
4674
4675 #ifdef DEBUGGING
4676
4677 STATIC void
4678 S_put_byte(pTHX_ SV *sv, int c)
4679 {
4680     if (isCNTRL(c) || c == 255 || !isPRINT(c))
4681         Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4682     else if (c == '-' || c == ']' || c == '\\' || c == '^')
4683         Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4684     else
4685         Perl_sv_catpvf(aTHX_ sv, "%c", c);
4686 }
4687
4688 #endif  /* DEBUGGING */
4689
4690 /*
4691 - regprop - printable representation of opcode
4692 */
4693 void
4694 Perl_regprop(pTHX_ SV *sv, regnode *o)
4695 {
4696 #ifdef DEBUGGING
4697     register int k;
4698
4699     sv_setpvn(sv, "", 0);
4700     if (OP(o) >= reg_num)               /* regnode.type is unsigned */
4701         /* It would be nice to FAIL() here, but this may be called from
4702            regexec.c, and it would be hard to supply pRExC_state. */
4703         Perl_croak(aTHX_ "Corrupted regexp opcode");
4704     sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4705
4706     k = PL_regkind[(U8)OP(o)];
4707
4708     if (k == EXACT) {
4709         SV *dsv = sv_2mortal(newSVpvn("", 0));
4710         /* Using is_utf8_string() is a crude hack but it may
4711          * be the best for now since we have no flag "this EXACTish
4712          * node was UTF-8" --jhi */
4713         bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4714         char *s    = do_utf8 ?
4715           pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4716                          UNI_DISPLAY_REGEX) :
4717           STRING(o);
4718         int len = do_utf8 ?
4719           strlen(s) :
4720           STR_LEN(o);
4721         Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4722                        PL_colors[0],
4723                        len, s,
4724                        PL_colors[1]);
4725     }
4726     else if (k == CURLY) {
4727         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4728             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4729         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4730     }
4731     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
4732         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4733     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4734         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
4735     else if (k == LOGICAL)
4736         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
4737     else if (k == ANYOF) {
4738         int i, rangestart = -1;
4739         U8 flags = ANYOF_FLAGS(o);
4740         const char * const anyofs[] = { /* Should be synchronized with
4741                                          * ANYOF_ #xdefines in regcomp.h */
4742             "\\w",
4743             "\\W",
4744             "\\s",
4745             "\\S",
4746             "\\d",
4747             "\\D",
4748             "[:alnum:]",
4749             "[:^alnum:]",
4750             "[:alpha:]",
4751             "[:^alpha:]",
4752             "[:ascii:]",
4753             "[:^ascii:]",
4754             "[:ctrl:]",
4755             "[:^ctrl:]",
4756             "[:graph:]",
4757             "[:^graph:]",
4758             "[:lower:]",
4759             "[:^lower:]",
4760             "[:print:]",
4761             "[:^print:]",
4762             "[:punct:]",
4763             "[:^punct:]",
4764             "[:upper:]",
4765             "[:^upper:]",
4766             "[:xdigit:]",
4767             "[:^xdigit:]",
4768             "[:space:]",
4769             "[:^space:]",
4770             "[:blank:]",
4771             "[:^blank:]"
4772         };
4773
4774         if (flags & ANYOF_LOCALE)
4775             sv_catpv(sv, "{loc}");
4776         if (flags & ANYOF_FOLD)
4777             sv_catpv(sv, "{i}");
4778         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4779         if (flags & ANYOF_INVERT)
4780             sv_catpv(sv, "^");
4781         for (i = 0; i <= 256; i++) {
4782             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4783                 if (rangestart == -1)
4784                     rangestart = i;
4785             } else if (rangestart != -1) {
4786                 if (i <= rangestart + 3)
4787                     for (; rangestart < i; rangestart++)
4788                         put_byte(sv, rangestart);
4789                 else {
4790                     put_byte(sv, rangestart);
4791                     sv_catpv(sv, "-");
4792                     put_byte(sv, i - 1);
4793                 }
4794                 rangestart = -1;
4795             }
4796         }
4797
4798         if (o->flags & ANYOF_CLASS)
4799             for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4800                 if (ANYOF_CLASS_TEST(o,i))
4801                     sv_catpv(sv, anyofs[i]);
4802
4803         if (flags & ANYOF_UNICODE)
4804             sv_catpv(sv, "{unicode}");
4805         else if (flags & ANYOF_UNICODE_ALL)
4806             sv_catpv(sv, "{unicode_all}");
4807
4808         {
4809             SV *lv;
4810             SV *sw = regclass_swash(o, FALSE, &lv, 0);
4811         
4812             if (lv) {
4813                 if (sw) {
4814                     U8 s[UTF8_MAXLEN+1];
4815                 
4816                     for (i = 0; i <= 256; i++) { /* just the first 256 */
4817                         U8 *e = uvchr_to_utf8(s, i);
4818                         
4819                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
4820                             if (rangestart == -1)
4821                                 rangestart = i;
4822                         } else if (rangestart != -1) {
4823                             U8 *p;
4824                         
4825                             if (i <= rangestart + 3)
4826                                 for (; rangestart < i; rangestart++) {
4827                                     for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4828                                         put_byte(sv, *p);
4829                                 }
4830                             else {
4831                                 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4832                                     put_byte(sv, *p);
4833                                 sv_catpv(sv, "-");
4834                                     for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4835                                         put_byte(sv, *p);
4836                                 }
4837                                 rangestart = -1;
4838                             }
4839                         }
4840                         
4841                     sv_catpv(sv, "..."); /* et cetera */
4842                 }
4843
4844                 {
4845                     char *s = savepv(SvPVX(lv));
4846                     char *origs = s;
4847                 
4848                     while(*s && *s != '\n') s++;
4849                 
4850                     if (*s == '\n') {
4851                         char *t = ++s;
4852                         
4853                         while (*s) {
4854                             if (*s == '\n')
4855                                 *s = ' ';
4856                             s++;
4857                         }
4858                         if (s[-1] == ' ')
4859                             s[-1] = 0;
4860                         
4861                         sv_catpv(sv, t);
4862                     }
4863                 
4864                     Safefree(origs);
4865                 }
4866             }
4867         }
4868
4869         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4870     }
4871     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4872         Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4873 #endif  /* DEBUGGING */
4874 }
4875
4876 SV *
4877 Perl_re_intuit_string(pTHX_ regexp *prog)
4878 {                               /* Assume that RE_INTUIT is set */
4879     DEBUG_r(
4880         {   STRLEN n_a;
4881             char *s = SvPV(prog->check_substr
4882                       ? prog->check_substr : prog->check_utf8, n_a);
4883
4884             if (!PL_colorset) reginitcolors();
4885             PerlIO_printf(Perl_debug_log,
4886                       "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4887                       PL_colors[4],
4888                       prog->check_substr ? "" : "utf8 ",
4889                       PL_colors[5],PL_colors[0],
4890                       s,
4891                       PL_colors[1],
4892                       (strlen(s) > 60 ? "..." : ""));
4893         } );
4894
4895     return prog->check_substr ? prog->check_substr : prog->check_utf8;
4896 }
4897
4898 void
4899 Perl_pregfree(pTHX_ struct regexp *r)
4900 {
4901 #ifdef DEBUGGING
4902     SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4903 #endif
4904
4905     if (!r || (--r->refcnt > 0))
4906         return;
4907     DEBUG_r({
4908          int len;
4909          char *s;
4910
4911          s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4912                 r->prelen, 60, UNI_DISPLAY_REGEX)
4913             : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4914          len = SvCUR(dsv);
4915          if (!PL_colorset)
4916               reginitcolors();
4917          PerlIO_printf(Perl_debug_log,
4918                        "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4919                        PL_colors[4],PL_colors[5],PL_colors[0],
4920                        len, len, s,
4921                        PL_colors[1],
4922                        len > 60 ? "..." : "");
4923     });
4924
4925     if (r->precomp)
4926         Safefree(r->precomp);
4927     if (r->offsets)             /* 20010421 MJD */
4928         Safefree(r->offsets);
4929     RX_MATCH_COPY_FREE(r);
4930 #ifdef PERL_COPY_ON_WRITE
4931     if (r->saved_copy)
4932         SvREFCNT_dec(r->saved_copy);
4933 #endif
4934     if (r->substrs) {
4935         if (r->anchored_substr)
4936             SvREFCNT_dec(r->anchored_substr);
4937         if (r->anchored_utf8)
4938             SvREFCNT_dec(r->anchored_utf8);
4939         if (r->float_substr)
4940             SvREFCNT_dec(r->float_substr);
4941         if (r->float_utf8)
4942             SvREFCNT_dec(r->float_utf8);
4943         Safefree(r->substrs);
4944     }
4945     if (r->data) {
4946         int n = r->data->count;
4947         PAD* new_comppad = NULL;
4948         PAD* old_comppad;
4949
4950         while (--n >= 0) {
4951           /* If you add a ->what type here, update the comment in regcomp.h */
4952             switch (r->data->what[n]) {
4953             case 's':
4954                 SvREFCNT_dec((SV*)r->data->data[n]);
4955                 break;
4956             case 'f':
4957                 Safefree(r->data->data[n]);
4958                 break;
4959             case 'p':
4960                 new_comppad = (AV*)r->data->data[n];
4961                 break;
4962             case 'o':
4963                 if (new_comppad == NULL)
4964                     Perl_croak(aTHX_ "panic: pregfree comppad");
4965                 PAD_SAVE_LOCAL(old_comppad,
4966                     /* Watch out for global destruction's random ordering. */
4967                     (SvTYPE(new_comppad) == SVt_PVAV) ?
4968                                 new_comppad : Null(PAD *)
4969                 );
4970                 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4971                     op_free((OP_4tree*)r->data->data[n]);
4972                 }
4973
4974                 PAD_RESTORE_LOCAL(old_comppad);
4975                 SvREFCNT_dec((SV*)new_comppad);
4976                 new_comppad = NULL;
4977                 break;
4978             case 'n':
4979                 break;
4980             default:
4981                 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4982             }
4983         }
4984         Safefree(r->data->what);
4985         Safefree(r->data);
4986     }
4987     Safefree(r->startp);
4988     Safefree(r->endp);
4989     Safefree(r);
4990 }
4991
4992 /*
4993  - regnext - dig the "next" pointer out of a node
4994  *
4995  * [Note, when REGALIGN is defined there are two places in regmatch()
4996  * that bypass this code for speed.]
4997  */
4998 regnode *
4999 Perl_regnext(pTHX_ register regnode *p)
5000 {
5001     register I32 offset;
5002
5003     if (p == &PL_regdummy)
5004         return(NULL);
5005
5006     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
5007     if (offset == 0)
5008         return(NULL);
5009
5010     return(p+offset);
5011 }
5012
5013 STATIC void     
5014 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
5015 {
5016     va_list args;
5017     STRLEN l1 = strlen(pat1);
5018     STRLEN l2 = strlen(pat2);
5019     char buf[512];
5020     SV *msv;
5021     char *message;
5022
5023     if (l1 > 510)
5024         l1 = 510;
5025     if (l1 + l2 > 510)
5026         l2 = 510 - l1;
5027     Copy(pat1, buf, l1 , char);
5028     Copy(pat2, buf + l1, l2 , char);
5029     buf[l1 + l2] = '\n';
5030     buf[l1 + l2 + 1] = '\0';
5031 #ifdef I_STDARG
5032     /* ANSI variant takes additional second argument */
5033     va_start(args, pat2);
5034 #else
5035     va_start(args);
5036 #endif
5037     msv = vmess(buf, &args);
5038     va_end(args);
5039     message = SvPV(msv,l1);
5040     if (l1 > 512)
5041         l1 = 512;
5042     Copy(message, buf, l1 , char);
5043     buf[l1] = '\0';                     /* Overwrite \n */
5044     Perl_croak(aTHX_ "%s", buf);
5045 }
5046
5047 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
5048
5049 void
5050 Perl_save_re_context(pTHX)
5051 {
5052     SAVEI32(PL_reg_flags);              /* from regexec.c */
5053     SAVEPPTR(PL_bostr);
5054     SAVEPPTR(PL_reginput);              /* String-input pointer. */
5055     SAVEPPTR(PL_regbol);                /* Beginning of input, for ^ check. */
5056     SAVEPPTR(PL_regeol);                /* End of input, for $ check. */
5057     SAVEVPTR(PL_regstartp);             /* Pointer to startp array. */
5058     SAVEVPTR(PL_regendp);               /* Ditto for endp. */
5059     SAVEVPTR(PL_reglastparen);          /* Similarly for lastparen. */
5060     SAVEVPTR(PL_reglastcloseparen);     /* Similarly for lastcloseparen. */
5061     SAVEPPTR(PL_regtill);               /* How far we are required to go. */
5062     SAVEGENERICPV(PL_reg_start_tmp);            /* from regexec.c */
5063     PL_reg_start_tmp = 0;
5064     SAVEI32(PL_reg_start_tmpl);         /* from regexec.c */
5065     PL_reg_start_tmpl = 0;
5066     SAVEVPTR(PL_regdata);
5067     SAVEI32(PL_reg_eval_set);           /* from regexec.c */
5068     SAVEI32(PL_regnarrate);             /* from regexec.c */
5069     SAVEVPTR(PL_regprogram);            /* from regexec.c */
5070     SAVEINT(PL_regindent);              /* from regexec.c */
5071     SAVEVPTR(PL_regcc);                 /* from regexec.c */
5072     SAVEVPTR(PL_curcop);
5073     SAVEVPTR(PL_reg_call_cc);           /* from regexec.c */
5074     SAVEVPTR(PL_reg_re);                /* from regexec.c */
5075     SAVEPPTR(PL_reg_ganch);             /* from regexec.c */
5076     SAVESPTR(PL_reg_sv);                /* from regexec.c */
5077     SAVEBOOL(PL_reg_match_utf8);        /* from regexec.c */
5078     SAVEVPTR(PL_reg_magic);             /* from regexec.c */
5079     SAVEI32(PL_reg_oldpos);                     /* from regexec.c */
5080     SAVEVPTR(PL_reg_oldcurpm);          /* from regexec.c */
5081     SAVEVPTR(PL_reg_curpm);             /* from regexec.c */
5082     SAVEPPTR(PL_reg_oldsaved);          /* old saved substr during match */
5083     PL_reg_oldsaved = Nullch;
5084     SAVEI32(PL_reg_oldsavedlen);        /* old length of saved substr during match */
5085     PL_reg_oldsavedlen = 0;
5086 #ifdef PERL_COPY_ON_WRITE
5087     SAVESPTR(PL_nrs);
5088     PL_nrs = Nullsv;
5089 #endif
5090     SAVEI32(PL_reg_maxiter);            /* max wait until caching pos */
5091     PL_reg_maxiter = 0;
5092     SAVEI32(PL_reg_leftiter);           /* wait until caching pos */
5093     PL_reg_leftiter = 0;
5094     SAVEGENERICPV(PL_reg_poscache);     /* cache of pos of WHILEM */
5095     PL_reg_poscache = Nullch;
5096     SAVEI32(PL_reg_poscache_size);      /* size of pos cache of WHILEM */
5097     PL_reg_poscache_size = 0;
5098     SAVEPPTR(PL_regprecomp);            /* uncompiled string. */
5099     SAVEI32(PL_regnpar);                /* () count. */
5100     SAVEI32(PL_regsize);                /* from regexec.c */
5101
5102     {
5103         /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
5104         U32 i;
5105         GV *mgv;
5106         REGEXP *rx;
5107         char digits[16];
5108
5109         if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5110             for (i = 1; i <= rx->nparens; i++) {
5111                 sprintf(digits, "%lu", (long)i);
5112                 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5113                     save_scalar(mgv);
5114             }
5115         }
5116     }
5117
5118 #ifdef DEBUGGING
5119     SAVEPPTR(PL_reg_starttry);          /* from regexec.c */
5120 #endif
5121 }
5122
5123 static void
5124 clear_re(pTHX_ void *r)
5125 {
5126     ReREFCNT_dec((regexp *)r);
5127 }
5128