Make also the -CAL conditional on locale.
[p5sagit/p5-mst-13.2.git] / regcomp.c
1 /*    regcomp.c
2  */
3
4 /*
5  * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
6  */
7
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9  * confused with the original package (see point 3 below).  Thanks, Henry!
10  */
11
12 /* Additional note: this code is very heavily munged from Henry's version
13  * in places.  In some spots I've traded clarity for efficiency, so don't
14  * blame Henry for some of the lack of readability.
15  */
16
17 /* The names of the functions have been changed from regcomp and
18  * regexec to  pregcomp and pregexec in order to avoid conflicts
19  * with the POSIX routines of the same names.
20 */
21
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 #  ifndef PERL_IN_XSUB_RE
25 #    define PERL_IN_XSUB_RE
26 #  endif
27 /* need access to debugger hooks */
28 #  if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
29 #    define DEBUGGING
30 #  endif
31 #endif
32
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 #  define Perl_pregcomp my_regcomp
36 #  define Perl_regdump my_regdump
37 #  define Perl_regprop my_regprop
38 #  define Perl_pregfree my_regfree
39 #  define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 #  define Perl_regnext my_regnext
42 #  define Perl_save_re_context my_save_re_context
43 #  define Perl_reginitcolors my_reginitcolors
44
45 #  define PERL_NO_GET_CONTEXT
46 #endif
47
48 /*SUPPRESS 112*/
49 /*
50  * pregcomp and pregexec -- regsub and regerror are not used in perl
51  *
52  *      Copyright (c) 1986 by University of Toronto.
53  *      Written by Henry Spencer.  Not derived from licensed software.
54  *
55  *      Permission is granted to anyone to use this software for any
56  *      purpose on any computer system, and to redistribute it freely,
57  *      subject to the following restrictions:
58  *
59  *      1. The author is not responsible for the consequences of use of
60  *              this software, no matter how awful, even if they arise
61  *              from defects in it.
62  *
63  *      2. The origin of this software must not be misrepresented, either
64  *              by explicit claim or by omission.
65  *
66  *      3. Altered versions must be plainly marked as such, and must not
67  *              be misrepresented as being the original software.
68  *
69  *
70  ****    Alterations to Henry's code are...
71  ****
72  ****    Copyright (c) 1991-2002, Larry Wall
73  ****
74  ****    You may distribute under the terms of either the GNU General Public
75  ****    License or the Artistic License, as specified in the README file.
76
77  *
78  * Beware that some of this code is subtly aware of the way operator
79  * precedence is structured in regular expressions.  Serious changes in
80  * regular-expression syntax might require a total rethink.
81  */
82 #include "EXTERN.h"
83 #define PERL_IN_REGCOMP_C
84 #include "perl.h"
85
86 #ifndef PERL_IN_XSUB_RE
87 #  include "INTERN.h"
88 #endif
89
90 #define REG_COMP_C
91 #include "regcomp.h"
92
93 #ifdef op
94 #undef op
95 #endif /* op */
96
97 #ifdef MSDOS
98 # if defined(BUGGY_MSC6)
99  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100  # pragma optimize("a",off)
101  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102  # pragma optimize("w",on )
103 # endif /* BUGGY_MSC6 */
104 #endif /* MSDOS */
105
106 #ifndef STATIC
107 #define STATIC  static
108 #endif
109
110 typedef struct RExC_state_t {
111     U32         flags;                  /* are we folding, multilining? */
112     char        *precomp;               /* uncompiled string. */
113     regexp      *rx;
114     char        *start;                 /* Start of input for compile */
115     char        *end;                   /* End of input for compile */
116     char        *parse;                 /* Input-scan pointer. */
117     I32         whilem_seen;            /* number of WHILEM in this expr */
118     regnode     *emit_start;            /* Start of emitted-code area */
119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
120     I32         naughty;                /* How bad is this pattern? */
121     I32         sawback;                /* Did we see \1, ...? */
122     U32         seen;
123     I32         size;                   /* Code size. */
124     I32         npar;                   /* () count. */
125     I32         extralen;
126     I32         seen_zerolen;
127     I32         seen_evals;
128     I32         utf8;
129 #if ADD_TO_REGEXEC
130     char        *starttry;              /* -Dr: where regtry was called. */
131 #define RExC_starttry   (pRExC_state->starttry)
132 #endif
133 } RExC_state_t;
134
135 #define RExC_flags      (pRExC_state->flags)
136 #define RExC_precomp    (pRExC_state->precomp)
137 #define RExC_rx         (pRExC_state->rx)
138 #define RExC_start      (pRExC_state->start)
139 #define RExC_end        (pRExC_state->end)
140 #define RExC_parse      (pRExC_state->parse)
141 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
142 #define RExC_offsets    (pRExC_state->rx->offsets) /* I am not like the others */
143 #define RExC_emit       (pRExC_state->emit)
144 #define RExC_emit_start (pRExC_state->emit_start)
145 #define RExC_naughty    (pRExC_state->naughty)
146 #define RExC_sawback    (pRExC_state->sawback)
147 #define RExC_seen       (pRExC_state->seen)
148 #define RExC_size       (pRExC_state->size)
149 #define RExC_npar       (pRExC_state->npar)
150 #define RExC_extralen   (pRExC_state->extralen)
151 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
152 #define RExC_seen_evals (pRExC_state->seen_evals)
153 #define RExC_utf8       (pRExC_state->utf8)
154
155 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
156 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157         ((*s) == '{' && regcurly(s)))
158
159 #ifdef SPSTART
160 #undef SPSTART          /* dratted cpp namespace... */
161 #endif
162 /*
163  * Flags to be passed up and down.
164  */
165 #define WORST           0       /* Worst case. */
166 #define HASWIDTH        0x1     /* Known to match non-null strings. */
167 #define SIMPLE          0x2     /* Simple enough to be STAR/PLUS operand. */
168 #define SPSTART         0x4     /* Starts with * or +. */
169 #define TRYAGAIN        0x8     /* Weeded out a declaration. */
170
171 /* Length of a variant. */
172
173 typedef struct scan_data_t {
174     I32 len_min;
175     I32 len_delta;
176     I32 pos_min;
177     I32 pos_delta;
178     SV *last_found;
179     I32 last_end;                       /* min value, <0 unless valid. */
180     I32 last_start_min;
181     I32 last_start_max;
182     SV **longest;                       /* Either &l_fixed, or &l_float. */
183     SV *longest_fixed;
184     I32 offset_fixed;
185     SV *longest_float;
186     I32 offset_float_min;
187     I32 offset_float_max;
188     I32 flags;
189     I32 whilem_c;
190     I32 *last_closep;
191     struct regnode_charclass_class *start_class;
192 } scan_data_t;
193
194 /*
195  * Forward declarations for pregcomp()'s friends.
196  */
197
198 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199                                       0, 0, 0, 0, 0, 0};
200
201 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202 #define SF_BEFORE_SEOL          0x1
203 #define SF_BEFORE_MEOL          0x2
204 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
207 #ifdef NO_UNARY_PLUS
208 #  define SF_FIX_SHIFT_EOL      (0+2)
209 #  define SF_FL_SHIFT_EOL               (0+4)
210 #else
211 #  define SF_FIX_SHIFT_EOL      (+2)
212 #  define SF_FL_SHIFT_EOL               (+4)
213 #endif
214
215 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220 #define SF_IS_INF               0x40
221 #define SF_HAS_PAR              0x80
222 #define SF_IN_PAR               0x100
223 #define SF_HAS_EVAL             0x200
224 #define SCF_DO_SUBSTR           0x400
225 #define SCF_DO_STCLASS_AND      0x0800
226 #define SCF_DO_STCLASS_OR       0x1000
227 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
228 #define SCF_WHILEM_VISITED_POS  0x2000
229
230 #define UTF (RExC_utf8 != 0)
231 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
232 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
233
234 #define OOB_UNICODE             12345678
235 #define OOB_NAMEDCLASS          -1
236
237 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
240
241 /* length of regex to show in messages that don't mark a position within */
242 #define RegexLengthToShowInErrorMessages 127
243
244 /*
245  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247  * op/pragma/warn/regcomp.
248  */
249 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
250 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
251
252 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
253
254 /*
255  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256  * arg. Show regex, up to a maximum length. If it's too long, chop and add
257  * "...".
258  */
259 #define FAIL(msg) STMT_START {                                          \
260     char *ellipses = "";                                                \
261     IV len = RExC_end - RExC_precomp;                                   \
262                                                                         \
263     if (!SIZE_ONLY)                                                     \
264         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
265     if (len > RegexLengthToShowInErrorMessages) {                       \
266         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
267         len = RegexLengthToShowInErrorMessages - 10;                    \
268         ellipses = "...";                                               \
269     }                                                                   \
270     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",                           \
271             msg, (int)len, RExC_precomp, ellipses);                     \
272 } STMT_END
273
274 /*
275  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
276  * args. Show regex, up to a maximum length. If it's too long, chop and add
277  * "...".
278  */
279 #define FAIL2(pat,msg) STMT_START {                                     \
280     char *ellipses = "";                                                \
281     IV len = RExC_end - RExC_precomp;                                   \
282                                                                         \
283     if (!SIZE_ONLY)                                                     \
284         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
285     if (len > RegexLengthToShowInErrorMessages) {                       \
286         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
287         len = RegexLengthToShowInErrorMessages - 10;                    \
288         ellipses = "...";                                               \
289     }                                                                   \
290     S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/",                       \
291             msg, (int)len, RExC_precomp, ellipses);                     \
292 } STMT_END
293
294
295 /*
296  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
297  */
298 #define Simple_vFAIL(m) STMT_START {                                    \
299     IV offset = RExC_parse - RExC_precomp;                              \
300     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
301             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
302 } STMT_END
303
304 /*
305  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
306  */
307 #define vFAIL(m) STMT_START {                           \
308     if (!SIZE_ONLY)                                     \
309         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
310     Simple_vFAIL(m);                                    \
311 } STMT_END
312
313 /*
314  * Like Simple_vFAIL(), but accepts two arguments.
315  */
316 #define Simple_vFAIL2(m,a1) STMT_START {                        \
317     IV offset = RExC_parse - RExC_precomp;                      \
318     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
319             (int)offset, RExC_precomp, RExC_precomp + offset);  \
320 } STMT_END
321
322 /*
323  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
324  */
325 #define vFAIL2(m,a1) STMT_START {                       \
326     if (!SIZE_ONLY)                                     \
327         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
328     Simple_vFAIL2(m, a1);                               \
329 } STMT_END
330
331
332 /*
333  * Like Simple_vFAIL(), but accepts three arguments.
334  */
335 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
336     IV offset = RExC_parse - RExC_precomp;                      \
337     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
338             (int)offset, RExC_precomp, RExC_precomp + offset);  \
339 } STMT_END
340
341 /*
342  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
343  */
344 #define vFAIL3(m,a1,a2) STMT_START {                    \
345     if (!SIZE_ONLY)                                     \
346         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
347     Simple_vFAIL3(m, a1, a2);                           \
348 } STMT_END
349
350 /*
351  * Like Simple_vFAIL(), but accepts four arguments.
352  */
353 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
354     IV offset = RExC_parse - RExC_precomp;                      \
355     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
356             (int)offset, RExC_precomp, RExC_precomp + offset);  \
357 } STMT_END
358
359 /*
360  * Like Simple_vFAIL(), but accepts five arguments.
361  */
362 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START {           \
363     IV offset = RExC_parse - RExC_precomp;                      \
364     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,       \
365             (int)offset, RExC_precomp, RExC_precomp + offset);  \
366 } STMT_END
367
368
369 #define vWARN(loc,m) STMT_START {                                       \
370     IV offset = loc - RExC_precomp;                                     \
371     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
372             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
373 } STMT_END
374
375 #define vWARNdep(loc,m) STMT_START {                                    \
376     IV offset = loc - RExC_precomp;                                     \
377     Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),          \
378             "%s" REPORT_LOCATION,                                       \
379             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
380 } STMT_END
381
382
383 #define vWARN2(loc, m, a1) STMT_START {                                 \
384     IV offset = loc - RExC_precomp;                                     \
385     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
386             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
387 } STMT_END
388
389 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
390     IV offset = loc - RExC_precomp;                                     \
391     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
392             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
393 } STMT_END
394
395 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
396     IV offset = loc - RExC_precomp;                                     \
397     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
398             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
399 } STMT_END
400
401 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
402     IV offset = loc - RExC_precomp;                                     \
403     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
404             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
405 } STMT_END
406
407
408 /* Allow for side effects in s */
409 #define REGC(c,s) STMT_START {                  \
410     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
411 } STMT_END
412
413 /* Macros for recording node offsets.   20001227 mjd@plover.com 
414  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
415  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
416  * Element 0 holds the number n.
417  */
418
419 #define MJD_OFFSET_DEBUG(x)
420 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
421
422
423 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
424     if (! SIZE_ONLY) {                                                  \
425         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
426                 __LINE__, (node), (byte)));                             \
427         if((node) < 0) {                                                \
428             Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
429         } else {                                                        \
430             RExC_offsets[2*(node)-1] = (byte);                          \
431         }                                                               \
432     }                                                                   \
433 } STMT_END
434
435 #define Set_Node_Offset(node,byte) \
436     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
437 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
438
439 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
440     if (! SIZE_ONLY) {                                                  \
441         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
442                 __LINE__, (node), (len)));                              \
443         if((node) < 0) {                                                \
444             Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
445         } else {                                                        \
446             RExC_offsets[2*(node)] = (len);                             \
447         }                                                               \
448     }                                                                   \
449 } STMT_END
450
451 #define Set_Node_Length(node,len) \
452     Set_Node_Length_To_R((node)-RExC_emit_start, len)
453 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
454 #define Set_Node_Cur_Length(node) \
455     Set_Node_Length(node, RExC_parse - parse_start)
456
457 /* Get offsets and lengths */
458 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
459 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
460
461 static void clear_re(pTHX_ void *r);
462
463 /* Mark that we cannot extend a found fixed substring at this point.
464    Updata the longest found anchored substring and the longest found
465    floating substrings if needed. */
466
467 STATIC void
468 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
469 {
470     STRLEN l = CHR_SVLEN(data->last_found);
471     STRLEN old_l = CHR_SVLEN(*data->longest);
472
473     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
474         sv_setsv(*data->longest, data->last_found);
475         if (*data->longest == data->longest_fixed) {
476             data->offset_fixed = l ? data->last_start_min : data->pos_min;
477             if (data->flags & SF_BEFORE_EOL)
478                 data->flags
479                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
480             else
481                 data->flags &= ~SF_FIX_BEFORE_EOL;
482         }
483         else {
484             data->offset_float_min = l ? data->last_start_min : data->pos_min;
485             data->offset_float_max = (l
486                                       ? data->last_start_max
487                                       : data->pos_min + data->pos_delta);
488             if ((U32)data->offset_float_max > (U32)I32_MAX)
489                 data->offset_float_max = I32_MAX;
490             if (data->flags & SF_BEFORE_EOL)
491                 data->flags
492                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
493             else
494                 data->flags &= ~SF_FL_BEFORE_EOL;
495         }
496     }
497     SvCUR_set(data->last_found, 0);
498     data->last_end = -1;
499     data->flags &= ~SF_BEFORE_EOL;
500 }
501
502 /* Can match anything (initialization) */
503 STATIC void
504 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
505 {
506     ANYOF_CLASS_ZERO(cl);
507     ANYOF_BITMAP_SETALL(cl);
508     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
509     if (LOC)
510         cl->flags |= ANYOF_LOCALE;
511 }
512
513 /* Can match anything (initialization) */
514 STATIC int
515 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
516 {
517     int value;
518
519     for (value = 0; value <= ANYOF_MAX; value += 2)
520         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
521             return 1;
522     if (!(cl->flags & ANYOF_UNICODE_ALL))
523         return 0;
524     if (!ANYOF_BITMAP_TESTALLSET(cl))
525         return 0;
526     return 1;
527 }
528
529 /* Can match anything (initialization) */
530 STATIC void
531 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
532 {
533     Zero(cl, 1, struct regnode_charclass_class);
534     cl->type = ANYOF;
535     cl_anything(pRExC_state, cl);
536 }
537
538 STATIC void
539 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
540 {
541     Zero(cl, 1, struct regnode_charclass_class);
542     cl->type = ANYOF;
543     cl_anything(pRExC_state, cl);
544     if (LOC)
545         cl->flags |= ANYOF_LOCALE;
546 }
547
548 /* 'And' a given class with another one.  Can create false positives */
549 /* We assume that cl is not inverted */
550 STATIC void
551 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
552          struct regnode_charclass_class *and_with)
553 {
554     if (!(and_with->flags & ANYOF_CLASS)
555         && !(cl->flags & ANYOF_CLASS)
556         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
557         && !(and_with->flags & ANYOF_FOLD)
558         && !(cl->flags & ANYOF_FOLD)) {
559         int i;
560
561         if (and_with->flags & ANYOF_INVERT)
562             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
563                 cl->bitmap[i] &= ~and_with->bitmap[i];
564         else
565             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
566                 cl->bitmap[i] &= and_with->bitmap[i];
567     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
568     if (!(and_with->flags & ANYOF_EOS))
569         cl->flags &= ~ANYOF_EOS;
570
571     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
572         cl->flags &= ~ANYOF_UNICODE_ALL;
573         cl->flags |= ANYOF_UNICODE;
574         ARG_SET(cl, ARG(and_with));
575     }
576     if (!(and_with->flags & ANYOF_UNICODE_ALL))
577         cl->flags &= ~ANYOF_UNICODE_ALL;
578     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
579         cl->flags &= ~ANYOF_UNICODE;
580 }
581
582 /* 'OR' a given class with another one.  Can create false positives */
583 /* We assume that cl is not inverted */
584 STATIC void
585 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
586 {
587     if (or_with->flags & ANYOF_INVERT) {
588         /* We do not use
589          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
590          *   <= (B1 | !B2) | (CL1 | !CL2)
591          * which is wasteful if CL2 is small, but we ignore CL2:
592          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
593          * XXXX Can we handle case-fold?  Unclear:
594          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
595          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
596          */
597         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
598              && !(or_with->flags & ANYOF_FOLD)
599              && !(cl->flags & ANYOF_FOLD) ) {
600             int i;
601
602             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
603                 cl->bitmap[i] |= ~or_with->bitmap[i];
604         } /* XXXX: logic is complicated otherwise */
605         else {
606             cl_anything(pRExC_state, cl);
607         }
608     } else {
609         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
610         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
611              && (!(or_with->flags & ANYOF_FOLD)
612                  || (cl->flags & ANYOF_FOLD)) ) {
613             int i;
614
615             /* OR char bitmap and class bitmap separately */
616             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
617                 cl->bitmap[i] |= or_with->bitmap[i];
618             if (or_with->flags & ANYOF_CLASS) {
619                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
620                     cl->classflags[i] |= or_with->classflags[i];
621                 cl->flags |= ANYOF_CLASS;
622             }
623         }
624         else { /* XXXX: logic is complicated, leave it along for a moment. */
625             cl_anything(pRExC_state, cl);
626         }
627     }
628     if (or_with->flags & ANYOF_EOS)
629         cl->flags |= ANYOF_EOS;
630
631     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
632         ARG(cl) != ARG(or_with)) {
633         cl->flags |= ANYOF_UNICODE_ALL;
634         cl->flags &= ~ANYOF_UNICODE;
635     }
636     if (or_with->flags & ANYOF_UNICODE_ALL) {
637         cl->flags |= ANYOF_UNICODE_ALL;
638         cl->flags &= ~ANYOF_UNICODE;
639     }
640 }
641
642 /*
643  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
644  * These need to be revisited when a newer toolchain becomes available.
645  */
646 #if defined(__sparc64__) && defined(__GNUC__)
647 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
648 #       undef  SPARC64_GCC_WORKAROUND
649 #       define SPARC64_GCC_WORKAROUND 1
650 #   endif
651 #endif
652
653 /* REx optimizer.  Converts nodes into quickier variants "in place".
654    Finds fixed substrings.  */
655
656 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
657    to the position after last scanned or to NULL. */
658
659 STATIC I32
660 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
661                         /* scanp: Start here (read-write). */
662                         /* deltap: Write maxlen-minlen here. */
663                         /* last: Stop before this one. */
664 {
665     I32 min = 0, pars = 0, code;
666     regnode *scan = *scanp, *next;
667     I32 delta = 0;
668     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
669     int is_inf_internal = 0;            /* The studied chunk is infinite */
670     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
671     scan_data_t data_fake;
672     struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
673
674     while (scan && OP(scan) != END && scan < last) {
675         /* Peephole optimizer: */
676
677         if (PL_regkind[(U8)OP(scan)] == EXACT) {
678             /* Merge several consecutive EXACTish nodes into one. */
679             regnode *n = regnext(scan);
680             U32 stringok = 1;
681 #ifdef DEBUGGING
682             regnode *stop = scan;
683 #endif
684
685             next = scan + NODE_SZ_STR(scan);
686             /* Skip NOTHING, merge EXACT*. */
687             while (n &&
688                    ( PL_regkind[(U8)OP(n)] == NOTHING ||
689                      (stringok && (OP(n) == OP(scan))))
690                    && NEXT_OFF(n)
691                    && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
692                 if (OP(n) == TAIL || n > next)
693                     stringok = 0;
694                 if (PL_regkind[(U8)OP(n)] == NOTHING) {
695                     NEXT_OFF(scan) += NEXT_OFF(n);
696                     next = n + NODE_STEP_REGNODE;
697 #ifdef DEBUGGING
698                     if (stringok)
699                         stop = n;
700 #endif
701                     n = regnext(n);
702                 }
703                 else if (stringok) {
704                     int oldl = STR_LEN(scan);
705                     regnode *nnext = regnext(n);
706
707                     if (oldl + STR_LEN(n) > U8_MAX)
708                         break;
709                     NEXT_OFF(scan) += NEXT_OFF(n);
710                     STR_LEN(scan) += STR_LEN(n);
711                     next = n + NODE_SZ_STR(n);
712                     /* Now we can overwrite *n : */
713                     Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
714 #ifdef DEBUGGING
715                     stop = next - 1;
716 #endif
717                     n = nnext;
718                 }
719             }
720
721             if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
722 /*
723   Two problematic code points in Unicode casefolding of EXACT nodes:
724
725    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
726    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
727
728    which casefold to
729
730    Unicode                      UTF-8
731
732    U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
733    U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
734
735    This means that in case-insensitive matching (or "loose matching",
736    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
737    length of the above casefolded versions) can match a target string
738    of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
739    This would rather mess up the minimum length computation.
740
741    What we'll do is to look for the tail four bytes, and then peek
742    at the preceding two bytes to see whether we need to decrease
743    the minimum length by four (six minus two).
744
745    Thanks to the design of UTF-8, there cannot be false matches:
746    A sequence of valid UTF-8 bytes cannot be a subsequence of
747    another valid sequence of UTF-8 bytes.
748
749 */
750                  char *s0 = STRING(scan), *s, *t;
751                  char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
752                  char *t0 = "\xcc\x88\xcc\x81";
753                  char *t1 = t0 + 3;
754                  
755                  for (s = s0 + 2;
756                       s < s2 && (t = ninstr(s, s1, t0, t1));
757                       s = t + 4) {
758                       if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
759                           ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
760                            min -= 4;
761                  }
762             }
763
764 #ifdef DEBUGGING
765             /* Allow dumping */
766             n = scan + NODE_SZ_STR(scan);
767             while (n <= stop) {
768                 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
769                     OP(n) = OPTIMIZED;
770                     NEXT_OFF(n) = 0;
771                 }
772                 n++;
773             }
774 #endif
775         }
776         /* Follow the next-chain of the current node and optimize
777            away all the NOTHINGs from it.  */
778         if (OP(scan) != CURLYX) {
779             int max = (reg_off_by_arg[OP(scan)]
780                        ? I32_MAX
781                        /* I32 may be smaller than U16 on CRAYs! */
782                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
783             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
784             int noff;
785             regnode *n = scan;
786         
787             /* Skip NOTHING and LONGJMP. */
788             while ((n = regnext(n))
789                    && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
790                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
791                    && off + noff < max)
792                 off += noff;
793             if (reg_off_by_arg[OP(scan)])
794                 ARG(scan) = off;
795             else
796                 NEXT_OFF(scan) = off;
797         }
798         /* The principal pseudo-switch.  Cannot be a switch, since we
799            look into several different things.  */
800         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
801                    || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
802             next = regnext(scan);
803             code = OP(scan);
804         
805             if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
806                 I32 max1 = 0, min1 = I32_MAX, num = 0;
807                 struct regnode_charclass_class accum;
808                 
809                 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
810                     scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
811                 if (flags & SCF_DO_STCLASS)
812                     cl_init_zero(pRExC_state, &accum);
813                 while (OP(scan) == code) {
814                     I32 deltanext, minnext, f = 0, fake;
815                     struct regnode_charclass_class this_class;
816
817                     num++;
818                     data_fake.flags = 0;
819                     if (data) {         
820                         data_fake.whilem_c = data->whilem_c;
821                         data_fake.last_closep = data->last_closep;
822                     }
823                     else
824                         data_fake.last_closep = &fake;
825                     next = regnext(scan);
826                     scan = NEXTOPER(scan);
827                     if (code != BRANCH)
828                         scan = NEXTOPER(scan);
829                     if (flags & SCF_DO_STCLASS) {
830                         cl_init(pRExC_state, &this_class);
831                         data_fake.start_class = &this_class;
832                         f = SCF_DO_STCLASS_AND;
833                     }           
834                     if (flags & SCF_WHILEM_VISITED_POS)
835                         f |= SCF_WHILEM_VISITED_POS;
836                     /* we suppose the run is continuous, last=next...*/
837                     minnext = study_chunk(pRExC_state, &scan, &deltanext,
838                                           next, &data_fake, f);
839                     if (min1 > minnext)
840                         min1 = minnext;
841                     if (max1 < minnext + deltanext)
842                         max1 = minnext + deltanext;
843                     if (deltanext == I32_MAX)
844                         is_inf = is_inf_internal = 1;
845                     scan = next;
846                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
847                         pars++;
848                     if (data && (data_fake.flags & SF_HAS_EVAL))
849                         data->flags |= SF_HAS_EVAL;
850                     if (data)
851                         data->whilem_c = data_fake.whilem_c;
852                     if (flags & SCF_DO_STCLASS)
853                         cl_or(pRExC_state, &accum, &this_class);
854                     if (code == SUSPEND)
855                         break;
856                 }
857                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
858                     min1 = 0;
859                 if (flags & SCF_DO_SUBSTR) {
860                     data->pos_min += min1;
861                     data->pos_delta += max1 - min1;
862                     if (max1 != min1 || is_inf)
863                         data->longest = &(data->longest_float);
864                 }
865                 min += min1;
866                 delta += max1 - min1;
867                 if (flags & SCF_DO_STCLASS_OR) {
868                     cl_or(pRExC_state, data->start_class, &accum);
869                     if (min1) {
870                         cl_and(data->start_class, &and_with);
871                         flags &= ~SCF_DO_STCLASS;
872                     }
873                 }
874                 else if (flags & SCF_DO_STCLASS_AND) {
875                     if (min1) {
876                         cl_and(data->start_class, &accum);
877                         flags &= ~SCF_DO_STCLASS;
878                     }
879                     else {
880                         /* Switch to OR mode: cache the old value of
881                          * data->start_class */
882                         StructCopy(data->start_class, &and_with,
883                                    struct regnode_charclass_class);
884                         flags &= ~SCF_DO_STCLASS_AND;
885                         StructCopy(&accum, data->start_class,
886                                    struct regnode_charclass_class);
887                         flags |= SCF_DO_STCLASS_OR;
888                         data->start_class->flags |= ANYOF_EOS;
889                     }
890                 }
891             }
892             else if (code == BRANCHJ)   /* single branch is optimized. */
893                 scan = NEXTOPER(NEXTOPER(scan));
894             else                        /* single branch is optimized. */
895                 scan = NEXTOPER(scan);
896             continue;
897         }
898         else if (OP(scan) == EXACT) {
899             I32 l = STR_LEN(scan);
900             UV uc = *((U8*)STRING(scan));
901             if (UTF) {
902                 U8 *s = (U8*)STRING(scan);
903                 l = utf8_length(s, s + l);
904                 uc = utf8_to_uvchr(s, NULL);
905             }
906             min += l;
907             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
908                 /* The code below prefers earlier match for fixed
909                    offset, later match for variable offset.  */
910                 if (data->last_end == -1) { /* Update the start info. */
911                     data->last_start_min = data->pos_min;
912                     data->last_start_max = is_inf
913                         ? I32_MAX : data->pos_min + data->pos_delta;
914                 }
915                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
916                 if (UTF)
917                     SvUTF8_on(data->last_found);
918                 data->last_end = data->pos_min + l;
919                 data->pos_min += l; /* As in the first entry. */
920                 data->flags &= ~SF_BEFORE_EOL;
921             }
922             if (flags & SCF_DO_STCLASS_AND) {
923                 /* Check whether it is compatible with what we know already! */
924                 int compat = 1;
925
926                 if (uc >= 0x100 ||
927                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
928                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
929                     && (!(data->start_class->flags & ANYOF_FOLD)
930                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
931                     )
932                     compat = 0;
933                 ANYOF_CLASS_ZERO(data->start_class);
934                 ANYOF_BITMAP_ZERO(data->start_class);
935                 if (compat)
936                     ANYOF_BITMAP_SET(data->start_class, uc);
937                 data->start_class->flags &= ~ANYOF_EOS;
938                 if (uc < 0x100)
939                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
940             }
941             else if (flags & SCF_DO_STCLASS_OR) {
942                 /* false positive possible if the class is case-folded */
943                 if (uc < 0x100)
944                     ANYOF_BITMAP_SET(data->start_class, uc);
945                 else
946                     data->start_class->flags |= ANYOF_UNICODE_ALL;
947                 data->start_class->flags &= ~ANYOF_EOS;
948                 cl_and(data->start_class, &and_with);
949             }
950             flags &= ~SCF_DO_STCLASS;
951         }
952         else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
953             I32 l = STR_LEN(scan);
954             UV uc = *((U8*)STRING(scan));
955
956             /* Search for fixed substrings supports EXACT only. */
957             if (flags & SCF_DO_SUBSTR)
958                 scan_commit(pRExC_state, data);
959             if (UTF) {
960                 U8 *s = (U8 *)STRING(scan);
961                 l = utf8_length(s, s + l);
962                 uc = utf8_to_uvchr(s, NULL);
963             }
964             min += l;
965             if (data && (flags & SCF_DO_SUBSTR))
966                 data->pos_min += l;
967             if (flags & SCF_DO_STCLASS_AND) {
968                 /* Check whether it is compatible with what we know already! */
969                 int compat = 1;
970
971                 if (uc >= 0x100 ||
972                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
973                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
974                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
975                     compat = 0;
976                 ANYOF_CLASS_ZERO(data->start_class);
977                 ANYOF_BITMAP_ZERO(data->start_class);
978                 if (compat) {
979                     ANYOF_BITMAP_SET(data->start_class, uc);
980                     data->start_class->flags &= ~ANYOF_EOS;
981                     data->start_class->flags |= ANYOF_FOLD;
982                     if (OP(scan) == EXACTFL)
983                         data->start_class->flags |= ANYOF_LOCALE;
984                 }
985             }
986             else if (flags & SCF_DO_STCLASS_OR) {
987                 if (data->start_class->flags & ANYOF_FOLD) {
988                     /* false positive possible if the class is case-folded.
989                        Assume that the locale settings are the same... */
990                     if (uc < 0x100)
991                         ANYOF_BITMAP_SET(data->start_class, uc);
992                     data->start_class->flags &= ~ANYOF_EOS;
993                 }
994                 cl_and(data->start_class, &and_with);
995             }
996             flags &= ~SCF_DO_STCLASS;
997         }
998         else if (strchr((char*)PL_varies,OP(scan))) {
999             I32 mincount, maxcount, minnext, deltanext, fl = 0;
1000             I32 f = flags, pos_before = 0;
1001             regnode *oscan = scan;
1002             struct regnode_charclass_class this_class;
1003             struct regnode_charclass_class *oclass = NULL;
1004             I32 next_is_eval = 0;
1005
1006             switch (PL_regkind[(U8)OP(scan)]) {
1007             case WHILEM:                /* End of (?:...)* . */
1008                 scan = NEXTOPER(scan);
1009                 goto finish;
1010             case PLUS:
1011                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1012                     next = NEXTOPER(scan);
1013                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1014                         mincount = 1;
1015                         maxcount = REG_INFTY;
1016                         next = regnext(scan);
1017                         scan = NEXTOPER(scan);
1018                         goto do_curly;
1019                     }
1020                 }
1021                 if (flags & SCF_DO_SUBSTR)
1022                     data->pos_min++;
1023                 min++;
1024                 /* Fall through. */
1025             case STAR:
1026                 if (flags & SCF_DO_STCLASS) {
1027                     mincount = 0;
1028                     maxcount = REG_INFTY;
1029                     next = regnext(scan);
1030                     scan = NEXTOPER(scan);
1031                     goto do_curly;
1032                 }
1033                 is_inf = is_inf_internal = 1;
1034                 scan = regnext(scan);
1035                 if (flags & SCF_DO_SUBSTR) {
1036                     scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1037                     data->longest = &(data->longest_float);
1038                 }
1039                 goto optimize_curly_tail;
1040             case CURLY:
1041                 mincount = ARG1(scan);
1042                 maxcount = ARG2(scan);
1043                 next = regnext(scan);
1044                 if (OP(scan) == CURLYX) {
1045                     I32 lp = (data ? *(data->last_closep) : 0);
1046
1047                     scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1048                 }
1049                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1050                 next_is_eval = (OP(scan) == EVAL);
1051               do_curly:
1052                 if (flags & SCF_DO_SUBSTR) {
1053                     if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1054                     pos_before = data->pos_min;
1055                 }
1056                 if (data) {
1057                     fl = data->flags;
1058                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1059                     if (is_inf)
1060                         data->flags |= SF_IS_INF;
1061                 }
1062                 if (flags & SCF_DO_STCLASS) {
1063                     cl_init(pRExC_state, &this_class);
1064                     oclass = data->start_class;
1065                     data->start_class = &this_class;
1066                     f |= SCF_DO_STCLASS_AND;
1067                     f &= ~SCF_DO_STCLASS_OR;
1068                 }
1069                 /* These are the cases when once a subexpression
1070                    fails at a particular position, it cannot succeed
1071                    even after backtracking at the enclosing scope.
1072                 
1073                    XXXX what if minimal match and we are at the
1074                         initial run of {n,m}? */
1075                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1076                     f &= ~SCF_WHILEM_VISITED_POS;
1077
1078                 /* This will finish on WHILEM, setting scan, or on NULL: */
1079                 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1080                                       mincount == 0
1081                                         ? (f & ~SCF_DO_SUBSTR) : f);
1082
1083                 if (flags & SCF_DO_STCLASS)
1084                     data->start_class = oclass;
1085                 if (mincount == 0 || minnext == 0) {
1086                     if (flags & SCF_DO_STCLASS_OR) {
1087                         cl_or(pRExC_state, data->start_class, &this_class);
1088                     }
1089                     else if (flags & SCF_DO_STCLASS_AND) {
1090                         /* Switch to OR mode: cache the old value of
1091                          * data->start_class */
1092                         StructCopy(data->start_class, &and_with,
1093                                    struct regnode_charclass_class);
1094                         flags &= ~SCF_DO_STCLASS_AND;
1095                         StructCopy(&this_class, data->start_class,
1096                                    struct regnode_charclass_class);
1097                         flags |= SCF_DO_STCLASS_OR;
1098                         data->start_class->flags |= ANYOF_EOS;
1099                     }
1100                 } else {                /* Non-zero len */
1101                     if (flags & SCF_DO_STCLASS_OR) {
1102                         cl_or(pRExC_state, data->start_class, &this_class);
1103                         cl_and(data->start_class, &and_with);
1104                     }
1105                     else if (flags & SCF_DO_STCLASS_AND)
1106                         cl_and(data->start_class, &this_class);
1107                     flags &= ~SCF_DO_STCLASS;
1108                 }
1109                 if (!scan)              /* It was not CURLYX, but CURLY. */
1110                     scan = next;
1111                 if (ckWARN(WARN_REGEXP)
1112                        /* ? quantifier ok, except for (?{ ... }) */
1113                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
1114                     && (minnext == 0) && (deltanext == 0)
1115                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1116                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
1117                 {
1118                     vWARN(RExC_parse,
1119                           "Quantifier unexpected on zero-length expression");
1120                 }
1121
1122                 min += minnext * mincount;
1123                 is_inf_internal |= ((maxcount == REG_INFTY
1124                                      && (minnext + deltanext) > 0)
1125                                     || deltanext == I32_MAX);
1126                 is_inf |= is_inf_internal;
1127                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1128
1129                 /* Try powerful optimization CURLYX => CURLYN. */
1130                 if (  OP(oscan) == CURLYX && data
1131                       && data->flags & SF_IN_PAR
1132                       && !(data->flags & SF_HAS_EVAL)
1133                       && !deltanext && minnext == 1 ) {
1134                     /* Try to optimize to CURLYN.  */
1135                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1136                     regnode *nxt1 = nxt;
1137 #ifdef DEBUGGING
1138                     regnode *nxt2;
1139 #endif
1140
1141                     /* Skip open. */
1142                     nxt = regnext(nxt);
1143                     if (!strchr((char*)PL_simple,OP(nxt))
1144                         && !(PL_regkind[(U8)OP(nxt)] == EXACT
1145                              && STR_LEN(nxt) == 1))
1146                         goto nogo;
1147 #ifdef DEBUGGING
1148                     nxt2 = nxt;
1149 #endif
1150                     nxt = regnext(nxt);
1151                     if (OP(nxt) != CLOSE)
1152                         goto nogo;
1153                     /* Now we know that nxt2 is the only contents: */
1154                     oscan->flags = (U8)ARG(nxt);
1155                     OP(oscan) = CURLYN;
1156                     OP(nxt1) = NOTHING; /* was OPEN. */
1157 #ifdef DEBUGGING
1158                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1159                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1160                     NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1161                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
1162                     OP(nxt + 1) = OPTIMIZED; /* was count. */
1163                     NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1164 #endif
1165                 }
1166               nogo:
1167
1168                 /* Try optimization CURLYX => CURLYM. */
1169                 if (  OP(oscan) == CURLYX && data
1170                       && !(data->flags & SF_HAS_PAR)
1171                       && !(data->flags & SF_HAS_EVAL)
1172                       && !deltanext  ) {
1173                     /* XXXX How to optimize if data == 0? */
1174                     /* Optimize to a simpler form.  */
1175                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1176                     regnode *nxt2;
1177
1178                     OP(oscan) = CURLYM;
1179                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1180                             && (OP(nxt2) != WHILEM))
1181                         nxt = nxt2;
1182                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
1183                     /* Need to optimize away parenths. */
1184                     if (data->flags & SF_IN_PAR) {
1185                         /* Set the parenth number.  */
1186                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1187
1188                         if (OP(nxt) != CLOSE)
1189                             FAIL("Panic opt close");
1190                         oscan->flags = (U8)ARG(nxt);
1191                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
1192                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
1193 #ifdef DEBUGGING
1194                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1195                         OP(nxt + 1) = OPTIMIZED; /* was count. */
1196                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1197                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1198 #endif
1199 #if 0
1200                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
1201                             regnode *nnxt = regnext(nxt1);
1202                         
1203                             if (nnxt == nxt) {
1204                                 if (reg_off_by_arg[OP(nxt1)])
1205                                     ARG_SET(nxt1, nxt2 - nxt1);
1206                                 else if (nxt2 - nxt1 < U16_MAX)
1207                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
1208                                 else
1209                                     OP(nxt) = NOTHING;  /* Cannot beautify */
1210                             }
1211                             nxt1 = nnxt;
1212                         }
1213 #endif
1214                         /* Optimize again: */
1215                         study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1216                                     NULL, 0);
1217                     }
1218                     else
1219                         oscan->flags = 0;
1220                 }
1221                 else if ((OP(oscan) == CURLYX)
1222                          && (flags & SCF_WHILEM_VISITED_POS)
1223                          /* See the comment on a similar expression above.
1224                             However, this time it not a subexpression
1225                             we care about, but the expression itself. */
1226                          && (maxcount == REG_INFTY)
1227                          && data && ++data->whilem_c < 16) {
1228                     /* This stays as CURLYX, we can put the count/of pair. */
1229                     /* Find WHILEM (as in regexec.c) */
1230                     regnode *nxt = oscan + NEXT_OFF(oscan);
1231
1232                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1233                         nxt += ARG(nxt);
1234                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
1235                         | (RExC_whilem_seen << 4)); /* On WHILEM */
1236                 }
1237                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1238                     pars++;
1239                 if (flags & SCF_DO_SUBSTR) {
1240                     SV *last_str = Nullsv;
1241                     int counted = mincount != 0;
1242
1243                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1244 #if defined(SPARC64_GCC_WORKAROUND)
1245                         I32 b = 0;
1246                         STRLEN l = 0;
1247                         char *s = NULL;
1248                         I32 old = 0;
1249
1250                         if (pos_before >= data->last_start_min)
1251                             b = pos_before;
1252                         else
1253                             b = data->last_start_min;
1254
1255                         l = 0;
1256                         s = SvPV(data->last_found, l);
1257                         old = b - data->last_start_min;
1258
1259 #else
1260                         I32 b = pos_before >= data->last_start_min
1261                             ? pos_before : data->last_start_min;
1262                         STRLEN l;
1263                         char *s = SvPV(data->last_found, l);
1264                         I32 old = b - data->last_start_min;
1265 #endif
1266
1267                         if (UTF)
1268                             old = utf8_hop((U8*)s, old) - (U8*)s;
1269                         
1270                         l -= old;
1271                         /* Get the added string: */
1272                         last_str = newSVpvn(s  + old, l);
1273                         if (UTF)
1274                             SvUTF8_on(last_str);
1275                         if (deltanext == 0 && pos_before == b) {
1276                             /* What was added is a constant string */
1277                             if (mincount > 1) {
1278                                 SvGROW(last_str, (mincount * l) + 1);
1279                                 repeatcpy(SvPVX(last_str) + l,
1280                                           SvPVX(last_str), l, mincount - 1);
1281                                 SvCUR(last_str) *= mincount;
1282                                 /* Add additional parts. */
1283                                 SvCUR_set(data->last_found,
1284                                           SvCUR(data->last_found) - l);
1285                                 sv_catsv(data->last_found, last_str);
1286                                 data->last_end += l * (mincount - 1);
1287                             }
1288                         } else {
1289                             /* start offset must point into the last copy */
1290                             data->last_start_min += minnext * (mincount - 1);
1291                             data->last_start_max += is_inf ? I32_MAX
1292                                 : (maxcount - 1) * (minnext + data->pos_delta);
1293                         }
1294                     }
1295                     /* It is counted once already... */
1296                     data->pos_min += minnext * (mincount - counted);
1297                     data->pos_delta += - counted * deltanext +
1298                         (minnext + deltanext) * maxcount - minnext * mincount;
1299                     if (mincount != maxcount) {
1300                          /* Cannot extend fixed substrings found inside
1301                             the group.  */
1302                         scan_commit(pRExC_state,data);
1303                         if (mincount && last_str) {
1304                             sv_setsv(data->last_found, last_str);
1305                             data->last_end = data->pos_min;
1306                             data->last_start_min =
1307                                 data->pos_min - CHR_SVLEN(last_str);
1308                             data->last_start_max = is_inf
1309                                 ? I32_MAX
1310                                 : data->pos_min + data->pos_delta
1311                                 - CHR_SVLEN(last_str);
1312                         }
1313                         data->longest = &(data->longest_float);
1314                     }
1315                     SvREFCNT_dec(last_str);
1316                 }
1317                 if (data && (fl & SF_HAS_EVAL))
1318                     data->flags |= SF_HAS_EVAL;
1319               optimize_curly_tail:
1320                 if (OP(oscan) != CURLYX) {
1321                     while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1322                            && NEXT_OFF(next))
1323                         NEXT_OFF(oscan) += NEXT_OFF(next);
1324                 }
1325                 continue;
1326             default:                    /* REF and CLUMP only? */
1327                 if (flags & SCF_DO_SUBSTR) {
1328                     scan_commit(pRExC_state,data);      /* Cannot expect anything... */
1329                     data->longest = &(data->longest_float);
1330                 }
1331                 is_inf = is_inf_internal = 1;
1332                 if (flags & SCF_DO_STCLASS_OR)
1333                     cl_anything(pRExC_state, data->start_class);
1334                 flags &= ~SCF_DO_STCLASS;
1335                 break;
1336             }
1337         }
1338         else if (strchr((char*)PL_simple,OP(scan))) {
1339             int value = 0;
1340
1341             if (flags & SCF_DO_SUBSTR) {
1342                 scan_commit(pRExC_state,data);
1343                 data->pos_min++;
1344             }
1345             min++;
1346             if (flags & SCF_DO_STCLASS) {
1347                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1348
1349                 /* Some of the logic below assumes that switching
1350                    locale on will only add false positives. */
1351                 switch (PL_regkind[(U8)OP(scan)]) {
1352                 case SANY:
1353                 default:
1354                   do_default:
1355                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1356                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1357                         cl_anything(pRExC_state, data->start_class);
1358                     break;
1359                 case REG_ANY:
1360                     if (OP(scan) == SANY)
1361                         goto do_default;
1362                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1363                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1364                                  || (data->start_class->flags & ANYOF_CLASS));
1365                         cl_anything(pRExC_state, data->start_class);
1366                     }
1367                     if (flags & SCF_DO_STCLASS_AND || !value)
1368                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1369                     break;
1370                 case ANYOF:
1371                     if (flags & SCF_DO_STCLASS_AND)
1372                         cl_and(data->start_class,
1373                                (struct regnode_charclass_class*)scan);
1374                     else
1375                         cl_or(pRExC_state, data->start_class,
1376                               (struct regnode_charclass_class*)scan);
1377                     break;
1378                 case ALNUM:
1379                     if (flags & SCF_DO_STCLASS_AND) {
1380                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1381                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1382                             for (value = 0; value < 256; value++)
1383                                 if (!isALNUM(value))
1384                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1385                         }
1386                     }
1387                     else {
1388                         if (data->start_class->flags & ANYOF_LOCALE)
1389                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1390                         else {
1391                             for (value = 0; value < 256; value++)
1392                                 if (isALNUM(value))
1393                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1394                         }
1395                     }
1396                     break;
1397                 case ALNUML:
1398                     if (flags & SCF_DO_STCLASS_AND) {
1399                         if (data->start_class->flags & ANYOF_LOCALE)
1400                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1401                     }
1402                     else {
1403                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1404                         data->start_class->flags |= ANYOF_LOCALE;
1405                     }
1406                     break;
1407                 case NALNUM:
1408                     if (flags & SCF_DO_STCLASS_AND) {
1409                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1410                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1411                             for (value = 0; value < 256; value++)
1412                                 if (isALNUM(value))
1413                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1414                         }
1415                     }
1416                     else {
1417                         if (data->start_class->flags & ANYOF_LOCALE)
1418                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1419                         else {
1420                             for (value = 0; value < 256; value++)
1421                                 if (!isALNUM(value))
1422                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1423                         }
1424                     }
1425                     break;
1426                 case NALNUML:
1427                     if (flags & SCF_DO_STCLASS_AND) {
1428                         if (data->start_class->flags & ANYOF_LOCALE)
1429                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1430                     }
1431                     else {
1432                         data->start_class->flags |= ANYOF_LOCALE;
1433                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1434                     }
1435                     break;
1436                 case SPACE:
1437                     if (flags & SCF_DO_STCLASS_AND) {
1438                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1439                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1440                             for (value = 0; value < 256; value++)
1441                                 if (!isSPACE(value))
1442                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1443                         }
1444                     }
1445                     else {
1446                         if (data->start_class->flags & ANYOF_LOCALE)
1447                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1448                         else {
1449                             for (value = 0; value < 256; value++)
1450                                 if (isSPACE(value))
1451                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1452                         }
1453                     }
1454                     break;
1455                 case SPACEL:
1456                     if (flags & SCF_DO_STCLASS_AND) {
1457                         if (data->start_class->flags & ANYOF_LOCALE)
1458                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1459                     }
1460                     else {
1461                         data->start_class->flags |= ANYOF_LOCALE;
1462                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1463                     }
1464                     break;
1465                 case NSPACE:
1466                     if (flags & SCF_DO_STCLASS_AND) {
1467                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1468                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1469                             for (value = 0; value < 256; value++)
1470                                 if (isSPACE(value))
1471                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1472                         }
1473                     }
1474                     else {
1475                         if (data->start_class->flags & ANYOF_LOCALE)
1476                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1477                         else {
1478                             for (value = 0; value < 256; value++)
1479                                 if (!isSPACE(value))
1480                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1481                         }
1482                     }
1483                     break;
1484                 case NSPACEL:
1485                     if (flags & SCF_DO_STCLASS_AND) {
1486                         if (data->start_class->flags & ANYOF_LOCALE) {
1487                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1488                             for (value = 0; value < 256; value++)
1489                                 if (!isSPACE(value))
1490                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1491                         }
1492                     }
1493                     else {
1494                         data->start_class->flags |= ANYOF_LOCALE;
1495                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1496                     }
1497                     break;
1498                 case DIGIT:
1499                     if (flags & SCF_DO_STCLASS_AND) {
1500                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1501                         for (value = 0; value < 256; value++)
1502                             if (!isDIGIT(value))
1503                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1504                     }
1505                     else {
1506                         if (data->start_class->flags & ANYOF_LOCALE)
1507                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1508                         else {
1509                             for (value = 0; value < 256; value++)
1510                                 if (isDIGIT(value))
1511                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1512                         }
1513                     }
1514                     break;
1515                 case NDIGIT:
1516                     if (flags & SCF_DO_STCLASS_AND) {
1517                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1518                         for (value = 0; value < 256; value++)
1519                             if (isDIGIT(value))
1520                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1521                     }
1522                     else {
1523                         if (data->start_class->flags & ANYOF_LOCALE)
1524                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1525                         else {
1526                             for (value = 0; value < 256; value++)
1527                                 if (!isDIGIT(value))
1528                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1529                         }
1530                     }
1531                     break;
1532                 }
1533                 if (flags & SCF_DO_STCLASS_OR)
1534                     cl_and(data->start_class, &and_with);
1535                 flags &= ~SCF_DO_STCLASS;
1536             }
1537         }
1538         else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1539             data->flags |= (OP(scan) == MEOL
1540                             ? SF_BEFORE_MEOL
1541                             : SF_BEFORE_SEOL);
1542         }
1543         else if (  PL_regkind[(U8)OP(scan)] == BRANCHJ
1544                  /* Lookbehind, or need to calculate parens/evals/stclass: */
1545                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
1546                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1547             /* Lookahead/lookbehind */
1548             I32 deltanext, minnext, fake = 0;
1549             regnode *nscan;
1550             struct regnode_charclass_class intrnl;
1551             int f = 0;
1552
1553             data_fake.flags = 0;
1554             if (data) {         
1555                 data_fake.whilem_c = data->whilem_c;
1556                 data_fake.last_closep = data->last_closep;
1557             }
1558             else
1559                 data_fake.last_closep = &fake;
1560             if ( flags & SCF_DO_STCLASS && !scan->flags
1561                  && OP(scan) == IFMATCH ) { /* Lookahead */
1562                 cl_init(pRExC_state, &intrnl);
1563                 data_fake.start_class = &intrnl;
1564                 f |= SCF_DO_STCLASS_AND;
1565             }
1566             if (flags & SCF_WHILEM_VISITED_POS)
1567                 f |= SCF_WHILEM_VISITED_POS;
1568             next = regnext(scan);
1569             nscan = NEXTOPER(NEXTOPER(scan));
1570             minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1571             if (scan->flags) {
1572                 if (deltanext) {
1573                     vFAIL("Variable length lookbehind not implemented");
1574                 }
1575                 else if (minnext > U8_MAX) {
1576                     vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1577                 }
1578                 scan->flags = (U8)minnext;
1579             }
1580             if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1581                 pars++;
1582             if (data && (data_fake.flags & SF_HAS_EVAL))
1583                 data->flags |= SF_HAS_EVAL;
1584             if (data)
1585                 data->whilem_c = data_fake.whilem_c;
1586             if (f & SCF_DO_STCLASS_AND) {
1587                 int was = (data->start_class->flags & ANYOF_EOS);
1588
1589                 cl_and(data->start_class, &intrnl);
1590                 if (was)
1591                     data->start_class->flags |= ANYOF_EOS;
1592             }
1593         }
1594         else if (OP(scan) == OPEN) {
1595             pars++;
1596         }
1597         else if (OP(scan) == CLOSE) {
1598             if ((I32)ARG(scan) == is_par) {
1599                 next = regnext(scan);
1600
1601                 if ( next && (OP(next) != WHILEM) && next < last)
1602                     is_par = 0;         /* Disable optimization */
1603             }
1604             if (data)
1605                 *(data->last_closep) = ARG(scan);
1606         }
1607         else if (OP(scan) == EVAL) {
1608                 if (data)
1609                     data->flags |= SF_HAS_EVAL;
1610         }
1611         else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1612                 if (flags & SCF_DO_SUBSTR) {
1613                     scan_commit(pRExC_state,data);
1614                     data->longest = &(data->longest_float);
1615                 }
1616                 is_inf = is_inf_internal = 1;
1617                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1618                     cl_anything(pRExC_state, data->start_class);
1619                 flags &= ~SCF_DO_STCLASS;
1620         }
1621         /* Else: zero-length, ignore. */
1622         scan = regnext(scan);
1623     }
1624
1625   finish:
1626     *scanp = scan;
1627     *deltap = is_inf_internal ? I32_MAX : delta;
1628     if (flags & SCF_DO_SUBSTR && is_inf)
1629         data->pos_delta = I32_MAX - data->pos_min;
1630     if (is_par > U8_MAX)
1631         is_par = 0;
1632     if (is_par && pars==1 && data) {
1633         data->flags |= SF_IN_PAR;
1634         data->flags &= ~SF_HAS_PAR;
1635     }
1636     else if (pars && data) {
1637         data->flags |= SF_HAS_PAR;
1638         data->flags &= ~SF_IN_PAR;
1639     }
1640     if (flags & SCF_DO_STCLASS_OR)
1641         cl_and(data->start_class, &and_with);
1642     return min;
1643 }
1644
1645 STATIC I32
1646 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1647 {
1648     if (RExC_rx->data) {
1649         Renewc(RExC_rx->data,
1650                sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1651                char, struct reg_data);
1652         Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1653         RExC_rx->data->count += n;
1654     }
1655     else {
1656         Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1657              char, struct reg_data);
1658         New(1208, RExC_rx->data->what, n, U8);
1659         RExC_rx->data->count = n;
1660     }
1661     Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1662     return RExC_rx->data->count - n;
1663 }
1664
1665 void
1666 Perl_reginitcolors(pTHX)
1667 {
1668     int i = 0;
1669     char *s = PerlEnv_getenv("PERL_RE_COLORS");
1670         
1671     if (s) {
1672         PL_colors[0] = s = savepv(s);
1673         while (++i < 6) {
1674             s = strchr(s, '\t');
1675             if (s) {
1676                 *s = '\0';
1677                 PL_colors[i] = ++s;
1678             }
1679             else
1680                 PL_colors[i] = s = "";
1681         }
1682     } else {
1683         while (i < 6)
1684             PL_colors[i++] = "";
1685     }
1686     PL_colorset = 1;
1687 }
1688
1689
1690 /*
1691  - pregcomp - compile a regular expression into internal code
1692  *
1693  * We can't allocate space until we know how big the compiled form will be,
1694  * but we can't compile it (and thus know how big it is) until we've got a
1695  * place to put the code.  So we cheat:  we compile it twice, once with code
1696  * generation turned off and size counting turned on, and once "for real".
1697  * This also means that we don't allocate space until we are sure that the
1698  * thing really will compile successfully, and we never have to move the
1699  * code and thus invalidate pointers into it.  (Note that it has to be in
1700  * one piece because free() must be able to free it all.) [NB: not true in perl]
1701  *
1702  * Beware that the optimization-preparation code in here knows about some
1703  * of the structure of the compiled regexp.  [I'll say.]
1704  */
1705 regexp *
1706 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1707 {
1708     register regexp *r;
1709     regnode *scan;
1710     regnode *first;
1711     I32 flags;
1712     I32 minlen = 0;
1713     I32 sawplus = 0;
1714     I32 sawopen = 0;
1715     scan_data_t data;
1716     RExC_state_t RExC_state;
1717     RExC_state_t *pRExC_state = &RExC_state;
1718
1719     if (exp == NULL)
1720         FAIL("NULL regexp argument");
1721
1722     RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1723
1724     RExC_precomp = exp;
1725     DEBUG_r({
1726          if (!PL_colorset) reginitcolors();
1727          PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1728                        PL_colors[4],PL_colors[5],PL_colors[0],
1729                        (int)(xend - exp), RExC_precomp, PL_colors[1]);
1730     });
1731     RExC_flags = pm->op_pmflags;
1732     RExC_sawback = 0;
1733
1734     RExC_seen = 0;
1735     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1736     RExC_seen_evals = 0;
1737     RExC_extralen = 0;
1738
1739     /* First pass: determine size, legality. */
1740     RExC_parse = exp;
1741     RExC_start = exp;
1742     RExC_end = xend;
1743     RExC_naughty = 0;
1744     RExC_npar = 1;
1745     RExC_size = 0L;
1746     RExC_emit = &PL_regdummy;
1747     RExC_whilem_seen = 0;
1748 #if 0 /* REGC() is (currently) a NOP at the first pass.
1749        * Clever compilers notice this and complain. --jhi */
1750     REGC((U8)REG_MAGIC, (char*)RExC_emit);
1751 #endif
1752     if (reg(pRExC_state, 0, &flags) == NULL) {
1753         RExC_precomp = Nullch;
1754         return(NULL);
1755     }
1756     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1757
1758     /* Small enough for pointer-storage convention?
1759        If extralen==0, this means that we will not need long jumps. */
1760     if (RExC_size >= 0x10000L && RExC_extralen)
1761         RExC_size += RExC_extralen;
1762     else
1763         RExC_extralen = 0;
1764     if (RExC_whilem_seen > 15)
1765         RExC_whilem_seen = 15;
1766
1767     /* Allocate space and initialize. */
1768     Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1769          char, regexp);
1770     if (r == NULL)
1771         FAIL("Regexp out of space");
1772
1773 #ifdef DEBUGGING
1774     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1775     Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1776 #endif
1777     r->refcnt = 1;
1778     r->prelen = xend - exp;
1779     r->precomp = savepvn(RExC_precomp, r->prelen);
1780     r->subbeg = NULL;
1781 #ifdef PERL_COPY_ON_WRITE
1782     r->saved_copy = Nullsv;
1783 #endif
1784     r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1785     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1786
1787     r->substrs = 0;                     /* Useful during FAIL. */
1788     r->startp = 0;                      /* Useful during FAIL. */
1789     r->endp = 0;                        /* Useful during FAIL. */
1790
1791     Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1792     if (r->offsets) {
1793       r->offsets[0] = RExC_size; 
1794     }
1795     DEBUG_r(PerlIO_printf(Perl_debug_log, 
1796                           "%s %"UVuf" bytes for offset annotations.\n", 
1797                           r->offsets ? "Got" : "Couldn't get", 
1798                           (UV)((2*RExC_size+1) * sizeof(U32))));
1799
1800     RExC_rx = r;
1801
1802     /* Second pass: emit code. */
1803     RExC_flags = pm->op_pmflags;        /* don't let top level (?i) bleed */
1804     RExC_parse = exp;
1805     RExC_end = xend;
1806     RExC_naughty = 0;
1807     RExC_npar = 1;
1808     RExC_emit_start = r->program;
1809     RExC_emit = r->program;
1810     /* Store the count of eval-groups for security checks: */
1811     RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1812     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1813     r->data = 0;
1814     if (reg(pRExC_state, 0, &flags) == NULL)
1815         return(NULL);
1816
1817     /* Dig out information for optimizations. */
1818     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1819     pm->op_pmflags = RExC_flags;
1820     if (UTF)
1821         r->reganch |= ROPT_UTF8;        /* Unicode in it? */
1822     r->regstclass = NULL;
1823     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
1824         r->reganch |= ROPT_NAUGHTY;
1825     scan = r->program + 1;              /* First BRANCH. */
1826
1827     /* XXXX To minimize changes to RE engine we always allocate
1828        3-units-long substrs field. */
1829     Newz(1004, r->substrs, 1, struct reg_substr_data);
1830
1831     StructCopy(&zero_scan_data, &data, scan_data_t);
1832     /* XXXX Should not we check for something else?  Usually it is OPEN1... */
1833     if (OP(scan) != BRANCH) {   /* Only one top-level choice. */
1834         I32 fake;
1835         STRLEN longest_float_length, longest_fixed_length;
1836         struct regnode_charclass_class ch_class;
1837         int stclass_flag;
1838         I32 last_close = 0;
1839
1840         first = scan;
1841         /* Skip introductions and multiplicators >= 1. */
1842         while ((OP(first) == OPEN && (sawopen = 1)) ||
1843                /* An OR of *one* alternative - should not happen now. */
1844             (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1845             (OP(first) == PLUS) ||
1846             (OP(first) == MINMOD) ||
1847                /* An {n,m} with n>0 */
1848             (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1849                 if (OP(first) == PLUS)
1850                     sawplus = 1;
1851                 else
1852                     first += regarglen[(U8)OP(first)];
1853                 first = NEXTOPER(first);
1854         }
1855
1856         /* Starting-point info. */
1857       again:
1858         if (PL_regkind[(U8)OP(first)] == EXACT) {
1859             if (OP(first) == EXACT)
1860                 ;       /* Empty, get anchored substr later. */
1861             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1862                 r->regstclass = first;
1863         }
1864         else if (strchr((char*)PL_simple,OP(first)))
1865             r->regstclass = first;
1866         else if (PL_regkind[(U8)OP(first)] == BOUND ||
1867                  PL_regkind[(U8)OP(first)] == NBOUND)
1868             r->regstclass = first;
1869         else if (PL_regkind[(U8)OP(first)] == BOL) {
1870             r->reganch |= (OP(first) == MBOL
1871                            ? ROPT_ANCH_MBOL
1872                            : (OP(first) == SBOL
1873                               ? ROPT_ANCH_SBOL
1874                               : ROPT_ANCH_BOL));
1875             first = NEXTOPER(first);
1876             goto again;
1877         }
1878         else if (OP(first) == GPOS) {
1879             r->reganch |= ROPT_ANCH_GPOS;
1880             first = NEXTOPER(first);
1881             goto again;
1882         }
1883         else if (!sawopen && (OP(first) == STAR &&
1884             PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1885             !(r->reganch & ROPT_ANCH) )
1886         {
1887             /* turn .* into ^.* with an implied $*=1 */
1888             int type = OP(NEXTOPER(first));
1889
1890             if (type == REG_ANY)
1891                 type = ROPT_ANCH_MBOL;
1892             else
1893                 type = ROPT_ANCH_SBOL;
1894
1895             r->reganch |= type | ROPT_IMPLICIT;
1896             first = NEXTOPER(first);
1897             goto again;
1898         }
1899         if (sawplus && (!sawopen || !RExC_sawback)
1900             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1901             /* x+ must match at the 1st pos of run of x's */
1902             r->reganch |= ROPT_SKIP;
1903
1904         /* Scan is after the zeroth branch, first is atomic matcher. */
1905         DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1906                               (IV)(first - scan + 1)));
1907         /*
1908         * If there's something expensive in the r.e., find the
1909         * longest literal string that must appear and make it the
1910         * regmust.  Resolve ties in favor of later strings, since
1911         * the regstart check works with the beginning of the r.e.
1912         * and avoiding duplication strengthens checking.  Not a
1913         * strong reason, but sufficient in the absence of others.
1914         * [Now we resolve ties in favor of the earlier string if
1915         * it happens that c_offset_min has been invalidated, since the
1916         * earlier string may buy us something the later one won't.]
1917         */
1918         minlen = 0;
1919
1920         data.longest_fixed = newSVpvn("",0);
1921         data.longest_float = newSVpvn("",0);
1922         data.last_found = newSVpvn("",0);
1923         data.longest = &(data.longest_fixed);
1924         first = scan;
1925         if (!r->regstclass) {
1926             cl_init(pRExC_state, &ch_class);
1927             data.start_class = &ch_class;
1928             stclass_flag = SCF_DO_STCLASS_AND;
1929         } else                          /* XXXX Check for BOUND? */
1930             stclass_flag = 0;
1931         data.last_closep = &last_close;
1932
1933         minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1934                              &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1935         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1936              && data.last_start_min == 0 && data.last_end > 0
1937              && !RExC_seen_zerolen
1938              && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1939             r->reganch |= ROPT_CHECK_ALL;
1940         scan_commit(pRExC_state, &data);
1941         SvREFCNT_dec(data.last_found);
1942
1943         longest_float_length = CHR_SVLEN(data.longest_float);
1944         if (longest_float_length
1945             || (data.flags & SF_FL_BEFORE_EOL
1946                 && (!(data.flags & SF_FL_BEFORE_MEOL)
1947                     || (RExC_flags & PMf_MULTILINE)))) {
1948             int t;
1949
1950             if (SvCUR(data.longest_fixed)                       /* ok to leave SvCUR */
1951                 && data.offset_fixed == data.offset_float_min
1952                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1953                     goto remove_float;          /* As in (a)+. */
1954
1955             if (SvUTF8(data.longest_float)) {
1956                 r->float_utf8 = data.longest_float;
1957                 r->float_substr = Nullsv;
1958             } else {
1959                 r->float_substr = data.longest_float;
1960                 r->float_utf8 = Nullsv;
1961             }
1962             r->float_min_offset = data.offset_float_min;
1963             r->float_max_offset = data.offset_float_max;
1964             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1965                        && (!(data.flags & SF_FL_BEFORE_MEOL)
1966                            || (RExC_flags & PMf_MULTILINE)));
1967             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1968         }
1969         else {
1970           remove_float:
1971             r->float_substr = r->float_utf8 = Nullsv;
1972             SvREFCNT_dec(data.longest_float);
1973             longest_float_length = 0;
1974         }
1975
1976         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
1977         if (longest_fixed_length
1978             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1979                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1980                     || (RExC_flags & PMf_MULTILINE)))) {
1981             int t;
1982
1983             if (SvUTF8(data.longest_fixed)) {
1984                 r->anchored_utf8 = data.longest_fixed;
1985                 r->anchored_substr = Nullsv;
1986             } else {
1987                 r->anchored_substr = data.longest_fixed;
1988                 r->anchored_utf8 = Nullsv;
1989             }
1990             r->anchored_offset = data.offset_fixed;
1991             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1992                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
1993                      || (RExC_flags & PMf_MULTILINE)));
1994             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
1995         }
1996         else {
1997             r->anchored_substr = r->anchored_utf8 = Nullsv;
1998             SvREFCNT_dec(data.longest_fixed);
1999             longest_fixed_length = 0;
2000         }
2001         if (r->regstclass
2002             && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2003             r->regstclass = NULL;
2004         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2005             && stclass_flag
2006             && !(data.start_class->flags & ANYOF_EOS)
2007             && !cl_is_anything(data.start_class))
2008         {
2009             I32 n = add_data(pRExC_state, 1, "f");
2010
2011             New(1006, RExC_rx->data->data[n], 1,
2012                 struct regnode_charclass_class);
2013             StructCopy(data.start_class,
2014                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2015                        struct regnode_charclass_class);
2016             r->regstclass = (regnode*)RExC_rx->data->data[n];
2017             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2018             PL_regdata = r->data; /* for regprop() */
2019             DEBUG_r({ SV *sv = sv_newmortal();
2020                       regprop(sv, (regnode*)data.start_class);
2021                       PerlIO_printf(Perl_debug_log,
2022                                     "synthetic stclass `%s'.\n",
2023                                     SvPVX(sv));});
2024         }
2025
2026         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2027         if (longest_fixed_length > longest_float_length) {
2028             r->check_substr = r->anchored_substr;
2029             r->check_utf8 = r->anchored_utf8;
2030             r->check_offset_min = r->check_offset_max = r->anchored_offset;
2031             if (r->reganch & ROPT_ANCH_SINGLE)
2032                 r->reganch |= ROPT_NOSCAN;
2033         }
2034         else {
2035             r->check_substr = r->float_substr;
2036             r->check_utf8 = r->float_utf8;
2037             r->check_offset_min = data.offset_float_min;
2038             r->check_offset_max = data.offset_float_max;
2039         }
2040         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2041            This should be changed ASAP!  */
2042         if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2043             r->reganch |= RE_USE_INTUIT;
2044             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2045                 r->reganch |= RE_INTUIT_TAIL;
2046         }
2047     }
2048     else {
2049         /* Several toplevels. Best we can is to set minlen. */
2050         I32 fake;
2051         struct regnode_charclass_class ch_class;
2052         I32 last_close = 0;
2053         
2054         DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2055         scan = r->program + 1;
2056         cl_init(pRExC_state, &ch_class);
2057         data.start_class = &ch_class;
2058         data.last_closep = &last_close;
2059         minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2060         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2061                 = r->float_substr = r->float_utf8 = Nullsv;
2062         if (!(data.start_class->flags & ANYOF_EOS)
2063             && !cl_is_anything(data.start_class))
2064         {
2065             I32 n = add_data(pRExC_state, 1, "f");
2066
2067             New(1006, RExC_rx->data->data[n], 1,
2068                 struct regnode_charclass_class);
2069             StructCopy(data.start_class,
2070                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2071                        struct regnode_charclass_class);
2072             r->regstclass = (regnode*)RExC_rx->data->data[n];
2073             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2074             DEBUG_r({ SV* sv = sv_newmortal();
2075                       regprop(sv, (regnode*)data.start_class);
2076                       PerlIO_printf(Perl_debug_log,
2077                                     "synthetic stclass `%s'.\n",
2078                                     SvPVX(sv));});
2079         }
2080     }
2081
2082     r->minlen = minlen;
2083     if (RExC_seen & REG_SEEN_GPOS)
2084         r->reganch |= ROPT_GPOS_SEEN;
2085     if (RExC_seen & REG_SEEN_LOOKBEHIND)
2086         r->reganch |= ROPT_LOOKBEHIND_SEEN;
2087     if (RExC_seen & REG_SEEN_EVAL)
2088         r->reganch |= ROPT_EVAL_SEEN;
2089     if (RExC_seen & REG_SEEN_CANY)
2090         r->reganch |= ROPT_CANY_SEEN;
2091     Newz(1002, r->startp, RExC_npar, I32);
2092     Newz(1002, r->endp, RExC_npar, I32);
2093     PL_regdata = r->data; /* for regprop() */
2094     DEBUG_r(regdump(r));
2095     return(r);
2096 }
2097
2098 /*
2099  - reg - regular expression, i.e. main body or parenthesized thing
2100  *
2101  * Caller must absorb opening parenthesis.
2102  *
2103  * Combining parenthesis handling with the base level of regular expression
2104  * is a trifle forced, but the need to tie the tails of the branches to what
2105  * follows makes it hard to avoid.
2106  */
2107 STATIC regnode *
2108 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2109     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2110 {
2111     register regnode *ret;              /* Will be the head of the group. */
2112     register regnode *br;
2113     register regnode *lastbr;
2114     register regnode *ender = 0;
2115     register I32 parno = 0;
2116     I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2117
2118     /* for (?g), (?gc), and (?o) warnings; warning
2119        about (?c) will warn about (?g) -- japhy    */
2120
2121     I32 wastedflags = 0x00,
2122         wasted_o    = 0x01,
2123         wasted_g    = 0x02,
2124         wasted_gc   = 0x02 | 0x04,
2125         wasted_c    = 0x04;
2126
2127     char * parse_start = RExC_parse; /* MJD */
2128     char *oregcomp_parse = RExC_parse;
2129     char c;
2130
2131     *flagp = 0;                         /* Tentatively. */
2132
2133
2134     /* Make an OPEN node, if parenthesized. */
2135     if (paren) {
2136         if (*RExC_parse == '?') { /* (?...) */
2137             U32 posflags = 0, negflags = 0;
2138             U32 *flagsp = &posflags;
2139             int logical = 0;
2140             char *seqstart = RExC_parse;
2141
2142             RExC_parse++;
2143             paren = *RExC_parse++;
2144             ret = NULL;                 /* For look-ahead/behind. */
2145             switch (paren) {
2146             case '<':           /* (?<...) */
2147                 RExC_seen |= REG_SEEN_LOOKBEHIND;
2148                 if (*RExC_parse == '!')
2149                     paren = ',';
2150                 if (*RExC_parse != '=' && *RExC_parse != '!')
2151                     goto unknown;
2152                 RExC_parse++;
2153             case '=':           /* (?=...) */
2154             case '!':           /* (?!...) */
2155                 RExC_seen_zerolen++;
2156             case ':':           /* (?:...) */
2157             case '>':           /* (?>...) */
2158                 break;
2159             case '$':           /* (?$...) */
2160             case '@':           /* (?@...) */
2161                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2162                 break;
2163             case '#':           /* (?#...) */
2164                 while (*RExC_parse && *RExC_parse != ')')
2165                     RExC_parse++;
2166                 if (*RExC_parse != ')')
2167                     FAIL("Sequence (?#... not terminated");
2168                 nextchar(pRExC_state);
2169                 *flagp = TRYAGAIN;
2170                 return NULL;
2171             case 'p':           /* (?p...) */
2172                 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2173                     vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2174                 /* FALL THROUGH*/
2175             case '?':           /* (??...) */
2176                 logical = 1;
2177                 if (*RExC_parse != '{')
2178                     goto unknown;
2179                 paren = *RExC_parse++;
2180                 /* FALL THROUGH */
2181             case '{':           /* (?{...}) */
2182             {
2183                 I32 count = 1, n = 0;
2184                 char c;
2185                 char *s = RExC_parse;
2186                 SV *sv;
2187                 OP_4tree *sop, *rop;
2188
2189                 RExC_seen_zerolen++;
2190                 RExC_seen |= REG_SEEN_EVAL;
2191                 while (count && (c = *RExC_parse)) {
2192                     if (c == '\\' && RExC_parse[1])
2193                         RExC_parse++;
2194                     else if (c == '{')
2195                         count++;
2196                     else if (c == '}')
2197                         count--;
2198                     RExC_parse++;
2199                 }
2200                 if (*RExC_parse != ')')
2201                 {
2202                     RExC_parse = s;             
2203                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2204                 }
2205                 if (!SIZE_ONLY) {
2206                     PAD *pad;
2207                 
2208                     if (RExC_parse - 1 - s)
2209                         sv = newSVpvn(s, RExC_parse - 1 - s);
2210                     else
2211                         sv = newSVpvn("", 0);
2212
2213                     ENTER;
2214                     Perl_save_re_context(aTHX);
2215                     rop = sv_compile_2op(sv, &sop, "re", &pad);
2216                     sop->op_private |= OPpREFCOUNTED;
2217                     /* re_dup will OpREFCNT_inc */
2218                     OpREFCNT_set(sop, 1);
2219                     LEAVE;
2220
2221                     n = add_data(pRExC_state, 3, "nop");
2222                     RExC_rx->data->data[n] = (void*)rop;
2223                     RExC_rx->data->data[n+1] = (void*)sop;
2224                     RExC_rx->data->data[n+2] = (void*)pad;
2225                     SvREFCNT_dec(sv);
2226                 }
2227                 else {                                          /* First pass */
2228                     if (PL_reginterp_cnt < ++RExC_seen_evals
2229                         && PL_curcop != &PL_compiling)
2230                         /* No compiled RE interpolated, has runtime
2231                            components ===> unsafe.  */
2232                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
2233                     if (PL_tainting && PL_tainted)
2234                         FAIL("Eval-group in insecure regular expression");
2235                 }
2236                 
2237                 nextchar(pRExC_state);
2238                 if (logical) {
2239                     ret = reg_node(pRExC_state, LOGICAL);
2240                     if (!SIZE_ONLY)
2241                         ret->flags = 2;
2242                     regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2243                     /* deal with the length of this later - MJD */
2244                     return ret;
2245                 }
2246                 ret = reganode(pRExC_state, EVAL, n);
2247                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2248                 Set_Node_Offset(ret, parse_start);
2249                 return ret;
2250             }
2251             case '(':           /* (?(?{...})...) and (?(?=...)...) */
2252             {
2253                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
2254                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2255                         || RExC_parse[1] == '<'
2256                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
2257                         I32 flag;
2258                         
2259                         ret = reg_node(pRExC_state, LOGICAL);
2260                         if (!SIZE_ONLY)
2261                             ret->flags = 1;
2262                         regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2263                         goto insert_if;
2264                     }
2265                 }
2266                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2267                     /* (?(1)...) */
2268                     parno = atoi(RExC_parse++);
2269
2270                     while (isDIGIT(*RExC_parse))
2271                         RExC_parse++;
2272                     ret = reganode(pRExC_state, GROUPP, parno);
2273                     
2274                     if ((c = *nextchar(pRExC_state)) != ')')
2275                         vFAIL("Switch condition not recognized");
2276                   insert_if:
2277                     regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2278                     br = regbranch(pRExC_state, &flags, 1);
2279                     if (br == NULL)
2280                         br = reganode(pRExC_state, LONGJMP, 0);
2281                     else
2282                         regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2283                     c = *nextchar(pRExC_state);
2284                     if (flags&HASWIDTH)
2285                         *flagp |= HASWIDTH;
2286                     if (c == '|') {
2287                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2288                         regbranch(pRExC_state, &flags, 1);
2289                         regtail(pRExC_state, ret, lastbr);
2290                         if (flags&HASWIDTH)
2291                             *flagp |= HASWIDTH;
2292                         c = *nextchar(pRExC_state);
2293                     }
2294                     else
2295                         lastbr = NULL;
2296                     if (c != ')')
2297                         vFAIL("Switch (?(condition)... contains too many branches");
2298                     ender = reg_node(pRExC_state, TAIL);
2299                     regtail(pRExC_state, br, ender);
2300                     if (lastbr) {
2301                         regtail(pRExC_state, lastbr, ender);
2302                         regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2303                     }
2304                     else
2305                         regtail(pRExC_state, ret, ender);
2306                     return ret;
2307                 }
2308                 else {
2309                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2310                 }
2311             }
2312             case 0:
2313                 RExC_parse--; /* for vFAIL to print correctly */
2314                 vFAIL("Sequence (? incomplete");
2315                 break;
2316             default:
2317                 --RExC_parse;
2318               parse_flags:      /* (?i) */
2319                 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2320                     /* (?g), (?gc) and (?o) are useless here
2321                        and must be globally applied -- japhy */
2322
2323                     if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2324                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2325                             I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2326                             if (! (wastedflags & wflagbit) ) {
2327                                 wastedflags |= wflagbit;
2328                                 vWARN5(
2329                                     RExC_parse + 1,
2330                                     "Useless (%s%c) - %suse /%c modifier",
2331                                     flagsp == &negflags ? "?-" : "?",
2332                                     *RExC_parse,
2333                                     flagsp == &negflags ? "don't " : "",
2334                                     *RExC_parse
2335                                 );
2336                             }
2337                         }
2338                     }
2339                     else if (*RExC_parse == 'c') {
2340                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2341                             if (! (wastedflags & wasted_c) ) {
2342                                 wastedflags |= wasted_gc;
2343                                 vWARN3(
2344                                     RExC_parse + 1,
2345                                     "Useless (%sc) - %suse /gc modifier",
2346                                     flagsp == &negflags ? "?-" : "?",
2347                                     flagsp == &negflags ? "don't " : ""
2348                                 );
2349                             }
2350                         }
2351                     }
2352                     else { pmflag(flagsp, *RExC_parse); }
2353
2354                     ++RExC_parse;
2355                 }
2356                 if (*RExC_parse == '-') {
2357                     flagsp = &negflags;
2358                     wastedflags = 0;  /* reset so (?g-c) warns twice */
2359                     ++RExC_parse;
2360                     goto parse_flags;
2361                 }
2362                 RExC_flags |= posflags;
2363                 RExC_flags &= ~negflags;
2364                 if (*RExC_parse == ':') {
2365                     RExC_parse++;
2366                     paren = ':';
2367                     break;
2368                 }               
2369               unknown:
2370                 if (*RExC_parse != ')') {
2371                     RExC_parse++;
2372                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2373                 }
2374                 nextchar(pRExC_state);
2375                 *flagp = TRYAGAIN;
2376                 return NULL;
2377             }
2378         }
2379         else {                  /* (...) */
2380             parno = RExC_npar;
2381             RExC_npar++;
2382             ret = reganode(pRExC_state, OPEN, parno);
2383             Set_Node_Length(ret, 1); /* MJD */
2384             Set_Node_Offset(ret, RExC_parse); /* MJD */
2385             open = 1;
2386         }
2387     }
2388     else                        /* ! paren */
2389         ret = NULL;
2390
2391     /* Pick up the branches, linking them together. */
2392     parse_start = RExC_parse;   /* MJD */
2393     br = regbranch(pRExC_state, &flags, 1);
2394     /*     branch_len = (paren != 0); */
2395     
2396     if (br == NULL)
2397         return(NULL);
2398     if (*RExC_parse == '|') {
2399         if (!SIZE_ONLY && RExC_extralen) {
2400             reginsert(pRExC_state, BRANCHJ, br);
2401         }
2402         else {                  /* MJD */
2403             reginsert(pRExC_state, BRANCH, br);
2404             Set_Node_Length(br, paren != 0);
2405             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2406         }
2407         have_branch = 1;
2408         if (SIZE_ONLY)
2409             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
2410     }
2411     else if (paren == ':') {
2412         *flagp |= flags&SIMPLE;
2413     }
2414     if (open) {                         /* Starts with OPEN. */
2415         regtail(pRExC_state, ret, br);          /* OPEN -> first. */
2416     }
2417     else if (paren != '?')              /* Not Conditional */
2418         ret = br;
2419     *flagp |= flags & (SPSTART | HASWIDTH);
2420     lastbr = br;
2421     while (*RExC_parse == '|') {
2422         if (!SIZE_ONLY && RExC_extralen) {
2423             ender = reganode(pRExC_state, LONGJMP,0);
2424             regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2425         }
2426         if (SIZE_ONLY)
2427             RExC_extralen += 2;         /* Account for LONGJMP. */
2428         nextchar(pRExC_state);
2429         br = regbranch(pRExC_state, &flags, 0);
2430         
2431         if (br == NULL)
2432             return(NULL);
2433         regtail(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
2434         lastbr = br;
2435         if (flags&HASWIDTH)
2436             *flagp |= HASWIDTH;
2437         *flagp |= flags&SPSTART;
2438     }
2439
2440     if (have_branch || paren != ':') {
2441         /* Make a closing node, and hook it on the end. */
2442         switch (paren) {
2443         case ':':
2444             ender = reg_node(pRExC_state, TAIL);
2445             break;
2446         case 1:
2447             ender = reganode(pRExC_state, CLOSE, parno);
2448             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2449             Set_Node_Length(ender,1); /* MJD */
2450             break;
2451         case '<':
2452         case ',':
2453         case '=':
2454         case '!':
2455             *flagp &= ~HASWIDTH;
2456             /* FALL THROUGH */
2457         case '>':
2458             ender = reg_node(pRExC_state, SUCCEED);
2459             break;
2460         case 0:
2461             ender = reg_node(pRExC_state, END);
2462             break;
2463         }
2464         regtail(pRExC_state, lastbr, ender);
2465
2466         if (have_branch) {
2467             /* Hook the tails of the branches to the closing node. */
2468             for (br = ret; br != NULL; br = regnext(br)) {
2469                 regoptail(pRExC_state, br, ender);
2470             }
2471         }
2472     }
2473
2474     {
2475         char *p;
2476         static char parens[] = "=!<,>";
2477
2478         if (paren && (p = strchr(parens, paren))) {
2479             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2480             int flag = (p - parens) > 1;
2481
2482             if (paren == '>')
2483                 node = SUSPEND, flag = 0;
2484             reginsert(pRExC_state, node,ret);
2485             Set_Node_Offset(ret, oregcomp_parse);
2486             Set_Node_Length(ret,  RExC_parse - oregcomp_parse + 2);
2487             ret->flags = flag;
2488             regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2489         }
2490     }
2491
2492     /* Check for proper termination. */
2493     if (paren) {
2494         RExC_flags = oregflags;
2495         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2496             RExC_parse = oregcomp_parse;
2497             vFAIL("Unmatched (");
2498         }
2499     }
2500     else if (!paren && RExC_parse < RExC_end) {
2501         if (*RExC_parse == ')') {
2502             RExC_parse++;
2503             vFAIL("Unmatched )");
2504         }
2505         else
2506             FAIL("Junk on end of regexp");      /* "Can't happen". */
2507         /* NOTREACHED */
2508     }
2509
2510     return(ret);
2511 }
2512
2513 /*
2514  - regbranch - one alternative of an | operator
2515  *
2516  * Implements the concatenation operator.
2517  */
2518 STATIC regnode *
2519 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2520 {
2521     register regnode *ret;
2522     register regnode *chain = NULL;
2523     register regnode *latest;
2524     I32 flags = 0, c = 0;
2525
2526     if (first)
2527         ret = NULL;
2528     else {
2529         if (!SIZE_ONLY && RExC_extralen)
2530             ret = reganode(pRExC_state, BRANCHJ,0);
2531         else {
2532             ret = reg_node(pRExC_state, BRANCH);
2533             Set_Node_Length(ret, 1);
2534         }
2535     }
2536         
2537     if (!first && SIZE_ONLY)
2538         RExC_extralen += 1;                     /* BRANCHJ */
2539
2540     *flagp = WORST;                     /* Tentatively. */
2541
2542     RExC_parse--;
2543     nextchar(pRExC_state);
2544     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2545         flags &= ~TRYAGAIN;
2546         latest = regpiece(pRExC_state, &flags);
2547         if (latest == NULL) {
2548             if (flags & TRYAGAIN)
2549                 continue;
2550             return(NULL);
2551         }
2552         else if (ret == NULL)
2553             ret = latest;
2554         *flagp |= flags&HASWIDTH;
2555         if (chain == NULL)      /* First piece. */
2556             *flagp |= flags&SPSTART;
2557         else {
2558             RExC_naughty++;
2559             regtail(pRExC_state, chain, latest);
2560         }
2561         chain = latest;
2562         c++;
2563     }
2564     if (chain == NULL) {        /* Loop ran zero times. */
2565         chain = reg_node(pRExC_state, NOTHING);
2566         if (ret == NULL)
2567             ret = chain;
2568     }
2569     if (c == 1) {
2570         *flagp |= flags&SIMPLE;
2571     }
2572
2573     return(ret);
2574 }
2575
2576 /*
2577  - regpiece - something followed by possible [*+?]
2578  *
2579  * Note that the branching code sequences used for ? and the general cases
2580  * of * and + are somewhat optimized:  they use the same NOTHING node as
2581  * both the endmarker for their branch list and the body of the last branch.
2582  * It might seem that this node could be dispensed with entirely, but the
2583  * endmarker role is not redundant.
2584  */
2585 STATIC regnode *
2586 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2587 {
2588     register regnode *ret;
2589     register char op;
2590     register char *next;
2591     I32 flags;
2592     char *origparse = RExC_parse;
2593     char *maxpos;
2594     I32 min;
2595     I32 max = REG_INFTY;
2596     char *parse_start;
2597
2598     ret = regatom(pRExC_state, &flags);
2599     if (ret == NULL) {
2600         if (flags & TRYAGAIN)
2601             *flagp |= TRYAGAIN;
2602         return(NULL);
2603     }
2604
2605     op = *RExC_parse;
2606
2607     if (op == '{' && regcurly(RExC_parse)) {
2608         parse_start = RExC_parse; /* MJD */
2609         next = RExC_parse + 1;
2610         maxpos = Nullch;
2611         while (isDIGIT(*next) || *next == ',') {
2612             if (*next == ',') {
2613                 if (maxpos)
2614                     break;
2615                 else
2616                     maxpos = next;
2617             }
2618             next++;
2619         }
2620         if (*next == '}') {             /* got one */
2621             if (!maxpos)
2622                 maxpos = next;
2623             RExC_parse++;
2624             min = atoi(RExC_parse);
2625             if (*maxpos == ',')
2626                 maxpos++;
2627             else
2628                 maxpos = RExC_parse;
2629             max = atoi(maxpos);
2630             if (!max && *maxpos != '0')
2631                 max = REG_INFTY;                /* meaning "infinity" */
2632             else if (max >= REG_INFTY)
2633                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2634             RExC_parse = next;
2635             nextchar(pRExC_state);
2636
2637         do_curly:
2638             if ((flags&SIMPLE)) {
2639                 RExC_naughty += 2 + RExC_naughty / 2;
2640                 reginsert(pRExC_state, CURLY, ret);
2641                 Set_Node_Offset(ret, parse_start+1); /* MJD */
2642                 Set_Node_Cur_Length(ret);
2643             }
2644             else {
2645                 regnode *w = reg_node(pRExC_state, WHILEM);
2646
2647                 w->flags = 0;
2648                 regtail(pRExC_state, ret, w);
2649                 if (!SIZE_ONLY && RExC_extralen) {
2650                     reginsert(pRExC_state, LONGJMP,ret);
2651                     reginsert(pRExC_state, NOTHING,ret);
2652                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
2653                 }
2654                 reginsert(pRExC_state, CURLYX,ret);
2655                                 /* MJD hk */
2656                 Set_Node_Offset(ret, parse_start+1);
2657                 Set_Node_Length(ret, 
2658                                 op == '{' ? (RExC_parse - parse_start) : 1);
2659                 
2660                 if (!SIZE_ONLY && RExC_extralen)
2661                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
2662                 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2663                 if (SIZE_ONLY)
2664                     RExC_whilem_seen++, RExC_extralen += 3;
2665                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
2666             }
2667             ret->flags = 0;
2668
2669             if (min > 0)
2670                 *flagp = WORST;
2671             if (max > 0)
2672                 *flagp |= HASWIDTH;
2673             if (max && max < min)
2674                 vFAIL("Can't do {n,m} with n > m");
2675             if (!SIZE_ONLY) {
2676                 ARG1_SET(ret, (U16)min);
2677                 ARG2_SET(ret, (U16)max);
2678             }
2679
2680             goto nest_check;
2681         }
2682     }
2683
2684     if (!ISMULT1(op)) {
2685         *flagp = flags;
2686         return(ret);
2687     }
2688
2689 #if 0                           /* Now runtime fix should be reliable. */
2690
2691     /* if this is reinstated, don't forget to put this back into perldiag:
2692
2693             =item Regexp *+ operand could be empty at {#} in regex m/%s/
2694
2695            (F) The part of the regexp subject to either the * or + quantifier
2696            could match an empty string. The {#} shows in the regular
2697            expression about where the problem was discovered.
2698
2699     */
2700
2701     if (!(flags&HASWIDTH) && op != '?')
2702       vFAIL("Regexp *+ operand could be empty");
2703 #endif
2704
2705     parse_start = RExC_parse;
2706     nextchar(pRExC_state);
2707
2708     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2709
2710     if (op == '*' && (flags&SIMPLE)) {
2711         reginsert(pRExC_state, STAR, ret);
2712         ret->flags = 0;
2713         RExC_naughty += 4;
2714     }
2715     else if (op == '*') {
2716         min = 0;
2717         goto do_curly;
2718     }
2719     else if (op == '+' && (flags&SIMPLE)) {
2720         reginsert(pRExC_state, PLUS, ret);
2721         ret->flags = 0;
2722         RExC_naughty += 3;
2723     }
2724     else if (op == '+') {
2725         min = 1;
2726         goto do_curly;
2727     }
2728     else if (op == '?') {
2729         min = 0; max = 1;
2730         goto do_curly;
2731     }
2732   nest_check:
2733     if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2734         vWARN3(RExC_parse,
2735                "%.*s matches null string many times",
2736                RExC_parse - origparse,
2737                origparse);
2738     }
2739
2740     if (*RExC_parse == '?') {
2741         nextchar(pRExC_state);
2742         reginsert(pRExC_state, MINMOD, ret);
2743         regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2744     }
2745     if (ISMULT2(RExC_parse)) {
2746         RExC_parse++;
2747         vFAIL("Nested quantifiers");
2748     }
2749
2750     return(ret);
2751 }
2752
2753 /*
2754  - regatom - the lowest level
2755  *
2756  * Optimization:  gobbles an entire sequence of ordinary characters so that
2757  * it can turn them into a single node, which is smaller to store and
2758  * faster to run.  Backslashed characters are exceptions, each becoming a
2759  * separate node; the code is simpler that way and it's not worth fixing.
2760  *
2761  * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2762 STATIC regnode *
2763 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2764 {
2765     register regnode *ret = 0;
2766     I32 flags;
2767     char *parse_start = 0;
2768
2769     *flagp = WORST;             /* Tentatively. */
2770
2771 tryagain:
2772     switch (*RExC_parse) {
2773     case '^':
2774         RExC_seen_zerolen++;
2775         nextchar(pRExC_state);
2776         if (RExC_flags & PMf_MULTILINE)
2777             ret = reg_node(pRExC_state, MBOL);
2778         else if (RExC_flags & PMf_SINGLELINE)
2779             ret = reg_node(pRExC_state, SBOL);
2780         else
2781             ret = reg_node(pRExC_state, BOL);
2782         Set_Node_Length(ret, 1); /* MJD */
2783         break;
2784     case '$':
2785         nextchar(pRExC_state);
2786         if (*RExC_parse)
2787             RExC_seen_zerolen++;
2788         if (RExC_flags & PMf_MULTILINE)
2789             ret = reg_node(pRExC_state, MEOL);
2790         else if (RExC_flags & PMf_SINGLELINE)
2791             ret = reg_node(pRExC_state, SEOL);
2792         else
2793             ret = reg_node(pRExC_state, EOL);
2794         Set_Node_Length(ret, 1); /* MJD */
2795         break;
2796     case '.':
2797         nextchar(pRExC_state);
2798         if (RExC_flags & PMf_SINGLELINE)
2799             ret = reg_node(pRExC_state, SANY);
2800         else
2801             ret = reg_node(pRExC_state, REG_ANY);
2802         *flagp |= HASWIDTH|SIMPLE;
2803         RExC_naughty++;
2804         Set_Node_Length(ret, 1); /* MJD */
2805         break;
2806     case '[':
2807     {
2808         char *oregcomp_parse = ++RExC_parse;
2809         ret = regclass(pRExC_state);
2810         if (*RExC_parse != ']') {
2811             RExC_parse = oregcomp_parse;
2812             vFAIL("Unmatched [");
2813         }
2814         nextchar(pRExC_state);
2815         *flagp |= HASWIDTH|SIMPLE;
2816         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2817         break;
2818     }
2819     case '(':
2820         nextchar(pRExC_state);
2821         ret = reg(pRExC_state, 1, &flags);
2822         if (ret == NULL) {
2823                 if (flags & TRYAGAIN) {
2824                     if (RExC_parse == RExC_end) {
2825                          /* Make parent create an empty node if needed. */
2826                         *flagp |= TRYAGAIN;
2827                         return(NULL);
2828                     }
2829                     goto tryagain;
2830                 }
2831                 return(NULL);
2832         }
2833         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2834         break;
2835     case '|':
2836     case ')':
2837         if (flags & TRYAGAIN) {
2838             *flagp |= TRYAGAIN;
2839             return NULL;
2840         }
2841         vFAIL("Internal urp");
2842                                 /* Supposed to be caught earlier. */
2843         break;
2844     case '{':
2845         if (!regcurly(RExC_parse)) {
2846             RExC_parse++;
2847             goto defchar;
2848         }
2849         /* FALL THROUGH */
2850     case '?':
2851     case '+':
2852     case '*':
2853         RExC_parse++;
2854         vFAIL("Quantifier follows nothing");
2855         break;
2856     case '\\':
2857         switch (*++RExC_parse) {
2858         case 'A':
2859             RExC_seen_zerolen++;
2860             ret = reg_node(pRExC_state, SBOL);
2861             *flagp |= SIMPLE;
2862             nextchar(pRExC_state);
2863             Set_Node_Length(ret, 2); /* MJD */
2864             break;
2865         case 'G':
2866             ret = reg_node(pRExC_state, GPOS);
2867             RExC_seen |= REG_SEEN_GPOS;
2868             *flagp |= SIMPLE;
2869             nextchar(pRExC_state);
2870             Set_Node_Length(ret, 2); /* MJD */
2871             break;
2872         case 'Z':
2873             ret = reg_node(pRExC_state, SEOL);
2874             *flagp |= SIMPLE;
2875             RExC_seen_zerolen++;                /* Do not optimize RE away */
2876             nextchar(pRExC_state);
2877             break;
2878         case 'z':
2879             ret = reg_node(pRExC_state, EOS);
2880             *flagp |= SIMPLE;
2881             RExC_seen_zerolen++;                /* Do not optimize RE away */
2882             nextchar(pRExC_state);
2883             Set_Node_Length(ret, 2); /* MJD */
2884             break;
2885         case 'C':
2886             ret = reg_node(pRExC_state, CANY);
2887             RExC_seen |= REG_SEEN_CANY;
2888             *flagp |= HASWIDTH|SIMPLE;
2889             nextchar(pRExC_state);
2890             Set_Node_Length(ret, 2); /* MJD */
2891             break;
2892         case 'X':
2893             ret = reg_node(pRExC_state, CLUMP);
2894             *flagp |= HASWIDTH;
2895             nextchar(pRExC_state);
2896             Set_Node_Length(ret, 2); /* MJD */
2897             break;
2898         case 'w':
2899             ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
2900             *flagp |= HASWIDTH|SIMPLE;
2901             nextchar(pRExC_state);
2902             Set_Node_Length(ret, 2); /* MJD */
2903             break;
2904         case 'W':
2905             ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
2906             *flagp |= HASWIDTH|SIMPLE;
2907             nextchar(pRExC_state);
2908             Set_Node_Length(ret, 2); /* MJD */
2909             break;
2910         case 'b':
2911             RExC_seen_zerolen++;
2912             RExC_seen |= REG_SEEN_LOOKBEHIND;
2913             ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
2914             *flagp |= SIMPLE;
2915             nextchar(pRExC_state);
2916             Set_Node_Length(ret, 2); /* MJD */
2917             break;
2918         case 'B':
2919             RExC_seen_zerolen++;
2920             RExC_seen |= REG_SEEN_LOOKBEHIND;
2921             ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
2922             *flagp |= SIMPLE;
2923             nextchar(pRExC_state);
2924             Set_Node_Length(ret, 2); /* MJD */
2925             break;
2926         case 's':
2927             ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
2928             *flagp |= HASWIDTH|SIMPLE;
2929             nextchar(pRExC_state);
2930             Set_Node_Length(ret, 2); /* MJD */
2931             break;
2932         case 'S':
2933             ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
2934             *flagp |= HASWIDTH|SIMPLE;
2935             nextchar(pRExC_state);
2936             Set_Node_Length(ret, 2); /* MJD */
2937             break;
2938         case 'd':
2939             ret = reg_node(pRExC_state, DIGIT);
2940             *flagp |= HASWIDTH|SIMPLE;
2941             nextchar(pRExC_state);
2942             Set_Node_Length(ret, 2); /* MJD */
2943             break;
2944         case 'D':
2945             ret = reg_node(pRExC_state, NDIGIT);
2946             *flagp |= HASWIDTH|SIMPLE;
2947             nextchar(pRExC_state);
2948             Set_Node_Length(ret, 2); /* MJD */
2949             break;
2950         case 'p':
2951         case 'P':
2952             {   
2953                 char* oldregxend = RExC_end;
2954                 char* parse_start = RExC_parse - 2;
2955
2956                 if (RExC_parse[1] == '{') {
2957                   /* a lovely hack--pretend we saw [\pX] instead */
2958                     RExC_end = strchr(RExC_parse, '}');
2959                     if (!RExC_end) {
2960                         U8 c = (U8)*RExC_parse;
2961                         RExC_parse += 2;
2962                         RExC_end = oldregxend;
2963                         vFAIL2("Missing right brace on \\%c{}", c);
2964                     }
2965                     RExC_end++;
2966                 }
2967                 else {
2968                     RExC_end = RExC_parse + 2;
2969                     if (RExC_end > oldregxend)
2970                         RExC_end = oldregxend;
2971                 }
2972                 RExC_parse--;
2973
2974                 ret = regclass(pRExC_state);
2975
2976                 RExC_end = oldregxend;
2977                 RExC_parse--;
2978
2979                 Set_Node_Offset(ret, parse_start + 2);
2980                 Set_Node_Cur_Length(ret);
2981                 nextchar(pRExC_state);
2982                 *flagp |= HASWIDTH|SIMPLE;
2983             }
2984             break;
2985         case 'n':
2986         case 'r':
2987         case 't':
2988         case 'f':
2989         case 'e':
2990         case 'a':
2991         case 'x':
2992         case 'c':
2993         case '0':
2994             goto defchar;
2995         case '1': case '2': case '3': case '4':
2996         case '5': case '6': case '7': case '8': case '9':
2997             {
2998                 I32 num = atoi(RExC_parse);
2999
3000                 if (num > 9 && num >= RExC_npar)
3001                     goto defchar;
3002                 else {
3003                     char * parse_start = RExC_parse - 1; /* MJD */
3004                     while (isDIGIT(*RExC_parse))
3005                         RExC_parse++;
3006
3007                     if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3008                         vFAIL("Reference to nonexistent group");
3009                     RExC_sawback = 1;
3010                     ret = reganode(pRExC_state,
3011                                    (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3012                                    num);
3013                     *flagp |= HASWIDTH;
3014                     
3015                     /* override incorrect value set in reganode MJD */
3016                     Set_Node_Offset(ret, parse_start+1); 
3017                     Set_Node_Cur_Length(ret); /* MJD */
3018                     RExC_parse--;
3019                     nextchar(pRExC_state);
3020                 }
3021             }
3022             break;
3023         case '\0':
3024             if (RExC_parse >= RExC_end)
3025                 FAIL("Trailing \\");
3026             /* FALL THROUGH */
3027         default:
3028             /* Do not generate `unrecognized' warnings here, we fall
3029                back into the quick-grab loop below */
3030             goto defchar;
3031         }
3032         break;
3033
3034     case '#':
3035         if (RExC_flags & PMf_EXTENDED) {
3036             while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3037             if (RExC_parse < RExC_end)
3038                 goto tryagain;
3039         }
3040         /* FALL THROUGH */
3041
3042     default: {
3043             register STRLEN len;
3044             register UV ender;
3045             register char *p;
3046             char *oldp, *s;
3047             STRLEN numlen;
3048             STRLEN foldlen;
3049             U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3050
3051             parse_start = RExC_parse - 1;
3052
3053             RExC_parse++;
3054
3055         defchar:
3056             ender = 0;
3057             ret = reg_node(pRExC_state,
3058                            (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3059             s = STRING(ret);
3060             for (len = 0, p = RExC_parse - 1;
3061               len < 127 && p < RExC_end;
3062               len++)
3063             {
3064                 oldp = p;
3065
3066                 if (RExC_flags & PMf_EXTENDED)
3067                     p = regwhite(p, RExC_end);
3068                 switch (*p) {
3069                 case '^':
3070                 case '$':
3071                 case '.':
3072                 case '[':
3073                 case '(':
3074                 case ')':
3075                 case '|':
3076                     goto loopdone;
3077                 case '\\':
3078                     switch (*++p) {
3079                     case 'A':
3080                     case 'C':
3081                     case 'X':
3082                     case 'G':
3083                     case 'Z':
3084                     case 'z':
3085                     case 'w':
3086                     case 'W':
3087                     case 'b':
3088                     case 'B':
3089                     case 's':
3090                     case 'S':
3091                     case 'd':
3092                     case 'D':
3093                     case 'p':
3094                     case 'P':
3095                         --p;
3096                         goto loopdone;
3097                     case 'n':
3098                         ender = '\n';
3099                         p++;
3100                         break;
3101                     case 'r':
3102                         ender = '\r';
3103                         p++;
3104                         break;
3105                     case 't':
3106                         ender = '\t';
3107                         p++;
3108                         break;
3109                     case 'f':
3110                         ender = '\f';
3111                         p++;
3112                         break;
3113                     case 'e':
3114                           ender = ASCII_TO_NATIVE('\033');
3115                         p++;
3116                         break;
3117                     case 'a':
3118                           ender = ASCII_TO_NATIVE('\007');
3119                         p++;
3120                         break;
3121                     case 'x':
3122                         if (*++p == '{') {
3123                             char* e = strchr(p, '}');
3124         
3125                             if (!e) {
3126                                 RExC_parse = p + 1;
3127                                 vFAIL("Missing right brace on \\x{}");
3128                             }
3129                             else {
3130                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3131                                     | PERL_SCAN_DISALLOW_PREFIX;
3132                                 numlen = e - p - 1;
3133                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3134                                 if (ender > 0xff)
3135                                     RExC_utf8 = 1;
3136                                 /* numlen is generous */
3137                                 if (numlen + len >= 127) {
3138                                     p--;
3139                                     goto loopdone;
3140                                 }
3141                                 p = e + 1;
3142                             }
3143                         }
3144                         else {
3145                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3146                             numlen = 2;
3147                             ender = grok_hex(p, &numlen, &flags, NULL);
3148                             p += numlen;
3149                         }
3150                         break;
3151                     case 'c':
3152                         p++;
3153                         ender = UCHARAT(p++);
3154                         ender = toCTRL(ender);
3155                         break;
3156                     case '0': case '1': case '2': case '3':case '4':
3157                     case '5': case '6': case '7': case '8':case '9':
3158                         if (*p == '0' ||
3159                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3160                             I32 flags = 0;
3161                             numlen = 3;
3162                             ender = grok_oct(p, &numlen, &flags, NULL);
3163                             p += numlen;
3164                         }
3165                         else {
3166                             --p;
3167                             goto loopdone;
3168                         }
3169                         break;
3170                     case '\0':
3171                         if (p >= RExC_end)
3172                             FAIL("Trailing \\");
3173                         /* FALL THROUGH */
3174                     default:
3175                         if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3176                             vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3177                         goto normal_default;
3178                     }
3179                     break;
3180                 default:
3181                   normal_default:
3182                     if (UTF8_IS_START(*p) && UTF) {
3183                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3184                                                &numlen, 0);
3185                         p += numlen;
3186                     }
3187                     else
3188                         ender = *p++;
3189                     break;
3190                 }
3191                 if (RExC_flags & PMf_EXTENDED)
3192                     p = regwhite(p, RExC_end);
3193                 if (UTF && FOLD) {
3194                     /* Prime the casefolded buffer. */
3195                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3196                 }
3197                 if (ISMULT2(p)) { /* Back off on ?+*. */
3198                     if (len)
3199                         p = oldp;
3200                     else if (UTF) {
3201                          STRLEN unilen;
3202
3203                          if (FOLD) {
3204                               /* Emit all the Unicode characters. */
3205                               for (foldbuf = tmpbuf;
3206                                    foldlen;
3207                                    foldlen -= numlen) {
3208                                    ender = utf8_to_uvchr(foldbuf, &numlen);
3209                                    if (numlen > 0) {
3210                                         reguni(pRExC_state, ender, s, &unilen);
3211                                         s       += unilen;
3212                                         len     += unilen;
3213                                         /* In EBCDIC the numlen
3214                                          * and unilen can differ. */
3215                                         foldbuf += numlen;
3216                                         if (numlen >= foldlen)
3217                                              break;
3218                                    }
3219                                    else
3220                                         break; /* "Can't happen." */
3221                               }
3222                          }
3223                          else {
3224                               reguni(pRExC_state, ender, s, &unilen);
3225                               if (unilen > 0) {
3226                                    s   += unilen;
3227                                    len += unilen;
3228                               }
3229                          }
3230                     }
3231                     else {
3232                         len++;
3233                         REGC((char)ender, s++);
3234                     }
3235                     break;
3236                 }
3237                 if (UTF) {
3238                      STRLEN unilen;
3239
3240                      if (FOLD) {
3241                           /* Emit all the Unicode characters. */
3242                           for (foldbuf = tmpbuf;
3243                                foldlen;
3244                                foldlen -= numlen) {
3245                                ender = utf8_to_uvchr(foldbuf, &numlen);
3246                                if (numlen > 0) {
3247                                     reguni(pRExC_state, ender, s, &unilen);
3248                                     len     += unilen;
3249                                     s       += unilen;
3250                                     /* In EBCDIC the numlen
3251                                      * and unilen can differ. */
3252                                     foldbuf += numlen;
3253                                     if (numlen >= foldlen)
3254                                          break;
3255                                }
3256                                else
3257                                     break;
3258                           }
3259                      }
3260                      else {
3261                           reguni(pRExC_state, ender, s, &unilen);
3262                           if (unilen > 0) {
3263                                s   += unilen;
3264                                len += unilen;
3265                           }
3266                      }
3267                      len--;
3268                 }
3269                 else
3270                     REGC((char)ender, s++);
3271             }
3272         loopdone:
3273             RExC_parse = p - 1;
3274             Set_Node_Cur_Length(ret); /* MJD */
3275             nextchar(pRExC_state);
3276             {
3277                 /* len is STRLEN which is unsigned, need to copy to signed */
3278                 IV iv = len;
3279                 if (iv < 0)
3280                     vFAIL("Internal disaster");
3281             }
3282             if (len > 0)
3283                 *flagp |= HASWIDTH;
3284             if (len == 1)
3285                 *flagp |= SIMPLE;
3286             if (!SIZE_ONLY)
3287                 STR_LEN(ret) = len;
3288             if (SIZE_ONLY)
3289                 RExC_size += STR_SZ(len);
3290             else
3291                 RExC_emit += STR_SZ(len);
3292         }
3293         break;
3294     }
3295
3296     /* If the encoding pragma is in effect recode the text of
3297      * any EXACT-kind nodes. */
3298     if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3299         STRLEN oldlen = STR_LEN(ret);
3300         SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3301
3302         if (RExC_utf8)
3303             SvUTF8_on(sv);
3304         if (sv_utf8_downgrade(sv, TRUE)) {
3305             char *s       = sv_recode_to_utf8(sv, PL_encoding);
3306             STRLEN newlen = SvCUR(sv);
3307
3308             if (SvUTF8(sv))
3309                 RExC_utf8 = 1;
3310             if (!SIZE_ONLY) {
3311                 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3312                                       (int)oldlen, STRING(ret),
3313                                       (int)newlen, s));
3314                 Copy(s, STRING(ret), newlen, char);
3315                 STR_LEN(ret) += newlen - oldlen;
3316                 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3317             } else
3318                 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3319         }
3320     }
3321
3322     return(ret);
3323 }
3324
3325 STATIC char *
3326 S_regwhite(pTHX_ char *p, char *e)
3327 {
3328     while (p < e) {
3329         if (isSPACE(*p))
3330             ++p;
3331         else if (*p == '#') {
3332             do {
3333                 p++;
3334             } while (p < e && *p != '\n');
3335         }
3336         else
3337             break;
3338     }
3339     return p;
3340 }
3341
3342 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3343    Character classes ([:foo:]) can also be negated ([:^foo:]).
3344    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3345    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3346    but trigger failures because they are currently unimplemented. */
3347
3348 #define POSIXCC_DONE(c)   ((c) == ':')
3349 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3350 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3351
3352 STATIC I32
3353 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3354 {
3355     char *posixcc = 0;
3356     I32 namedclass = OOB_NAMEDCLASS;
3357
3358     if (value == '[' && RExC_parse + 1 < RExC_end &&
3359         /* I smell either [: or [= or [. -- POSIX has been here, right? */
3360         POSIXCC(UCHARAT(RExC_parse))) {
3361         char  c = UCHARAT(RExC_parse);
3362         char* s = RExC_parse++;
3363         
3364         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3365             RExC_parse++;
3366         if (RExC_parse == RExC_end)
3367             /* Grandfather lone [:, [=, [. */
3368             RExC_parse = s;
3369         else {
3370             char* t = RExC_parse++; /* skip over the c */
3371
3372             if (UCHARAT(RExC_parse) == ']') {
3373                 RExC_parse++; /* skip over the ending ] */
3374                 posixcc = s + 1;
3375                 if (*s == ':') {
3376                     I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3377                     I32 skip = 5; /* the most common skip */
3378
3379                     switch (*posixcc) {
3380                     case 'a':
3381                         if (strnEQ(posixcc, "alnum", 5))
3382                             namedclass =
3383                                 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3384                         else if (strnEQ(posixcc, "alpha", 5))
3385                             namedclass =
3386                                 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3387                         else if (strnEQ(posixcc, "ascii", 5))
3388                             namedclass =
3389                                 complement ? ANYOF_NASCII : ANYOF_ASCII;
3390                         break;
3391                     case 'b':
3392                         if (strnEQ(posixcc, "blank", 5))
3393                             namedclass =
3394                                 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3395                         break;
3396                     case 'c':
3397                         if (strnEQ(posixcc, "cntrl", 5))
3398                             namedclass =
3399                                 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3400                         break;
3401                     case 'd':
3402                         if (strnEQ(posixcc, "digit", 5))
3403                             namedclass =
3404                                 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3405                         break;
3406                     case 'g':
3407                         if (strnEQ(posixcc, "graph", 5))
3408                             namedclass =
3409                                 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3410                         break;
3411                     case 'l':
3412                         if (strnEQ(posixcc, "lower", 5))
3413                             namedclass =
3414                                 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3415                         break;
3416                     case 'p':
3417                         if (strnEQ(posixcc, "print", 5))
3418                             namedclass =
3419                                 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3420                         else if (strnEQ(posixcc, "punct", 5))
3421                             namedclass =
3422                                 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3423                         break;
3424                     case 's':
3425                         if (strnEQ(posixcc, "space", 5))
3426                             namedclass =
3427                                 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3428                         break;
3429                     case 'u':
3430                         if (strnEQ(posixcc, "upper", 5))
3431                             namedclass =
3432                                 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3433                         break;
3434                     case 'w': /* this is not POSIX, this is the Perl \w */
3435                         if (strnEQ(posixcc, "word", 4)) {
3436                             namedclass =
3437                                 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3438                             skip = 4;
3439                         }
3440                         break;
3441                     case 'x':
3442                         if (strnEQ(posixcc, "xdigit", 6)) {
3443                             namedclass =
3444                                 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3445                             skip = 6;
3446                         }
3447                         break;
3448                     }
3449                     if (namedclass == OOB_NAMEDCLASS ||
3450                         posixcc[skip] != ':' ||
3451                         posixcc[skip+1] != ']')
3452                     {
3453                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3454                                       t - s - 1, s + 1);
3455                     }
3456                 } else if (!SIZE_ONLY) {
3457                     /* [[=foo=]] and [[.foo.]] are still future. */
3458
3459                     /* adjust RExC_parse so the warning shows after
3460                        the class closes */
3461                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3462                         RExC_parse++;
3463                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3464                 }
3465             } else {
3466                 /* Maternal grandfather:
3467                  * "[:" ending in ":" but not in ":]" */
3468                 RExC_parse = s;
3469             }
3470         }
3471     }
3472
3473     return namedclass;
3474 }
3475
3476 STATIC void
3477 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3478 {
3479     if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3480         char *s = RExC_parse;
3481         char  c = *s++;
3482
3483         while(*s && isALNUM(*s))
3484             s++;
3485         if (*s && c == *s && s[1] == ']') {
3486             if (ckWARN(WARN_REGEXP))
3487                 vWARN3(s+2,
3488                         "POSIX syntax [%c %c] belongs inside character classes",
3489                         c, c);
3490
3491             /* [[=foo=]] and [[.foo.]] are still future. */
3492             if (POSIXCC_NOTYET(c)) {
3493                 /* adjust RExC_parse so the error shows after
3494                    the class closes */
3495                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3496                     ;
3497                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3498             }
3499         }
3500     }
3501 }
3502
3503 STATIC regnode *
3504 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3505 {
3506     register UV value;
3507     register UV nextvalue;
3508     register IV prevvalue = OOB_UNICODE;
3509     register IV range = 0;
3510     register regnode *ret;
3511     STRLEN numlen;
3512     IV namedclass;
3513     char *rangebegin = 0;
3514     bool need_class = 0;
3515     SV *listsv = Nullsv;
3516     register char *e;
3517     UV n;
3518     bool optimize_invert   = TRUE;
3519     AV* unicode_alternate  = 0;
3520 #ifdef EBCDIC
3521     UV literal_endpoint = 0;
3522 #endif
3523
3524     ret = reganode(pRExC_state, ANYOF, 0);
3525
3526     if (!SIZE_ONLY)
3527         ANYOF_FLAGS(ret) = 0;
3528
3529     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
3530         RExC_naughty++;
3531         RExC_parse++;
3532         if (!SIZE_ONLY)
3533             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3534     }
3535
3536     if (SIZE_ONLY)
3537         RExC_size += ANYOF_SKIP;
3538     else {
3539         RExC_emit += ANYOF_SKIP;
3540         if (FOLD)
3541             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3542         if (LOC)
3543             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3544         ANYOF_BITMAP_ZERO(ret);
3545         listsv = newSVpvn("# comment\n", 10);
3546     }
3547
3548     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3549
3550     if (!SIZE_ONLY && POSIXCC(nextvalue))
3551         checkposixcc(pRExC_state);
3552
3553     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3554     if (UCHARAT(RExC_parse) == ']')
3555         goto charclassloop;
3556
3557     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3558
3559     charclassloop:
3560
3561         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3562
3563         if (!range)
3564             rangebegin = RExC_parse;
3565         if (UTF) {
3566             value = utf8n_to_uvchr((U8*)RExC_parse,
3567                                    RExC_end - RExC_parse,
3568                                    &numlen, 0);
3569             RExC_parse += numlen;
3570         }
3571         else
3572             value = UCHARAT(RExC_parse++);
3573         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3574         if (value == '[' && POSIXCC(nextvalue))
3575             namedclass = regpposixcc(pRExC_state, value);
3576         else if (value == '\\') {
3577             if (UTF) {
3578                 value = utf8n_to_uvchr((U8*)RExC_parse,
3579                                    RExC_end - RExC_parse,
3580                                    &numlen, 0);
3581                 RExC_parse += numlen;
3582             }
3583             else
3584                 value = UCHARAT(RExC_parse++);
3585             /* Some compilers cannot handle switching on 64-bit integer
3586              * values, therefore value cannot be an UV.  Yes, this will
3587              * be a problem later if we want switch on Unicode.
3588              * A similar issue a little bit later when switching on
3589              * namedclass. --jhi */
3590             switch ((I32)value) {
3591             case 'w':   namedclass = ANYOF_ALNUM;       break;
3592             case 'W':   namedclass = ANYOF_NALNUM;      break;
3593             case 's':   namedclass = ANYOF_SPACE;       break;
3594             case 'S':   namedclass = ANYOF_NSPACE;      break;
3595             case 'd':   namedclass = ANYOF_DIGIT;       break;
3596             case 'D':   namedclass = ANYOF_NDIGIT;      break;
3597             case 'p':
3598             case 'P':
3599                 if (RExC_parse >= RExC_end)
3600                     vFAIL2("Empty \\%c{}", (U8)value);
3601                 if (*RExC_parse == '{') {
3602                     U8 c = (U8)value;
3603                     e = strchr(RExC_parse++, '}');
3604                     if (!e)
3605                         vFAIL2("Missing right brace on \\%c{}", c);
3606                     while (isSPACE(UCHARAT(RExC_parse)))
3607                         RExC_parse++;
3608                     if (e == RExC_parse)
3609                         vFAIL2("Empty \\%c{}", c);
3610                     n = e - RExC_parse;
3611                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3612                         n--;
3613                 }
3614                 else {
3615                     e = RExC_parse;
3616                     n = 1;
3617                 }
3618                 if (!SIZE_ONLY) {
3619                     if (UCHARAT(RExC_parse) == '^') {
3620                          RExC_parse++;
3621                          n--;
3622                          value = value == 'p' ? 'P' : 'p'; /* toggle */
3623                          while (isSPACE(UCHARAT(RExC_parse))) {
3624                               RExC_parse++;
3625                               n--;
3626                          }
3627                     }
3628                     if (value == 'p')
3629                          Perl_sv_catpvf(aTHX_ listsv,
3630                                         "+utf8::%.*s\n", (int)n, RExC_parse);
3631                     else
3632                          Perl_sv_catpvf(aTHX_ listsv,
3633                                         "!utf8::%.*s\n", (int)n, RExC_parse);
3634                 }
3635                 RExC_parse = e + 1;
3636                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3637                 continue;
3638             case 'n':   value = '\n';                   break;
3639             case 'r':   value = '\r';                   break;
3640             case 't':   value = '\t';                   break;
3641             case 'f':   value = '\f';                   break;
3642             case 'b':   value = '\b';                   break;
3643             case 'e':   value = ASCII_TO_NATIVE('\033');break;
3644             case 'a':   value = ASCII_TO_NATIVE('\007');break;
3645             case 'x':
3646                 if (*RExC_parse == '{') {
3647                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3648                         | PERL_SCAN_DISALLOW_PREFIX;
3649                     e = strchr(RExC_parse++, '}');
3650                     if (!e)
3651                         vFAIL("Missing right brace on \\x{}");
3652
3653                     numlen = e - RExC_parse;
3654                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3655                     RExC_parse = e + 1;
3656                 }
3657                 else {
3658                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3659                     numlen = 2;
3660                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3661                     RExC_parse += numlen;
3662                 }
3663                 break;
3664             case 'c':
3665                 value = UCHARAT(RExC_parse++);
3666                 value = toCTRL(value);
3667                 break;
3668             case '0': case '1': case '2': case '3': case '4':
3669             case '5': case '6': case '7': case '8': case '9':
3670             {
3671                 I32 flags = 0;
3672                 numlen = 3;
3673                 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3674                 RExC_parse += numlen;
3675                 break;
3676             }
3677             default:
3678                 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3679                     vWARN2(RExC_parse,
3680                            "Unrecognized escape \\%c in character class passed through",
3681                            (int)value);
3682                 break;
3683             }
3684         } /* end of \blah */
3685 #ifdef EBCDIC
3686         else
3687             literal_endpoint++;
3688 #endif
3689
3690         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3691
3692             if (!SIZE_ONLY && !need_class)
3693                 ANYOF_CLASS_ZERO(ret);
3694
3695             need_class = 1;
3696
3697             /* a bad range like a-\d, a-[:digit:] ? */
3698             if (range) {
3699                 if (!SIZE_ONLY) {
3700                     if (ckWARN(WARN_REGEXP))
3701                         vWARN4(RExC_parse,
3702                                "False [] range \"%*.*s\"",
3703                                RExC_parse - rangebegin,
3704                                RExC_parse - rangebegin,
3705                                rangebegin);
3706                     if (prevvalue < 256) {
3707                         ANYOF_BITMAP_SET(ret, prevvalue);
3708                         ANYOF_BITMAP_SET(ret, '-');
3709                     }
3710                     else {
3711                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3712                         Perl_sv_catpvf(aTHX_ listsv,
3713                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3714                     }
3715                 }
3716
3717                 range = 0; /* this was not a true range */
3718             }
3719
3720             if (!SIZE_ONLY) {
3721                 if (namedclass > OOB_NAMEDCLASS)
3722                     optimize_invert = FALSE;
3723                 /* Possible truncation here but in some 64-bit environments
3724                  * the compiler gets heartburn about switch on 64-bit values.
3725                  * A similar issue a little earlier when switching on value.
3726                  * --jhi */
3727                 switch ((I32)namedclass) {
3728                 case ANYOF_ALNUM:
3729                     if (LOC)
3730                         ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3731                     else {
3732                         for (value = 0; value < 256; value++)
3733                             if (isALNUM(value))
3734                                 ANYOF_BITMAP_SET(ret, value);
3735                     }
3736                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");    
3737                     break;
3738                 case ANYOF_NALNUM:
3739                     if (LOC)
3740                         ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3741                     else {
3742                         for (value = 0; value < 256; value++)
3743                             if (!isALNUM(value))
3744                                 ANYOF_BITMAP_SET(ret, value);
3745                     }
3746                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3747                     break;
3748                 case ANYOF_ALNUMC:
3749                     if (LOC)
3750                         ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3751                     else {
3752                         for (value = 0; value < 256; value++)
3753                             if (isALNUMC(value))
3754                                 ANYOF_BITMAP_SET(ret, value);
3755                     }
3756                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3757                     break;
3758                 case ANYOF_NALNUMC:
3759                     if (LOC)
3760                         ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3761                     else {
3762                         for (value = 0; value < 256; value++)
3763                             if (!isALNUMC(value))
3764                                 ANYOF_BITMAP_SET(ret, value);
3765                     }
3766                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3767                     break;
3768                 case ANYOF_ALPHA:
3769                     if (LOC)
3770                         ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3771                     else {
3772                         for (value = 0; value < 256; value++)
3773                             if (isALPHA(value))
3774                                 ANYOF_BITMAP_SET(ret, value);
3775                     }
3776                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3777                     break;
3778                 case ANYOF_NALPHA:
3779                     if (LOC)
3780                         ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3781                     else {
3782                         for (value = 0; value < 256; value++)
3783                             if (!isALPHA(value))
3784                                 ANYOF_BITMAP_SET(ret, value);
3785                     }
3786                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3787                     break;
3788                 case ANYOF_ASCII:
3789                     if (LOC)
3790                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3791                     else {
3792 #ifndef EBCDIC
3793                         for (value = 0; value < 128; value++)
3794                             ANYOF_BITMAP_SET(ret, value);
3795 #else  /* EBCDIC */
3796                         for (value = 0; value < 256; value++) {
3797                             if (isASCII(value))
3798                                 ANYOF_BITMAP_SET(ret, value);
3799                         }
3800 #endif /* EBCDIC */
3801                     }
3802                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3803                     break;
3804                 case ANYOF_NASCII:
3805                     if (LOC)
3806                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3807                     else {
3808 #ifndef EBCDIC
3809                         for (value = 128; value < 256; value++)
3810                             ANYOF_BITMAP_SET(ret, value);
3811 #else  /* EBCDIC */
3812                         for (value = 0; value < 256; value++) {
3813                             if (!isASCII(value))
3814                                 ANYOF_BITMAP_SET(ret, value);
3815                         }
3816 #endif /* EBCDIC */
3817                     }
3818                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3819                     break;
3820                 case ANYOF_BLANK:
3821                     if (LOC)
3822                         ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3823                     else {
3824                         for (value = 0; value < 256; value++)
3825                             if (isBLANK(value))
3826                                 ANYOF_BITMAP_SET(ret, value);
3827                     }
3828                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3829                     break;
3830                 case ANYOF_NBLANK:
3831                     if (LOC)
3832                         ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3833                     else {
3834                         for (value = 0; value < 256; value++)
3835                             if (!isBLANK(value))
3836                                 ANYOF_BITMAP_SET(ret, value);
3837                     }
3838                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3839                     break;
3840                 case ANYOF_CNTRL:
3841                     if (LOC)
3842                         ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3843                     else {
3844                         for (value = 0; value < 256; value++)
3845                             if (isCNTRL(value))
3846                                 ANYOF_BITMAP_SET(ret, value);
3847                     }
3848                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3849                     break;
3850                 case ANYOF_NCNTRL:
3851                     if (LOC)
3852                         ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3853                     else {
3854                         for (value = 0; value < 256; value++)
3855                             if (!isCNTRL(value))
3856                                 ANYOF_BITMAP_SET(ret, value);
3857                     }
3858                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3859                     break;
3860                 case ANYOF_DIGIT:
3861                     if (LOC)
3862                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3863                     else {
3864                         /* consecutive digits assumed */
3865                         for (value = '0'; value <= '9'; value++)
3866                             ANYOF_BITMAP_SET(ret, value);
3867                     }
3868                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3869                     break;
3870                 case ANYOF_NDIGIT:
3871                     if (LOC)
3872                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3873                     else {
3874                         /* consecutive digits assumed */
3875                         for (value = 0; value < '0'; value++)
3876                             ANYOF_BITMAP_SET(ret, value);
3877                         for (value = '9' + 1; value < 256; value++)
3878                             ANYOF_BITMAP_SET(ret, value);
3879                     }
3880                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3881                     break;
3882                 case ANYOF_GRAPH:
3883                     if (LOC)
3884                         ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3885                     else {
3886                         for (value = 0; value < 256; value++)
3887                             if (isGRAPH(value))
3888                                 ANYOF_BITMAP_SET(ret, value);
3889                     }
3890                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3891                     break;
3892                 case ANYOF_NGRAPH:
3893                     if (LOC)
3894                         ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3895                     else {
3896                         for (value = 0; value < 256; value++)
3897                             if (!isGRAPH(value))
3898                                 ANYOF_BITMAP_SET(ret, value);
3899                     }
3900                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3901                     break;
3902                 case ANYOF_LOWER:
3903                     if (LOC)
3904                         ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3905                     else {
3906                         for (value = 0; value < 256; value++)
3907                             if (isLOWER(value))
3908                                 ANYOF_BITMAP_SET(ret, value);
3909                     }
3910                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3911                     break;
3912                 case ANYOF_NLOWER:
3913                     if (LOC)
3914                         ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3915                     else {
3916                         for (value = 0; value < 256; value++)
3917                             if (!isLOWER(value))
3918                                 ANYOF_BITMAP_SET(ret, value);
3919                     }
3920                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3921                     break;
3922                 case ANYOF_PRINT:
3923                     if (LOC)
3924                         ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3925                     else {
3926                         for (value = 0; value < 256; value++)
3927                             if (isPRINT(value))
3928                                 ANYOF_BITMAP_SET(ret, value);
3929                     }
3930                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3931                     break;
3932                 case ANYOF_NPRINT:
3933                     if (LOC)
3934                         ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3935                     else {
3936                         for (value = 0; value < 256; value++)
3937                             if (!isPRINT(value))
3938                                 ANYOF_BITMAP_SET(ret, value);
3939                     }
3940                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3941                     break;
3942                 case ANYOF_PSXSPC:
3943                     if (LOC)
3944                         ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3945                     else {
3946                         for (value = 0; value < 256; value++)
3947                             if (isPSXSPC(value))
3948                                 ANYOF_BITMAP_SET(ret, value);
3949                     }
3950                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3951                     break;
3952                 case ANYOF_NPSXSPC:
3953                     if (LOC)
3954                         ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3955                     else {
3956                         for (value = 0; value < 256; value++)
3957                             if (!isPSXSPC(value))
3958                                 ANYOF_BITMAP_SET(ret, value);
3959                     }
3960                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3961                     break;
3962                 case ANYOF_PUNCT:
3963                     if (LOC)
3964                         ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3965                     else {
3966                         for (value = 0; value < 256; value++)
3967                             if (isPUNCT(value))
3968                                 ANYOF_BITMAP_SET(ret, value);
3969                     }
3970                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
3971                     break;
3972                 case ANYOF_NPUNCT:
3973                     if (LOC)
3974                         ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
3975                     else {
3976                         for (value = 0; value < 256; value++)
3977                             if (!isPUNCT(value))
3978                                 ANYOF_BITMAP_SET(ret, value);
3979                     }
3980                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3981                     break;
3982                 case ANYOF_SPACE:
3983                     if (LOC)
3984                         ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3985                     else {
3986                         for (value = 0; value < 256; value++)
3987                             if (isSPACE(value))
3988                                 ANYOF_BITMAP_SET(ret, value);
3989                     }
3990                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3991                     break;
3992                 case ANYOF_NSPACE:
3993                     if (LOC)
3994                         ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3995                     else {
3996                         for (value = 0; value < 256; value++)
3997                             if (!isSPACE(value))
3998                                 ANYOF_BITMAP_SET(ret, value);
3999                     }
4000                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
4001                     break;
4002                 case ANYOF_UPPER:
4003                     if (LOC)
4004                         ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4005                     else {
4006                         for (value = 0; value < 256; value++)
4007                             if (isUPPER(value))
4008                                 ANYOF_BITMAP_SET(ret, value);
4009                     }
4010                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
4011                     break;
4012                 case ANYOF_NUPPER:
4013                     if (LOC)
4014                         ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4015                     else {
4016                         for (value = 0; value < 256; value++)
4017                             if (!isUPPER(value))
4018                                 ANYOF_BITMAP_SET(ret, value);
4019                     }
4020                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
4021                     break;
4022                 case ANYOF_XDIGIT:
4023                     if (LOC)
4024                         ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4025                     else {
4026                         for (value = 0; value < 256; value++)
4027                             if (isXDIGIT(value))
4028                                 ANYOF_BITMAP_SET(ret, value);
4029                     }
4030                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4031                     break;
4032                 case ANYOF_NXDIGIT:
4033                     if (LOC)
4034                         ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4035                     else {
4036                         for (value = 0; value < 256; value++)
4037                             if (!isXDIGIT(value))
4038                                 ANYOF_BITMAP_SET(ret, value);
4039                     }
4040                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4041                     break;
4042                 default:
4043                     vFAIL("Invalid [::] class");
4044                     break;
4045                 }
4046                 if (LOC)
4047                     ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4048                 continue;
4049             }
4050         } /* end of namedclass \blah */
4051
4052         if (range) {
4053             if (prevvalue > (IV)value) /* b-a */ {
4054                 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4055                               RExC_parse - rangebegin,
4056                               RExC_parse - rangebegin,
4057                               rangebegin);
4058                 range = 0; /* not a valid range */
4059             }
4060         }
4061         else {
4062             prevvalue = value; /* save the beginning of the range */
4063             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4064                 RExC_parse[1] != ']') {
4065                 RExC_parse++;
4066
4067                 /* a bad range like \w-, [:word:]- ? */
4068                 if (namedclass > OOB_NAMEDCLASS) {
4069                     if (ckWARN(WARN_REGEXP))
4070                         vWARN4(RExC_parse,
4071                                "False [] range \"%*.*s\"",
4072                                RExC_parse - rangebegin,
4073                                RExC_parse - rangebegin,
4074                                rangebegin);
4075                     if (!SIZE_ONLY)
4076                         ANYOF_BITMAP_SET(ret, '-');
4077                 } else
4078                     range = 1;  /* yeah, it's a range! */
4079                 continue;       /* but do it the next time */
4080             }
4081         }
4082
4083         /* now is the next time */
4084         if (!SIZE_ONLY) {
4085             IV i;
4086
4087             if (prevvalue < 256) {
4088                 IV ceilvalue = value < 256 ? value : 255;
4089
4090 #ifdef EBCDIC
4091                 /* In EBCDIC [\x89-\x91] should include
4092                  * the \x8e but [i-j] should not. */
4093                 if (literal_endpoint == 2 &&
4094                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4095                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4096                 {
4097                     if (isLOWER(prevvalue)) {
4098                         for (i = prevvalue; i <= ceilvalue; i++)
4099                             if (isLOWER(i))
4100                                 ANYOF_BITMAP_SET(ret, i);
4101                     } else {
4102                         for (i = prevvalue; i <= ceilvalue; i++)
4103                             if (isUPPER(i))
4104                                 ANYOF_BITMAP_SET(ret, i);
4105                     }
4106                 }
4107                 else
4108 #endif
4109                       for (i = prevvalue; i <= ceilvalue; i++)
4110                           ANYOF_BITMAP_SET(ret, i);
4111           }
4112           if (value > 255 || UTF) {
4113                 UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
4114                 UV natvalue      = NATIVE_TO_UNI(value);
4115
4116                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4117                 if (prevnatvalue < natvalue) { /* what about > ? */
4118                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4119                                    prevnatvalue, natvalue);
4120                 }
4121                 else if (prevnatvalue == natvalue) {
4122                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4123                     if (FOLD) {
4124                          U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4125                          STRLEN foldlen;
4126                          UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4127
4128                          /* If folding and foldable and a single
4129                           * character, insert also the folded version
4130                           * to the charclass. */
4131                          if (f != value) {
4132                               if (foldlen == (STRLEN)UNISKIP(f))
4133                                   Perl_sv_catpvf(aTHX_ listsv,
4134                                                  "%04"UVxf"\n", f);
4135                               else {
4136                                   /* Any multicharacter foldings
4137                                    * require the following transform:
4138                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4139                                    * where E folds into "pq" and F folds
4140                                    * into "rst", all other characters
4141                                    * fold to single characters.  We save
4142                                    * away these multicharacter foldings,
4143                                    * to be later saved as part of the
4144                                    * additional "s" data. */
4145                                   SV *sv;
4146
4147                                   if (!unicode_alternate)
4148                                       unicode_alternate = newAV();
4149                                   sv = newSVpvn((char*)foldbuf, foldlen);
4150                                   SvUTF8_on(sv);
4151                                   av_push(unicode_alternate, sv);
4152                               }
4153                          }
4154
4155                          /* If folding and the value is one of the Greek
4156                           * sigmas insert a few more sigmas to make the
4157                           * folding rules of the sigmas to work right.
4158                           * Note that not all the possible combinations
4159                           * are handled here: some of them are handled
4160                           * by the standard folding rules, and some of
4161                           * them (literal or EXACTF cases) are handled
4162                           * during runtime in regexec.c:S_find_byclass(). */
4163                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4164                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4165                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4166                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4167                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4168                          }
4169                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4170                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4171                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4172                     }
4173                 }
4174             }
4175 #ifdef EBCDIC
4176             literal_endpoint = 0;
4177 #endif
4178         }
4179
4180         range = 0; /* this range (if it was one) is done now */
4181     }
4182
4183     if (need_class) {
4184         ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4185         if (SIZE_ONLY)
4186             RExC_size += ANYOF_CLASS_ADD_SKIP;
4187         else
4188             RExC_emit += ANYOF_CLASS_ADD_SKIP;
4189     }
4190
4191     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4192     if (!SIZE_ONLY &&
4193          /* If the only flag is folding (plus possibly inversion). */
4194         ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4195        ) {
4196         for (value = 0; value < 256; ++value) {
4197             if (ANYOF_BITMAP_TEST(ret, value)) {
4198                 UV fold = PL_fold[value];
4199
4200                 if (fold != value)
4201                     ANYOF_BITMAP_SET(ret, fold);
4202             }
4203         }
4204         ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4205     }
4206
4207     /* optimize inverted simple patterns (e.g. [^a-z]) */
4208     if (!SIZE_ONLY && optimize_invert &&
4209         /* If the only flag is inversion. */
4210         (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4211         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4212             ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4213         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4214     }
4215
4216     if (!SIZE_ONLY) {
4217         AV *av = newAV();
4218         SV *rv;
4219
4220         /* The 0th element stores the character class description
4221          * in its textual form: used later (regexec.c:Perl_regclass_swash())
4222          * to initialize the appropriate swash (which gets stored in
4223          * the 1st element), and also useful for dumping the regnode.
4224          * The 2nd element stores the multicharacter foldings,
4225          * used later (regexec.c:S_reginclass()). */
4226         av_store(av, 0, listsv);
4227         av_store(av, 1, NULL);
4228         av_store(av, 2, (SV*)unicode_alternate);
4229         rv = newRV_noinc((SV*)av);
4230         n = add_data(pRExC_state, 1, "s");
4231         RExC_rx->data->data[n] = (void*)rv;
4232         ARG_SET(ret, n);
4233     }
4234
4235     return ret;
4236 }
4237
4238 STATIC char*
4239 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4240 {
4241     char* retval = RExC_parse++;
4242
4243     for (;;) {
4244         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4245                 RExC_parse[2] == '#') {
4246             while (*RExC_parse && *RExC_parse != ')')
4247                 RExC_parse++;
4248             RExC_parse++;
4249             continue;
4250         }
4251         if (RExC_flags & PMf_EXTENDED) {
4252             if (isSPACE(*RExC_parse)) {
4253                 RExC_parse++;
4254                 continue;
4255             }
4256             else if (*RExC_parse == '#') {
4257                 while (*RExC_parse && *RExC_parse != '\n')
4258                     RExC_parse++;
4259                 RExC_parse++;
4260                 continue;
4261             }
4262         }
4263         return retval;
4264     }
4265 }
4266
4267 /*
4268 - reg_node - emit a node
4269 */
4270 STATIC regnode *                        /* Location. */
4271 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4272 {
4273     register regnode *ret;
4274     register regnode *ptr;
4275
4276     ret = RExC_emit;
4277     if (SIZE_ONLY) {
4278         SIZE_ALIGN(RExC_size);
4279         RExC_size += 1;
4280         return(ret);
4281     }
4282
4283     NODE_ALIGN_FILL(ret);
4284     ptr = ret;
4285     FILL_ADVANCE_NODE(ptr, op);
4286     if (RExC_offsets) {         /* MJD */
4287         MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n", 
4288               "reg_node", __LINE__, 
4289               reg_name[op],
4290               RExC_emit - RExC_emit_start > RExC_offsets[0] 
4291               ? "Overwriting end of array!\n" : "OK",
4292               RExC_emit - RExC_emit_start,
4293               RExC_parse - RExC_start,
4294               RExC_offsets[0])); 
4295         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4296     }
4297             
4298     RExC_emit = ptr;
4299
4300     return(ret);
4301 }
4302
4303 /*
4304 - reganode - emit a node with an argument
4305 */
4306 STATIC regnode *                        /* Location. */
4307 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4308 {
4309     register regnode *ret;
4310     register regnode *ptr;
4311
4312     ret = RExC_emit;
4313     if (SIZE_ONLY) {
4314         SIZE_ALIGN(RExC_size);
4315         RExC_size += 2;
4316         return(ret);
4317     }
4318
4319     NODE_ALIGN_FILL(ret);
4320     ptr = ret;
4321     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4322     if (RExC_offsets) {         /* MJD */
4323         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n", 
4324               "reganode",
4325               __LINE__,
4326               reg_name[op],
4327               RExC_emit - RExC_emit_start > RExC_offsets[0] ? 
4328               "Overwriting end of array!\n" : "OK",
4329               RExC_emit - RExC_emit_start,
4330               RExC_parse - RExC_start,
4331               RExC_offsets[0])); 
4332         Set_Cur_Node_Offset;
4333     }
4334             
4335     RExC_emit = ptr;
4336
4337     return(ret);
4338 }
4339
4340 /*
4341 - reguni - emit (if appropriate) a Unicode character
4342 */
4343 STATIC void
4344 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4345 {
4346     *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4347 }
4348
4349 /*
4350 - reginsert - insert an operator in front of already-emitted operand
4351 *
4352 * Means relocating the operand.
4353 */
4354 STATIC void
4355 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4356 {
4357     register regnode *src;
4358     register regnode *dst;
4359     register regnode *place;
4360     register int offset = regarglen[(U8)op];
4361
4362 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4363
4364     if (SIZE_ONLY) {
4365         RExC_size += NODE_STEP_REGNODE + offset;
4366         return;
4367     }
4368
4369     src = RExC_emit;
4370     RExC_emit += NODE_STEP_REGNODE + offset;
4371     dst = RExC_emit;
4372     while (src > opnd) {
4373         StructCopy(--src, --dst, regnode);
4374         if (RExC_offsets) {     /* MJD 20010112 */
4375             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4376                   "reg_insert",
4377                   __LINE__,
4378                   reg_name[op],
4379                   dst - RExC_emit_start > RExC_offsets[0] 
4380                   ? "Overwriting end of array!\n" : "OK",
4381                   src - RExC_emit_start,
4382                   dst - RExC_emit_start,
4383                   RExC_offsets[0])); 
4384             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4385             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4386         }
4387     }
4388     
4389
4390     place = opnd;               /* Op node, where operand used to be. */
4391     if (RExC_offsets) {         /* MJD */
4392         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n", 
4393               "reginsert",
4394               __LINE__,
4395               reg_name[op],
4396               place - RExC_emit_start > RExC_offsets[0] 
4397               ? "Overwriting end of array!\n" : "OK",
4398               place - RExC_emit_start,
4399               RExC_parse - RExC_start,
4400               RExC_offsets[0])); 
4401         Set_Node_Offset(place, RExC_parse);
4402     }
4403     src = NEXTOPER(place);
4404     FILL_ADVANCE_NODE(place, op);
4405     Zero(src, offset, regnode);
4406 }
4407
4408 /*
4409 - regtail - set the next-pointer at the end of a node chain of p to val.
4410 */
4411 STATIC void
4412 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4413 {
4414     register regnode *scan;
4415     register regnode *temp;
4416
4417     if (SIZE_ONLY)
4418         return;
4419
4420     /* Find last node. */
4421     scan = p;
4422     for (;;) {
4423         temp = regnext(scan);
4424         if (temp == NULL)
4425             break;
4426         scan = temp;
4427     }
4428
4429     if (reg_off_by_arg[OP(scan)]) {
4430         ARG_SET(scan, val - scan);
4431     }
4432     else {
4433         NEXT_OFF(scan) = val - scan;
4434     }
4435 }
4436
4437 /*
4438 - regoptail - regtail on operand of first argument; nop if operandless
4439 */
4440 STATIC void
4441 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4442 {
4443     /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4444     if (p == NULL || SIZE_ONLY)
4445         return;
4446     if (PL_regkind[(U8)OP(p)] == BRANCH) {
4447         regtail(pRExC_state, NEXTOPER(p), val);
4448     }
4449     else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4450         regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4451     }
4452     else
4453         return;
4454 }
4455
4456 /*
4457  - regcurly - a little FSA that accepts {\d+,?\d*}
4458  */
4459 STATIC I32
4460 S_regcurly(pTHX_ register char *s)
4461 {
4462     if (*s++ != '{')
4463         return FALSE;
4464     if (!isDIGIT(*s))
4465         return FALSE;
4466     while (isDIGIT(*s))
4467         s++;
4468     if (*s == ',')
4469         s++;
4470     while (isDIGIT(*s))
4471         s++;
4472     if (*s != '}')
4473         return FALSE;
4474     return TRUE;
4475 }
4476
4477
4478 #ifdef DEBUGGING
4479
4480 STATIC regnode *
4481 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4482 {
4483     register U8 op = EXACT;     /* Arbitrary non-END op. */
4484     register regnode *next;
4485
4486     while (op != END && (!last || node < last)) {
4487         /* While that wasn't END last time... */
4488
4489         NODE_ALIGN(node);
4490         op = OP(node);
4491         if (op == CLOSE)
4492             l--;        
4493         next = regnext(node);
4494         /* Where, what. */
4495         if (OP(node) == OPTIMIZED)
4496             goto after_print;
4497         regprop(sv, node);
4498         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4499                       (int)(2*l + 1), "", SvPVX(sv));
4500         if (next == NULL)               /* Next ptr. */
4501             PerlIO_printf(Perl_debug_log, "(0)");
4502         else
4503             PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4504         (void)PerlIO_putc(Perl_debug_log, '\n');
4505       after_print:
4506         if (PL_regkind[(U8)op] == BRANCHJ) {
4507             register regnode *nnode = (OP(next) == LONGJMP
4508                                        ? regnext(next)
4509                                        : next);
4510             if (last && nnode > last)
4511                 nnode = last;
4512             node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4513         }
4514         else if (PL_regkind[(U8)op] == BRANCH) {
4515             node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4516         }
4517         else if ( op == CURLY) {   /* `next' might be very big: optimizer */
4518             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4519                              NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4520         }
4521         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4522             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4523                              next, sv, l + 1);
4524         }
4525         else if ( op == PLUS || op == STAR) {
4526             node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4527         }
4528         else if (op == ANYOF) {
4529             /* arglen 1 + class block */
4530             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4531                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4532             node = NEXTOPER(node);
4533         }
4534         else if (PL_regkind[(U8)op] == EXACT) {
4535             /* Literal string, where present. */
4536             node += NODE_SZ_STR(node) - 1;
4537             node = NEXTOPER(node);
4538         }
4539         else {
4540             node = NEXTOPER(node);
4541             node += regarglen[(U8)op];
4542         }
4543         if (op == CURLYX || op == OPEN)
4544             l++;
4545         else if (op == WHILEM)
4546             l--;
4547     }
4548     return node;
4549 }
4550
4551 #endif  /* DEBUGGING */
4552
4553 /*
4554  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4555  */
4556 void
4557 Perl_regdump(pTHX_ regexp *r)
4558 {
4559 #ifdef DEBUGGING
4560     SV *sv = sv_newmortal();
4561
4562     (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4563
4564     /* Header fields of interest. */
4565     if (r->anchored_substr)
4566         PerlIO_printf(Perl_debug_log,
4567                       "anchored `%s%.*s%s'%s at %"IVdf" ",
4568                       PL_colors[0],
4569                       (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4570                       SvPVX(r->anchored_substr),
4571                       PL_colors[1],
4572                       SvTAIL(r->anchored_substr) ? "$" : "",
4573                       (IV)r->anchored_offset);
4574     else if (r->anchored_utf8)
4575         PerlIO_printf(Perl_debug_log,
4576                       "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4577                       PL_colors[0],
4578                       (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4579                       SvPVX(r->anchored_utf8),
4580                       PL_colors[1],
4581                       SvTAIL(r->anchored_utf8) ? "$" : "",
4582                       (IV)r->anchored_offset);
4583     if (r->float_substr)
4584         PerlIO_printf(Perl_debug_log,
4585                       "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4586                       PL_colors[0],
4587                       (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4588                       SvPVX(r->float_substr),
4589                       PL_colors[1],
4590                       SvTAIL(r->float_substr) ? "$" : "",
4591                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4592     else if (r->float_utf8)
4593         PerlIO_printf(Perl_debug_log,
4594                       "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4595                       PL_colors[0],
4596                       (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4597                       SvPVX(r->float_utf8),
4598                       PL_colors[1],
4599                       SvTAIL(r->float_utf8) ? "$" : "",
4600                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4601     if (r->check_substr || r->check_utf8)
4602         PerlIO_printf(Perl_debug_log,
4603                       r->check_substr == r->float_substr
4604                       && r->check_utf8 == r->float_utf8
4605                       ? "(checking floating" : "(checking anchored");
4606     if (r->reganch & ROPT_NOSCAN)
4607         PerlIO_printf(Perl_debug_log, " noscan");
4608     if (r->reganch & ROPT_CHECK_ALL)
4609         PerlIO_printf(Perl_debug_log, " isall");
4610     if (r->check_substr || r->check_utf8)
4611         PerlIO_printf(Perl_debug_log, ") ");
4612
4613     if (r->regstclass) {
4614         regprop(sv, r->regstclass);
4615         PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4616     }
4617     if (r->reganch & ROPT_ANCH) {
4618         PerlIO_printf(Perl_debug_log, "anchored");
4619         if (r->reganch & ROPT_ANCH_BOL)
4620             PerlIO_printf(Perl_debug_log, "(BOL)");
4621         if (r->reganch & ROPT_ANCH_MBOL)
4622             PerlIO_printf(Perl_debug_log, "(MBOL)");
4623         if (r->reganch & ROPT_ANCH_SBOL)
4624             PerlIO_printf(Perl_debug_log, "(SBOL)");
4625         if (r->reganch & ROPT_ANCH_GPOS)
4626             PerlIO_printf(Perl_debug_log, "(GPOS)");
4627         PerlIO_putc(Perl_debug_log, ' ');
4628     }
4629     if (r->reganch & ROPT_GPOS_SEEN)
4630         PerlIO_printf(Perl_debug_log, "GPOS ");
4631     if (r->reganch & ROPT_SKIP)
4632         PerlIO_printf(Perl_debug_log, "plus ");
4633     if (r->reganch & ROPT_IMPLICIT)
4634         PerlIO_printf(Perl_debug_log, "implicit ");
4635     PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4636     if (r->reganch & ROPT_EVAL_SEEN)
4637         PerlIO_printf(Perl_debug_log, "with eval ");
4638     PerlIO_printf(Perl_debug_log, "\n");
4639     if (r->offsets) {
4640       U32 i;
4641       U32 len = r->offsets[0];
4642       PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4643       for (i = 1; i <= len; i++)
4644         PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ", 
4645                       (UV)r->offsets[i*2-1], 
4646                       (UV)r->offsets[i*2]);
4647       PerlIO_printf(Perl_debug_log, "\n");
4648     }
4649 #endif  /* DEBUGGING */
4650 }
4651
4652 #ifdef DEBUGGING
4653
4654 STATIC void
4655 S_put_byte(pTHX_ SV *sv, int c)
4656 {
4657     if (isCNTRL(c) || c == 255 || !isPRINT(c))
4658         Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4659     else if (c == '-' || c == ']' || c == '\\' || c == '^')
4660         Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4661     else
4662         Perl_sv_catpvf(aTHX_ sv, "%c", c);
4663 }
4664
4665 #endif  /* DEBUGGING */
4666
4667 /*
4668 - regprop - printable representation of opcode
4669 */
4670 void
4671 Perl_regprop(pTHX_ SV *sv, regnode *o)
4672 {
4673 #ifdef DEBUGGING
4674     register int k;
4675
4676     sv_setpvn(sv, "", 0);
4677     if (OP(o) >= reg_num)               /* regnode.type is unsigned */
4678         /* It would be nice to FAIL() here, but this may be called from
4679            regexec.c, and it would be hard to supply pRExC_state. */
4680         Perl_croak(aTHX_ "Corrupted regexp opcode");
4681     sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4682
4683     k = PL_regkind[(U8)OP(o)];
4684
4685     if (k == EXACT) {
4686         SV *dsv = sv_2mortal(newSVpvn("", 0));
4687         /* Using is_utf8_string() is a crude hack but it may
4688          * be the best for now since we have no flag "this EXACTish
4689          * node was UTF-8" --jhi */
4690         bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4691         char *s    = do_utf8 ?
4692           pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4693                          UNI_DISPLAY_REGEX) :
4694           STRING(o);
4695         int len = do_utf8 ?
4696           strlen(s) :
4697           STR_LEN(o);
4698         Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4699                        PL_colors[0],
4700                        len, s,
4701                        PL_colors[1]);
4702     }
4703     else if (k == CURLY) {
4704         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4705             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4706         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4707     }
4708     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
4709         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4710     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4711         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
4712     else if (k == LOGICAL)
4713         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
4714     else if (k == ANYOF) {
4715         int i, rangestart = -1;
4716         U8 flags = ANYOF_FLAGS(o);
4717         const char * const anyofs[] = { /* Should be syncronized with
4718                                          * ANYOF_ #xdefines in regcomp.h */
4719             "\\w",
4720             "\\W",
4721             "\\s",
4722             "\\S",
4723             "\\d",
4724             "\\D",
4725             "[:alnum:]",
4726             "[:^alnum:]",
4727             "[:alpha:]",
4728             "[:^alpha:]",
4729             "[:ascii:]",
4730             "[:^ascii:]",
4731             "[:ctrl:]",
4732             "[:^ctrl:]",
4733             "[:graph:]",
4734             "[:^graph:]",
4735             "[:lower:]",
4736             "[:^lower:]",
4737             "[:print:]",
4738             "[:^print:]",
4739             "[:punct:]",
4740             "[:^punct:]",
4741             "[:upper:]",
4742             "[:^upper:]",
4743             "[:xdigit:]",
4744             "[:^xdigit:]",
4745             "[:space:]",
4746             "[:^space:]",
4747             "[:blank:]",
4748             "[:^blank:]"
4749         };
4750
4751         if (flags & ANYOF_LOCALE)
4752             sv_catpv(sv, "{loc}");
4753         if (flags & ANYOF_FOLD)
4754             sv_catpv(sv, "{i}");
4755         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4756         if (flags & ANYOF_INVERT)
4757             sv_catpv(sv, "^");
4758         for (i = 0; i <= 256; i++) {
4759             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4760                 if (rangestart == -1)
4761                     rangestart = i;
4762             } else if (rangestart != -1) {
4763                 if (i <= rangestart + 3)
4764                     for (; rangestart < i; rangestart++)
4765                         put_byte(sv, rangestart);
4766                 else {
4767                     put_byte(sv, rangestart);
4768                     sv_catpv(sv, "-");
4769                     put_byte(sv, i - 1);
4770                 }
4771                 rangestart = -1;
4772             }
4773         }
4774
4775         if (o->flags & ANYOF_CLASS)
4776             for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4777                 if (ANYOF_CLASS_TEST(o,i))
4778                     sv_catpv(sv, anyofs[i]);
4779
4780         if (flags & ANYOF_UNICODE)
4781             sv_catpv(sv, "{unicode}");
4782         else if (flags & ANYOF_UNICODE_ALL)
4783             sv_catpv(sv, "{unicode_all}");
4784
4785         {
4786             SV *lv;
4787             SV *sw = regclass_swash(o, FALSE, &lv, 0);
4788         
4789             if (lv) {
4790                 if (sw) {
4791                     U8 s[UTF8_MAXLEN+1];
4792                 
4793                     for (i = 0; i <= 256; i++) { /* just the first 256 */
4794                         U8 *e = uvchr_to_utf8(s, i);
4795                         
4796                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
4797                             if (rangestart == -1)
4798                                 rangestart = i;
4799                         } else if (rangestart != -1) {
4800                             U8 *p;
4801                         
4802                             if (i <= rangestart + 3)
4803                                 for (; rangestart < i; rangestart++) {
4804                                     for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4805                                         put_byte(sv, *p);
4806                                 }
4807                             else {
4808                                 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4809                                     put_byte(sv, *p);
4810                                 sv_catpv(sv, "-");
4811                                     for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4812                                         put_byte(sv, *p);
4813                                 }
4814                                 rangestart = -1;
4815                             }
4816                         }
4817                         
4818                     sv_catpv(sv, "..."); /* et cetera */
4819                 }
4820
4821                 {
4822                     char *s = savepv(SvPVX(lv));
4823                     char *origs = s;
4824                 
4825                     while(*s && *s != '\n') s++;
4826                 
4827                     if (*s == '\n') {
4828                         char *t = ++s;
4829                         
4830                         while (*s) {
4831                             if (*s == '\n')
4832                                 *s = ' ';
4833                             s++;
4834                         }
4835                         if (s[-1] == ' ')
4836                             s[-1] = 0;
4837                         
4838                         sv_catpv(sv, t);
4839                     }
4840                 
4841                     Safefree(origs);
4842                 }
4843             }
4844         }
4845
4846         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4847     }
4848     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4849         Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4850 #endif  /* DEBUGGING */
4851 }
4852
4853 SV *
4854 Perl_re_intuit_string(pTHX_ regexp *prog)
4855 {                               /* Assume that RE_INTUIT is set */
4856     DEBUG_r(
4857         {   STRLEN n_a;
4858             char *s = SvPV(prog->check_substr
4859                       ? prog->check_substr : prog->check_utf8, n_a);
4860
4861             if (!PL_colorset) reginitcolors();
4862             PerlIO_printf(Perl_debug_log,
4863                       "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4864                       PL_colors[4],
4865                       prog->check_substr ? "" : "utf8 ",
4866                       PL_colors[5],PL_colors[0],
4867                       s,
4868                       PL_colors[1],
4869                       (strlen(s) > 60 ? "..." : ""));
4870         } );
4871
4872     return prog->check_substr ? prog->check_substr : prog->check_utf8;
4873 }
4874
4875 void
4876 Perl_pregfree(pTHX_ struct regexp *r)
4877 {
4878 #ifdef DEBUGGING
4879     SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4880 #endif
4881
4882     if (!r || (--r->refcnt > 0))
4883         return;
4884     DEBUG_r({
4885          int len;
4886          char *s;
4887
4888          s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4889                 r->prelen, 60, UNI_DISPLAY_REGEX)
4890             : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4891          len = SvCUR(dsv);
4892          if (!PL_colorset)
4893               reginitcolors();
4894          PerlIO_printf(Perl_debug_log,
4895                        "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4896                        PL_colors[4],PL_colors[5],PL_colors[0],
4897                        len, len, s,
4898                        PL_colors[1],
4899                        len > 60 ? "..." : "");
4900     });
4901
4902     if (r->precomp)
4903         Safefree(r->precomp);
4904     if (r->offsets)             /* 20010421 MJD */
4905         Safefree(r->offsets);
4906     RX_MATCH_COPY_FREE(r);
4907 #ifdef PERL_COPY_ON_WRITE
4908     if (r->saved_copy)
4909         SvREFCNT_dec(r->saved_copy);
4910 #endif
4911     if (r->substrs) {
4912         if (r->anchored_substr)
4913             SvREFCNT_dec(r->anchored_substr);
4914         if (r->anchored_utf8)
4915             SvREFCNT_dec(r->anchored_utf8);
4916         if (r->float_substr)
4917             SvREFCNT_dec(r->float_substr);
4918         if (r->float_utf8)
4919             SvREFCNT_dec(r->float_utf8);
4920         Safefree(r->substrs);
4921     }
4922     if (r->data) {
4923         int n = r->data->count;
4924         PAD* new_comppad = NULL;
4925         PAD* old_comppad;
4926
4927         while (--n >= 0) {
4928           /* If you add a ->what type here, update the comment in regcomp.h */
4929             switch (r->data->what[n]) {
4930             case 's':
4931                 SvREFCNT_dec((SV*)r->data->data[n]);
4932                 break;
4933             case 'f':
4934                 Safefree(r->data->data[n]);
4935                 break;
4936             case 'p':
4937                 new_comppad = (AV*)r->data->data[n];
4938                 break;
4939             case 'o':
4940                 if (new_comppad == NULL)
4941                     Perl_croak(aTHX_ "panic: pregfree comppad");
4942                 PAD_SAVE_LOCAL(old_comppad,
4943                     /* Watch out for global destruction's random ordering. */
4944                     (SvTYPE(new_comppad) == SVt_PVAV) ?
4945                                 new_comppad : Null(PAD *)
4946                 );
4947                 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4948                     op_free((OP_4tree*)r->data->data[n]);
4949                 }
4950
4951                 PAD_RESTORE_LOCAL(old_comppad);
4952                 SvREFCNT_dec((SV*)new_comppad);
4953                 new_comppad = NULL;
4954                 break;
4955             case 'n':
4956                 break;
4957             default:
4958                 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4959             }
4960         }
4961         Safefree(r->data->what);
4962         Safefree(r->data);
4963     }
4964     Safefree(r->startp);
4965     Safefree(r->endp);
4966     Safefree(r);
4967 }
4968
4969 /*
4970  - regnext - dig the "next" pointer out of a node
4971  *
4972  * [Note, when REGALIGN is defined there are two places in regmatch()
4973  * that bypass this code for speed.]
4974  */
4975 regnode *
4976 Perl_regnext(pTHX_ register regnode *p)
4977 {
4978     register I32 offset;
4979
4980     if (p == &PL_regdummy)
4981         return(NULL);
4982
4983     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4984     if (offset == 0)
4985         return(NULL);
4986
4987     return(p+offset);
4988 }
4989
4990 STATIC void     
4991 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
4992 {
4993     va_list args;
4994     STRLEN l1 = strlen(pat1);
4995     STRLEN l2 = strlen(pat2);
4996     char buf[512];
4997     SV *msv;
4998     char *message;
4999
5000     if (l1 > 510)
5001         l1 = 510;
5002     if (l1 + l2 > 510)
5003         l2 = 510 - l1;
5004     Copy(pat1, buf, l1 , char);
5005     Copy(pat2, buf + l1, l2 , char);
5006     buf[l1 + l2] = '\n';
5007     buf[l1 + l2 + 1] = '\0';
5008 #ifdef I_STDARG
5009     /* ANSI variant takes additional second argument */
5010     va_start(args, pat2);
5011 #else
5012     va_start(args);
5013 #endif
5014     msv = vmess(buf, &args);
5015     va_end(args);
5016     message = SvPV(msv,l1);
5017     if (l1 > 512)
5018         l1 = 512;
5019     Copy(message, buf, l1 , char);
5020     buf[l1] = '\0';                     /* Overwrite \n */
5021     Perl_croak(aTHX_ "%s", buf);
5022 }
5023
5024 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
5025
5026 void
5027 Perl_save_re_context(pTHX)
5028 {
5029     SAVEI32(PL_reg_flags);              /* from regexec.c */
5030     SAVEPPTR(PL_bostr);
5031     SAVEPPTR(PL_reginput);              /* String-input pointer. */
5032     SAVEPPTR(PL_regbol);                /* Beginning of input, for ^ check. */
5033     SAVEPPTR(PL_regeol);                /* End of input, for $ check. */
5034     SAVEVPTR(PL_regstartp);             /* Pointer to startp array. */
5035     SAVEVPTR(PL_regendp);               /* Ditto for endp. */
5036     SAVEVPTR(PL_reglastparen);          /* Similarly for lastparen. */
5037     SAVEVPTR(PL_reglastcloseparen);     /* Similarly for lastcloseparen. */
5038     SAVEPPTR(PL_regtill);               /* How far we are required to go. */
5039     SAVEGENERICPV(PL_reg_start_tmp);            /* from regexec.c */
5040     PL_reg_start_tmp = 0;
5041     SAVEI32(PL_reg_start_tmpl);         /* from regexec.c */
5042     PL_reg_start_tmpl = 0;
5043     SAVEVPTR(PL_regdata);
5044     SAVEI32(PL_reg_eval_set);           /* from regexec.c */
5045     SAVEI32(PL_regnarrate);             /* from regexec.c */
5046     SAVEVPTR(PL_regprogram);            /* from regexec.c */
5047     SAVEINT(PL_regindent);              /* from regexec.c */
5048     SAVEVPTR(PL_regcc);                 /* from regexec.c */
5049     SAVEVPTR(PL_curcop);
5050     SAVEVPTR(PL_reg_call_cc);           /* from regexec.c */
5051     SAVEVPTR(PL_reg_re);                /* from regexec.c */
5052     SAVEPPTR(PL_reg_ganch);             /* from regexec.c */
5053     SAVESPTR(PL_reg_sv);                /* from regexec.c */
5054     SAVEBOOL(PL_reg_match_utf8);        /* from regexec.c */
5055     SAVEVPTR(PL_reg_magic);             /* from regexec.c */
5056     SAVEI32(PL_reg_oldpos);                     /* from regexec.c */
5057     SAVEVPTR(PL_reg_oldcurpm);          /* from regexec.c */
5058     SAVEVPTR(PL_reg_curpm);             /* from regexec.c */
5059     SAVEPPTR(PL_reg_oldsaved);          /* old saved substr during match */
5060     PL_reg_oldsaved = Nullch;
5061     SAVEI32(PL_reg_oldsavedlen);        /* old length of saved substr during match */
5062     PL_reg_oldsavedlen = 0;
5063 #ifdef PERL_COPY_ON_WRITE
5064     SAVESPTR(PL_nrs);
5065     PL_nrs = Nullsv;
5066 #endif
5067     SAVEI32(PL_reg_maxiter);            /* max wait until caching pos */
5068     PL_reg_maxiter = 0;
5069     SAVEI32(PL_reg_leftiter);           /* wait until caching pos */
5070     PL_reg_leftiter = 0;
5071     SAVEGENERICPV(PL_reg_poscache);     /* cache of pos of WHILEM */
5072     PL_reg_poscache = Nullch;
5073     SAVEI32(PL_reg_poscache_size);      /* size of pos cache of WHILEM */
5074     PL_reg_poscache_size = 0;
5075     SAVEPPTR(PL_regprecomp);            /* uncompiled string. */
5076     SAVEI32(PL_regnpar);                /* () count. */
5077     SAVEI32(PL_regsize);                /* from regexec.c */
5078
5079     {
5080         /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
5081         U32 i;
5082         GV *mgv;
5083         REGEXP *rx;
5084         char digits[16];
5085
5086         if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5087             for (i = 1; i <= rx->nparens; i++) {
5088                 sprintf(digits, "%lu", (long)i);
5089                 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5090                     save_scalar(mgv);
5091             }
5092         }
5093     }
5094
5095 #ifdef DEBUGGING
5096     SAVEPPTR(PL_reg_starttry);          /* from regexec.c */
5097 #endif
5098 }
5099
5100 static void
5101 clear_re(pTHX_ void *r)
5102 {
5103     ReREFCNT_dec((regexp *)r);
5104 }
5105