File::Temp documentation fix
[p5sagit/p5-mst-13.2.git] / regcomp.c
1 /*    regcomp.c
2  */
3
4 /*
5  * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
6  */
7
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9  * confused with the original package (see point 3 below).  Thanks, Henry!
10  */
11
12 /* Additional note: this code is very heavily munged from Henry's version
13  * in places.  In some spots I've traded clarity for efficiency, so don't
14  * blame Henry for some of the lack of readability.
15  */
16
17 /* The names of the functions have been changed from regcomp and
18  * regexec to  pregcomp and pregexec in order to avoid conflicts
19  * with the POSIX routines of the same names.
20 */
21
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 #  ifndef PERL_IN_XSUB_RE
25 #    define PERL_IN_XSUB_RE
26 #  endif
27 /* need access to debugger hooks */
28 #  if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
29 #    define DEBUGGING
30 #  endif
31 #endif
32
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 #  define Perl_pregcomp my_regcomp
36 #  define Perl_regdump my_regdump
37 #  define Perl_regprop my_regprop
38 #  define Perl_pregfree my_regfree
39 #  define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 #  define Perl_regnext my_regnext
42 #  define Perl_save_re_context my_save_re_context
43 #  define Perl_reginitcolors my_reginitcolors
44
45 #  define PERL_NO_GET_CONTEXT
46 #endif
47
48 /*SUPPRESS 112*/
49 /*
50  * pregcomp and pregexec -- regsub and regerror are not used in perl
51  *
52  *      Copyright (c) 1986 by University of Toronto.
53  *      Written by Henry Spencer.  Not derived from licensed software.
54  *
55  *      Permission is granted to anyone to use this software for any
56  *      purpose on any computer system, and to redistribute it freely,
57  *      subject to the following restrictions:
58  *
59  *      1. The author is not responsible for the consequences of use of
60  *              this software, no matter how awful, even if they arise
61  *              from defects in it.
62  *
63  *      2. The origin of this software must not be misrepresented, either
64  *              by explicit claim or by omission.
65  *
66  *      3. Altered versions must be plainly marked as such, and must not
67  *              be misrepresented as being the original software.
68  *
69  *
70  ****    Alterations to Henry's code are...
71  ****
72  ****    Copyright (c) 1991-2002, Larry Wall
73  ****
74  ****    You may distribute under the terms of either the GNU General Public
75  ****    License or the Artistic License, as specified in the README file.
76
77  *
78  * Beware that some of this code is subtly aware of the way operator
79  * precedence is structured in regular expressions.  Serious changes in
80  * regular-expression syntax might require a total rethink.
81  */
82 #include "EXTERN.h"
83 #define PERL_IN_REGCOMP_C
84 #include "perl.h"
85
86 #ifndef PERL_IN_XSUB_RE
87 #  include "INTERN.h"
88 #endif
89
90 #define REG_COMP_C
91 #include "regcomp.h"
92
93 #ifdef op
94 #undef op
95 #endif /* op */
96
97 #ifdef MSDOS
98 # if defined(BUGGY_MSC6)
99  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100  # pragma optimize("a",off)
101  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102  # pragma optimize("w",on )
103 # endif /* BUGGY_MSC6 */
104 #endif /* MSDOS */
105
106 #ifndef STATIC
107 #define STATIC  static
108 #endif
109
110 typedef struct RExC_state_t {
111     U32         flags;                  /* are we folding, multilining? */
112     char        *precomp;               /* uncompiled string. */
113     regexp      *rx;
114     char        *start;                 /* Start of input for compile */
115     char        *end;                   /* End of input for compile */
116     char        *parse;                 /* Input-scan pointer. */
117     I32         whilem_seen;            /* number of WHILEM in this expr */
118     regnode     *emit_start;            /* Start of emitted-code area */
119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
120     I32         naughty;                /* How bad is this pattern? */
121     I32         sawback;                /* Did we see \1, ...? */
122     U32         seen;
123     I32         size;                   /* Code size. */
124     I32         npar;                   /* () count. */
125     I32         extralen;
126     I32         seen_zerolen;
127     I32         seen_evals;
128     I32         utf8;
129 #if ADD_TO_REGEXEC
130     char        *starttry;              /* -Dr: where regtry was called. */
131 #define RExC_starttry   (pRExC_state->starttry)
132 #endif
133 } RExC_state_t;
134
135 #define RExC_flags      (pRExC_state->flags)
136 #define RExC_precomp    (pRExC_state->precomp)
137 #define RExC_rx         (pRExC_state->rx)
138 #define RExC_start      (pRExC_state->start)
139 #define RExC_end        (pRExC_state->end)
140 #define RExC_parse      (pRExC_state->parse)
141 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
142 #define RExC_offsets    (pRExC_state->rx->offsets) /* I am not like the others */
143 #define RExC_emit       (pRExC_state->emit)
144 #define RExC_emit_start (pRExC_state->emit_start)
145 #define RExC_naughty    (pRExC_state->naughty)
146 #define RExC_sawback    (pRExC_state->sawback)
147 #define RExC_seen       (pRExC_state->seen)
148 #define RExC_size       (pRExC_state->size)
149 #define RExC_npar       (pRExC_state->npar)
150 #define RExC_extralen   (pRExC_state->extralen)
151 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
152 #define RExC_seen_evals (pRExC_state->seen_evals)
153 #define RExC_utf8       (pRExC_state->utf8)
154
155 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
156 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157         ((*s) == '{' && regcurly(s)))
158
159 #ifdef SPSTART
160 #undef SPSTART          /* dratted cpp namespace... */
161 #endif
162 /*
163  * Flags to be passed up and down.
164  */
165 #define WORST           0       /* Worst case. */
166 #define HASWIDTH        0x1     /* Known to match non-null strings. */
167 #define SIMPLE          0x2     /* Simple enough to be STAR/PLUS operand. */
168 #define SPSTART         0x4     /* Starts with * or +. */
169 #define TRYAGAIN        0x8     /* Weeded out a declaration. */
170
171 /* Length of a variant. */
172
173 typedef struct scan_data_t {
174     I32 len_min;
175     I32 len_delta;
176     I32 pos_min;
177     I32 pos_delta;
178     SV *last_found;
179     I32 last_end;                       /* min value, <0 unless valid. */
180     I32 last_start_min;
181     I32 last_start_max;
182     SV **longest;                       /* Either &l_fixed, or &l_float. */
183     SV *longest_fixed;
184     I32 offset_fixed;
185     SV *longest_float;
186     I32 offset_float_min;
187     I32 offset_float_max;
188     I32 flags;
189     I32 whilem_c;
190     I32 *last_closep;
191     struct regnode_charclass_class *start_class;
192 } scan_data_t;
193
194 /*
195  * Forward declarations for pregcomp()'s friends.
196  */
197
198 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199                                       0, 0, 0, 0, 0, 0};
200
201 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202 #define SF_BEFORE_SEOL          0x1
203 #define SF_BEFORE_MEOL          0x2
204 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
207 #ifdef NO_UNARY_PLUS
208 #  define SF_FIX_SHIFT_EOL      (0+2)
209 #  define SF_FL_SHIFT_EOL               (0+4)
210 #else
211 #  define SF_FIX_SHIFT_EOL      (+2)
212 #  define SF_FL_SHIFT_EOL               (+4)
213 #endif
214
215 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220 #define SF_IS_INF               0x40
221 #define SF_HAS_PAR              0x80
222 #define SF_IN_PAR               0x100
223 #define SF_HAS_EVAL             0x200
224 #define SCF_DO_SUBSTR           0x400
225 #define SCF_DO_STCLASS_AND      0x0800
226 #define SCF_DO_STCLASS_OR       0x1000
227 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
228 #define SCF_WHILEM_VISITED_POS  0x2000
229
230 #define UTF (RExC_utf8 != 0)
231 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
232 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
233
234 #define OOB_UNICODE             12345678
235 #define OOB_NAMEDCLASS          -1
236
237 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
240
241 /* length of regex to show in messages that don't mark a position within */
242 #define RegexLengthToShowInErrorMessages 127
243
244 /*
245  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247  * op/pragma/warn/regcomp.
248  */
249 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
250 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
251
252 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
253
254 /*
255  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256  * arg. Show regex, up to a maximum length. If it's too long, chop and add
257  * "...".
258  */
259 #define FAIL(msg) STMT_START {                                          \
260     char *ellipses = "";                                                \
261     IV len = RExC_end - RExC_precomp;                                   \
262                                                                         \
263     if (!SIZE_ONLY)                                                     \
264         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
265     if (len > RegexLengthToShowInErrorMessages) {                       \
266         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
267         len = RegexLengthToShowInErrorMessages - 10;                    \
268         ellipses = "...";                                               \
269     }                                                                   \
270     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",                           \
271             msg, (int)len, RExC_precomp, ellipses);                     \
272 } STMT_END
273
274 /*
275  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
276  * args. Show regex, up to a maximum length. If it's too long, chop and add
277  * "...".
278  */
279 #define FAIL2(pat,msg) STMT_START {                                     \
280     char *ellipses = "";                                                \
281     IV len = RExC_end - RExC_precomp;                                   \
282                                                                         \
283     if (!SIZE_ONLY)                                                     \
284         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
285     if (len > RegexLengthToShowInErrorMessages) {                       \
286         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
287         len = RegexLengthToShowInErrorMessages - 10;                    \
288         ellipses = "...";                                               \
289     }                                                                   \
290     S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/",                       \
291             msg, (int)len, RExC_precomp, ellipses);                     \
292 } STMT_END
293
294
295 /*
296  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
297  */
298 #define Simple_vFAIL(m) STMT_START {                                    \
299     IV offset = RExC_parse - RExC_precomp;                              \
300     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
301             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
302 } STMT_END
303
304 /*
305  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
306  */
307 #define vFAIL(m) STMT_START {                           \
308     if (!SIZE_ONLY)                                     \
309         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
310     Simple_vFAIL(m);                                    \
311 } STMT_END
312
313 /*
314  * Like Simple_vFAIL(), but accepts two arguments.
315  */
316 #define Simple_vFAIL2(m,a1) STMT_START {                        \
317     IV offset = RExC_parse - RExC_precomp;                      \
318     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
319             (int)offset, RExC_precomp, RExC_precomp + offset);  \
320 } STMT_END
321
322 /*
323  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
324  */
325 #define vFAIL2(m,a1) STMT_START {                       \
326     if (!SIZE_ONLY)                                     \
327         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
328     Simple_vFAIL2(m, a1);                               \
329 } STMT_END
330
331
332 /*
333  * Like Simple_vFAIL(), but accepts three arguments.
334  */
335 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
336     IV offset = RExC_parse - RExC_precomp;                      \
337     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
338             (int)offset, RExC_precomp, RExC_precomp + offset);  \
339 } STMT_END
340
341 /*
342  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
343  */
344 #define vFAIL3(m,a1,a2) STMT_START {                    \
345     if (!SIZE_ONLY)                                     \
346         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
347     Simple_vFAIL3(m, a1, a2);                           \
348 } STMT_END
349
350 /*
351  * Like Simple_vFAIL(), but accepts four arguments.
352  */
353 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
354     IV offset = RExC_parse - RExC_precomp;                      \
355     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
356             (int)offset, RExC_precomp, RExC_precomp + offset);  \
357 } STMT_END
358
359 /*
360  * Like Simple_vFAIL(), but accepts five arguments.
361  */
362 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START {           \
363     IV offset = RExC_parse - RExC_precomp;                      \
364     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,       \
365             (int)offset, RExC_precomp, RExC_precomp + offset);  \
366 } STMT_END
367
368
369 #define vWARN(loc,m) STMT_START {                                       \
370     IV offset = loc - RExC_precomp;                                     \
371     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
372             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
373 } STMT_END
374
375 #define vWARNdep(loc,m) STMT_START {                                    \
376     IV offset = loc - RExC_precomp;                                     \
377     Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),          \
378             "%s" REPORT_LOCATION,                                       \
379             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
380 } STMT_END
381
382
383 #define vWARN2(loc, m, a1) STMT_START {                                 \
384     IV offset = loc - RExC_precomp;                                     \
385     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
386             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
387 } STMT_END
388
389 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
390     IV offset = loc - RExC_precomp;                                     \
391     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
392             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
393 } STMT_END
394
395 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
396     IV offset = loc - RExC_precomp;                                     \
397     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
398             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
399 } STMT_END
400
401 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
402     IV offset = loc - RExC_precomp;                                     \
403     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
404             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
405 } STMT_END
406
407
408 /* Allow for side effects in s */
409 #define REGC(c,s) STMT_START {                  \
410     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
411 } STMT_END
412
413 /* Macros for recording node offsets.   20001227 mjd@plover.com 
414  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
415  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
416  * Element 0 holds the number n.
417  */
418
419 #define MJD_OFFSET_DEBUG(x)
420 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
421
422
423 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
424     if (! SIZE_ONLY) {                                                  \
425         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
426                 __LINE__, (node), (byte)));                             \
427         if((node) < 0) {                                                \
428             Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
429         } else {                                                        \
430             RExC_offsets[2*(node)-1] = (byte);                          \
431         }                                                               \
432     }                                                                   \
433 } STMT_END
434
435 #define Set_Node_Offset(node,byte) \
436     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
437 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
438
439 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
440     if (! SIZE_ONLY) {                                                  \
441         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
442                 __LINE__, (node), (len)));                              \
443         if((node) < 0) {                                                \
444             Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
445         } else {                                                        \
446             RExC_offsets[2*(node)] = (len);                             \
447         }                                                               \
448     }                                                                   \
449 } STMT_END
450
451 #define Set_Node_Length(node,len) \
452     Set_Node_Length_To_R((node)-RExC_emit_start, len)
453 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
454 #define Set_Node_Cur_Length(node) \
455     Set_Node_Length(node, RExC_parse - parse_start)
456
457 /* Get offsets and lengths */
458 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
459 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
460
461 static void clear_re(pTHX_ void *r);
462
463 /* Mark that we cannot extend a found fixed substring at this point.
464    Updata the longest found anchored substring and the longest found
465    floating substrings if needed. */
466
467 STATIC void
468 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
469 {
470     STRLEN l = CHR_SVLEN(data->last_found);
471     STRLEN old_l = CHR_SVLEN(*data->longest);
472
473     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
474         sv_setsv(*data->longest, data->last_found);
475         if (*data->longest == data->longest_fixed) {
476             data->offset_fixed = l ? data->last_start_min : data->pos_min;
477             if (data->flags & SF_BEFORE_EOL)
478                 data->flags
479                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
480             else
481                 data->flags &= ~SF_FIX_BEFORE_EOL;
482         }
483         else {
484             data->offset_float_min = l ? data->last_start_min : data->pos_min;
485             data->offset_float_max = (l
486                                       ? data->last_start_max
487                                       : data->pos_min + data->pos_delta);
488             if ((U32)data->offset_float_max > (U32)I32_MAX)
489                 data->offset_float_max = I32_MAX;
490             if (data->flags & SF_BEFORE_EOL)
491                 data->flags
492                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
493             else
494                 data->flags &= ~SF_FL_BEFORE_EOL;
495         }
496     }
497     SvCUR_set(data->last_found, 0);
498     data->last_end = -1;
499     data->flags &= ~SF_BEFORE_EOL;
500 }
501
502 /* Can match anything (initialization) */
503 STATIC void
504 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
505 {
506     ANYOF_CLASS_ZERO(cl);
507     ANYOF_BITMAP_SETALL(cl);
508     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
509     if (LOC)
510         cl->flags |= ANYOF_LOCALE;
511 }
512
513 /* Can match anything (initialization) */
514 STATIC int
515 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
516 {
517     int value;
518
519     for (value = 0; value <= ANYOF_MAX; value += 2)
520         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
521             return 1;
522     if (!(cl->flags & ANYOF_UNICODE_ALL))
523         return 0;
524     if (!ANYOF_BITMAP_TESTALLSET(cl))
525         return 0;
526     return 1;
527 }
528
529 /* Can match anything (initialization) */
530 STATIC void
531 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
532 {
533     Zero(cl, 1, struct regnode_charclass_class);
534     cl->type = ANYOF;
535     cl_anything(pRExC_state, cl);
536 }
537
538 STATIC void
539 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
540 {
541     Zero(cl, 1, struct regnode_charclass_class);
542     cl->type = ANYOF;
543     cl_anything(pRExC_state, cl);
544     if (LOC)
545         cl->flags |= ANYOF_LOCALE;
546 }
547
548 /* 'And' a given class with another one.  Can create false positives */
549 /* We assume that cl is not inverted */
550 STATIC void
551 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
552          struct regnode_charclass_class *and_with)
553 {
554     if (!(and_with->flags & ANYOF_CLASS)
555         && !(cl->flags & ANYOF_CLASS)
556         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
557         && !(and_with->flags & ANYOF_FOLD)
558         && !(cl->flags & ANYOF_FOLD)) {
559         int i;
560
561         if (and_with->flags & ANYOF_INVERT)
562             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
563                 cl->bitmap[i] &= ~and_with->bitmap[i];
564         else
565             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
566                 cl->bitmap[i] &= and_with->bitmap[i];
567     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
568     if (!(and_with->flags & ANYOF_EOS))
569         cl->flags &= ~ANYOF_EOS;
570
571     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
572         cl->flags &= ~ANYOF_UNICODE_ALL;
573         cl->flags |= ANYOF_UNICODE;
574         ARG_SET(cl, ARG(and_with));
575     }
576     if (!(and_with->flags & ANYOF_UNICODE_ALL))
577         cl->flags &= ~ANYOF_UNICODE_ALL;
578     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
579         cl->flags &= ~ANYOF_UNICODE;
580 }
581
582 /* 'OR' a given class with another one.  Can create false positives */
583 /* We assume that cl is not inverted */
584 STATIC void
585 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
586 {
587     if (or_with->flags & ANYOF_INVERT) {
588         /* We do not use
589          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
590          *   <= (B1 | !B2) | (CL1 | !CL2)
591          * which is wasteful if CL2 is small, but we ignore CL2:
592          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
593          * XXXX Can we handle case-fold?  Unclear:
594          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
595          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
596          */
597         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
598              && !(or_with->flags & ANYOF_FOLD)
599              && !(cl->flags & ANYOF_FOLD) ) {
600             int i;
601
602             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
603                 cl->bitmap[i] |= ~or_with->bitmap[i];
604         } /* XXXX: logic is complicated otherwise */
605         else {
606             cl_anything(pRExC_state, cl);
607         }
608     } else {
609         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
610         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
611              && (!(or_with->flags & ANYOF_FOLD)
612                  || (cl->flags & ANYOF_FOLD)) ) {
613             int i;
614
615             /* OR char bitmap and class bitmap separately */
616             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
617                 cl->bitmap[i] |= or_with->bitmap[i];
618             if (or_with->flags & ANYOF_CLASS) {
619                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
620                     cl->classflags[i] |= or_with->classflags[i];
621                 cl->flags |= ANYOF_CLASS;
622             }
623         }
624         else { /* XXXX: logic is complicated, leave it along for a moment. */
625             cl_anything(pRExC_state, cl);
626         }
627     }
628     if (or_with->flags & ANYOF_EOS)
629         cl->flags |= ANYOF_EOS;
630
631     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
632         ARG(cl) != ARG(or_with)) {
633         cl->flags |= ANYOF_UNICODE_ALL;
634         cl->flags &= ~ANYOF_UNICODE;
635     }
636     if (or_with->flags & ANYOF_UNICODE_ALL) {
637         cl->flags |= ANYOF_UNICODE_ALL;
638         cl->flags &= ~ANYOF_UNICODE;
639     }
640 }
641
642 /*
643  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
644  * These need to be revisited when a newer toolchain becomes available.
645  */
646 #if defined(__sparc64__) && defined(__GNUC__)
647 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
648 #       undef  SPARC64_GCC_WORKAROUND
649 #       define SPARC64_GCC_WORKAROUND 1
650 #   endif
651 #endif
652
653 /* REx optimizer.  Converts nodes into quickier variants "in place".
654    Finds fixed substrings.  */
655
656 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
657    to the position after last scanned or to NULL. */
658
659 STATIC I32
660 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
661                         /* scanp: Start here (read-write). */
662                         /* deltap: Write maxlen-minlen here. */
663                         /* last: Stop before this one. */
664 {
665     I32 min = 0, pars = 0, code;
666     regnode *scan = *scanp, *next;
667     I32 delta = 0;
668     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
669     int is_inf_internal = 0;            /* The studied chunk is infinite */
670     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
671     scan_data_t data_fake;
672     struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
673
674     while (scan && OP(scan) != END && scan < last) {
675         /* Peephole optimizer: */
676
677         if (PL_regkind[(U8)OP(scan)] == EXACT) {
678             /* Merge several consecutive EXACTish nodes into one. */
679             regnode *n = regnext(scan);
680             U32 stringok = 1;
681 #ifdef DEBUGGING
682             regnode *stop = scan;
683 #endif
684
685             next = scan + NODE_SZ_STR(scan);
686             /* Skip NOTHING, merge EXACT*. */
687             while (n &&
688                    ( PL_regkind[(U8)OP(n)] == NOTHING ||
689                      (stringok && (OP(n) == OP(scan))))
690                    && NEXT_OFF(n)
691                    && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
692                 if (OP(n) == TAIL || n > next)
693                     stringok = 0;
694                 if (PL_regkind[(U8)OP(n)] == NOTHING) {
695                     NEXT_OFF(scan) += NEXT_OFF(n);
696                     next = n + NODE_STEP_REGNODE;
697 #ifdef DEBUGGING
698                     if (stringok)
699                         stop = n;
700 #endif
701                     n = regnext(n);
702                 }
703                 else if (stringok) {
704                     int oldl = STR_LEN(scan);
705                     regnode *nnext = regnext(n);
706
707                     if (oldl + STR_LEN(n) > U8_MAX)
708                         break;
709                     NEXT_OFF(scan) += NEXT_OFF(n);
710                     STR_LEN(scan) += STR_LEN(n);
711                     next = n + NODE_SZ_STR(n);
712                     /* Now we can overwrite *n : */
713                     Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
714 #ifdef DEBUGGING
715                     stop = next - 1;
716 #endif
717                     n = nnext;
718                 }
719             }
720
721             if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
722 /*
723   Two problematic code points in Unicode casefolding of EXACT nodes:
724
725    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
726    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
727
728    which casefold to
729
730    Unicode                      UTF-8
731
732    U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
733    U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
734
735    This means that in case-insensitive matching (or "loose matching",
736    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
737    length of the above casefolded versions) can match a target string
738    of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
739    This would rather mess up the minimum length computation.
740
741    What we'll do is to look for the tail four bytes, and then peek
742    at the preceding two bytes to see whether we need to decrease
743    the minimum length by four (six minus two).
744
745    Thanks to the design of UTF-8, there cannot be false matches:
746    A sequence of valid UTF-8 bytes cannot be a subsequence of
747    another valid sequence of UTF-8 bytes.
748
749 */
750                  char *s0 = STRING(scan), *s, *t;
751                  char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
752                  char *t0 = "\xcc\x88\xcc\x81";
753                  char *t1 = t0 + 3;
754                  
755                  for (s = s0 + 2;
756                       s < s2 && (t = ninstr(s, s1, t0, t1));
757                       s = t + 4) {
758                       if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
759                           ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
760                            min -= 4;
761                  }
762             }
763
764 #ifdef DEBUGGING
765             /* Allow dumping */
766             n = scan + NODE_SZ_STR(scan);
767             while (n <= stop) {
768                 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
769                     OP(n) = OPTIMIZED;
770                     NEXT_OFF(n) = 0;
771                 }
772                 n++;
773             }
774 #endif
775         }
776         /* Follow the next-chain of the current node and optimize
777            away all the NOTHINGs from it.  */
778         if (OP(scan) != CURLYX) {
779             int max = (reg_off_by_arg[OP(scan)]
780                        ? I32_MAX
781                        /* I32 may be smaller than U16 on CRAYs! */
782                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
783             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
784             int noff;
785             regnode *n = scan;
786         
787             /* Skip NOTHING and LONGJMP. */
788             while ((n = regnext(n))
789                    && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
790                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
791                    && off + noff < max)
792                 off += noff;
793             if (reg_off_by_arg[OP(scan)])
794                 ARG(scan) = off;
795             else
796                 NEXT_OFF(scan) = off;
797         }
798         /* The principal pseudo-switch.  Cannot be a switch, since we
799            look into several different things.  */
800         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
801                    || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
802             next = regnext(scan);
803             code = OP(scan);
804         
805             if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
806                 I32 max1 = 0, min1 = I32_MAX, num = 0;
807                 struct regnode_charclass_class accum;
808                 
809                 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
810                     scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
811                 if (flags & SCF_DO_STCLASS)
812                     cl_init_zero(pRExC_state, &accum);
813                 while (OP(scan) == code) {
814                     I32 deltanext, minnext, f = 0, fake;
815                     struct regnode_charclass_class this_class;
816
817                     num++;
818                     data_fake.flags = 0;
819                     if (data) {         
820                         data_fake.whilem_c = data->whilem_c;
821                         data_fake.last_closep = data->last_closep;
822                     }
823                     else
824                         data_fake.last_closep = &fake;
825                     next = regnext(scan);
826                     scan = NEXTOPER(scan);
827                     if (code != BRANCH)
828                         scan = NEXTOPER(scan);
829                     if (flags & SCF_DO_STCLASS) {
830                         cl_init(pRExC_state, &this_class);
831                         data_fake.start_class = &this_class;
832                         f = SCF_DO_STCLASS_AND;
833                     }           
834                     if (flags & SCF_WHILEM_VISITED_POS)
835                         f |= SCF_WHILEM_VISITED_POS;
836                     /* we suppose the run is continuous, last=next...*/
837                     minnext = study_chunk(pRExC_state, &scan, &deltanext,
838                                           next, &data_fake, f);
839                     if (min1 > minnext)
840                         min1 = minnext;
841                     if (max1 < minnext + deltanext)
842                         max1 = minnext + deltanext;
843                     if (deltanext == I32_MAX)
844                         is_inf = is_inf_internal = 1;
845                     scan = next;
846                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
847                         pars++;
848                     if (data && (data_fake.flags & SF_HAS_EVAL))
849                         data->flags |= SF_HAS_EVAL;
850                     if (data)
851                         data->whilem_c = data_fake.whilem_c;
852                     if (flags & SCF_DO_STCLASS)
853                         cl_or(pRExC_state, &accum, &this_class);
854                     if (code == SUSPEND)
855                         break;
856                 }
857                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
858                     min1 = 0;
859                 if (flags & SCF_DO_SUBSTR) {
860                     data->pos_min += min1;
861                     data->pos_delta += max1 - min1;
862                     if (max1 != min1 || is_inf)
863                         data->longest = &(data->longest_float);
864                 }
865                 min += min1;
866                 delta += max1 - min1;
867                 if (flags & SCF_DO_STCLASS_OR) {
868                     cl_or(pRExC_state, data->start_class, &accum);
869                     if (min1) {
870                         cl_and(data->start_class, &and_with);
871                         flags &= ~SCF_DO_STCLASS;
872                     }
873                 }
874                 else if (flags & SCF_DO_STCLASS_AND) {
875                     if (min1) {
876                         cl_and(data->start_class, &accum);
877                         flags &= ~SCF_DO_STCLASS;
878                     }
879                     else {
880                         /* Switch to OR mode: cache the old value of
881                          * data->start_class */
882                         StructCopy(data->start_class, &and_with,
883                                    struct regnode_charclass_class);
884                         flags &= ~SCF_DO_STCLASS_AND;
885                         StructCopy(&accum, data->start_class,
886                                    struct regnode_charclass_class);
887                         flags |= SCF_DO_STCLASS_OR;
888                         data->start_class->flags |= ANYOF_EOS;
889                     }
890                 }
891             }
892             else if (code == BRANCHJ)   /* single branch is optimized. */
893                 scan = NEXTOPER(NEXTOPER(scan));
894             else                        /* single branch is optimized. */
895                 scan = NEXTOPER(scan);
896             continue;
897         }
898         else if (OP(scan) == EXACT) {
899             I32 l = STR_LEN(scan);
900             UV uc = *((U8*)STRING(scan));
901             if (UTF) {
902                 U8 *s = (U8*)STRING(scan);
903                 l = utf8_length(s, s + l);
904                 uc = utf8_to_uvchr(s, NULL);
905             }
906             min += l;
907             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
908                 /* The code below prefers earlier match for fixed
909                    offset, later match for variable offset.  */
910                 if (data->last_end == -1) { /* Update the start info. */
911                     data->last_start_min = data->pos_min;
912                     data->last_start_max = is_inf
913                         ? I32_MAX : data->pos_min + data->pos_delta;
914                 }
915                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
916                 if (UTF)
917                     SvUTF8_on(data->last_found);
918                 data->last_end = data->pos_min + l;
919                 data->pos_min += l; /* As in the first entry. */
920                 data->flags &= ~SF_BEFORE_EOL;
921             }
922             if (flags & SCF_DO_STCLASS_AND) {
923                 /* Check whether it is compatible with what we know already! */
924                 int compat = 1;
925
926                 if (uc >= 0x100 ||
927                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
928                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
929                     && (!(data->start_class->flags & ANYOF_FOLD)
930                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
931                     )
932                     compat = 0;
933                 ANYOF_CLASS_ZERO(data->start_class);
934                 ANYOF_BITMAP_ZERO(data->start_class);
935                 if (compat)
936                     ANYOF_BITMAP_SET(data->start_class, uc);
937                 data->start_class->flags &= ~ANYOF_EOS;
938                 if (uc < 0x100)
939                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
940             }
941             else if (flags & SCF_DO_STCLASS_OR) {
942                 /* false positive possible if the class is case-folded */
943                 if (uc < 0x100)
944                     ANYOF_BITMAP_SET(data->start_class, uc);
945                 else
946                     data->start_class->flags |= ANYOF_UNICODE_ALL;
947                 data->start_class->flags &= ~ANYOF_EOS;
948                 cl_and(data->start_class, &and_with);
949             }
950             flags &= ~SCF_DO_STCLASS;
951         }
952         else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
953             I32 l = STR_LEN(scan);
954             UV uc = *((U8*)STRING(scan));
955
956             /* Search for fixed substrings supports EXACT only. */
957             if (flags & SCF_DO_SUBSTR)
958                 scan_commit(pRExC_state, data);
959             if (UTF) {
960                 U8 *s = (U8 *)STRING(scan);
961                 l = utf8_length(s, s + l);
962                 uc = utf8_to_uvchr(s, NULL);
963             }
964             min += l;
965             if (data && (flags & SCF_DO_SUBSTR))
966                 data->pos_min += l;
967             if (flags & SCF_DO_STCLASS_AND) {
968                 /* Check whether it is compatible with what we know already! */
969                 int compat = 1;
970
971                 if (uc >= 0x100 ||
972                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
973                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
974                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
975                     compat = 0;
976                 ANYOF_CLASS_ZERO(data->start_class);
977                 ANYOF_BITMAP_ZERO(data->start_class);
978                 if (compat) {
979                     ANYOF_BITMAP_SET(data->start_class, uc);
980                     data->start_class->flags &= ~ANYOF_EOS;
981                     data->start_class->flags |= ANYOF_FOLD;
982                     if (OP(scan) == EXACTFL)
983                         data->start_class->flags |= ANYOF_LOCALE;
984                 }
985             }
986             else if (flags & SCF_DO_STCLASS_OR) {
987                 if (data->start_class->flags & ANYOF_FOLD) {
988                     /* false positive possible if the class is case-folded.
989                        Assume that the locale settings are the same... */
990                     if (uc < 0x100)
991                         ANYOF_BITMAP_SET(data->start_class, uc);
992                     data->start_class->flags &= ~ANYOF_EOS;
993                 }
994                 cl_and(data->start_class, &and_with);
995             }
996             flags &= ~SCF_DO_STCLASS;
997         }
998         else if (strchr((char*)PL_varies,OP(scan))) {
999             I32 mincount, maxcount, minnext, deltanext, fl = 0;
1000             I32 f = flags, pos_before = 0;
1001             regnode *oscan = scan;
1002             struct regnode_charclass_class this_class;
1003             struct regnode_charclass_class *oclass = NULL;
1004             I32 next_is_eval = 0;
1005
1006             switch (PL_regkind[(U8)OP(scan)]) {
1007             case WHILEM:                /* End of (?:...)* . */
1008                 scan = NEXTOPER(scan);
1009                 goto finish;
1010             case PLUS:
1011                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1012                     next = NEXTOPER(scan);
1013                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1014                         mincount = 1;
1015                         maxcount = REG_INFTY;
1016                         next = regnext(scan);
1017                         scan = NEXTOPER(scan);
1018                         goto do_curly;
1019                     }
1020                 }
1021                 if (flags & SCF_DO_SUBSTR)
1022                     data->pos_min++;
1023                 min++;
1024                 /* Fall through. */
1025             case STAR:
1026                 if (flags & SCF_DO_STCLASS) {
1027                     mincount = 0;
1028                     maxcount = REG_INFTY;
1029                     next = regnext(scan);
1030                     scan = NEXTOPER(scan);
1031                     goto do_curly;
1032                 }
1033                 is_inf = is_inf_internal = 1;
1034                 scan = regnext(scan);
1035                 if (flags & SCF_DO_SUBSTR) {
1036                     scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1037                     data->longest = &(data->longest_float);
1038                 }
1039                 goto optimize_curly_tail;
1040             case CURLY:
1041                 mincount = ARG1(scan);
1042                 maxcount = ARG2(scan);
1043                 next = regnext(scan);
1044                 if (OP(scan) == CURLYX) {
1045                     I32 lp = (data ? *(data->last_closep) : 0);
1046
1047                     scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1048                 }
1049                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1050                 next_is_eval = (OP(scan) == EVAL);
1051               do_curly:
1052                 if (flags & SCF_DO_SUBSTR) {
1053                     if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1054                     pos_before = data->pos_min;
1055                 }
1056                 if (data) {
1057                     fl = data->flags;
1058                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1059                     if (is_inf)
1060                         data->flags |= SF_IS_INF;
1061                 }
1062                 if (flags & SCF_DO_STCLASS) {
1063                     cl_init(pRExC_state, &this_class);
1064                     oclass = data->start_class;
1065                     data->start_class = &this_class;
1066                     f |= SCF_DO_STCLASS_AND;
1067                     f &= ~SCF_DO_STCLASS_OR;
1068                 }
1069                 /* These are the cases when once a subexpression
1070                    fails at a particular position, it cannot succeed
1071                    even after backtracking at the enclosing scope.
1072                 
1073                    XXXX what if minimal match and we are at the
1074                         initial run of {n,m}? */
1075                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1076                     f &= ~SCF_WHILEM_VISITED_POS;
1077
1078                 /* This will finish on WHILEM, setting scan, or on NULL: */
1079                 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1080                                       mincount == 0
1081                                         ? (f & ~SCF_DO_SUBSTR) : f);
1082
1083                 if (flags & SCF_DO_STCLASS)
1084                     data->start_class = oclass;
1085                 if (mincount == 0 || minnext == 0) {
1086                     if (flags & SCF_DO_STCLASS_OR) {
1087                         cl_or(pRExC_state, data->start_class, &this_class);
1088                     }
1089                     else if (flags & SCF_DO_STCLASS_AND) {
1090                         /* Switch to OR mode: cache the old value of
1091                          * data->start_class */
1092                         StructCopy(data->start_class, &and_with,
1093                                    struct regnode_charclass_class);
1094                         flags &= ~SCF_DO_STCLASS_AND;
1095                         StructCopy(&this_class, data->start_class,
1096                                    struct regnode_charclass_class);
1097                         flags |= SCF_DO_STCLASS_OR;
1098                         data->start_class->flags |= ANYOF_EOS;
1099                     }
1100                 } else {                /* Non-zero len */
1101                     if (flags & SCF_DO_STCLASS_OR) {
1102                         cl_or(pRExC_state, data->start_class, &this_class);
1103                         cl_and(data->start_class, &and_with);
1104                     }
1105                     else if (flags & SCF_DO_STCLASS_AND)
1106                         cl_and(data->start_class, &this_class);
1107                     flags &= ~SCF_DO_STCLASS;
1108                 }
1109                 if (!scan)              /* It was not CURLYX, but CURLY. */
1110                     scan = next;
1111                 if (ckWARN(WARN_REGEXP)
1112                        /* ? quantifier ok, except for (?{ ... }) */
1113                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
1114                     && (minnext == 0) && (deltanext == 0)
1115                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1116                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
1117                 {
1118                     vWARN(RExC_parse,
1119                           "Quantifier unexpected on zero-length expression");
1120                 }
1121
1122                 min += minnext * mincount;
1123                 is_inf_internal |= ((maxcount == REG_INFTY
1124                                      && (minnext + deltanext) > 0)
1125                                     || deltanext == I32_MAX);
1126                 is_inf |= is_inf_internal;
1127                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1128
1129                 /* Try powerful optimization CURLYX => CURLYN. */
1130                 if (  OP(oscan) == CURLYX && data
1131                       && data->flags & SF_IN_PAR
1132                       && !(data->flags & SF_HAS_EVAL)
1133                       && !deltanext && minnext == 1 ) {
1134                     /* Try to optimize to CURLYN.  */
1135                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1136                     regnode *nxt1 = nxt;
1137 #ifdef DEBUGGING
1138                     regnode *nxt2;
1139 #endif
1140
1141                     /* Skip open. */
1142                     nxt = regnext(nxt);
1143                     if (!strchr((char*)PL_simple,OP(nxt))
1144                         && !(PL_regkind[(U8)OP(nxt)] == EXACT
1145                              && STR_LEN(nxt) == 1))
1146                         goto nogo;
1147 #ifdef DEBUGGING
1148                     nxt2 = nxt;
1149 #endif
1150                     nxt = regnext(nxt);
1151                     if (OP(nxt) != CLOSE)
1152                         goto nogo;
1153                     /* Now we know that nxt2 is the only contents: */
1154                     oscan->flags = (U8)ARG(nxt);
1155                     OP(oscan) = CURLYN;
1156                     OP(nxt1) = NOTHING; /* was OPEN. */
1157 #ifdef DEBUGGING
1158                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1159                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1160                     NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1161                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
1162                     OP(nxt + 1) = OPTIMIZED; /* was count. */
1163                     NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1164 #endif
1165                 }
1166               nogo:
1167
1168                 /* Try optimization CURLYX => CURLYM. */
1169                 if (  OP(oscan) == CURLYX && data
1170                       && !(data->flags & SF_HAS_PAR)
1171                       && !(data->flags & SF_HAS_EVAL)
1172                       && !deltanext  ) {
1173                     /* XXXX How to optimize if data == 0? */
1174                     /* Optimize to a simpler form.  */
1175                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1176                     regnode *nxt2;
1177
1178                     OP(oscan) = CURLYM;
1179                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1180                             && (OP(nxt2) != WHILEM))
1181                         nxt = nxt2;
1182                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
1183                     /* Need to optimize away parenths. */
1184                     if (data->flags & SF_IN_PAR) {
1185                         /* Set the parenth number.  */
1186                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1187
1188                         if (OP(nxt) != CLOSE)
1189                             FAIL("Panic opt close");
1190                         oscan->flags = (U8)ARG(nxt);
1191                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
1192                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
1193 #ifdef DEBUGGING
1194                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1195                         OP(nxt + 1) = OPTIMIZED; /* was count. */
1196                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1197                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1198 #endif
1199 #if 0
1200                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
1201                             regnode *nnxt = regnext(nxt1);
1202                         
1203                             if (nnxt == nxt) {
1204                                 if (reg_off_by_arg[OP(nxt1)])
1205                                     ARG_SET(nxt1, nxt2 - nxt1);
1206                                 else if (nxt2 - nxt1 < U16_MAX)
1207                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
1208                                 else
1209                                     OP(nxt) = NOTHING;  /* Cannot beautify */
1210                             }
1211                             nxt1 = nnxt;
1212                         }
1213 #endif
1214                         /* Optimize again: */
1215                         study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1216                                     NULL, 0);
1217                     }
1218                     else
1219                         oscan->flags = 0;
1220                 }
1221                 else if ((OP(oscan) == CURLYX)
1222                          && (flags & SCF_WHILEM_VISITED_POS)
1223                          /* See the comment on a similar expression above.
1224                             However, this time it not a subexpression
1225                             we care about, but the expression itself. */
1226                          && (maxcount == REG_INFTY)
1227                          && data && ++data->whilem_c < 16) {
1228                     /* This stays as CURLYX, we can put the count/of pair. */
1229                     /* Find WHILEM (as in regexec.c) */
1230                     regnode *nxt = oscan + NEXT_OFF(oscan);
1231
1232                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1233                         nxt += ARG(nxt);
1234                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
1235                         | (RExC_whilem_seen << 4)); /* On WHILEM */
1236                 }
1237                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1238                     pars++;
1239                 if (flags & SCF_DO_SUBSTR) {
1240                     SV *last_str = Nullsv;
1241                     int counted = mincount != 0;
1242
1243                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1244 #if defined(SPARC64_GCC_WORKAROUND)
1245                         I32 b = 0;
1246                         STRLEN l = 0;
1247                         char *s = NULL;
1248                         I32 old = 0;
1249
1250                         if (pos_before >= data->last_start_min)
1251                             b = pos_before;
1252                         else
1253                             b = data->last_start_min;
1254
1255                         l = 0;
1256                         s = SvPV(data->last_found, l);
1257                         old = b - data->last_start_min;
1258
1259 #else
1260                         I32 b = pos_before >= data->last_start_min
1261                             ? pos_before : data->last_start_min;
1262                         STRLEN l;
1263                         char *s = SvPV(data->last_found, l);
1264                         I32 old = b - data->last_start_min;
1265 #endif
1266
1267                         if (UTF)
1268                             old = utf8_hop((U8*)s, old) - (U8*)s;
1269                         
1270                         l -= old;
1271                         /* Get the added string: */
1272                         last_str = newSVpvn(s  + old, l);
1273                         if (deltanext == 0 && pos_before == b) {
1274                             /* What was added is a constant string */
1275                             if (mincount > 1) {
1276                                 SvGROW(last_str, (mincount * l) + 1);
1277                                 repeatcpy(SvPVX(last_str) + l,
1278                                           SvPVX(last_str), l, mincount - 1);
1279                                 SvCUR(last_str) *= mincount;
1280                                 /* Add additional parts. */
1281                                 SvCUR_set(data->last_found,
1282                                           SvCUR(data->last_found) - l);
1283                                 sv_catsv(data->last_found, last_str);
1284                                 data->last_end += l * (mincount - 1);
1285                             }
1286                         } else {
1287                             /* start offset must point into the last copy */
1288                             data->last_start_min += minnext * (mincount - 1);
1289                             data->last_start_max += is_inf ? I32_MAX
1290                                 : (maxcount - 1) * (minnext + data->pos_delta);
1291                         }
1292                     }
1293                     /* It is counted once already... */
1294                     data->pos_min += minnext * (mincount - counted);
1295                     data->pos_delta += - counted * deltanext +
1296                         (minnext + deltanext) * maxcount - minnext * mincount;
1297                     if (mincount != maxcount) {
1298                          /* Cannot extend fixed substrings found inside
1299                             the group.  */
1300                         scan_commit(pRExC_state,data);
1301                         if (mincount && last_str) {
1302                             sv_setsv(data->last_found, last_str);
1303                             data->last_end = data->pos_min;
1304                             data->last_start_min =
1305                                 data->pos_min - CHR_SVLEN(last_str);
1306                             data->last_start_max = is_inf
1307                                 ? I32_MAX
1308                                 : data->pos_min + data->pos_delta
1309                                 - CHR_SVLEN(last_str);
1310                         }
1311                         data->longest = &(data->longest_float);
1312                     }
1313                     SvREFCNT_dec(last_str);
1314                 }
1315                 if (data && (fl & SF_HAS_EVAL))
1316                     data->flags |= SF_HAS_EVAL;
1317               optimize_curly_tail:
1318                 if (OP(oscan) != CURLYX) {
1319                     while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1320                            && NEXT_OFF(next))
1321                         NEXT_OFF(oscan) += NEXT_OFF(next);
1322                 }
1323                 continue;
1324             default:                    /* REF and CLUMP only? */
1325                 if (flags & SCF_DO_SUBSTR) {
1326                     scan_commit(pRExC_state,data);      /* Cannot expect anything... */
1327                     data->longest = &(data->longest_float);
1328                 }
1329                 is_inf = is_inf_internal = 1;
1330                 if (flags & SCF_DO_STCLASS_OR)
1331                     cl_anything(pRExC_state, data->start_class);
1332                 flags &= ~SCF_DO_STCLASS;
1333                 break;
1334             }
1335         }
1336         else if (strchr((char*)PL_simple,OP(scan))) {
1337             int value = 0;
1338
1339             if (flags & SCF_DO_SUBSTR) {
1340                 scan_commit(pRExC_state,data);
1341                 data->pos_min++;
1342             }
1343             min++;
1344             if (flags & SCF_DO_STCLASS) {
1345                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1346
1347                 /* Some of the logic below assumes that switching
1348                    locale on will only add false positives. */
1349                 switch (PL_regkind[(U8)OP(scan)]) {
1350                 case SANY:
1351                 default:
1352                   do_default:
1353                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1354                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1355                         cl_anything(pRExC_state, data->start_class);
1356                     break;
1357                 case REG_ANY:
1358                     if (OP(scan) == SANY)
1359                         goto do_default;
1360                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1361                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1362                                  || (data->start_class->flags & ANYOF_CLASS));
1363                         cl_anything(pRExC_state, data->start_class);
1364                     }
1365                     if (flags & SCF_DO_STCLASS_AND || !value)
1366                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1367                     break;
1368                 case ANYOF:
1369                     if (flags & SCF_DO_STCLASS_AND)
1370                         cl_and(data->start_class,
1371                                (struct regnode_charclass_class*)scan);
1372                     else
1373                         cl_or(pRExC_state, data->start_class,
1374                               (struct regnode_charclass_class*)scan);
1375                     break;
1376                 case ALNUM:
1377                     if (flags & SCF_DO_STCLASS_AND) {
1378                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1379                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1380                             for (value = 0; value < 256; value++)
1381                                 if (!isALNUM(value))
1382                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1383                         }
1384                     }
1385                     else {
1386                         if (data->start_class->flags & ANYOF_LOCALE)
1387                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1388                         else {
1389                             for (value = 0; value < 256; value++)
1390                                 if (isALNUM(value))
1391                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1392                         }
1393                     }
1394                     break;
1395                 case ALNUML:
1396                     if (flags & SCF_DO_STCLASS_AND) {
1397                         if (data->start_class->flags & ANYOF_LOCALE)
1398                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1399                     }
1400                     else {
1401                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1402                         data->start_class->flags |= ANYOF_LOCALE;
1403                     }
1404                     break;
1405                 case NALNUM:
1406                     if (flags & SCF_DO_STCLASS_AND) {
1407                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1408                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1409                             for (value = 0; value < 256; value++)
1410                                 if (isALNUM(value))
1411                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1412                         }
1413                     }
1414                     else {
1415                         if (data->start_class->flags & ANYOF_LOCALE)
1416                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1417                         else {
1418                             for (value = 0; value < 256; value++)
1419                                 if (!isALNUM(value))
1420                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1421                         }
1422                     }
1423                     break;
1424                 case NALNUML:
1425                     if (flags & SCF_DO_STCLASS_AND) {
1426                         if (data->start_class->flags & ANYOF_LOCALE)
1427                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1428                     }
1429                     else {
1430                         data->start_class->flags |= ANYOF_LOCALE;
1431                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1432                     }
1433                     break;
1434                 case SPACE:
1435                     if (flags & SCF_DO_STCLASS_AND) {
1436                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1437                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1438                             for (value = 0; value < 256; value++)
1439                                 if (!isSPACE(value))
1440                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1441                         }
1442                     }
1443                     else {
1444                         if (data->start_class->flags & ANYOF_LOCALE)
1445                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1446                         else {
1447                             for (value = 0; value < 256; value++)
1448                                 if (isSPACE(value))
1449                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1450                         }
1451                     }
1452                     break;
1453                 case SPACEL:
1454                     if (flags & SCF_DO_STCLASS_AND) {
1455                         if (data->start_class->flags & ANYOF_LOCALE)
1456                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1457                     }
1458                     else {
1459                         data->start_class->flags |= ANYOF_LOCALE;
1460                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1461                     }
1462                     break;
1463                 case NSPACE:
1464                     if (flags & SCF_DO_STCLASS_AND) {
1465                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1466                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1467                             for (value = 0; value < 256; value++)
1468                                 if (isSPACE(value))
1469                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1470                         }
1471                     }
1472                     else {
1473                         if (data->start_class->flags & ANYOF_LOCALE)
1474                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1475                         else {
1476                             for (value = 0; value < 256; value++)
1477                                 if (!isSPACE(value))
1478                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1479                         }
1480                     }
1481                     break;
1482                 case NSPACEL:
1483                     if (flags & SCF_DO_STCLASS_AND) {
1484                         if (data->start_class->flags & ANYOF_LOCALE) {
1485                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1486                             for (value = 0; value < 256; value++)
1487                                 if (!isSPACE(value))
1488                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1489                         }
1490                     }
1491                     else {
1492                         data->start_class->flags |= ANYOF_LOCALE;
1493                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1494                     }
1495                     break;
1496                 case DIGIT:
1497                     if (flags & SCF_DO_STCLASS_AND) {
1498                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1499                         for (value = 0; value < 256; value++)
1500                             if (!isDIGIT(value))
1501                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1502                     }
1503                     else {
1504                         if (data->start_class->flags & ANYOF_LOCALE)
1505                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1506                         else {
1507                             for (value = 0; value < 256; value++)
1508                                 if (isDIGIT(value))
1509                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1510                         }
1511                     }
1512                     break;
1513                 case NDIGIT:
1514                     if (flags & SCF_DO_STCLASS_AND) {
1515                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1516                         for (value = 0; value < 256; value++)
1517                             if (isDIGIT(value))
1518                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1519                     }
1520                     else {
1521                         if (data->start_class->flags & ANYOF_LOCALE)
1522                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1523                         else {
1524                             for (value = 0; value < 256; value++)
1525                                 if (!isDIGIT(value))
1526                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1527                         }
1528                     }
1529                     break;
1530                 }
1531                 if (flags & SCF_DO_STCLASS_OR)
1532                     cl_and(data->start_class, &and_with);
1533                 flags &= ~SCF_DO_STCLASS;
1534             }
1535         }
1536         else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1537             data->flags |= (OP(scan) == MEOL
1538                             ? SF_BEFORE_MEOL
1539                             : SF_BEFORE_SEOL);
1540         }
1541         else if (  PL_regkind[(U8)OP(scan)] == BRANCHJ
1542                  /* Lookbehind, or need to calculate parens/evals/stclass: */
1543                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
1544                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1545             /* Lookahead/lookbehind */
1546             I32 deltanext, minnext, fake = 0;
1547             regnode *nscan;
1548             struct regnode_charclass_class intrnl;
1549             int f = 0;
1550
1551             data_fake.flags = 0;
1552             if (data) {         
1553                 data_fake.whilem_c = data->whilem_c;
1554                 data_fake.last_closep = data->last_closep;
1555             }
1556             else
1557                 data_fake.last_closep = &fake;
1558             if ( flags & SCF_DO_STCLASS && !scan->flags
1559                  && OP(scan) == IFMATCH ) { /* Lookahead */
1560                 cl_init(pRExC_state, &intrnl);
1561                 data_fake.start_class = &intrnl;
1562                 f |= SCF_DO_STCLASS_AND;
1563             }
1564             if (flags & SCF_WHILEM_VISITED_POS)
1565                 f |= SCF_WHILEM_VISITED_POS;
1566             next = regnext(scan);
1567             nscan = NEXTOPER(NEXTOPER(scan));
1568             minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1569             if (scan->flags) {
1570                 if (deltanext) {
1571                     vFAIL("Variable length lookbehind not implemented");
1572                 }
1573                 else if (minnext > U8_MAX) {
1574                     vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1575                 }
1576                 scan->flags = (U8)minnext;
1577             }
1578             if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1579                 pars++;
1580             if (data && (data_fake.flags & SF_HAS_EVAL))
1581                 data->flags |= SF_HAS_EVAL;
1582             if (data)
1583                 data->whilem_c = data_fake.whilem_c;
1584             if (f & SCF_DO_STCLASS_AND) {
1585                 int was = (data->start_class->flags & ANYOF_EOS);
1586
1587                 cl_and(data->start_class, &intrnl);
1588                 if (was)
1589                     data->start_class->flags |= ANYOF_EOS;
1590             }
1591         }
1592         else if (OP(scan) == OPEN) {
1593             pars++;
1594         }
1595         else if (OP(scan) == CLOSE) {
1596             if ((I32)ARG(scan) == is_par) {
1597                 next = regnext(scan);
1598
1599                 if ( next && (OP(next) != WHILEM) && next < last)
1600                     is_par = 0;         /* Disable optimization */
1601             }
1602             if (data)
1603                 *(data->last_closep) = ARG(scan);
1604         }
1605         else if (OP(scan) == EVAL) {
1606                 if (data)
1607                     data->flags |= SF_HAS_EVAL;
1608         }
1609         else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1610                 if (flags & SCF_DO_SUBSTR) {
1611                     scan_commit(pRExC_state,data);
1612                     data->longest = &(data->longest_float);
1613                 }
1614                 is_inf = is_inf_internal = 1;
1615                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1616                     cl_anything(pRExC_state, data->start_class);
1617                 flags &= ~SCF_DO_STCLASS;
1618         }
1619         /* Else: zero-length, ignore. */
1620         scan = regnext(scan);
1621     }
1622
1623   finish:
1624     *scanp = scan;
1625     *deltap = is_inf_internal ? I32_MAX : delta;
1626     if (flags & SCF_DO_SUBSTR && is_inf)
1627         data->pos_delta = I32_MAX - data->pos_min;
1628     if (is_par > U8_MAX)
1629         is_par = 0;
1630     if (is_par && pars==1 && data) {
1631         data->flags |= SF_IN_PAR;
1632         data->flags &= ~SF_HAS_PAR;
1633     }
1634     else if (pars && data) {
1635         data->flags |= SF_HAS_PAR;
1636         data->flags &= ~SF_IN_PAR;
1637     }
1638     if (flags & SCF_DO_STCLASS_OR)
1639         cl_and(data->start_class, &and_with);
1640     return min;
1641 }
1642
1643 STATIC I32
1644 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1645 {
1646     if (RExC_rx->data) {
1647         Renewc(RExC_rx->data,
1648                sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1649                char, struct reg_data);
1650         Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1651         RExC_rx->data->count += n;
1652     }
1653     else {
1654         Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1655              char, struct reg_data);
1656         New(1208, RExC_rx->data->what, n, U8);
1657         RExC_rx->data->count = n;
1658     }
1659     Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1660     return RExC_rx->data->count - n;
1661 }
1662
1663 void
1664 Perl_reginitcolors(pTHX)
1665 {
1666     int i = 0;
1667     char *s = PerlEnv_getenv("PERL_RE_COLORS");
1668         
1669     if (s) {
1670         PL_colors[0] = s = savepv(s);
1671         while (++i < 6) {
1672             s = strchr(s, '\t');
1673             if (s) {
1674                 *s = '\0';
1675                 PL_colors[i] = ++s;
1676             }
1677             else
1678                 PL_colors[i] = s = "";
1679         }
1680     } else {
1681         while (i < 6)
1682             PL_colors[i++] = "";
1683     }
1684     PL_colorset = 1;
1685 }
1686
1687
1688 /*
1689  - pregcomp - compile a regular expression into internal code
1690  *
1691  * We can't allocate space until we know how big the compiled form will be,
1692  * but we can't compile it (and thus know how big it is) until we've got a
1693  * place to put the code.  So we cheat:  we compile it twice, once with code
1694  * generation turned off and size counting turned on, and once "for real".
1695  * This also means that we don't allocate space until we are sure that the
1696  * thing really will compile successfully, and we never have to move the
1697  * code and thus invalidate pointers into it.  (Note that it has to be in
1698  * one piece because free() must be able to free it all.) [NB: not true in perl]
1699  *
1700  * Beware that the optimization-preparation code in here knows about some
1701  * of the structure of the compiled regexp.  [I'll say.]
1702  */
1703 regexp *
1704 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1705 {
1706     register regexp *r;
1707     regnode *scan;
1708     regnode *first;
1709     I32 flags;
1710     I32 minlen = 0;
1711     I32 sawplus = 0;
1712     I32 sawopen = 0;
1713     scan_data_t data;
1714     RExC_state_t RExC_state;
1715     RExC_state_t *pRExC_state = &RExC_state;
1716
1717     if (exp == NULL)
1718         FAIL("NULL regexp argument");
1719
1720     RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1721
1722     RExC_precomp = exp;
1723     DEBUG_r({
1724          if (!PL_colorset) reginitcolors();
1725          PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1726                        PL_colors[4],PL_colors[5],PL_colors[0],
1727                        (int)(xend - exp), RExC_precomp, PL_colors[1]);
1728     });
1729     RExC_flags = pm->op_pmflags;
1730     RExC_sawback = 0;
1731
1732     RExC_seen = 0;
1733     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1734     RExC_seen_evals = 0;
1735     RExC_extralen = 0;
1736
1737     /* First pass: determine size, legality. */
1738     RExC_parse = exp;
1739     RExC_start = exp;
1740     RExC_end = xend;
1741     RExC_naughty = 0;
1742     RExC_npar = 1;
1743     RExC_size = 0L;
1744     RExC_emit = &PL_regdummy;
1745     RExC_whilem_seen = 0;
1746 #if 0 /* REGC() is (currently) a NOP at the first pass.
1747        * Clever compilers notice this and complain. --jhi */
1748     REGC((U8)REG_MAGIC, (char*)RExC_emit);
1749 #endif
1750     if (reg(pRExC_state, 0, &flags) == NULL) {
1751         RExC_precomp = Nullch;
1752         return(NULL);
1753     }
1754     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1755
1756     /* Small enough for pointer-storage convention?
1757        If extralen==0, this means that we will not need long jumps. */
1758     if (RExC_size >= 0x10000L && RExC_extralen)
1759         RExC_size += RExC_extralen;
1760     else
1761         RExC_extralen = 0;
1762     if (RExC_whilem_seen > 15)
1763         RExC_whilem_seen = 15;
1764
1765     /* Allocate space and initialize. */
1766     Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1767          char, regexp);
1768     if (r == NULL)
1769         FAIL("Regexp out of space");
1770
1771 #ifdef DEBUGGING
1772     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1773     Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1774 #endif
1775     r->refcnt = 1;
1776     r->prelen = xend - exp;
1777     r->precomp = savepvn(RExC_precomp, r->prelen);
1778     r->subbeg = NULL;
1779     r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1780     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1781
1782     r->substrs = 0;                     /* Useful during FAIL. */
1783     r->startp = 0;                      /* Useful during FAIL. */
1784     r->endp = 0;                        /* Useful during FAIL. */
1785
1786     Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1787     if (r->offsets) {
1788       r->offsets[0] = RExC_size; 
1789     }
1790     DEBUG_r(PerlIO_printf(Perl_debug_log, 
1791                           "%s %"UVuf" bytes for offset annotations.\n", 
1792                           r->offsets ? "Got" : "Couldn't get", 
1793                           (UV)((2*RExC_size+1) * sizeof(U32))));
1794
1795     RExC_rx = r;
1796
1797     /* Second pass: emit code. */
1798     RExC_flags = pm->op_pmflags;        /* don't let top level (?i) bleed */
1799     RExC_parse = exp;
1800     RExC_end = xend;
1801     RExC_naughty = 0;
1802     RExC_npar = 1;
1803     RExC_emit_start = r->program;
1804     RExC_emit = r->program;
1805     /* Store the count of eval-groups for security checks: */
1806     RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1807     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1808     r->data = 0;
1809     if (reg(pRExC_state, 0, &flags) == NULL)
1810         return(NULL);
1811
1812     /* Dig out information for optimizations. */
1813     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1814     pm->op_pmflags = RExC_flags;
1815     if (UTF)
1816         r->reganch |= ROPT_UTF8;        /* Unicode in it? */
1817     r->regstclass = NULL;
1818     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
1819         r->reganch |= ROPT_NAUGHTY;
1820     scan = r->program + 1;              /* First BRANCH. */
1821
1822     /* XXXX To minimize changes to RE engine we always allocate
1823        3-units-long substrs field. */
1824     Newz(1004, r->substrs, 1, struct reg_substr_data);
1825
1826     StructCopy(&zero_scan_data, &data, scan_data_t);
1827     /* XXXX Should not we check for something else?  Usually it is OPEN1... */
1828     if (OP(scan) != BRANCH) {   /* Only one top-level choice. */
1829         I32 fake;
1830         STRLEN longest_float_length, longest_fixed_length;
1831         struct regnode_charclass_class ch_class;
1832         int stclass_flag;
1833         I32 last_close = 0;
1834
1835         first = scan;
1836         /* Skip introductions and multiplicators >= 1. */
1837         while ((OP(first) == OPEN && (sawopen = 1)) ||
1838                /* An OR of *one* alternative - should not happen now. */
1839             (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1840             (OP(first) == PLUS) ||
1841             (OP(first) == MINMOD) ||
1842                /* An {n,m} with n>0 */
1843             (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1844                 if (OP(first) == PLUS)
1845                     sawplus = 1;
1846                 else
1847                     first += regarglen[(U8)OP(first)];
1848                 first = NEXTOPER(first);
1849         }
1850
1851         /* Starting-point info. */
1852       again:
1853         if (PL_regkind[(U8)OP(first)] == EXACT) {
1854             if (OP(first) == EXACT)
1855                 ;       /* Empty, get anchored substr later. */
1856             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1857                 r->regstclass = first;
1858         }
1859         else if (strchr((char*)PL_simple,OP(first)))
1860             r->regstclass = first;
1861         else if (PL_regkind[(U8)OP(first)] == BOUND ||
1862                  PL_regkind[(U8)OP(first)] == NBOUND)
1863             r->regstclass = first;
1864         else if (PL_regkind[(U8)OP(first)] == BOL) {
1865             r->reganch |= (OP(first) == MBOL
1866                            ? ROPT_ANCH_MBOL
1867                            : (OP(first) == SBOL
1868                               ? ROPT_ANCH_SBOL
1869                               : ROPT_ANCH_BOL));
1870             first = NEXTOPER(first);
1871             goto again;
1872         }
1873         else if (OP(first) == GPOS) {
1874             r->reganch |= ROPT_ANCH_GPOS;
1875             first = NEXTOPER(first);
1876             goto again;
1877         }
1878         else if (!sawopen && (OP(first) == STAR &&
1879             PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1880             !(r->reganch & ROPT_ANCH) )
1881         {
1882             /* turn .* into ^.* with an implied $*=1 */
1883             int type = OP(NEXTOPER(first));
1884
1885             if (type == REG_ANY)
1886                 type = ROPT_ANCH_MBOL;
1887             else
1888                 type = ROPT_ANCH_SBOL;
1889
1890             r->reganch |= type | ROPT_IMPLICIT;
1891             first = NEXTOPER(first);
1892             goto again;
1893         }
1894         if (sawplus && (!sawopen || !RExC_sawback)
1895             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1896             /* x+ must match at the 1st pos of run of x's */
1897             r->reganch |= ROPT_SKIP;
1898
1899         /* Scan is after the zeroth branch, first is atomic matcher. */
1900         DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1901                               (IV)(first - scan + 1)));
1902         /*
1903         * If there's something expensive in the r.e., find the
1904         * longest literal string that must appear and make it the
1905         * regmust.  Resolve ties in favor of later strings, since
1906         * the regstart check works with the beginning of the r.e.
1907         * and avoiding duplication strengthens checking.  Not a
1908         * strong reason, but sufficient in the absence of others.
1909         * [Now we resolve ties in favor of the earlier string if
1910         * it happens that c_offset_min has been invalidated, since the
1911         * earlier string may buy us something the later one won't.]
1912         */
1913         minlen = 0;
1914
1915         data.longest_fixed = newSVpvn("",0);
1916         data.longest_float = newSVpvn("",0);
1917         data.last_found = newSVpvn("",0);
1918         data.longest = &(data.longest_fixed);
1919         first = scan;
1920         if (!r->regstclass) {
1921             cl_init(pRExC_state, &ch_class);
1922             data.start_class = &ch_class;
1923             stclass_flag = SCF_DO_STCLASS_AND;
1924         } else                          /* XXXX Check for BOUND? */
1925             stclass_flag = 0;
1926         data.last_closep = &last_close;
1927
1928         minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1929                              &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1930         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1931              && data.last_start_min == 0 && data.last_end > 0
1932              && !RExC_seen_zerolen
1933              && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1934             r->reganch |= ROPT_CHECK_ALL;
1935         scan_commit(pRExC_state, &data);
1936         SvREFCNT_dec(data.last_found);
1937
1938         longest_float_length = CHR_SVLEN(data.longest_float);
1939         if (longest_float_length
1940             || (data.flags & SF_FL_BEFORE_EOL
1941                 && (!(data.flags & SF_FL_BEFORE_MEOL)
1942                     || (RExC_flags & PMf_MULTILINE)))) {
1943             int t;
1944
1945             if (SvCUR(data.longest_fixed)                       /* ok to leave SvCUR */
1946                 && data.offset_fixed == data.offset_float_min
1947                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1948                     goto remove_float;          /* As in (a)+. */
1949
1950             if (SvUTF8(data.longest_float)) {
1951                 r->float_utf8 = data.longest_float;
1952                 r->float_substr = Nullsv;
1953             } else {
1954                 r->float_substr = data.longest_float;
1955                 r->float_utf8 = Nullsv;
1956             }
1957             r->float_min_offset = data.offset_float_min;
1958             r->float_max_offset = data.offset_float_max;
1959             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1960                        && (!(data.flags & SF_FL_BEFORE_MEOL)
1961                            || (RExC_flags & PMf_MULTILINE)));
1962             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1963         }
1964         else {
1965           remove_float:
1966             r->float_substr = r->float_utf8 = Nullsv;
1967             SvREFCNT_dec(data.longest_float);
1968             longest_float_length = 0;
1969         }
1970
1971         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
1972         if (longest_fixed_length
1973             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1974                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1975                     || (RExC_flags & PMf_MULTILINE)))) {
1976             int t;
1977
1978             if (SvUTF8(data.longest_fixed)) {
1979                 r->anchored_utf8 = data.longest_fixed;
1980                 r->anchored_substr = Nullsv;
1981             } else {
1982                 r->anchored_substr = data.longest_fixed;
1983                 r->anchored_utf8 = Nullsv;
1984             }
1985             r->anchored_offset = data.offset_fixed;
1986             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1987                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
1988                      || (RExC_flags & PMf_MULTILINE)));
1989             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
1990         }
1991         else {
1992             r->anchored_substr = r->anchored_utf8 = Nullsv;
1993             SvREFCNT_dec(data.longest_fixed);
1994             longest_fixed_length = 0;
1995         }
1996         if (r->regstclass
1997             && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
1998             r->regstclass = NULL;
1999         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2000             && stclass_flag
2001             && !(data.start_class->flags & ANYOF_EOS)
2002             && !cl_is_anything(data.start_class))
2003         {
2004             I32 n = add_data(pRExC_state, 1, "f");
2005
2006             New(1006, RExC_rx->data->data[n], 1,
2007                 struct regnode_charclass_class);
2008             StructCopy(data.start_class,
2009                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2010                        struct regnode_charclass_class);
2011             r->regstclass = (regnode*)RExC_rx->data->data[n];
2012             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2013             PL_regdata = r->data; /* for regprop() */
2014             DEBUG_r({ SV *sv = sv_newmortal();
2015                       regprop(sv, (regnode*)data.start_class);
2016                       PerlIO_printf(Perl_debug_log,
2017                                     "synthetic stclass `%s'.\n",
2018                                     SvPVX(sv));});
2019         }
2020
2021         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2022         if (longest_fixed_length > longest_float_length) {
2023             r->check_substr = r->anchored_substr;
2024             r->check_utf8 = r->anchored_utf8;
2025             r->check_offset_min = r->check_offset_max = r->anchored_offset;
2026             if (r->reganch & ROPT_ANCH_SINGLE)
2027                 r->reganch |= ROPT_NOSCAN;
2028         }
2029         else {
2030             r->check_substr = r->float_substr;
2031             r->check_utf8 = r->float_utf8;
2032             r->check_offset_min = data.offset_float_min;
2033             r->check_offset_max = data.offset_float_max;
2034         }
2035         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2036            This should be changed ASAP!  */
2037         if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2038             r->reganch |= RE_USE_INTUIT;
2039             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2040                 r->reganch |= RE_INTUIT_TAIL;
2041         }
2042     }
2043     else {
2044         /* Several toplevels. Best we can is to set minlen. */
2045         I32 fake;
2046         struct regnode_charclass_class ch_class;
2047         I32 last_close = 0;
2048         
2049         DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2050         scan = r->program + 1;
2051         cl_init(pRExC_state, &ch_class);
2052         data.start_class = &ch_class;
2053         data.last_closep = &last_close;
2054         minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2055         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2056                 = r->float_substr = r->float_utf8 = Nullsv;
2057         if (!(data.start_class->flags & ANYOF_EOS)
2058             && !cl_is_anything(data.start_class))
2059         {
2060             I32 n = add_data(pRExC_state, 1, "f");
2061
2062             New(1006, RExC_rx->data->data[n], 1,
2063                 struct regnode_charclass_class);
2064             StructCopy(data.start_class,
2065                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2066                        struct regnode_charclass_class);
2067             r->regstclass = (regnode*)RExC_rx->data->data[n];
2068             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2069             DEBUG_r({ SV* sv = sv_newmortal();
2070                       regprop(sv, (regnode*)data.start_class);
2071                       PerlIO_printf(Perl_debug_log,
2072                                     "synthetic stclass `%s'.\n",
2073                                     SvPVX(sv));});
2074         }
2075     }
2076
2077     r->minlen = minlen;
2078     if (RExC_seen & REG_SEEN_GPOS)
2079         r->reganch |= ROPT_GPOS_SEEN;
2080     if (RExC_seen & REG_SEEN_LOOKBEHIND)
2081         r->reganch |= ROPT_LOOKBEHIND_SEEN;
2082     if (RExC_seen & REG_SEEN_EVAL)
2083         r->reganch |= ROPT_EVAL_SEEN;
2084     if (RExC_seen & REG_SEEN_CANY)
2085         r->reganch |= ROPT_CANY_SEEN;
2086     Newz(1002, r->startp, RExC_npar, I32);
2087     Newz(1002, r->endp, RExC_npar, I32);
2088     PL_regdata = r->data; /* for regprop() */
2089     DEBUG_r(regdump(r));
2090     return(r);
2091 }
2092
2093 /*
2094  - reg - regular expression, i.e. main body or parenthesized thing
2095  *
2096  * Caller must absorb opening parenthesis.
2097  *
2098  * Combining parenthesis handling with the base level of regular expression
2099  * is a trifle forced, but the need to tie the tails of the branches to what
2100  * follows makes it hard to avoid.
2101  */
2102 STATIC regnode *
2103 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2104     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2105 {
2106     register regnode *ret;              /* Will be the head of the group. */
2107     register regnode *br;
2108     register regnode *lastbr;
2109     register regnode *ender = 0;
2110     register I32 parno = 0;
2111     I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2112
2113     /* for (?g), (?gc), and (?o) warnings; warning
2114        about (?c) will warn about (?g) -- japhy    */
2115
2116     I32 wastedflags = 0x00,
2117         wasted_o    = 0x01,
2118         wasted_g    = 0x02,
2119         wasted_gc   = 0x02 | 0x04,
2120         wasted_c    = 0x04;
2121
2122     char * parse_start = RExC_parse; /* MJD */
2123     char *oregcomp_parse = RExC_parse;
2124     char c;
2125
2126     *flagp = 0;                         /* Tentatively. */
2127
2128
2129     /* Make an OPEN node, if parenthesized. */
2130     if (paren) {
2131         if (*RExC_parse == '?') { /* (?...) */
2132             U32 posflags = 0, negflags = 0;
2133             U32 *flagsp = &posflags;
2134             int logical = 0;
2135             char *seqstart = RExC_parse;
2136
2137             RExC_parse++;
2138             paren = *RExC_parse++;
2139             ret = NULL;                 /* For look-ahead/behind. */
2140             switch (paren) {
2141             case '<':           /* (?<...) */
2142                 RExC_seen |= REG_SEEN_LOOKBEHIND;
2143                 if (*RExC_parse == '!')
2144                     paren = ',';
2145                 if (*RExC_parse != '=' && *RExC_parse != '!')
2146                     goto unknown;
2147                 RExC_parse++;
2148             case '=':           /* (?=...) */
2149             case '!':           /* (?!...) */
2150                 RExC_seen_zerolen++;
2151             case ':':           /* (?:...) */
2152             case '>':           /* (?>...) */
2153                 break;
2154             case '$':           /* (?$...) */
2155             case '@':           /* (?@...) */
2156                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2157                 break;
2158             case '#':           /* (?#...) */
2159                 while (*RExC_parse && *RExC_parse != ')')
2160                     RExC_parse++;
2161                 if (*RExC_parse != ')')
2162                     FAIL("Sequence (?#... not terminated");
2163                 nextchar(pRExC_state);
2164                 *flagp = TRYAGAIN;
2165                 return NULL;
2166             case 'p':           /* (?p...) */
2167                 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2168                     vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2169                 /* FALL THROUGH*/
2170             case '?':           /* (??...) */
2171                 logical = 1;
2172                 if (*RExC_parse != '{')
2173                     goto unknown;
2174                 paren = *RExC_parse++;
2175                 /* FALL THROUGH */
2176             case '{':           /* (?{...}) */
2177             {
2178                 I32 count = 1, n = 0;
2179                 char c;
2180                 char *s = RExC_parse;
2181                 SV *sv;
2182                 OP_4tree *sop, *rop;
2183
2184                 RExC_seen_zerolen++;
2185                 RExC_seen |= REG_SEEN_EVAL;
2186                 while (count && (c = *RExC_parse)) {
2187                     if (c == '\\' && RExC_parse[1])
2188                         RExC_parse++;
2189                     else if (c == '{')
2190                         count++;
2191                     else if (c == '}')
2192                         count--;
2193                     RExC_parse++;
2194                 }
2195                 if (*RExC_parse != ')')
2196                 {
2197                     RExC_parse = s;             
2198                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2199                 }
2200                 if (!SIZE_ONLY) {
2201                     PAD *pad;
2202                 
2203                     if (RExC_parse - 1 - s)
2204                         sv = newSVpvn(s, RExC_parse - 1 - s);
2205                     else
2206                         sv = newSVpvn("", 0);
2207
2208                     ENTER;
2209                     Perl_save_re_context(aTHX);
2210                     rop = sv_compile_2op(sv, &sop, "re", &pad);
2211                     sop->op_private |= OPpREFCOUNTED;
2212                     /* re_dup will OpREFCNT_inc */
2213                     OpREFCNT_set(sop, 1);
2214                     LEAVE;
2215
2216                     n = add_data(pRExC_state, 3, "nop");
2217                     RExC_rx->data->data[n] = (void*)rop;
2218                     RExC_rx->data->data[n+1] = (void*)sop;
2219                     RExC_rx->data->data[n+2] = (void*)pad;
2220                     SvREFCNT_dec(sv);
2221                 }
2222                 else {                                          /* First pass */
2223                     if (PL_reginterp_cnt < ++RExC_seen_evals
2224                         && PL_curcop != &PL_compiling)
2225                         /* No compiled RE interpolated, has runtime
2226                            components ===> unsafe.  */
2227                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
2228                     if (PL_tainting && PL_tainted)
2229                         FAIL("Eval-group in insecure regular expression");
2230                 }
2231                 
2232                 nextchar(pRExC_state);
2233                 if (logical) {
2234                     ret = reg_node(pRExC_state, LOGICAL);
2235                     if (!SIZE_ONLY)
2236                         ret->flags = 2;
2237                     regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2238                     /* deal with the length of this later - MJD */
2239                     return ret;
2240                 }
2241                 ret = reganode(pRExC_state, EVAL, n);
2242                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2243                 Set_Node_Offset(ret, parse_start);
2244                 return ret;
2245             }
2246             case '(':           /* (?(?{...})...) and (?(?=...)...) */
2247             {
2248                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
2249                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2250                         || RExC_parse[1] == '<'
2251                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
2252                         I32 flag;
2253                         
2254                         ret = reg_node(pRExC_state, LOGICAL);
2255                         if (!SIZE_ONLY)
2256                             ret->flags = 1;
2257                         regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2258                         goto insert_if;
2259                     }
2260                 }
2261                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2262                     /* (?(1)...) */
2263                     parno = atoi(RExC_parse++);
2264
2265                     while (isDIGIT(*RExC_parse))
2266                         RExC_parse++;
2267                     ret = reganode(pRExC_state, GROUPP, parno);
2268                     
2269                     if ((c = *nextchar(pRExC_state)) != ')')
2270                         vFAIL("Switch condition not recognized");
2271                   insert_if:
2272                     regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2273                     br = regbranch(pRExC_state, &flags, 1);
2274                     if (br == NULL)
2275                         br = reganode(pRExC_state, LONGJMP, 0);
2276                     else
2277                         regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2278                     c = *nextchar(pRExC_state);
2279                     if (flags&HASWIDTH)
2280                         *flagp |= HASWIDTH;
2281                     if (c == '|') {
2282                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2283                         regbranch(pRExC_state, &flags, 1);
2284                         regtail(pRExC_state, ret, lastbr);
2285                         if (flags&HASWIDTH)
2286                             *flagp |= HASWIDTH;
2287                         c = *nextchar(pRExC_state);
2288                     }
2289                     else
2290                         lastbr = NULL;
2291                     if (c != ')')
2292                         vFAIL("Switch (?(condition)... contains too many branches");
2293                     ender = reg_node(pRExC_state, TAIL);
2294                     regtail(pRExC_state, br, ender);
2295                     if (lastbr) {
2296                         regtail(pRExC_state, lastbr, ender);
2297                         regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2298                     }
2299                     else
2300                         regtail(pRExC_state, ret, ender);
2301                     return ret;
2302                 }
2303                 else {
2304                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2305                 }
2306             }
2307             case 0:
2308                 RExC_parse--; /* for vFAIL to print correctly */
2309                 vFAIL("Sequence (? incomplete");
2310                 break;
2311             default:
2312                 --RExC_parse;
2313               parse_flags:      /* (?i) */
2314                 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2315                     /* (?g), (?gc) and (?o) are useless here
2316                        and must be globally applied -- japhy */
2317
2318                     if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2319                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2320                             I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2321                             if (! (wastedflags & wflagbit) ) {
2322                                 wastedflags |= wflagbit;
2323                                 vWARN5(
2324                                     RExC_parse + 1,
2325                                     "Useless (%s%c) - %suse /%c modifier",
2326                                     flagsp == &negflags ? "?-" : "?",
2327                                     *RExC_parse,
2328                                     flagsp == &negflags ? "don't " : "",
2329                                     *RExC_parse
2330                                 );
2331                             }
2332                         }
2333                     }
2334                     else if (*RExC_parse == 'c') {
2335                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2336                             if (! (wastedflags & wasted_c) ) {
2337                                 wastedflags |= wasted_gc;
2338                                 vWARN3(
2339                                     RExC_parse + 1,
2340                                     "Useless (%sc) - %suse /gc modifier",
2341                                     flagsp == &negflags ? "?-" : "?",
2342                                     flagsp == &negflags ? "don't " : ""
2343                                 );
2344                             }
2345                         }
2346                     }
2347                     else { pmflag(flagsp, *RExC_parse); }
2348
2349                     ++RExC_parse;
2350                 }
2351                 if (*RExC_parse == '-') {
2352                     flagsp = &negflags;
2353                     wastedflags = 0;  /* reset so (?g-c) warns twice */
2354                     ++RExC_parse;
2355                     goto parse_flags;
2356                 }
2357                 RExC_flags |= posflags;
2358                 RExC_flags &= ~negflags;
2359                 if (*RExC_parse == ':') {
2360                     RExC_parse++;
2361                     paren = ':';
2362                     break;
2363                 }               
2364               unknown:
2365                 if (*RExC_parse != ')') {
2366                     RExC_parse++;
2367                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2368                 }
2369                 nextchar(pRExC_state);
2370                 *flagp = TRYAGAIN;
2371                 return NULL;
2372             }
2373         }
2374         else {                  /* (...) */
2375             parno = RExC_npar;
2376             RExC_npar++;
2377             ret = reganode(pRExC_state, OPEN, parno);
2378             Set_Node_Length(ret, 1); /* MJD */
2379             Set_Node_Offset(ret, RExC_parse); /* MJD */
2380             open = 1;
2381         }
2382     }
2383     else                        /* ! paren */
2384         ret = NULL;
2385
2386     /* Pick up the branches, linking them together. */
2387     parse_start = RExC_parse;   /* MJD */
2388     br = regbranch(pRExC_state, &flags, 1);
2389     /*     branch_len = (paren != 0); */
2390     
2391     if (br == NULL)
2392         return(NULL);
2393     if (*RExC_parse == '|') {
2394         if (!SIZE_ONLY && RExC_extralen) {
2395             reginsert(pRExC_state, BRANCHJ, br);
2396         }
2397         else {                  /* MJD */
2398             reginsert(pRExC_state, BRANCH, br);
2399             Set_Node_Length(br, paren != 0);
2400             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2401         }
2402         have_branch = 1;
2403         if (SIZE_ONLY)
2404             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
2405     }
2406     else if (paren == ':') {
2407         *flagp |= flags&SIMPLE;
2408     }
2409     if (open) {                         /* Starts with OPEN. */
2410         regtail(pRExC_state, ret, br);          /* OPEN -> first. */
2411     }
2412     else if (paren != '?')              /* Not Conditional */
2413         ret = br;
2414     *flagp |= flags & (SPSTART | HASWIDTH);
2415     lastbr = br;
2416     while (*RExC_parse == '|') {
2417         if (!SIZE_ONLY && RExC_extralen) {
2418             ender = reganode(pRExC_state, LONGJMP,0);
2419             regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2420         }
2421         if (SIZE_ONLY)
2422             RExC_extralen += 2;         /* Account for LONGJMP. */
2423         nextchar(pRExC_state);
2424         br = regbranch(pRExC_state, &flags, 0);
2425         
2426         if (br == NULL)
2427             return(NULL);
2428         regtail(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
2429         lastbr = br;
2430         if (flags&HASWIDTH)
2431             *flagp |= HASWIDTH;
2432         *flagp |= flags&SPSTART;
2433     }
2434
2435     if (have_branch || paren != ':') {
2436         /* Make a closing node, and hook it on the end. */
2437         switch (paren) {
2438         case ':':
2439             ender = reg_node(pRExC_state, TAIL);
2440             break;
2441         case 1:
2442             ender = reganode(pRExC_state, CLOSE, parno);
2443             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2444             Set_Node_Length(ender,1); /* MJD */
2445             break;
2446         case '<':
2447         case ',':
2448         case '=':
2449         case '!':
2450             *flagp &= ~HASWIDTH;
2451             /* FALL THROUGH */
2452         case '>':
2453             ender = reg_node(pRExC_state, SUCCEED);
2454             break;
2455         case 0:
2456             ender = reg_node(pRExC_state, END);
2457             break;
2458         }
2459         regtail(pRExC_state, lastbr, ender);
2460
2461         if (have_branch) {
2462             /* Hook the tails of the branches to the closing node. */
2463             for (br = ret; br != NULL; br = regnext(br)) {
2464                 regoptail(pRExC_state, br, ender);
2465             }
2466         }
2467     }
2468
2469     {
2470         char *p;
2471         static char parens[] = "=!<,>";
2472
2473         if (paren && (p = strchr(parens, paren))) {
2474             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2475             int flag = (p - parens) > 1;
2476
2477             if (paren == '>')
2478                 node = SUSPEND, flag = 0;
2479             reginsert(pRExC_state, node,ret);
2480             Set_Node_Offset(ret, oregcomp_parse);
2481             Set_Node_Length(ret,  RExC_parse - oregcomp_parse + 2);
2482             ret->flags = flag;
2483             regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2484         }
2485     }
2486
2487     /* Check for proper termination. */
2488     if (paren) {
2489         RExC_flags = oregflags;
2490         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2491             RExC_parse = oregcomp_parse;
2492             vFAIL("Unmatched (");
2493         }
2494     }
2495     else if (!paren && RExC_parse < RExC_end) {
2496         if (*RExC_parse == ')') {
2497             RExC_parse++;
2498             vFAIL("Unmatched )");
2499         }
2500         else
2501             FAIL("Junk on end of regexp");      /* "Can't happen". */
2502         /* NOTREACHED */
2503     }
2504
2505     return(ret);
2506 }
2507
2508 /*
2509  - regbranch - one alternative of an | operator
2510  *
2511  * Implements the concatenation operator.
2512  */
2513 STATIC regnode *
2514 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2515 {
2516     register regnode *ret;
2517     register regnode *chain = NULL;
2518     register regnode *latest;
2519     I32 flags = 0, c = 0;
2520
2521     if (first)
2522         ret = NULL;
2523     else {
2524         if (!SIZE_ONLY && RExC_extralen)
2525             ret = reganode(pRExC_state, BRANCHJ,0);
2526         else {
2527             ret = reg_node(pRExC_state, BRANCH);
2528             Set_Node_Length(ret, 1);
2529         }
2530     }
2531         
2532     if (!first && SIZE_ONLY)
2533         RExC_extralen += 1;                     /* BRANCHJ */
2534
2535     *flagp = WORST;                     /* Tentatively. */
2536
2537     RExC_parse--;
2538     nextchar(pRExC_state);
2539     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2540         flags &= ~TRYAGAIN;
2541         latest = regpiece(pRExC_state, &flags);
2542         if (latest == NULL) {
2543             if (flags & TRYAGAIN)
2544                 continue;
2545             return(NULL);
2546         }
2547         else if (ret == NULL)
2548             ret = latest;
2549         *flagp |= flags&HASWIDTH;
2550         if (chain == NULL)      /* First piece. */
2551             *flagp |= flags&SPSTART;
2552         else {
2553             RExC_naughty++;
2554             regtail(pRExC_state, chain, latest);
2555         }
2556         chain = latest;
2557         c++;
2558     }
2559     if (chain == NULL) {        /* Loop ran zero times. */
2560         chain = reg_node(pRExC_state, NOTHING);
2561         if (ret == NULL)
2562             ret = chain;
2563     }
2564     if (c == 1) {
2565         *flagp |= flags&SIMPLE;
2566     }
2567
2568     return(ret);
2569 }
2570
2571 /*
2572  - regpiece - something followed by possible [*+?]
2573  *
2574  * Note that the branching code sequences used for ? and the general cases
2575  * of * and + are somewhat optimized:  they use the same NOTHING node as
2576  * both the endmarker for their branch list and the body of the last branch.
2577  * It might seem that this node could be dispensed with entirely, but the
2578  * endmarker role is not redundant.
2579  */
2580 STATIC regnode *
2581 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2582 {
2583     register regnode *ret;
2584     register char op;
2585     register char *next;
2586     I32 flags;
2587     char *origparse = RExC_parse;
2588     char *maxpos;
2589     I32 min;
2590     I32 max = REG_INFTY;
2591     char *parse_start;
2592
2593     ret = regatom(pRExC_state, &flags);
2594     if (ret == NULL) {
2595         if (flags & TRYAGAIN)
2596             *flagp |= TRYAGAIN;
2597         return(NULL);
2598     }
2599
2600     op = *RExC_parse;
2601
2602     if (op == '{' && regcurly(RExC_parse)) {
2603         parse_start = RExC_parse; /* MJD */
2604         next = RExC_parse + 1;
2605         maxpos = Nullch;
2606         while (isDIGIT(*next) || *next == ',') {
2607             if (*next == ',') {
2608                 if (maxpos)
2609                     break;
2610                 else
2611                     maxpos = next;
2612             }
2613             next++;
2614         }
2615         if (*next == '}') {             /* got one */
2616             if (!maxpos)
2617                 maxpos = next;
2618             RExC_parse++;
2619             min = atoi(RExC_parse);
2620             if (*maxpos == ',')
2621                 maxpos++;
2622             else
2623                 maxpos = RExC_parse;
2624             max = atoi(maxpos);
2625             if (!max && *maxpos != '0')
2626                 max = REG_INFTY;                /* meaning "infinity" */
2627             else if (max >= REG_INFTY)
2628                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2629             RExC_parse = next;
2630             nextchar(pRExC_state);
2631
2632         do_curly:
2633             if ((flags&SIMPLE)) {
2634                 RExC_naughty += 2 + RExC_naughty / 2;
2635                 reginsert(pRExC_state, CURLY, ret);
2636                 Set_Node_Offset(ret, parse_start+1); /* MJD */
2637                 Set_Node_Cur_Length(ret);
2638             }
2639             else {
2640                 regnode *w = reg_node(pRExC_state, WHILEM);
2641
2642                 w->flags = 0;
2643                 regtail(pRExC_state, ret, w);
2644                 if (!SIZE_ONLY && RExC_extralen) {
2645                     reginsert(pRExC_state, LONGJMP,ret);
2646                     reginsert(pRExC_state, NOTHING,ret);
2647                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
2648                 }
2649                 reginsert(pRExC_state, CURLYX,ret);
2650                                 /* MJD hk */
2651                 Set_Node_Offset(ret, parse_start+1);
2652                 Set_Node_Length(ret, 
2653                                 op == '{' ? (RExC_parse - parse_start) : 1);
2654                 
2655                 if (!SIZE_ONLY && RExC_extralen)
2656                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
2657                 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2658                 if (SIZE_ONLY)
2659                     RExC_whilem_seen++, RExC_extralen += 3;
2660                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
2661             }
2662             ret->flags = 0;
2663
2664             if (min > 0)
2665                 *flagp = WORST;
2666             if (max > 0)
2667                 *flagp |= HASWIDTH;
2668             if (max && max < min)
2669                 vFAIL("Can't do {n,m} with n > m");
2670             if (!SIZE_ONLY) {
2671                 ARG1_SET(ret, (U16)min);
2672                 ARG2_SET(ret, (U16)max);
2673             }
2674
2675             goto nest_check;
2676         }
2677     }
2678
2679     if (!ISMULT1(op)) {
2680         *flagp = flags;
2681         return(ret);
2682     }
2683
2684 #if 0                           /* Now runtime fix should be reliable. */
2685
2686     /* if this is reinstated, don't forget to put this back into perldiag:
2687
2688             =item Regexp *+ operand could be empty at {#} in regex m/%s/
2689
2690            (F) The part of the regexp subject to either the * or + quantifier
2691            could match an empty string. The {#} shows in the regular
2692            expression about where the problem was discovered.
2693
2694     */
2695
2696     if (!(flags&HASWIDTH) && op != '?')
2697       vFAIL("Regexp *+ operand could be empty");
2698 #endif
2699
2700     parse_start = RExC_parse;
2701     nextchar(pRExC_state);
2702
2703     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2704
2705     if (op == '*' && (flags&SIMPLE)) {
2706         reginsert(pRExC_state, STAR, ret);
2707         ret->flags = 0;
2708         RExC_naughty += 4;
2709     }
2710     else if (op == '*') {
2711         min = 0;
2712         goto do_curly;
2713     }
2714     else if (op == '+' && (flags&SIMPLE)) {
2715         reginsert(pRExC_state, PLUS, ret);
2716         ret->flags = 0;
2717         RExC_naughty += 3;
2718     }
2719     else if (op == '+') {
2720         min = 1;
2721         goto do_curly;
2722     }
2723     else if (op == '?') {
2724         min = 0; max = 1;
2725         goto do_curly;
2726     }
2727   nest_check:
2728     if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2729         vWARN3(RExC_parse,
2730                "%.*s matches null string many times",
2731                RExC_parse - origparse,
2732                origparse);
2733     }
2734
2735     if (*RExC_parse == '?') {
2736         nextchar(pRExC_state);
2737         reginsert(pRExC_state, MINMOD, ret);
2738         regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2739     }
2740     if (ISMULT2(RExC_parse)) {
2741         RExC_parse++;
2742         vFAIL("Nested quantifiers");
2743     }
2744
2745     return(ret);
2746 }
2747
2748 /*
2749  - regatom - the lowest level
2750  *
2751  * Optimization:  gobbles an entire sequence of ordinary characters so that
2752  * it can turn them into a single node, which is smaller to store and
2753  * faster to run.  Backslashed characters are exceptions, each becoming a
2754  * separate node; the code is simpler that way and it's not worth fixing.
2755  *
2756  * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2757 STATIC regnode *
2758 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2759 {
2760     register regnode *ret = 0;
2761     I32 flags;
2762     char *parse_start = 0;
2763
2764     *flagp = WORST;             /* Tentatively. */
2765
2766 tryagain:
2767     switch (*RExC_parse) {
2768     case '^':
2769         RExC_seen_zerolen++;
2770         nextchar(pRExC_state);
2771         if (RExC_flags & PMf_MULTILINE)
2772             ret = reg_node(pRExC_state, MBOL);
2773         else if (RExC_flags & PMf_SINGLELINE)
2774             ret = reg_node(pRExC_state, SBOL);
2775         else
2776             ret = reg_node(pRExC_state, BOL);
2777         Set_Node_Length(ret, 1); /* MJD */
2778         break;
2779     case '$':
2780         nextchar(pRExC_state);
2781         if (*RExC_parse)
2782             RExC_seen_zerolen++;
2783         if (RExC_flags & PMf_MULTILINE)
2784             ret = reg_node(pRExC_state, MEOL);
2785         else if (RExC_flags & PMf_SINGLELINE)
2786             ret = reg_node(pRExC_state, SEOL);
2787         else
2788             ret = reg_node(pRExC_state, EOL);
2789         Set_Node_Length(ret, 1); /* MJD */
2790         break;
2791     case '.':
2792         nextchar(pRExC_state);
2793         if (RExC_flags & PMf_SINGLELINE)
2794             ret = reg_node(pRExC_state, SANY);
2795         else
2796             ret = reg_node(pRExC_state, REG_ANY);
2797         *flagp |= HASWIDTH|SIMPLE;
2798         RExC_naughty++;
2799         Set_Node_Length(ret, 1); /* MJD */
2800         break;
2801     case '[':
2802     {
2803         char *oregcomp_parse = ++RExC_parse;
2804         ret = regclass(pRExC_state);
2805         if (*RExC_parse != ']') {
2806             RExC_parse = oregcomp_parse;
2807             vFAIL("Unmatched [");
2808         }
2809         nextchar(pRExC_state);
2810         *flagp |= HASWIDTH|SIMPLE;
2811         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2812         break;
2813     }
2814     case '(':
2815         nextchar(pRExC_state);
2816         ret = reg(pRExC_state, 1, &flags);
2817         if (ret == NULL) {
2818                 if (flags & TRYAGAIN) {
2819                     if (RExC_parse == RExC_end) {
2820                          /* Make parent create an empty node if needed. */
2821                         *flagp |= TRYAGAIN;
2822                         return(NULL);
2823                     }
2824                     goto tryagain;
2825                 }
2826                 return(NULL);
2827         }
2828         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2829         break;
2830     case '|':
2831     case ')':
2832         if (flags & TRYAGAIN) {
2833             *flagp |= TRYAGAIN;
2834             return NULL;
2835         }
2836         vFAIL("Internal urp");
2837                                 /* Supposed to be caught earlier. */
2838         break;
2839     case '{':
2840         if (!regcurly(RExC_parse)) {
2841             RExC_parse++;
2842             goto defchar;
2843         }
2844         /* FALL THROUGH */
2845     case '?':
2846     case '+':
2847     case '*':
2848         RExC_parse++;
2849         vFAIL("Quantifier follows nothing");
2850         break;
2851     case '\\':
2852         switch (*++RExC_parse) {
2853         case 'A':
2854             RExC_seen_zerolen++;
2855             ret = reg_node(pRExC_state, SBOL);
2856             *flagp |= SIMPLE;
2857             nextchar(pRExC_state);
2858             Set_Node_Length(ret, 2); /* MJD */
2859             break;
2860         case 'G':
2861             ret = reg_node(pRExC_state, GPOS);
2862             RExC_seen |= REG_SEEN_GPOS;
2863             *flagp |= SIMPLE;
2864             nextchar(pRExC_state);
2865             Set_Node_Length(ret, 2); /* MJD */
2866             break;
2867         case 'Z':
2868             ret = reg_node(pRExC_state, SEOL);
2869             *flagp |= SIMPLE;
2870             RExC_seen_zerolen++;                /* Do not optimize RE away */
2871             nextchar(pRExC_state);
2872             break;
2873         case 'z':
2874             ret = reg_node(pRExC_state, EOS);
2875             *flagp |= SIMPLE;
2876             RExC_seen_zerolen++;                /* Do not optimize RE away */
2877             nextchar(pRExC_state);
2878             Set_Node_Length(ret, 2); /* MJD */
2879             break;
2880         case 'C':
2881             ret = reg_node(pRExC_state, CANY);
2882             RExC_seen |= REG_SEEN_CANY;
2883             *flagp |= HASWIDTH|SIMPLE;
2884             nextchar(pRExC_state);
2885             Set_Node_Length(ret, 2); /* MJD */
2886             break;
2887         case 'X':
2888             ret = reg_node(pRExC_state, CLUMP);
2889             *flagp |= HASWIDTH;
2890             nextchar(pRExC_state);
2891             Set_Node_Length(ret, 2); /* MJD */
2892             break;
2893         case 'w':
2894             ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
2895             *flagp |= HASWIDTH|SIMPLE;
2896             nextchar(pRExC_state);
2897             Set_Node_Length(ret, 2); /* MJD */
2898             break;
2899         case 'W':
2900             ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
2901             *flagp |= HASWIDTH|SIMPLE;
2902             nextchar(pRExC_state);
2903             Set_Node_Length(ret, 2); /* MJD */
2904             break;
2905         case 'b':
2906             RExC_seen_zerolen++;
2907             RExC_seen |= REG_SEEN_LOOKBEHIND;
2908             ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
2909             *flagp |= SIMPLE;
2910             nextchar(pRExC_state);
2911             Set_Node_Length(ret, 2); /* MJD */
2912             break;
2913         case 'B':
2914             RExC_seen_zerolen++;
2915             RExC_seen |= REG_SEEN_LOOKBEHIND;
2916             ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
2917             *flagp |= SIMPLE;
2918             nextchar(pRExC_state);
2919             Set_Node_Length(ret, 2); /* MJD */
2920             break;
2921         case 's':
2922             ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
2923             *flagp |= HASWIDTH|SIMPLE;
2924             nextchar(pRExC_state);
2925             Set_Node_Length(ret, 2); /* MJD */
2926             break;
2927         case 'S':
2928             ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
2929             *flagp |= HASWIDTH|SIMPLE;
2930             nextchar(pRExC_state);
2931             Set_Node_Length(ret, 2); /* MJD */
2932             break;
2933         case 'd':
2934             ret = reg_node(pRExC_state, DIGIT);
2935             *flagp |= HASWIDTH|SIMPLE;
2936             nextchar(pRExC_state);
2937             Set_Node_Length(ret, 2); /* MJD */
2938             break;
2939         case 'D':
2940             ret = reg_node(pRExC_state, NDIGIT);
2941             *flagp |= HASWIDTH|SIMPLE;
2942             nextchar(pRExC_state);
2943             Set_Node_Length(ret, 2); /* MJD */
2944             break;
2945         case 'p':
2946         case 'P':
2947             {   
2948                 char* oldregxend = RExC_end;
2949                 char* parse_start = RExC_parse - 2;
2950
2951                 if (RExC_parse[1] == '{') {
2952                   /* a lovely hack--pretend we saw [\pX] instead */
2953                     RExC_end = strchr(RExC_parse, '}');
2954                     if (!RExC_end) {
2955                         U8 c = (U8)*RExC_parse;
2956                         RExC_parse += 2;
2957                         RExC_end = oldregxend;
2958                         vFAIL2("Missing right brace on \\%c{}", c);
2959                     }
2960                     RExC_end++;
2961                 }
2962                 else {
2963                     RExC_end = RExC_parse + 2;
2964                     if (RExC_end > oldregxend)
2965                         RExC_end = oldregxend;
2966                 }
2967                 RExC_parse--;
2968
2969                 ret = regclass(pRExC_state);
2970
2971                 RExC_end = oldregxend;
2972                 RExC_parse--;
2973
2974                 Set_Node_Offset(ret, parse_start + 2);
2975                 Set_Node_Cur_Length(ret);
2976                 nextchar(pRExC_state);
2977                 *flagp |= HASWIDTH|SIMPLE;
2978             }
2979             break;
2980         case 'n':
2981         case 'r':
2982         case 't':
2983         case 'f':
2984         case 'e':
2985         case 'a':
2986         case 'x':
2987         case 'c':
2988         case '0':
2989             goto defchar;
2990         case '1': case '2': case '3': case '4':
2991         case '5': case '6': case '7': case '8': case '9':
2992             {
2993                 I32 num = atoi(RExC_parse);
2994
2995                 if (num > 9 && num >= RExC_npar)
2996                     goto defchar;
2997                 else {
2998                     char * parse_start = RExC_parse - 1; /* MJD */
2999                     while (isDIGIT(*RExC_parse))
3000                         RExC_parse++;
3001
3002                     if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3003                         vFAIL("Reference to nonexistent group");
3004                     RExC_sawback = 1;
3005                     ret = reganode(pRExC_state,
3006                                    (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3007                                    num);
3008                     *flagp |= HASWIDTH;
3009                     
3010                     /* override incorrect value set in reganode MJD */
3011                     Set_Node_Offset(ret, parse_start+1); 
3012                     Set_Node_Cur_Length(ret); /* MJD */
3013                     RExC_parse--;
3014                     nextchar(pRExC_state);
3015                 }
3016             }
3017             break;
3018         case '\0':
3019             if (RExC_parse >= RExC_end)
3020                 FAIL("Trailing \\");
3021             /* FALL THROUGH */
3022         default:
3023             /* Do not generate `unrecognized' warnings here, we fall
3024                back into the quick-grab loop below */
3025             goto defchar;
3026         }
3027         break;
3028
3029     case '#':
3030         if (RExC_flags & PMf_EXTENDED) {
3031             while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3032             if (RExC_parse < RExC_end)
3033                 goto tryagain;
3034         }
3035         /* FALL THROUGH */
3036
3037     default: {
3038             register STRLEN len;
3039             register UV ender;
3040             register char *p;
3041             char *oldp, *s;
3042             STRLEN numlen;
3043             STRLEN foldlen;
3044             U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3045
3046             parse_start = RExC_parse - 1;
3047
3048             RExC_parse++;
3049
3050         defchar:
3051             ender = 0;
3052             ret = reg_node(pRExC_state,
3053                            (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3054             s = STRING(ret);
3055             for (len = 0, p = RExC_parse - 1;
3056               len < 127 && p < RExC_end;
3057               len++)
3058             {
3059                 oldp = p;
3060
3061                 if (RExC_flags & PMf_EXTENDED)
3062                     p = regwhite(p, RExC_end);
3063                 switch (*p) {
3064                 case '^':
3065                 case '$':
3066                 case '.':
3067                 case '[':
3068                 case '(':
3069                 case ')':
3070                 case '|':
3071                     goto loopdone;
3072                 case '\\':
3073                     switch (*++p) {
3074                     case 'A':
3075                     case 'C':
3076                     case 'X':
3077                     case 'G':
3078                     case 'Z':
3079                     case 'z':
3080                     case 'w':
3081                     case 'W':
3082                     case 'b':
3083                     case 'B':
3084                     case 's':
3085                     case 'S':
3086                     case 'd':
3087                     case 'D':
3088                     case 'p':
3089                     case 'P':
3090                         --p;
3091                         goto loopdone;
3092                     case 'n':
3093                         ender = '\n';
3094                         p++;
3095                         break;
3096                     case 'r':
3097                         ender = '\r';
3098                         p++;
3099                         break;
3100                     case 't':
3101                         ender = '\t';
3102                         p++;
3103                         break;
3104                     case 'f':
3105                         ender = '\f';
3106                         p++;
3107                         break;
3108                     case 'e':
3109                           ender = ASCII_TO_NATIVE('\033');
3110                         p++;
3111                         break;
3112                     case 'a':
3113                           ender = ASCII_TO_NATIVE('\007');
3114                         p++;
3115                         break;
3116                     case 'x':
3117                         if (*++p == '{') {
3118                             char* e = strchr(p, '}');
3119         
3120                             if (!e) {
3121                                 RExC_parse = p + 1;
3122                                 vFAIL("Missing right brace on \\x{}");
3123                             }
3124                             else {
3125                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3126                                     | PERL_SCAN_DISALLOW_PREFIX;
3127                                 numlen = e - p - 1;
3128                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3129                                 if (ender > 0xff)
3130                                     RExC_utf8 = 1;
3131                                 /* numlen is generous */
3132                                 if (numlen + len >= 127) {
3133                                     p--;
3134                                     goto loopdone;
3135                                 }
3136                                 p = e + 1;
3137                             }
3138                         }
3139                         else {
3140                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3141                             numlen = 2;
3142                             ender = grok_hex(p, &numlen, &flags, NULL);
3143                             p += numlen;
3144                         }
3145                         break;
3146                     case 'c':
3147                         p++;
3148                         ender = UCHARAT(p++);
3149                         ender = toCTRL(ender);
3150                         break;
3151                     case '0': case '1': case '2': case '3':case '4':
3152                     case '5': case '6': case '7': case '8':case '9':
3153                         if (*p == '0' ||
3154                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3155                             I32 flags = 0;
3156                             numlen = 3;
3157                             ender = grok_oct(p, &numlen, &flags, NULL);
3158                             p += numlen;
3159                         }
3160                         else {
3161                             --p;
3162                             goto loopdone;
3163                         }
3164                         break;
3165                     case '\0':
3166                         if (p >= RExC_end)
3167                             FAIL("Trailing \\");
3168                         /* FALL THROUGH */
3169                     default:
3170                         if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3171                             vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3172                         goto normal_default;
3173                     }
3174                     break;
3175                 default:
3176                   normal_default:
3177                     if (UTF8_IS_START(*p) && UTF) {
3178                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3179                                                &numlen, 0);
3180                         p += numlen;
3181                     }
3182                     else
3183                         ender = *p++;
3184                     break;
3185                 }
3186                 if (RExC_flags & PMf_EXTENDED)
3187                     p = regwhite(p, RExC_end);
3188                 if (UTF && FOLD) {
3189                     /* Prime the casefolded buffer. */
3190                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3191                 }
3192                 if (ISMULT2(p)) { /* Back off on ?+*. */
3193                     if (len)
3194                         p = oldp;
3195                     else if (UTF) {
3196                          STRLEN unilen;
3197
3198                          if (FOLD) {
3199                               /* Emit all the Unicode characters. */
3200                               for (foldbuf = tmpbuf;
3201                                    foldlen;
3202                                    foldlen -= numlen) {
3203                                    ender = utf8_to_uvchr(foldbuf, &numlen);
3204                                    if (numlen > 0) {
3205                                         reguni(pRExC_state, ender, s, &unilen);
3206                                         s       += unilen;
3207                                         len     += unilen;
3208                                         /* In EBCDIC the numlen
3209                                          * and unilen can differ. */
3210                                         foldbuf += numlen;
3211                                         if (numlen >= foldlen)
3212                                              break;
3213                                    }
3214                                    else
3215                                         break; /* "Can't happen." */
3216                               }
3217                          }
3218                          else {
3219                               reguni(pRExC_state, ender, s, &unilen);
3220                               if (unilen > 0) {
3221                                    s   += unilen;
3222                                    len += unilen;
3223                               }
3224                          }
3225                     }
3226                     else {
3227                         len++;
3228                         REGC((char)ender, s++);
3229                     }
3230                     break;
3231                 }
3232                 if (UTF) {
3233                      STRLEN unilen;
3234
3235                      if (FOLD) {
3236                           /* Emit all the Unicode characters. */
3237                           for (foldbuf = tmpbuf;
3238                                foldlen;
3239                                foldlen -= numlen) {
3240                                ender = utf8_to_uvchr(foldbuf, &numlen);
3241                                if (numlen > 0) {
3242                                     reguni(pRExC_state, ender, s, &unilen);
3243                                     len     += unilen;
3244                                     s       += unilen;
3245                                     /* In EBCDIC the numlen
3246                                      * and unilen can differ. */
3247                                     foldbuf += numlen;
3248                                     if (numlen >= foldlen)
3249                                          break;
3250                                }
3251                                else
3252                                     break;
3253                           }
3254                      }
3255                      else {
3256                           reguni(pRExC_state, ender, s, &unilen);
3257                           if (unilen > 0) {
3258                                s   += unilen;
3259                                len += unilen;
3260                           }
3261                      }
3262                      len--;
3263                 }
3264                 else
3265                     REGC((char)ender, s++);
3266             }
3267         loopdone:
3268             RExC_parse = p - 1;
3269             Set_Node_Cur_Length(ret); /* MJD */
3270             nextchar(pRExC_state);
3271             {
3272                 /* len is STRLEN which is unsigned, need to copy to signed */
3273                 IV iv = len;
3274                 if (iv < 0)
3275                     vFAIL("Internal disaster");
3276             }
3277             if (len > 0)
3278                 *flagp |= HASWIDTH;
3279             if (len == 1)
3280                 *flagp |= SIMPLE;
3281             if (!SIZE_ONLY)
3282                 STR_LEN(ret) = len;
3283             if (SIZE_ONLY)
3284                 RExC_size += STR_SZ(len);
3285             else
3286                 RExC_emit += STR_SZ(len);
3287         }
3288         break;
3289     }
3290
3291     /* If the encoding pragma is in effect recode the text of
3292      * any EXACT-kind nodes. */
3293     if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3294         STRLEN oldlen = STR_LEN(ret);
3295         SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3296
3297         if (RExC_utf8)
3298             SvUTF8_on(sv);
3299         if (sv_utf8_downgrade(sv, TRUE)) {
3300             char *s       = sv_recode_to_utf8(sv, PL_encoding);
3301             STRLEN newlen = SvCUR(sv);
3302
3303             if (SvUTF8(sv))
3304                 RExC_utf8 = 1;
3305             if (!SIZE_ONLY) {
3306                 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3307                                       (int)oldlen, STRING(ret),
3308                                       (int)newlen, s));
3309                 Copy(s, STRING(ret), newlen, char);
3310                 STR_LEN(ret) += newlen - oldlen;
3311                 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3312             } else
3313                 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3314         }
3315     }
3316
3317     return(ret);
3318 }
3319
3320 STATIC char *
3321 S_regwhite(pTHX_ char *p, char *e)
3322 {
3323     while (p < e) {
3324         if (isSPACE(*p))
3325             ++p;
3326         else if (*p == '#') {
3327             do {
3328                 p++;
3329             } while (p < e && *p != '\n');
3330         }
3331         else
3332             break;
3333     }
3334     return p;
3335 }
3336
3337 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3338    Character classes ([:foo:]) can also be negated ([:^foo:]).
3339    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3340    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3341    but trigger failures because they are currently unimplemented. */
3342
3343 #define POSIXCC_DONE(c)   ((c) == ':')
3344 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3345 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3346
3347 STATIC I32
3348 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3349 {
3350     char *posixcc = 0;
3351     I32 namedclass = OOB_NAMEDCLASS;
3352
3353     if (value == '[' && RExC_parse + 1 < RExC_end &&
3354         /* I smell either [: or [= or [. -- POSIX has been here, right? */
3355         POSIXCC(UCHARAT(RExC_parse))) {
3356         char  c = UCHARAT(RExC_parse);
3357         char* s = RExC_parse++;
3358         
3359         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3360             RExC_parse++;
3361         if (RExC_parse == RExC_end)
3362             /* Grandfather lone [:, [=, [. */
3363             RExC_parse = s;
3364         else {
3365             char* t = RExC_parse++; /* skip over the c */
3366
3367             if (UCHARAT(RExC_parse) == ']') {
3368                 RExC_parse++; /* skip over the ending ] */
3369                 posixcc = s + 1;
3370                 if (*s == ':') {
3371                     I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3372                     I32 skip = 5; /* the most common skip */
3373
3374                     switch (*posixcc) {
3375                     case 'a':
3376                         if (strnEQ(posixcc, "alnum", 5))
3377                             namedclass =
3378                                 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3379                         else if (strnEQ(posixcc, "alpha", 5))
3380                             namedclass =
3381                                 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3382                         else if (strnEQ(posixcc, "ascii", 5))
3383                             namedclass =
3384                                 complement ? ANYOF_NASCII : ANYOF_ASCII;
3385                         break;
3386                     case 'b':
3387                         if (strnEQ(posixcc, "blank", 5))
3388                             namedclass =
3389                                 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3390                         break;
3391                     case 'c':
3392                         if (strnEQ(posixcc, "cntrl", 5))
3393                             namedclass =
3394                                 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3395                         break;
3396                     case 'd':
3397                         if (strnEQ(posixcc, "digit", 5))
3398                             namedclass =
3399                                 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3400                         break;
3401                     case 'g':
3402                         if (strnEQ(posixcc, "graph", 5))
3403                             namedclass =
3404                                 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3405                         break;
3406                     case 'l':
3407                         if (strnEQ(posixcc, "lower", 5))
3408                             namedclass =
3409                                 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3410                         break;
3411                     case 'p':
3412                         if (strnEQ(posixcc, "print", 5))
3413                             namedclass =
3414                                 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3415                         else if (strnEQ(posixcc, "punct", 5))
3416                             namedclass =
3417                                 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3418                         break;
3419                     case 's':
3420                         if (strnEQ(posixcc, "space", 5))
3421                             namedclass =
3422                                 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3423                         break;
3424                     case 'u':
3425                         if (strnEQ(posixcc, "upper", 5))
3426                             namedclass =
3427                                 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3428                         break;
3429                     case 'w': /* this is not POSIX, this is the Perl \w */
3430                         if (strnEQ(posixcc, "word", 4)) {
3431                             namedclass =
3432                                 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3433                             skip = 4;
3434                         }
3435                         break;
3436                     case 'x':
3437                         if (strnEQ(posixcc, "xdigit", 6)) {
3438                             namedclass =
3439                                 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3440                             skip = 6;
3441                         }
3442                         break;
3443                     }
3444                     if (namedclass == OOB_NAMEDCLASS ||
3445                         posixcc[skip] != ':' ||
3446                         posixcc[skip+1] != ']')
3447                     {
3448                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3449                                       t - s - 1, s + 1);
3450                     }
3451                 } else if (!SIZE_ONLY) {
3452                     /* [[=foo=]] and [[.foo.]] are still future. */
3453
3454                     /* adjust RExC_parse so the warning shows after
3455                        the class closes */
3456                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3457                         RExC_parse++;
3458                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3459                 }
3460             } else {
3461                 /* Maternal grandfather:
3462                  * "[:" ending in ":" but not in ":]" */
3463                 RExC_parse = s;
3464             }
3465         }
3466     }
3467
3468     return namedclass;
3469 }
3470
3471 STATIC void
3472 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3473 {
3474     if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3475         char *s = RExC_parse;
3476         char  c = *s++;
3477
3478         while(*s && isALNUM(*s))
3479             s++;
3480         if (*s && c == *s && s[1] == ']') {
3481             if (ckWARN(WARN_REGEXP))
3482                 vWARN3(s+2,
3483                         "POSIX syntax [%c %c] belongs inside character classes",
3484                         c, c);
3485
3486             /* [[=foo=]] and [[.foo.]] are still future. */
3487             if (POSIXCC_NOTYET(c)) {
3488                 /* adjust RExC_parse so the error shows after
3489                    the class closes */
3490                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3491                     ;
3492                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3493             }
3494         }
3495     }
3496 }
3497
3498 STATIC regnode *
3499 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3500 {
3501     register UV value;
3502     register UV nextvalue;
3503     register IV prevvalue = OOB_UNICODE;
3504     register IV range = 0;
3505     register regnode *ret;
3506     STRLEN numlen;
3507     IV namedclass;
3508     char *rangebegin = 0;
3509     bool need_class = 0;
3510     SV *listsv = Nullsv;
3511     register char *e;
3512     UV n;
3513     bool optimize_invert   = TRUE;
3514     AV* unicode_alternate  = 0;
3515 #ifdef EBCDIC
3516     UV literal_endpoint = 0;
3517 #endif
3518
3519     ret = reganode(pRExC_state, ANYOF, 0);
3520
3521     if (!SIZE_ONLY)
3522         ANYOF_FLAGS(ret) = 0;
3523
3524     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
3525         RExC_naughty++;
3526         RExC_parse++;
3527         if (!SIZE_ONLY)
3528             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3529     }
3530
3531     if (SIZE_ONLY)
3532         RExC_size += ANYOF_SKIP;
3533     else {
3534         RExC_emit += ANYOF_SKIP;
3535         if (FOLD)
3536             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3537         if (LOC)
3538             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3539         ANYOF_BITMAP_ZERO(ret);
3540         listsv = newSVpvn("# comment\n", 10);
3541     }
3542
3543     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3544
3545     if (!SIZE_ONLY && POSIXCC(nextvalue))
3546         checkposixcc(pRExC_state);
3547
3548     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3549     if (UCHARAT(RExC_parse) == ']')
3550         goto charclassloop;
3551
3552     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3553
3554     charclassloop:
3555
3556         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3557
3558         if (!range)
3559             rangebegin = RExC_parse;
3560         if (UTF) {
3561             value = utf8n_to_uvchr((U8*)RExC_parse,
3562                                    RExC_end - RExC_parse,
3563                                    &numlen, 0);
3564             RExC_parse += numlen;
3565         }
3566         else
3567             value = UCHARAT(RExC_parse++);
3568         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3569         if (value == '[' && POSIXCC(nextvalue))
3570             namedclass = regpposixcc(pRExC_state, value);
3571         else if (value == '\\') {
3572             if (UTF) {
3573                 value = utf8n_to_uvchr((U8*)RExC_parse,
3574                                    RExC_end - RExC_parse,
3575                                    &numlen, 0);
3576                 RExC_parse += numlen;
3577             }
3578             else
3579                 value = UCHARAT(RExC_parse++);
3580             /* Some compilers cannot handle switching on 64-bit integer
3581              * values, therefore value cannot be an UV.  Yes, this will
3582              * be a problem later if we want switch on Unicode.
3583              * A similar issue a little bit later when switching on
3584              * namedclass. --jhi */
3585             switch ((I32)value) {
3586             case 'w':   namedclass = ANYOF_ALNUM;       break;
3587             case 'W':   namedclass = ANYOF_NALNUM;      break;
3588             case 's':   namedclass = ANYOF_SPACE;       break;
3589             case 'S':   namedclass = ANYOF_NSPACE;      break;
3590             case 'd':   namedclass = ANYOF_DIGIT;       break;
3591             case 'D':   namedclass = ANYOF_NDIGIT;      break;
3592             case 'p':
3593             case 'P':
3594                 if (RExC_parse >= RExC_end)
3595                     vFAIL2("Empty \\%c{}", (U8)value);
3596                 if (*RExC_parse == '{') {
3597                     U8 c = (U8)value;
3598                     e = strchr(RExC_parse++, '}');
3599                     if (!e)
3600                         vFAIL2("Missing right brace on \\%c{}", c);
3601                     while (isSPACE(UCHARAT(RExC_parse)))
3602                         RExC_parse++;
3603                     if (e == RExC_parse)
3604                         vFAIL2("Empty \\%c{}", c);
3605                     n = e - RExC_parse;
3606                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3607                         n--;
3608                 }
3609                 else {
3610                     e = RExC_parse;
3611                     n = 1;
3612                 }
3613                 if (!SIZE_ONLY) {
3614                     if (UCHARAT(RExC_parse) == '^') {
3615                          RExC_parse++;
3616                          n--;
3617                          value = value == 'p' ? 'P' : 'p'; /* toggle */
3618                          while (isSPACE(UCHARAT(RExC_parse))) {
3619                               RExC_parse++;
3620                               n--;
3621                          }
3622                     }
3623                     if (value == 'p')
3624                          Perl_sv_catpvf(aTHX_ listsv,
3625                                         "+utf8::%.*s\n", (int)n, RExC_parse);
3626                     else
3627                          Perl_sv_catpvf(aTHX_ listsv,
3628                                         "!utf8::%.*s\n", (int)n, RExC_parse);
3629                 }
3630                 RExC_parse = e + 1;
3631                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3632                 continue;
3633             case 'n':   value = '\n';                   break;
3634             case 'r':   value = '\r';                   break;
3635             case 't':   value = '\t';                   break;
3636             case 'f':   value = '\f';                   break;
3637             case 'b':   value = '\b';                   break;
3638             case 'e':   value = ASCII_TO_NATIVE('\033');break;
3639             case 'a':   value = ASCII_TO_NATIVE('\007');break;
3640             case 'x':
3641                 if (*RExC_parse == '{') {
3642                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3643                         | PERL_SCAN_DISALLOW_PREFIX;
3644                     e = strchr(RExC_parse++, '}');
3645                     if (!e)
3646                         vFAIL("Missing right brace on \\x{}");
3647
3648                     numlen = e - RExC_parse;
3649                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3650                     RExC_parse = e + 1;
3651                 }
3652                 else {
3653                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3654                     numlen = 2;
3655                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3656                     RExC_parse += numlen;
3657                 }
3658                 break;
3659             case 'c':
3660                 value = UCHARAT(RExC_parse++);
3661                 value = toCTRL(value);
3662                 break;
3663             case '0': case '1': case '2': case '3': case '4':
3664             case '5': case '6': case '7': case '8': case '9':
3665             {
3666                 I32 flags = 0;
3667                 numlen = 3;
3668                 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3669                 RExC_parse += numlen;
3670                 break;
3671             }
3672             default:
3673                 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3674                     vWARN2(RExC_parse,
3675                            "Unrecognized escape \\%c in character class passed through",
3676                            (int)value);
3677                 break;
3678             }
3679         } /* end of \blah */
3680 #ifdef EBCDIC
3681         else
3682             literal_endpoint++;
3683 #endif
3684
3685         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3686
3687             if (!SIZE_ONLY && !need_class)
3688                 ANYOF_CLASS_ZERO(ret);
3689
3690             need_class = 1;
3691
3692             /* a bad range like a-\d, a-[:digit:] ? */
3693             if (range) {
3694                 if (!SIZE_ONLY) {
3695                     if (ckWARN(WARN_REGEXP))
3696                         vWARN4(RExC_parse,
3697                                "False [] range \"%*.*s\"",
3698                                RExC_parse - rangebegin,
3699                                RExC_parse - rangebegin,
3700                                rangebegin);
3701                     if (prevvalue < 256) {
3702                         ANYOF_BITMAP_SET(ret, prevvalue);
3703                         ANYOF_BITMAP_SET(ret, '-');
3704                     }
3705                     else {
3706                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3707                         Perl_sv_catpvf(aTHX_ listsv,
3708                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3709                     }
3710                 }
3711
3712                 range = 0; /* this was not a true range */
3713             }
3714
3715             if (!SIZE_ONLY) {
3716                 if (namedclass > OOB_NAMEDCLASS)
3717                     optimize_invert = FALSE;
3718                 /* Possible truncation here but in some 64-bit environments
3719                  * the compiler gets heartburn about switch on 64-bit values.
3720                  * A similar issue a little earlier when switching on value.
3721                  * --jhi */
3722                 switch ((I32)namedclass) {
3723                 case ANYOF_ALNUM:
3724                     if (LOC)
3725                         ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3726                     else {
3727                         for (value = 0; value < 256; value++)
3728                             if (isALNUM(value))
3729                                 ANYOF_BITMAP_SET(ret, value);
3730                     }
3731                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");    
3732                     break;
3733                 case ANYOF_NALNUM:
3734                     if (LOC)
3735                         ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3736                     else {
3737                         for (value = 0; value < 256; value++)
3738                             if (!isALNUM(value))
3739                                 ANYOF_BITMAP_SET(ret, value);
3740                     }
3741                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3742                     break;
3743                 case ANYOF_ALNUMC:
3744                     if (LOC)
3745                         ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3746                     else {
3747                         for (value = 0; value < 256; value++)
3748                             if (isALNUMC(value))
3749                                 ANYOF_BITMAP_SET(ret, value);
3750                     }
3751                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3752                     break;
3753                 case ANYOF_NALNUMC:
3754                     if (LOC)
3755                         ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3756                     else {
3757                         for (value = 0; value < 256; value++)
3758                             if (!isALNUMC(value))
3759                                 ANYOF_BITMAP_SET(ret, value);
3760                     }
3761                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3762                     break;
3763                 case ANYOF_ALPHA:
3764                     if (LOC)
3765                         ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3766                     else {
3767                         for (value = 0; value < 256; value++)
3768                             if (isALPHA(value))
3769                                 ANYOF_BITMAP_SET(ret, value);
3770                     }
3771                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3772                     break;
3773                 case ANYOF_NALPHA:
3774                     if (LOC)
3775                         ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3776                     else {
3777                         for (value = 0; value < 256; value++)
3778                             if (!isALPHA(value))
3779                                 ANYOF_BITMAP_SET(ret, value);
3780                     }
3781                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3782                     break;
3783                 case ANYOF_ASCII:
3784                     if (LOC)
3785                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3786                     else {
3787 #ifndef EBCDIC
3788                         for (value = 0; value < 128; value++)
3789                             ANYOF_BITMAP_SET(ret, value);
3790 #else  /* EBCDIC */
3791                         for (value = 0; value < 256; value++) {
3792                             if (isASCII(value))
3793                                 ANYOF_BITMAP_SET(ret, value);
3794                         }
3795 #endif /* EBCDIC */
3796                     }
3797                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3798                     break;
3799                 case ANYOF_NASCII:
3800                     if (LOC)
3801                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3802                     else {
3803 #ifndef EBCDIC
3804                         for (value = 128; value < 256; value++)
3805                             ANYOF_BITMAP_SET(ret, value);
3806 #else  /* EBCDIC */
3807                         for (value = 0; value < 256; value++) {
3808                             if (!isASCII(value))
3809                                 ANYOF_BITMAP_SET(ret, value);
3810                         }
3811 #endif /* EBCDIC */
3812                     }
3813                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3814                     break;
3815                 case ANYOF_BLANK:
3816                     if (LOC)
3817                         ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3818                     else {
3819                         for (value = 0; value < 256; value++)
3820                             if (isBLANK(value))
3821                                 ANYOF_BITMAP_SET(ret, value);
3822                     }
3823                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3824                     break;
3825                 case ANYOF_NBLANK:
3826                     if (LOC)
3827                         ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3828                     else {
3829                         for (value = 0; value < 256; value++)
3830                             if (!isBLANK(value))
3831                                 ANYOF_BITMAP_SET(ret, value);
3832                     }
3833                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3834                     break;
3835                 case ANYOF_CNTRL:
3836                     if (LOC)
3837                         ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3838                     else {
3839                         for (value = 0; value < 256; value++)
3840                             if (isCNTRL(value))
3841                                 ANYOF_BITMAP_SET(ret, value);
3842                     }
3843                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3844                     break;
3845                 case ANYOF_NCNTRL:
3846                     if (LOC)
3847                         ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3848                     else {
3849                         for (value = 0; value < 256; value++)
3850                             if (!isCNTRL(value))
3851                                 ANYOF_BITMAP_SET(ret, value);
3852                     }
3853                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3854                     break;
3855                 case ANYOF_DIGIT:
3856                     if (LOC)
3857                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3858                     else {
3859                         /* consecutive digits assumed */
3860                         for (value = '0'; value <= '9'; value++)
3861                             ANYOF_BITMAP_SET(ret, value);
3862                     }
3863                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3864                     break;
3865                 case ANYOF_NDIGIT:
3866                     if (LOC)
3867                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3868                     else {
3869                         /* consecutive digits assumed */
3870                         for (value = 0; value < '0'; value++)
3871                             ANYOF_BITMAP_SET(ret, value);
3872                         for (value = '9' + 1; value < 256; value++)
3873                             ANYOF_BITMAP_SET(ret, value);
3874                     }
3875                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3876                     break;
3877                 case ANYOF_GRAPH:
3878                     if (LOC)
3879                         ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3880                     else {
3881                         for (value = 0; value < 256; value++)
3882                             if (isGRAPH(value))
3883                                 ANYOF_BITMAP_SET(ret, value);
3884                     }
3885                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3886                     break;
3887                 case ANYOF_NGRAPH:
3888                     if (LOC)
3889                         ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3890                     else {
3891                         for (value = 0; value < 256; value++)
3892                             if (!isGRAPH(value))
3893                                 ANYOF_BITMAP_SET(ret, value);
3894                     }
3895                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3896                     break;
3897                 case ANYOF_LOWER:
3898                     if (LOC)
3899                         ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3900                     else {
3901                         for (value = 0; value < 256; value++)
3902                             if (isLOWER(value))
3903                                 ANYOF_BITMAP_SET(ret, value);
3904                     }
3905                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3906                     break;
3907                 case ANYOF_NLOWER:
3908                     if (LOC)
3909                         ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3910                     else {
3911                         for (value = 0; value < 256; value++)
3912                             if (!isLOWER(value))
3913                                 ANYOF_BITMAP_SET(ret, value);
3914                     }
3915                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3916                     break;
3917                 case ANYOF_PRINT:
3918                     if (LOC)
3919                         ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3920                     else {
3921                         for (value = 0; value < 256; value++)
3922                             if (isPRINT(value))
3923                                 ANYOF_BITMAP_SET(ret, value);
3924                     }
3925                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3926                     break;
3927                 case ANYOF_NPRINT:
3928                     if (LOC)
3929                         ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3930                     else {
3931                         for (value = 0; value < 256; value++)
3932                             if (!isPRINT(value))
3933                                 ANYOF_BITMAP_SET(ret, value);
3934                     }
3935                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3936                     break;
3937                 case ANYOF_PSXSPC:
3938                     if (LOC)
3939                         ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3940                     else {
3941                         for (value = 0; value < 256; value++)
3942                             if (isPSXSPC(value))
3943                                 ANYOF_BITMAP_SET(ret, value);
3944                     }
3945                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3946                     break;
3947                 case ANYOF_NPSXSPC:
3948                     if (LOC)
3949                         ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3950                     else {
3951                         for (value = 0; value < 256; value++)
3952                             if (!isPSXSPC(value))
3953                                 ANYOF_BITMAP_SET(ret, value);
3954                     }
3955                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3956                     break;
3957                 case ANYOF_PUNCT:
3958                     if (LOC)
3959                         ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3960                     else {
3961                         for (value = 0; value < 256; value++)
3962                             if (isPUNCT(value))
3963                                 ANYOF_BITMAP_SET(ret, value);
3964                     }
3965                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
3966                     break;
3967                 case ANYOF_NPUNCT:
3968                     if (LOC)
3969                         ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
3970                     else {
3971                         for (value = 0; value < 256; value++)
3972                             if (!isPUNCT(value))
3973                                 ANYOF_BITMAP_SET(ret, value);
3974                     }
3975                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3976                     break;
3977                 case ANYOF_SPACE:
3978                     if (LOC)
3979                         ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3980                     else {
3981                         for (value = 0; value < 256; value++)
3982                             if (isSPACE(value))
3983                                 ANYOF_BITMAP_SET(ret, value);
3984                     }
3985                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3986                     break;
3987                 case ANYOF_NSPACE:
3988                     if (LOC)
3989                         ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3990                     else {
3991                         for (value = 0; value < 256; value++)
3992                             if (!isSPACE(value))
3993                                 ANYOF_BITMAP_SET(ret, value);
3994                     }
3995                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
3996                     break;
3997                 case ANYOF_UPPER:
3998                     if (LOC)
3999                         ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4000                     else {
4001                         for (value = 0; value < 256; value++)
4002                             if (isUPPER(value))
4003                                 ANYOF_BITMAP_SET(ret, value);
4004                     }
4005                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
4006                     break;
4007                 case ANYOF_NUPPER:
4008                     if (LOC)
4009                         ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4010                     else {
4011                         for (value = 0; value < 256; value++)
4012                             if (!isUPPER(value))
4013                                 ANYOF_BITMAP_SET(ret, value);
4014                     }
4015                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
4016                     break;
4017                 case ANYOF_XDIGIT:
4018                     if (LOC)
4019                         ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4020                     else {
4021                         for (value = 0; value < 256; value++)
4022                             if (isXDIGIT(value))
4023                                 ANYOF_BITMAP_SET(ret, value);
4024                     }
4025                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4026                     break;
4027                 case ANYOF_NXDIGIT:
4028                     if (LOC)
4029                         ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4030                     else {
4031                         for (value = 0; value < 256; value++)
4032                             if (!isXDIGIT(value))
4033                                 ANYOF_BITMAP_SET(ret, value);
4034                     }
4035                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4036                     break;
4037                 default:
4038                     vFAIL("Invalid [::] class");
4039                     break;
4040                 }
4041                 if (LOC)
4042                     ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4043                 continue;
4044             }
4045         } /* end of namedclass \blah */
4046
4047         if (range) {
4048             if (prevvalue > (IV)value) /* b-a */ {
4049                 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4050                               RExC_parse - rangebegin,
4051                               RExC_parse - rangebegin,
4052                               rangebegin);
4053                 range = 0; /* not a valid range */
4054             }
4055         }
4056         else {
4057             prevvalue = value; /* save the beginning of the range */
4058             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4059                 RExC_parse[1] != ']') {
4060                 RExC_parse++;
4061
4062                 /* a bad range like \w-, [:word:]- ? */
4063                 if (namedclass > OOB_NAMEDCLASS) {
4064                     if (ckWARN(WARN_REGEXP))
4065                         vWARN4(RExC_parse,
4066                                "False [] range \"%*.*s\"",
4067                                RExC_parse - rangebegin,
4068                                RExC_parse - rangebegin,
4069                                rangebegin);
4070                     if (!SIZE_ONLY)
4071                         ANYOF_BITMAP_SET(ret, '-');
4072                 } else
4073                     range = 1;  /* yeah, it's a range! */
4074                 continue;       /* but do it the next time */
4075             }
4076         }
4077
4078         /* now is the next time */
4079         if (!SIZE_ONLY) {
4080             IV i;
4081
4082             if (prevvalue < 256) {
4083                 IV ceilvalue = value < 256 ? value : 255;
4084
4085 #ifdef EBCDIC
4086                 /* In EBCDIC [\x89-\x91] should include
4087                  * the \x8e but [i-j] should not. */
4088                 if (literal_endpoint == 2 &&
4089                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4090                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4091                 {
4092                     if (isLOWER(prevvalue)) {
4093                         for (i = prevvalue; i <= ceilvalue; i++)
4094                             if (isLOWER(i))
4095                                 ANYOF_BITMAP_SET(ret, i);
4096                     } else {
4097                         for (i = prevvalue; i <= ceilvalue; i++)
4098                             if (isUPPER(i))
4099                                 ANYOF_BITMAP_SET(ret, i);
4100                     }
4101                 }
4102                 else
4103 #endif
4104                       for (i = prevvalue; i <= ceilvalue; i++)
4105                           ANYOF_BITMAP_SET(ret, i);
4106           }
4107           if (value > 255 || UTF) {
4108                 UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
4109                 UV natvalue      = NATIVE_TO_UNI(value);
4110
4111                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4112                 if (prevnatvalue < natvalue) { /* what about > ? */
4113                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4114                                    prevnatvalue, natvalue);
4115                 }
4116                 else if (prevnatvalue == natvalue) {
4117                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4118                     if (FOLD) {
4119                          U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4120                          STRLEN foldlen;
4121                          UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4122
4123                          /* If folding and foldable and a single
4124                           * character, insert also the folded version
4125                           * to the charclass. */
4126                          if (f != value) {
4127                               if (foldlen == (STRLEN)UNISKIP(f))
4128                                   Perl_sv_catpvf(aTHX_ listsv,
4129                                                  "%04"UVxf"\n", f);
4130                               else {
4131                                   /* Any multicharacter foldings
4132                                    * require the following transform:
4133                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4134                                    * where E folds into "pq" and F folds
4135                                    * into "rst", all other characters
4136                                    * fold to single characters.  We save
4137                                    * away these multicharacter foldings,
4138                                    * to be later saved as part of the
4139                                    * additional "s" data. */
4140                                   SV *sv;
4141
4142                                   if (!unicode_alternate)
4143                                       unicode_alternate = newAV();
4144                                   sv = newSVpvn((char*)foldbuf, foldlen);
4145                                   SvUTF8_on(sv);
4146                                   av_push(unicode_alternate, sv);
4147                               }
4148                          }
4149
4150                          /* If folding and the value is one of the Greek
4151                           * sigmas insert a few more sigmas to make the
4152                           * folding rules of the sigmas to work right.
4153                           * Note that not all the possible combinations
4154                           * are handled here: some of them are handled
4155                           * by the standard folding rules, and some of
4156                           * them (literal or EXACTF cases) are handled
4157                           * during runtime in regexec.c:S_find_byclass(). */
4158                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4159                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4160                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4161                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4162                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4163                          }
4164                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4165                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4166                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4167                     }
4168                 }
4169             }
4170 #ifdef EBCDIC
4171             literal_endpoint = 0;
4172 #endif
4173         }
4174
4175         range = 0; /* this range (if it was one) is done now */
4176     }
4177
4178     if (need_class) {
4179         ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4180         if (SIZE_ONLY)
4181             RExC_size += ANYOF_CLASS_ADD_SKIP;
4182         else
4183             RExC_emit += ANYOF_CLASS_ADD_SKIP;
4184     }
4185
4186     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4187     if (!SIZE_ONLY &&
4188          /* If the only flag is folding (plus possibly inversion). */
4189         ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4190        ) {
4191         for (value = 0; value < 256; ++value) {
4192             if (ANYOF_BITMAP_TEST(ret, value)) {
4193                 UV fold = PL_fold[value];
4194
4195                 if (fold != value)
4196                     ANYOF_BITMAP_SET(ret, fold);
4197             }
4198         }
4199         ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4200     }
4201
4202     /* optimize inverted simple patterns (e.g. [^a-z]) */
4203     if (!SIZE_ONLY && optimize_invert &&
4204         /* If the only flag is inversion. */
4205         (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4206         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4207             ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4208         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4209     }
4210
4211     if (!SIZE_ONLY) {
4212         AV *av = newAV();
4213         SV *rv;
4214
4215         /* The 0th element stores the character class description
4216          * in its textual form: used later (regexec.c:Perl_regclass_swash())
4217          * to initialize the appropriate swash (which gets stored in
4218          * the 1st element), and also useful for dumping the regnode.
4219          * The 2nd element stores the multicharacter foldings,
4220          * used later (regexec.c:S_reginclass()). */
4221         av_store(av, 0, listsv);
4222         av_store(av, 1, NULL);
4223         av_store(av, 2, (SV*)unicode_alternate);
4224         rv = newRV_noinc((SV*)av);
4225         n = add_data(pRExC_state, 1, "s");
4226         RExC_rx->data->data[n] = (void*)rv;
4227         ARG_SET(ret, n);
4228     }
4229
4230     return ret;
4231 }
4232
4233 STATIC char*
4234 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4235 {
4236     char* retval = RExC_parse++;
4237
4238     for (;;) {
4239         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4240                 RExC_parse[2] == '#') {
4241             while (*RExC_parse && *RExC_parse != ')')
4242                 RExC_parse++;
4243             RExC_parse++;
4244             continue;
4245         }
4246         if (RExC_flags & PMf_EXTENDED) {
4247             if (isSPACE(*RExC_parse)) {
4248                 RExC_parse++;
4249                 continue;
4250             }
4251             else if (*RExC_parse == '#') {
4252                 while (*RExC_parse && *RExC_parse != '\n')
4253                     RExC_parse++;
4254                 RExC_parse++;
4255                 continue;
4256             }
4257         }
4258         return retval;
4259     }
4260 }
4261
4262 /*
4263 - reg_node - emit a node
4264 */
4265 STATIC regnode *                        /* Location. */
4266 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4267 {
4268     register regnode *ret;
4269     register regnode *ptr;
4270
4271     ret = RExC_emit;
4272     if (SIZE_ONLY) {
4273         SIZE_ALIGN(RExC_size);
4274         RExC_size += 1;
4275         return(ret);
4276     }
4277
4278     NODE_ALIGN_FILL(ret);
4279     ptr = ret;
4280     FILL_ADVANCE_NODE(ptr, op);
4281     if (RExC_offsets) {         /* MJD */
4282         MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n", 
4283               "reg_node", __LINE__, 
4284               reg_name[op],
4285               RExC_emit - RExC_emit_start > RExC_offsets[0] 
4286               ? "Overwriting end of array!\n" : "OK",
4287               RExC_emit - RExC_emit_start,
4288               RExC_parse - RExC_start,
4289               RExC_offsets[0])); 
4290         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4291     }
4292             
4293     RExC_emit = ptr;
4294
4295     return(ret);
4296 }
4297
4298 /*
4299 - reganode - emit a node with an argument
4300 */
4301 STATIC regnode *                        /* Location. */
4302 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4303 {
4304     register regnode *ret;
4305     register regnode *ptr;
4306
4307     ret = RExC_emit;
4308     if (SIZE_ONLY) {
4309         SIZE_ALIGN(RExC_size);
4310         RExC_size += 2;
4311         return(ret);
4312     }
4313
4314     NODE_ALIGN_FILL(ret);
4315     ptr = ret;
4316     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4317     if (RExC_offsets) {         /* MJD */
4318         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n", 
4319               "reganode",
4320               __LINE__,
4321               reg_name[op],
4322               RExC_emit - RExC_emit_start > RExC_offsets[0] ? 
4323               "Overwriting end of array!\n" : "OK",
4324               RExC_emit - RExC_emit_start,
4325               RExC_parse - RExC_start,
4326               RExC_offsets[0])); 
4327         Set_Cur_Node_Offset;
4328     }
4329             
4330     RExC_emit = ptr;
4331
4332     return(ret);
4333 }
4334
4335 /*
4336 - reguni - emit (if appropriate) a Unicode character
4337 */
4338 STATIC void
4339 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4340 {
4341     *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4342 }
4343
4344 /*
4345 - reginsert - insert an operator in front of already-emitted operand
4346 *
4347 * Means relocating the operand.
4348 */
4349 STATIC void
4350 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4351 {
4352     register regnode *src;
4353     register regnode *dst;
4354     register regnode *place;
4355     register int offset = regarglen[(U8)op];
4356
4357 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4358
4359     if (SIZE_ONLY) {
4360         RExC_size += NODE_STEP_REGNODE + offset;
4361         return;
4362     }
4363
4364     src = RExC_emit;
4365     RExC_emit += NODE_STEP_REGNODE + offset;
4366     dst = RExC_emit;
4367     while (src > opnd) {
4368         StructCopy(--src, --dst, regnode);
4369         if (RExC_offsets) {     /* MJD 20010112 */
4370             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4371                   "reg_insert",
4372                   __LINE__,
4373                   reg_name[op],
4374                   dst - RExC_emit_start > RExC_offsets[0] 
4375                   ? "Overwriting end of array!\n" : "OK",
4376                   src - RExC_emit_start,
4377                   dst - RExC_emit_start,
4378                   RExC_offsets[0])); 
4379             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4380             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4381         }
4382     }
4383     
4384
4385     place = opnd;               /* Op node, where operand used to be. */
4386     if (RExC_offsets) {         /* MJD */
4387         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n", 
4388               "reginsert",
4389               __LINE__,
4390               reg_name[op],
4391               place - RExC_emit_start > RExC_offsets[0] 
4392               ? "Overwriting end of array!\n" : "OK",
4393               place - RExC_emit_start,
4394               RExC_parse - RExC_start,
4395               RExC_offsets[0])); 
4396         Set_Node_Offset(place, RExC_parse);
4397     }
4398     src = NEXTOPER(place);
4399     FILL_ADVANCE_NODE(place, op);
4400     Zero(src, offset, regnode);
4401 }
4402
4403 /*
4404 - regtail - set the next-pointer at the end of a node chain of p to val.
4405 */
4406 STATIC void
4407 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4408 {
4409     register regnode *scan;
4410     register regnode *temp;
4411
4412     if (SIZE_ONLY)
4413         return;
4414
4415     /* Find last node. */
4416     scan = p;
4417     for (;;) {
4418         temp = regnext(scan);
4419         if (temp == NULL)
4420             break;
4421         scan = temp;
4422     }
4423
4424     if (reg_off_by_arg[OP(scan)]) {
4425         ARG_SET(scan, val - scan);
4426     }
4427     else {
4428         NEXT_OFF(scan) = val - scan;
4429     }
4430 }
4431
4432 /*
4433 - regoptail - regtail on operand of first argument; nop if operandless
4434 */
4435 STATIC void
4436 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4437 {
4438     /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4439     if (p == NULL || SIZE_ONLY)
4440         return;
4441     if (PL_regkind[(U8)OP(p)] == BRANCH) {
4442         regtail(pRExC_state, NEXTOPER(p), val);
4443     }
4444     else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4445         regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4446     }
4447     else
4448         return;
4449 }
4450
4451 /*
4452  - regcurly - a little FSA that accepts {\d+,?\d*}
4453  */
4454 STATIC I32
4455 S_regcurly(pTHX_ register char *s)
4456 {
4457     if (*s++ != '{')
4458         return FALSE;
4459     if (!isDIGIT(*s))
4460         return FALSE;
4461     while (isDIGIT(*s))
4462         s++;
4463     if (*s == ',')
4464         s++;
4465     while (isDIGIT(*s))
4466         s++;
4467     if (*s != '}')
4468         return FALSE;
4469     return TRUE;
4470 }
4471
4472
4473 #ifdef DEBUGGING
4474
4475 STATIC regnode *
4476 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4477 {
4478     register U8 op = EXACT;     /* Arbitrary non-END op. */
4479     register regnode *next;
4480
4481     while (op != END && (!last || node < last)) {
4482         /* While that wasn't END last time... */
4483
4484         NODE_ALIGN(node);
4485         op = OP(node);
4486         if (op == CLOSE)
4487             l--;        
4488         next = regnext(node);
4489         /* Where, what. */
4490         if (OP(node) == OPTIMIZED)
4491             goto after_print;
4492         regprop(sv, node);
4493         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4494                       (int)(2*l + 1), "", SvPVX(sv));
4495         if (next == NULL)               /* Next ptr. */
4496             PerlIO_printf(Perl_debug_log, "(0)");
4497         else
4498             PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4499         (void)PerlIO_putc(Perl_debug_log, '\n');
4500       after_print:
4501         if (PL_regkind[(U8)op] == BRANCHJ) {
4502             register regnode *nnode = (OP(next) == LONGJMP
4503                                        ? regnext(next)
4504                                        : next);
4505             if (last && nnode > last)
4506                 nnode = last;
4507             node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4508         }
4509         else if (PL_regkind[(U8)op] == BRANCH) {
4510             node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4511         }
4512         else if ( op == CURLY) {   /* `next' might be very big: optimizer */
4513             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4514                              NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4515         }
4516         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4517             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4518                              next, sv, l + 1);
4519         }
4520         else if ( op == PLUS || op == STAR) {
4521             node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4522         }
4523         else if (op == ANYOF) {
4524             /* arglen 1 + class block */
4525             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4526                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4527             node = NEXTOPER(node);
4528         }
4529         else if (PL_regkind[(U8)op] == EXACT) {
4530             /* Literal string, where present. */
4531             node += NODE_SZ_STR(node) - 1;
4532             node = NEXTOPER(node);
4533         }
4534         else {
4535             node = NEXTOPER(node);
4536             node += regarglen[(U8)op];
4537         }
4538         if (op == CURLYX || op == OPEN)
4539             l++;
4540         else if (op == WHILEM)
4541             l--;
4542     }
4543     return node;
4544 }
4545
4546 #endif  /* DEBUGGING */
4547
4548 /*
4549  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4550  */
4551 void
4552 Perl_regdump(pTHX_ regexp *r)
4553 {
4554 #ifdef DEBUGGING
4555     SV *sv = sv_newmortal();
4556
4557     (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4558
4559     /* Header fields of interest. */
4560     if (r->anchored_substr)
4561         PerlIO_printf(Perl_debug_log,
4562                       "anchored `%s%.*s%s'%s at %"IVdf" ",
4563                       PL_colors[0],
4564                       (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4565                       SvPVX(r->anchored_substr),
4566                       PL_colors[1],
4567                       SvTAIL(r->anchored_substr) ? "$" : "",
4568                       (IV)r->anchored_offset);
4569     else if (r->anchored_utf8)
4570         PerlIO_printf(Perl_debug_log,
4571                       "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4572                       PL_colors[0],
4573                       (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4574                       SvPVX(r->anchored_utf8),
4575                       PL_colors[1],
4576                       SvTAIL(r->anchored_utf8) ? "$" : "",
4577                       (IV)r->anchored_offset);
4578     if (r->float_substr)
4579         PerlIO_printf(Perl_debug_log,
4580                       "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4581                       PL_colors[0],
4582                       (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4583                       SvPVX(r->float_substr),
4584                       PL_colors[1],
4585                       SvTAIL(r->float_substr) ? "$" : "",
4586                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4587     else if (r->float_utf8)
4588         PerlIO_printf(Perl_debug_log,
4589                       "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4590                       PL_colors[0],
4591                       (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4592                       SvPVX(r->float_utf8),
4593                       PL_colors[1],
4594                       SvTAIL(r->float_utf8) ? "$" : "",
4595                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4596     if (r->check_substr || r->check_utf8)
4597         PerlIO_printf(Perl_debug_log,
4598                       r->check_substr == r->float_substr
4599                       && r->check_utf8 == r->float_utf8
4600                       ? "(checking floating" : "(checking anchored");
4601     if (r->reganch & ROPT_NOSCAN)
4602         PerlIO_printf(Perl_debug_log, " noscan");
4603     if (r->reganch & ROPT_CHECK_ALL)
4604         PerlIO_printf(Perl_debug_log, " isall");
4605     if (r->check_substr || r->check_utf8)
4606         PerlIO_printf(Perl_debug_log, ") ");
4607
4608     if (r->regstclass) {
4609         regprop(sv, r->regstclass);
4610         PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4611     }
4612     if (r->reganch & ROPT_ANCH) {
4613         PerlIO_printf(Perl_debug_log, "anchored");
4614         if (r->reganch & ROPT_ANCH_BOL)
4615             PerlIO_printf(Perl_debug_log, "(BOL)");
4616         if (r->reganch & ROPT_ANCH_MBOL)
4617             PerlIO_printf(Perl_debug_log, "(MBOL)");
4618         if (r->reganch & ROPT_ANCH_SBOL)
4619             PerlIO_printf(Perl_debug_log, "(SBOL)");
4620         if (r->reganch & ROPT_ANCH_GPOS)
4621             PerlIO_printf(Perl_debug_log, "(GPOS)");
4622         PerlIO_putc(Perl_debug_log, ' ');
4623     }
4624     if (r->reganch & ROPT_GPOS_SEEN)
4625         PerlIO_printf(Perl_debug_log, "GPOS ");
4626     if (r->reganch & ROPT_SKIP)
4627         PerlIO_printf(Perl_debug_log, "plus ");
4628     if (r->reganch & ROPT_IMPLICIT)
4629         PerlIO_printf(Perl_debug_log, "implicit ");
4630     PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4631     if (r->reganch & ROPT_EVAL_SEEN)
4632         PerlIO_printf(Perl_debug_log, "with eval ");
4633     PerlIO_printf(Perl_debug_log, "\n");
4634     if (r->offsets) {
4635       U32 i;
4636       U32 len = r->offsets[0];
4637       PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4638       for (i = 1; i <= len; i++)
4639         PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ", 
4640                       (UV)r->offsets[i*2-1], 
4641                       (UV)r->offsets[i*2]);
4642       PerlIO_printf(Perl_debug_log, "\n");
4643     }
4644 #endif  /* DEBUGGING */
4645 }
4646
4647 #ifdef DEBUGGING
4648
4649 STATIC void
4650 S_put_byte(pTHX_ SV *sv, int c)
4651 {
4652     if (isCNTRL(c) || c == 255 || !isPRINT(c))
4653         Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4654     else if (c == '-' || c == ']' || c == '\\' || c == '^')
4655         Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4656     else
4657         Perl_sv_catpvf(aTHX_ sv, "%c", c);
4658 }
4659
4660 #endif  /* DEBUGGING */
4661
4662 /*
4663 - regprop - printable representation of opcode
4664 */
4665 void
4666 Perl_regprop(pTHX_ SV *sv, regnode *o)
4667 {
4668 #ifdef DEBUGGING
4669     register int k;
4670
4671     sv_setpvn(sv, "", 0);
4672     if (OP(o) >= reg_num)               /* regnode.type is unsigned */
4673         /* It would be nice to FAIL() here, but this may be called from
4674            regexec.c, and it would be hard to supply pRExC_state. */
4675         Perl_croak(aTHX_ "Corrupted regexp opcode");
4676     sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4677
4678     k = PL_regkind[(U8)OP(o)];
4679
4680     if (k == EXACT) {
4681         SV *dsv = sv_2mortal(newSVpvn("", 0));
4682         /* Using is_utf8_string() is a crude hack but it may
4683          * be the best for now since we have no flag "this EXACTish
4684          * node was UTF-8" --jhi */
4685         bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4686         char *s    = do_utf8 ?
4687           pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4688                          UNI_DISPLAY_REGEX) :
4689           STRING(o);
4690         int len = do_utf8 ?
4691           strlen(s) :
4692           STR_LEN(o);
4693         Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4694                        PL_colors[0],
4695                        len, s,
4696                        PL_colors[1]);
4697     }
4698     else if (k == CURLY) {
4699         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4700             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4701         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4702     }
4703     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
4704         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4705     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4706         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
4707     else if (k == LOGICAL)
4708         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
4709     else if (k == ANYOF) {
4710         int i, rangestart = -1;
4711         U8 flags = ANYOF_FLAGS(o);
4712         const char * const anyofs[] = { /* Should be syncronized with
4713                                          * ANYOF_ #xdefines in regcomp.h */
4714             "\\w",
4715             "\\W",
4716             "\\s",
4717             "\\S",
4718             "\\d",
4719             "\\D",
4720             "[:alnum:]",
4721             "[:^alnum:]",
4722             "[:alpha:]",
4723             "[:^alpha:]",
4724             "[:ascii:]",
4725             "[:^ascii:]",
4726             "[:ctrl:]",
4727             "[:^ctrl:]",
4728             "[:graph:]",
4729             "[:^graph:]",
4730             "[:lower:]",
4731             "[:^lower:]",
4732             "[:print:]",
4733             "[:^print:]",
4734             "[:punct:]",
4735             "[:^punct:]",
4736             "[:upper:]",
4737             "[:^upper:]",
4738             "[:xdigit:]",
4739             "[:^xdigit:]",
4740             "[:space:]",
4741             "[:^space:]",
4742             "[:blank:]",
4743             "[:^blank:]"
4744         };
4745
4746         if (flags & ANYOF_LOCALE)
4747             sv_catpv(sv, "{loc}");
4748         if (flags & ANYOF_FOLD)
4749             sv_catpv(sv, "{i}");
4750         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4751         if (flags & ANYOF_INVERT)
4752             sv_catpv(sv, "^");
4753         for (i = 0; i <= 256; i++) {
4754             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4755                 if (rangestart == -1)
4756                     rangestart = i;
4757             } else if (rangestart != -1) {
4758                 if (i <= rangestart + 3)
4759                     for (; rangestart < i; rangestart++)
4760                         put_byte(sv, rangestart);
4761                 else {
4762                     put_byte(sv, rangestart);
4763                     sv_catpv(sv, "-");
4764                     put_byte(sv, i - 1);
4765                 }
4766                 rangestart = -1;
4767             }
4768         }
4769
4770         if (o->flags & ANYOF_CLASS)
4771             for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4772                 if (ANYOF_CLASS_TEST(o,i))
4773                     sv_catpv(sv, anyofs[i]);
4774
4775         if (flags & ANYOF_UNICODE)
4776             sv_catpv(sv, "{unicode}");
4777         else if (flags & ANYOF_UNICODE_ALL)
4778             sv_catpv(sv, "{unicode_all}");
4779
4780         {
4781             SV *lv;
4782             SV *sw = regclass_swash(o, FALSE, &lv, 0);
4783         
4784             if (lv) {
4785                 if (sw) {
4786                     U8 s[UTF8_MAXLEN+1];
4787                 
4788                     for (i = 0; i <= 256; i++) { /* just the first 256 */
4789                         U8 *e = uvchr_to_utf8(s, i);
4790                         
4791                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
4792                             if (rangestart == -1)
4793                                 rangestart = i;
4794                         } else if (rangestart != -1) {
4795                             U8 *p;
4796                         
4797                             if (i <= rangestart + 3)
4798                                 for (; rangestart < i; rangestart++) {
4799                                     for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4800                                         put_byte(sv, *p);
4801                                 }
4802                             else {
4803                                 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4804                                     put_byte(sv, *p);
4805                                 sv_catpv(sv, "-");
4806                                     for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4807                                         put_byte(sv, *p);
4808                                 }
4809                                 rangestart = -1;
4810                             }
4811                         }
4812                         
4813                     sv_catpv(sv, "..."); /* et cetera */
4814                 }
4815
4816                 {
4817                     char *s = savepv(SvPVX(lv));
4818                     char *origs = s;
4819                 
4820                     while(*s && *s != '\n') s++;
4821                 
4822                     if (*s == '\n') {
4823                         char *t = ++s;
4824                         
4825                         while (*s) {
4826                             if (*s == '\n')
4827                                 *s = ' ';
4828                             s++;
4829                         }
4830                         if (s[-1] == ' ')
4831                             s[-1] = 0;
4832                         
4833                         sv_catpv(sv, t);
4834                     }
4835                 
4836                     Safefree(origs);
4837                 }
4838             }
4839         }
4840
4841         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4842     }
4843     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4844         Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4845 #endif  /* DEBUGGING */
4846 }
4847
4848 SV *
4849 Perl_re_intuit_string(pTHX_ regexp *prog)
4850 {                               /* Assume that RE_INTUIT is set */
4851     DEBUG_r(
4852         {   STRLEN n_a;
4853             char *s = SvPV(prog->check_substr
4854                       ? prog->check_substr : prog->check_utf8, n_a);
4855
4856             if (!PL_colorset) reginitcolors();
4857             PerlIO_printf(Perl_debug_log,
4858                       "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4859                       PL_colors[4],
4860                       prog->check_substr ? "" : "utf8 ",
4861                       PL_colors[5],PL_colors[0],
4862                       s,
4863                       PL_colors[1],
4864                       (strlen(s) > 60 ? "..." : ""));
4865         } );
4866
4867     return prog->check_substr ? prog->check_substr : prog->check_utf8;
4868 }
4869
4870 void
4871 Perl_pregfree(pTHX_ struct regexp *r)
4872 {
4873 #ifdef DEBUGGING
4874     SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4875 #endif
4876
4877     if (!r || (--r->refcnt > 0))
4878         return;
4879     DEBUG_r({
4880          int len;
4881          char *s;
4882
4883          s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4884                 r->prelen, 60, UNI_DISPLAY_REGEX)
4885             : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4886          len = SvCUR(dsv);
4887          if (!PL_colorset)
4888               reginitcolors();
4889          PerlIO_printf(Perl_debug_log,
4890                        "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4891                        PL_colors[4],PL_colors[5],PL_colors[0],
4892                        len, len, s,
4893                        PL_colors[1],
4894                        len > 60 ? "..." : "");
4895     });
4896
4897     if (r->precomp)
4898         Safefree(r->precomp);
4899     if (r->offsets)             /* 20010421 MJD */
4900         Safefree(r->offsets);
4901     if (RX_MATCH_COPIED(r))
4902         Safefree(r->subbeg);
4903     if (r->substrs) {
4904         if (r->anchored_substr)
4905             SvREFCNT_dec(r->anchored_substr);
4906         if (r->anchored_utf8)
4907             SvREFCNT_dec(r->anchored_utf8);
4908         if (r->float_substr)
4909             SvREFCNT_dec(r->float_substr);
4910         if (r->float_utf8)
4911             SvREFCNT_dec(r->float_utf8);
4912         Safefree(r->substrs);
4913     }
4914     if (r->data) {
4915         int n = r->data->count;
4916         PAD* new_comppad = NULL;
4917         PAD* old_comppad;
4918
4919         while (--n >= 0) {
4920           /* If you add a ->what type here, update the comment in regcomp.h */
4921             switch (r->data->what[n]) {
4922             case 's':
4923                 SvREFCNT_dec((SV*)r->data->data[n]);
4924                 break;
4925             case 'f':
4926                 Safefree(r->data->data[n]);
4927                 break;
4928             case 'p':
4929                 new_comppad = (AV*)r->data->data[n];
4930                 break;
4931             case 'o':
4932                 if (new_comppad == NULL)
4933                     Perl_croak(aTHX_ "panic: pregfree comppad");
4934                 PAD_SAVE_LOCAL(old_comppad,
4935                     /* Watch out for global destruction's random ordering. */
4936                     (SvTYPE(new_comppad) == SVt_PVAV) ?
4937                                 new_comppad : Null(PAD *)
4938                 );
4939                 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4940                     op_free((OP_4tree*)r->data->data[n]);
4941                 }
4942
4943                 PAD_RESTORE_LOCAL(old_comppad);
4944                 SvREFCNT_dec((SV*)new_comppad);
4945                 new_comppad = NULL;
4946                 break;
4947             case 'n':
4948                 break;
4949             default:
4950                 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4951             }
4952         }
4953         Safefree(r->data->what);
4954         Safefree(r->data);
4955     }
4956     Safefree(r->startp);
4957     Safefree(r->endp);
4958     Safefree(r);
4959 }
4960
4961 /*
4962  - regnext - dig the "next" pointer out of a node
4963  *
4964  * [Note, when REGALIGN is defined there are two places in regmatch()
4965  * that bypass this code for speed.]
4966  */
4967 regnode *
4968 Perl_regnext(pTHX_ register regnode *p)
4969 {
4970     register I32 offset;
4971
4972     if (p == &PL_regdummy)
4973         return(NULL);
4974
4975     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4976     if (offset == 0)
4977         return(NULL);
4978
4979     return(p+offset);
4980 }
4981
4982 STATIC void     
4983 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
4984 {
4985     va_list args;
4986     STRLEN l1 = strlen(pat1);
4987     STRLEN l2 = strlen(pat2);
4988     char buf[512];
4989     SV *msv;
4990     char *message;
4991
4992     if (l1 > 510)
4993         l1 = 510;
4994     if (l1 + l2 > 510)
4995         l2 = 510 - l1;
4996     Copy(pat1, buf, l1 , char);
4997     Copy(pat2, buf + l1, l2 , char);
4998     buf[l1 + l2] = '\n';
4999     buf[l1 + l2 + 1] = '\0';
5000 #ifdef I_STDARG
5001     /* ANSI variant takes additional second argument */
5002     va_start(args, pat2);
5003 #else
5004     va_start(args);
5005 #endif
5006     msv = vmess(buf, &args);
5007     va_end(args);
5008     message = SvPV(msv,l1);
5009     if (l1 > 512)
5010         l1 = 512;
5011     Copy(message, buf, l1 , char);
5012     buf[l1] = '\0';                     /* Overwrite \n */
5013     Perl_croak(aTHX_ "%s", buf);
5014 }
5015
5016 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
5017
5018 void
5019 Perl_save_re_context(pTHX)
5020 {
5021 #if 0
5022     SAVEPPTR(RExC_precomp);             /* uncompiled string. */
5023     SAVEI32(RExC_npar);         /* () count. */
5024     SAVEI32(RExC_size);         /* Code size. */
5025     SAVEI32(RExC_flags);                /* are we folding, multilining? */
5026     SAVEVPTR(RExC_rx);          /* from regcomp.c */
5027     SAVEI32(RExC_seen);         /* from regcomp.c */
5028     SAVEI32(RExC_sawback);              /* Did we see \1, ...? */
5029     SAVEI32(RExC_naughty);              /* How bad is this pattern? */
5030     SAVEVPTR(RExC_emit);                /* Code-emit pointer; &regdummy = don't */
5031     SAVEPPTR(RExC_end);         /* End of input for compile */
5032     SAVEPPTR(RExC_parse);               /* Input-scan pointer. */
5033 #endif
5034
5035     SAVEI32(PL_reg_flags);              /* from regexec.c */
5036     SAVEPPTR(PL_bostr);
5037     SAVEPPTR(PL_reginput);              /* String-input pointer. */
5038     SAVEPPTR(PL_regbol);                /* Beginning of input, for ^ check. */
5039     SAVEPPTR(PL_regeol);                /* End of input, for $ check. */
5040     SAVEVPTR(PL_regstartp);             /* Pointer to startp array. */
5041     SAVEVPTR(PL_regendp);               /* Ditto for endp. */
5042     SAVEVPTR(PL_reglastparen);          /* Similarly for lastparen. */
5043     SAVEPPTR(PL_regtill);               /* How far we are required to go. */
5044     SAVEGENERICPV(PL_reg_start_tmp);            /* from regexec.c */
5045     PL_reg_start_tmp = 0;
5046     SAVEI32(PL_reg_start_tmpl);         /* from regexec.c */
5047     PL_reg_start_tmpl = 0;
5048     SAVEVPTR(PL_regdata);
5049     SAVEI32(PL_reg_eval_set);           /* from regexec.c */
5050     SAVEI32(PL_regnarrate);             /* from regexec.c */
5051     SAVEVPTR(PL_regprogram);            /* from regexec.c */
5052     SAVEINT(PL_regindent);              /* from regexec.c */
5053     SAVEVPTR(PL_regcc);                 /* from regexec.c */
5054     SAVEVPTR(PL_curcop);
5055     SAVEVPTR(PL_reg_call_cc);           /* from regexec.c */
5056     SAVEVPTR(PL_reg_re);                /* from regexec.c */
5057     SAVEPPTR(PL_reg_ganch);             /* from regexec.c */
5058     SAVESPTR(PL_reg_sv);                /* from regexec.c */
5059     SAVEI8(PL_reg_match_utf8);          /* from regexec.c */
5060     SAVEVPTR(PL_reg_magic);             /* from regexec.c */
5061     SAVEI32(PL_reg_oldpos);                     /* from regexec.c */
5062     SAVEVPTR(PL_reg_oldcurpm);          /* from regexec.c */
5063     SAVEVPTR(PL_reg_curpm);             /* from regexec.c */
5064     SAVEI32(PL_regnpar);                /* () count. */
5065     SAVEI32(PL_regsize);                /* from regexec.c */
5066 #ifdef DEBUGGING
5067     SAVEPPTR(PL_reg_starttry);          /* from regexec.c */
5068 #endif
5069 }
5070
5071 static void
5072 clear_re(pTHX_ void *r)
5073 {
5074     ReREFCNT_dec((regexp *)r);
5075 }
5076