Slightly terser code in S_regpposixcc (names inside [:*here*:])
[p5sagit/p5-mst-13.2.git] / regcomp.c
1 /*    regcomp.c
2  */
3
4 /*
5  * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
6  */
7
8 /* This file contains functions for compiling a regular expression.  See
9  * also regexec.c which funnily enough, contains functions for executing
10  * a regular expression.
11  *
12  * This file is also copied at build time to ext/re/re_comp.c, where
13  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14  * This causes the main functions to be compiled under new names and with
15  * debugging support added, which makes "use re 'debug'" work.
16  */
17
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19  * confused with the original package (see point 3 below).  Thanks, Henry!
20  */
21
22 /* Additional note: this code is very heavily munged from Henry's version
23  * in places.  In some spots I've traded clarity for efficiency, so don't
24  * blame Henry for some of the lack of readability.
25  */
26
27 /* The names of the functions have been changed from regcomp and
28  * regexec to  pregcomp and pregexec in order to avoid conflicts
29  * with the POSIX routines of the same names.
30 */
31
32 #ifdef PERL_EXT_RE_BUILD
33 /* need to replace pregcomp et al, so enable that */
34 #  ifndef PERL_IN_XSUB_RE
35 #    define PERL_IN_XSUB_RE
36 #  endif
37 /* need access to debugger hooks */
38 #  if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
39 #    define DEBUGGING
40 #  endif
41 #endif
42
43 #ifdef PERL_IN_XSUB_RE
44 /* We *really* need to overwrite these symbols: */
45 #  define Perl_pregcomp my_regcomp
46 #  define Perl_regdump my_regdump
47 #  define Perl_regprop my_regprop
48 #  define Perl_pregfree my_regfree
49 #  define Perl_re_intuit_string my_re_intuit_string
50 /* *These* symbols are masked to allow static link. */
51 #  define Perl_regnext my_regnext
52 #  define Perl_save_re_context my_save_re_context
53 #  define Perl_reginitcolors my_reginitcolors
54
55 #  define PERL_NO_GET_CONTEXT
56 #endif
57
58 /*SUPPRESS 112*/
59 /*
60  * pregcomp and pregexec -- regsub and regerror are not used in perl
61  *
62  *      Copyright (c) 1986 by University of Toronto.
63  *      Written by Henry Spencer.  Not derived from licensed software.
64  *
65  *      Permission is granted to anyone to use this software for any
66  *      purpose on any computer system, and to redistribute it freely,
67  *      subject to the following restrictions:
68  *
69  *      1. The author is not responsible for the consequences of use of
70  *              this software, no matter how awful, even if they arise
71  *              from defects in it.
72  *
73  *      2. The origin of this software must not be misrepresented, either
74  *              by explicit claim or by omission.
75  *
76  *      3. Altered versions must be plainly marked as such, and must not
77  *              be misrepresented as being the original software.
78  *
79  *
80  ****    Alterations to Henry's code are...
81  ****
82  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
83  ****    2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
84  ****
85  ****    You may distribute under the terms of either the GNU General Public
86  ****    License or the Artistic License, as specified in the README file.
87
88  *
89  * Beware that some of this code is subtly aware of the way operator
90  * precedence is structured in regular expressions.  Serious changes in
91  * regular-expression syntax might require a total rethink.
92  */
93 #include "EXTERN.h"
94 #define PERL_IN_REGCOMP_C
95 #include "perl.h"
96
97 #ifndef PERL_IN_XSUB_RE
98 #  include "INTERN.h"
99 #endif
100
101 #define REG_COMP_C
102 #include "regcomp.h"
103
104 #ifdef op
105 #undef op
106 #endif /* op */
107
108 #ifdef MSDOS
109 #  if defined(BUGGY_MSC6)
110  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
111 #    pragma optimize("a",off)
112  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
113 #    pragma optimize("w",on )
114 #  endif /* BUGGY_MSC6 */
115 #endif /* MSDOS */
116
117 #ifndef STATIC
118 #define STATIC  static
119 #endif
120
121 typedef struct RExC_state_t {
122     U32         flags;                  /* are we folding, multilining? */
123     char        *precomp;               /* uncompiled string. */
124     regexp      *rx;
125     char        *start;                 /* Start of input for compile */
126     char        *end;                   /* End of input for compile */
127     char        *parse;                 /* Input-scan pointer. */
128     I32         whilem_seen;            /* number of WHILEM in this expr */
129     regnode     *emit_start;            /* Start of emitted-code area */
130     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
131     I32         naughty;                /* How bad is this pattern? */
132     I32         sawback;                /* Did we see \1, ...? */
133     U32         seen;
134     I32         size;                   /* Code size. */
135     I32         npar;                   /* () count. */
136     I32         extralen;
137     I32         seen_zerolen;
138     I32         seen_evals;
139     I32         utf8;
140 #if ADD_TO_REGEXEC
141     char        *starttry;              /* -Dr: where regtry was called. */
142 #define RExC_starttry   (pRExC_state->starttry)
143 #endif
144 } RExC_state_t;
145
146 #define RExC_flags      (pRExC_state->flags)
147 #define RExC_precomp    (pRExC_state->precomp)
148 #define RExC_rx         (pRExC_state->rx)
149 #define RExC_start      (pRExC_state->start)
150 #define RExC_end        (pRExC_state->end)
151 #define RExC_parse      (pRExC_state->parse)
152 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
153 #define RExC_offsets    (pRExC_state->rx->offsets) /* I am not like the others */
154 #define RExC_emit       (pRExC_state->emit)
155 #define RExC_emit_start (pRExC_state->emit_start)
156 #define RExC_naughty    (pRExC_state->naughty)
157 #define RExC_sawback    (pRExC_state->sawback)
158 #define RExC_seen       (pRExC_state->seen)
159 #define RExC_size       (pRExC_state->size)
160 #define RExC_npar       (pRExC_state->npar)
161 #define RExC_extralen   (pRExC_state->extralen)
162 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
163 #define RExC_seen_evals (pRExC_state->seen_evals)
164 #define RExC_utf8       (pRExC_state->utf8)
165
166 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
167 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
168         ((*s) == '{' && regcurly(s)))
169
170 #ifdef SPSTART
171 #undef SPSTART          /* dratted cpp namespace... */
172 #endif
173 /*
174  * Flags to be passed up and down.
175  */
176 #define WORST           0       /* Worst case. */
177 #define HASWIDTH        0x1     /* Known to match non-null strings. */
178 #define SIMPLE          0x2     /* Simple enough to be STAR/PLUS operand. */
179 #define SPSTART         0x4     /* Starts with * or +. */
180 #define TRYAGAIN        0x8     /* Weeded out a declaration. */
181
182 /* Length of a variant. */
183
184 typedef struct scan_data_t {
185     I32 len_min;
186     I32 len_delta;
187     I32 pos_min;
188     I32 pos_delta;
189     SV *last_found;
190     I32 last_end;                       /* min value, <0 unless valid. */
191     I32 last_start_min;
192     I32 last_start_max;
193     SV **longest;                       /* Either &l_fixed, or &l_float. */
194     SV *longest_fixed;
195     I32 offset_fixed;
196     SV *longest_float;
197     I32 offset_float_min;
198     I32 offset_float_max;
199     I32 flags;
200     I32 whilem_c;
201     I32 *last_closep;
202     struct regnode_charclass_class *start_class;
203 } scan_data_t;
204
205 /*
206  * Forward declarations for pregcomp()'s friends.
207  */
208
209 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
210                                       0, 0, 0, 0, 0, 0};
211
212 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
213 #define SF_BEFORE_SEOL          0x1
214 #define SF_BEFORE_MEOL          0x2
215 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
216 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
217
218 #ifdef NO_UNARY_PLUS
219 #  define SF_FIX_SHIFT_EOL      (0+2)
220 #  define SF_FL_SHIFT_EOL               (0+4)
221 #else
222 #  define SF_FIX_SHIFT_EOL      (+2)
223 #  define SF_FL_SHIFT_EOL               (+4)
224 #endif
225
226 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
227 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
228
229 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
230 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
231 #define SF_IS_INF               0x40
232 #define SF_HAS_PAR              0x80
233 #define SF_IN_PAR               0x100
234 #define SF_HAS_EVAL             0x200
235 #define SCF_DO_SUBSTR           0x400
236 #define SCF_DO_STCLASS_AND      0x0800
237 #define SCF_DO_STCLASS_OR       0x1000
238 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
239 #define SCF_WHILEM_VISITED_POS  0x2000
240
241 #define UTF (RExC_utf8 != 0)
242 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
243 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
244
245 #define OOB_UNICODE             12345678
246 #define OOB_NAMEDCLASS          -1
247
248 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
249 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
250
251
252 /* length of regex to show in messages that don't mark a position within */
253 #define RegexLengthToShowInErrorMessages 127
254
255 /*
256  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
257  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
258  * op/pragma/warn/regcomp.
259  */
260 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
261 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
262
263 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
264
265 /*
266  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
267  * arg. Show regex, up to a maximum length. If it's too long, chop and add
268  * "...".
269  */
270 #define FAIL(msg) STMT_START {                                          \
271     char *ellipses = "";                                                \
272     IV len = RExC_end - RExC_precomp;                                   \
273                                                                         \
274     if (!SIZE_ONLY)                                                     \
275         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
276     if (len > RegexLengthToShowInErrorMessages) {                       \
277         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
278         len = RegexLengthToShowInErrorMessages - 10;                    \
279         ellipses = "...";                                               \
280     }                                                                   \
281     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",                           \
282             msg, (int)len, RExC_precomp, ellipses);                     \
283 } STMT_END
284
285 /*
286  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
287  * args. Show regex, up to a maximum length. If it's too long, chop and add
288  * "...".
289  */
290 #define FAIL2(pat,msg) STMT_START {                                     \
291     char *ellipses = "";                                                \
292     IV len = RExC_end - RExC_precomp;                                   \
293                                                                         \
294     if (!SIZE_ONLY)                                                     \
295         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
296     if (len > RegexLengthToShowInErrorMessages) {                       \
297         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
298         len = RegexLengthToShowInErrorMessages - 10;                    \
299         ellipses = "...";                                               \
300     }                                                                   \
301     S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/",                       \
302             msg, (int)len, RExC_precomp, ellipses);                     \
303 } STMT_END
304
305
306 /*
307  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
308  */
309 #define Simple_vFAIL(m) STMT_START {                                    \
310     IV offset = RExC_parse - RExC_precomp;                              \
311     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
312             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
313 } STMT_END
314
315 /*
316  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
317  */
318 #define vFAIL(m) STMT_START {                           \
319     if (!SIZE_ONLY)                                     \
320         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
321     Simple_vFAIL(m);                                    \
322 } STMT_END
323
324 /*
325  * Like Simple_vFAIL(), but accepts two arguments.
326  */
327 #define Simple_vFAIL2(m,a1) STMT_START {                        \
328     IV offset = RExC_parse - RExC_precomp;                      \
329     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
330             (int)offset, RExC_precomp, RExC_precomp + offset);  \
331 } STMT_END
332
333 /*
334  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
335  */
336 #define vFAIL2(m,a1) STMT_START {                       \
337     if (!SIZE_ONLY)                                     \
338         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
339     Simple_vFAIL2(m, a1);                               \
340 } STMT_END
341
342
343 /*
344  * Like Simple_vFAIL(), but accepts three arguments.
345  */
346 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
347     IV offset = RExC_parse - RExC_precomp;                      \
348     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
349             (int)offset, RExC_precomp, RExC_precomp + offset);  \
350 } STMT_END
351
352 /*
353  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
354  */
355 #define vFAIL3(m,a1,a2) STMT_START {                    \
356     if (!SIZE_ONLY)                                     \
357         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
358     Simple_vFAIL3(m, a1, a2);                           \
359 } STMT_END
360
361 /*
362  * Like Simple_vFAIL(), but accepts four arguments.
363  */
364 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
365     IV offset = RExC_parse - RExC_precomp;                      \
366     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
367             (int)offset, RExC_precomp, RExC_precomp + offset);  \
368 } STMT_END
369
370 /*
371  * Like Simple_vFAIL(), but accepts five arguments.
372  */
373 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START {           \
374     IV offset = RExC_parse - RExC_precomp;                      \
375     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,       \
376             (int)offset, RExC_precomp, RExC_precomp + offset);  \
377 } STMT_END
378
379
380 #define vWARN(loc,m) STMT_START {                                       \
381     IV offset = loc - RExC_precomp;                                     \
382     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
383             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
384 } STMT_END
385
386 #define vWARNdep(loc,m) STMT_START {                                    \
387     IV offset = loc - RExC_precomp;                                     \
388     Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),          \
389             "%s" REPORT_LOCATION,                                       \
390             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
391 } STMT_END
392
393
394 #define vWARN2(loc, m, a1) STMT_START {                                 \
395     IV offset = loc - RExC_precomp;                                     \
396     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
397             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
398 } STMT_END
399
400 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
401     IV offset = loc - RExC_precomp;                                     \
402     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
403             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
404 } STMT_END
405
406 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
407     IV offset = loc - RExC_precomp;                                     \
408     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
409             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
410 } STMT_END
411
412 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
413     IV offset = loc - RExC_precomp;                                     \
414     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
415             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
416 } STMT_END
417
418
419 /* Allow for side effects in s */
420 #define REGC(c,s) STMT_START {                  \
421     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
422 } STMT_END
423
424 /* Macros for recording node offsets.   20001227 mjd@plover.com 
425  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
426  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
427  * Element 0 holds the number n.
428  */
429
430 #define MJD_OFFSET_DEBUG(x)
431 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
432
433
434 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
435     if (! SIZE_ONLY) {                                                  \
436         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
437                 __LINE__, (node), (byte)));                             \
438         if((node) < 0) {                                                \
439             Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
440         } else {                                                        \
441             RExC_offsets[2*(node)-1] = (byte);                          \
442         }                                                               \
443     }                                                                   \
444 } STMT_END
445
446 #define Set_Node_Offset(node,byte) \
447     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
448 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
449
450 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
451     if (! SIZE_ONLY) {                                                  \
452         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
453                 __LINE__, (node), (len)));                              \
454         if((node) < 0) {                                                \
455             Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
456         } else {                                                        \
457             RExC_offsets[2*(node)] = (len);                             \
458         }                                                               \
459     }                                                                   \
460 } STMT_END
461
462 #define Set_Node_Length(node,len) \
463     Set_Node_Length_To_R((node)-RExC_emit_start, len)
464 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
465 #define Set_Node_Cur_Length(node) \
466     Set_Node_Length(node, RExC_parse - parse_start)
467
468 /* Get offsets and lengths */
469 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
470 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
471
472 static void clear_re(pTHX_ void *r);
473
474 /* Mark that we cannot extend a found fixed substring at this point.
475    Updata the longest found anchored substring and the longest found
476    floating substrings if needed. */
477
478 STATIC void
479 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
480 {
481     STRLEN l = CHR_SVLEN(data->last_found);
482     STRLEN old_l = CHR_SVLEN(*data->longest);
483
484     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
485         SvSetMagicSV(*data->longest, data->last_found);
486         if (*data->longest == data->longest_fixed) {
487             data->offset_fixed = l ? data->last_start_min : data->pos_min;
488             if (data->flags & SF_BEFORE_EOL)
489                 data->flags
490                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
491             else
492                 data->flags &= ~SF_FIX_BEFORE_EOL;
493         }
494         else {
495             data->offset_float_min = l ? data->last_start_min : data->pos_min;
496             data->offset_float_max = (l
497                                       ? data->last_start_max
498                                       : data->pos_min + data->pos_delta);
499             if ((U32)data->offset_float_max > (U32)I32_MAX)
500                 data->offset_float_max = I32_MAX;
501             if (data->flags & SF_BEFORE_EOL)
502                 data->flags
503                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
504             else
505                 data->flags &= ~SF_FL_BEFORE_EOL;
506         }
507     }
508     SvCUR_set(data->last_found, 0);
509     {
510         SV * sv = data->last_found;
511         MAGIC *mg =
512             SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
513         if (mg && mg->mg_len > 0)
514             mg->mg_len = 0;
515     }
516     data->last_end = -1;
517     data->flags &= ~SF_BEFORE_EOL;
518 }
519
520 /* Can match anything (initialization) */
521 STATIC void
522 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
523 {
524     ANYOF_CLASS_ZERO(cl);
525     ANYOF_BITMAP_SETALL(cl);
526     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
527     if (LOC)
528         cl->flags |= ANYOF_LOCALE;
529 }
530
531 /* Can match anything (initialization) */
532 STATIC int
533 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
534 {
535     int value;
536
537     for (value = 0; value <= ANYOF_MAX; value += 2)
538         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
539             return 1;
540     if (!(cl->flags & ANYOF_UNICODE_ALL))
541         return 0;
542     if (!ANYOF_BITMAP_TESTALLSET(cl))
543         return 0;
544     return 1;
545 }
546
547 /* Can match anything (initialization) */
548 STATIC void
549 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
550 {
551     Zero(cl, 1, struct regnode_charclass_class);
552     cl->type = ANYOF;
553     cl_anything(pRExC_state, cl);
554 }
555
556 STATIC void
557 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
558 {
559     Zero(cl, 1, struct regnode_charclass_class);
560     cl->type = ANYOF;
561     cl_anything(pRExC_state, cl);
562     if (LOC)
563         cl->flags |= ANYOF_LOCALE;
564 }
565
566 /* 'And' a given class with another one.  Can create false positives */
567 /* We assume that cl is not inverted */
568 STATIC void
569 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570          struct regnode_charclass_class *and_with)
571 {
572     if (!(and_with->flags & ANYOF_CLASS)
573         && !(cl->flags & ANYOF_CLASS)
574         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575         && !(and_with->flags & ANYOF_FOLD)
576         && !(cl->flags & ANYOF_FOLD)) {
577         int i;
578
579         if (and_with->flags & ANYOF_INVERT)
580             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581                 cl->bitmap[i] &= ~and_with->bitmap[i];
582         else
583             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584                 cl->bitmap[i] &= and_with->bitmap[i];
585     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586     if (!(and_with->flags & ANYOF_EOS))
587         cl->flags &= ~ANYOF_EOS;
588
589     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
590         !(and_with->flags & ANYOF_INVERT)) {
591         cl->flags &= ~ANYOF_UNICODE_ALL;
592         cl->flags |= ANYOF_UNICODE;
593         ARG_SET(cl, ARG(and_with));
594     }
595     if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
596         !(and_with->flags & ANYOF_INVERT))
597         cl->flags &= ~ANYOF_UNICODE_ALL;
598     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
599         !(and_with->flags & ANYOF_INVERT))
600         cl->flags &= ~ANYOF_UNICODE;
601 }
602
603 /* 'OR' a given class with another one.  Can create false positives */
604 /* We assume that cl is not inverted */
605 STATIC void
606 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
607 {
608     if (or_with->flags & ANYOF_INVERT) {
609         /* We do not use
610          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
611          *   <= (B1 | !B2) | (CL1 | !CL2)
612          * which is wasteful if CL2 is small, but we ignore CL2:
613          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
614          * XXXX Can we handle case-fold?  Unclear:
615          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
616          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
617          */
618         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
619              && !(or_with->flags & ANYOF_FOLD)
620              && !(cl->flags & ANYOF_FOLD) ) {
621             int i;
622
623             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
624                 cl->bitmap[i] |= ~or_with->bitmap[i];
625         } /* XXXX: logic is complicated otherwise */
626         else {
627             cl_anything(pRExC_state, cl);
628         }
629     } else {
630         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
631         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
632              && (!(or_with->flags & ANYOF_FOLD)
633                  || (cl->flags & ANYOF_FOLD)) ) {
634             int i;
635
636             /* OR char bitmap and class bitmap separately */
637             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
638                 cl->bitmap[i] |= or_with->bitmap[i];
639             if (or_with->flags & ANYOF_CLASS) {
640                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
641                     cl->classflags[i] |= or_with->classflags[i];
642                 cl->flags |= ANYOF_CLASS;
643             }
644         }
645         else { /* XXXX: logic is complicated, leave it along for a moment. */
646             cl_anything(pRExC_state, cl);
647         }
648     }
649     if (or_with->flags & ANYOF_EOS)
650         cl->flags |= ANYOF_EOS;
651
652     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
653         ARG(cl) != ARG(or_with)) {
654         cl->flags |= ANYOF_UNICODE_ALL;
655         cl->flags &= ~ANYOF_UNICODE;
656     }
657     if (or_with->flags & ANYOF_UNICODE_ALL) {
658         cl->flags |= ANYOF_UNICODE_ALL;
659         cl->flags &= ~ANYOF_UNICODE;
660     }
661 }
662
663 /*
664  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
665  * These need to be revisited when a newer toolchain becomes available.
666  */
667 #if defined(__sparc64__) && defined(__GNUC__)
668 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
669 #       undef  SPARC64_GCC_WORKAROUND
670 #       define SPARC64_GCC_WORKAROUND 1
671 #   endif
672 #endif
673
674 /* REx optimizer.  Converts nodes into quickier variants "in place".
675    Finds fixed substrings.  */
676
677 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
678    to the position after last scanned or to NULL. */
679
680 STATIC I32
681 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
682                         /* scanp: Start here (read-write). */
683                         /* deltap: Write maxlen-minlen here. */
684                         /* last: Stop before this one. */
685 {
686     I32 min = 0, pars = 0, code;
687     regnode *scan = *scanp, *next;
688     I32 delta = 0;
689     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
690     int is_inf_internal = 0;            /* The studied chunk is infinite */
691     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
692     scan_data_t data_fake;
693     struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
694
695     while (scan && OP(scan) != END && scan < last) {
696         /* Peephole optimizer: */
697
698         if (PL_regkind[(U8)OP(scan)] == EXACT) {
699             /* Merge several consecutive EXACTish nodes into one. */
700             regnode *n = regnext(scan);
701             U32 stringok = 1;
702 #ifdef DEBUGGING
703             regnode *stop = scan;
704 #endif
705
706             next = scan + NODE_SZ_STR(scan);
707             /* Skip NOTHING, merge EXACT*. */
708             while (n &&
709                    ( PL_regkind[(U8)OP(n)] == NOTHING ||
710                      (stringok && (OP(n) == OP(scan))))
711                    && NEXT_OFF(n)
712                    && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
713                 if (OP(n) == TAIL || n > next)
714                     stringok = 0;
715                 if (PL_regkind[(U8)OP(n)] == NOTHING) {
716                     NEXT_OFF(scan) += NEXT_OFF(n);
717                     next = n + NODE_STEP_REGNODE;
718 #ifdef DEBUGGING
719                     if (stringok)
720                         stop = n;
721 #endif
722                     n = regnext(n);
723                 }
724                 else if (stringok) {
725                     int oldl = STR_LEN(scan);
726                     regnode *nnext = regnext(n);
727
728                     if (oldl + STR_LEN(n) > U8_MAX)
729                         break;
730                     NEXT_OFF(scan) += NEXT_OFF(n);
731                     STR_LEN(scan) += STR_LEN(n);
732                     next = n + NODE_SZ_STR(n);
733                     /* Now we can overwrite *n : */
734                     Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
735 #ifdef DEBUGGING
736                     stop = next - 1;
737 #endif
738                     n = nnext;
739                 }
740             }
741
742             if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
743 /*
744   Two problematic code points in Unicode casefolding of EXACT nodes:
745
746    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
747    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
748
749    which casefold to
750
751    Unicode                      UTF-8
752
753    U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
754    U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
755
756    This means that in case-insensitive matching (or "loose matching",
757    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
758    length of the above casefolded versions) can match a target string
759    of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
760    This would rather mess up the minimum length computation.
761
762    What we'll do is to look for the tail four bytes, and then peek
763    at the preceding two bytes to see whether we need to decrease
764    the minimum length by four (six minus two).
765
766    Thanks to the design of UTF-8, there cannot be false matches:
767    A sequence of valid UTF-8 bytes cannot be a subsequence of
768    another valid sequence of UTF-8 bytes.
769
770 */
771                  char *s0 = STRING(scan), *s, *t;
772                  char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
773                  char *t0 = "\xcc\x88\xcc\x81";
774                  char *t1 = t0 + 3;
775                  
776                  for (s = s0 + 2;
777                       s < s2 && (t = ninstr(s, s1, t0, t1));
778                       s = t + 4) {
779                       if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
780                           ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
781                            min -= 4;
782                  }
783             }
784
785 #ifdef DEBUGGING
786             /* Allow dumping */
787             n = scan + NODE_SZ_STR(scan);
788             while (n <= stop) {
789                 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
790                     OP(n) = OPTIMIZED;
791                     NEXT_OFF(n) = 0;
792                 }
793                 n++;
794             }
795 #endif
796         }
797         /* Follow the next-chain of the current node and optimize
798            away all the NOTHINGs from it.  */
799         if (OP(scan) != CURLYX) {
800             int max = (reg_off_by_arg[OP(scan)]
801                        ? I32_MAX
802                        /* I32 may be smaller than U16 on CRAYs! */
803                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
804             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
805             int noff;
806             regnode *n = scan;
807         
808             /* Skip NOTHING and LONGJMP. */
809             while ((n = regnext(n))
810                    && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
811                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
812                    && off + noff < max)
813                 off += noff;
814             if (reg_off_by_arg[OP(scan)])
815                 ARG(scan) = off;
816             else
817                 NEXT_OFF(scan) = off;
818         }
819         /* The principal pseudo-switch.  Cannot be a switch, since we
820            look into several different things.  */
821         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
822                    || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
823             next = regnext(scan);
824             code = OP(scan);
825         
826             if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
827                 I32 max1 = 0, min1 = I32_MAX, num = 0;
828                 struct regnode_charclass_class accum;
829                 
830                 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
831                     scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
832                 if (flags & SCF_DO_STCLASS)
833                     cl_init_zero(pRExC_state, &accum);
834                 while (OP(scan) == code) {
835                     I32 deltanext, minnext, f = 0, fake;
836                     struct regnode_charclass_class this_class;
837
838                     num++;
839                     data_fake.flags = 0;
840                     if (data) {         
841                         data_fake.whilem_c = data->whilem_c;
842                         data_fake.last_closep = data->last_closep;
843                     }
844                     else
845                         data_fake.last_closep = &fake;
846                     next = regnext(scan);
847                     scan = NEXTOPER(scan);
848                     if (code != BRANCH)
849                         scan = NEXTOPER(scan);
850                     if (flags & SCF_DO_STCLASS) {
851                         cl_init(pRExC_state, &this_class);
852                         data_fake.start_class = &this_class;
853                         f = SCF_DO_STCLASS_AND;
854                     }           
855                     if (flags & SCF_WHILEM_VISITED_POS)
856                         f |= SCF_WHILEM_VISITED_POS;
857                     /* we suppose the run is continuous, last=next...*/
858                     minnext = study_chunk(pRExC_state, &scan, &deltanext,
859                                           next, &data_fake, f);
860                     if (min1 > minnext)
861                         min1 = minnext;
862                     if (max1 < minnext + deltanext)
863                         max1 = minnext + deltanext;
864                     if (deltanext == I32_MAX)
865                         is_inf = is_inf_internal = 1;
866                     scan = next;
867                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
868                         pars++;
869                     if (data && (data_fake.flags & SF_HAS_EVAL))
870                         data->flags |= SF_HAS_EVAL;
871                     if (data)
872                         data->whilem_c = data_fake.whilem_c;
873                     if (flags & SCF_DO_STCLASS)
874                         cl_or(pRExC_state, &accum, &this_class);
875                     if (code == SUSPEND)
876                         break;
877                 }
878                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
879                     min1 = 0;
880                 if (flags & SCF_DO_SUBSTR) {
881                     data->pos_min += min1;
882                     data->pos_delta += max1 - min1;
883                     if (max1 != min1 || is_inf)
884                         data->longest = &(data->longest_float);
885                 }
886                 min += min1;
887                 delta += max1 - min1;
888                 if (flags & SCF_DO_STCLASS_OR) {
889                     cl_or(pRExC_state, data->start_class, &accum);
890                     if (min1) {
891                         cl_and(data->start_class, &and_with);
892                         flags &= ~SCF_DO_STCLASS;
893                     }
894                 }
895                 else if (flags & SCF_DO_STCLASS_AND) {
896                     if (min1) {
897                         cl_and(data->start_class, &accum);
898                         flags &= ~SCF_DO_STCLASS;
899                     }
900                     else {
901                         /* Switch to OR mode: cache the old value of
902                          * data->start_class */
903                         StructCopy(data->start_class, &and_with,
904                                    struct regnode_charclass_class);
905                         flags &= ~SCF_DO_STCLASS_AND;
906                         StructCopy(&accum, data->start_class,
907                                    struct regnode_charclass_class);
908                         flags |= SCF_DO_STCLASS_OR;
909                         data->start_class->flags |= ANYOF_EOS;
910                     }
911                 }
912             }
913             else if (code == BRANCHJ)   /* single branch is optimized. */
914                 scan = NEXTOPER(NEXTOPER(scan));
915             else                        /* single branch is optimized. */
916                 scan = NEXTOPER(scan);
917             continue;
918         }
919         else if (OP(scan) == EXACT) {
920             I32 l = STR_LEN(scan);
921             UV uc = *((U8*)STRING(scan));
922             if (UTF) {
923                 U8 *s = (U8*)STRING(scan);
924                 l = utf8_length(s, s + l);
925                 uc = utf8_to_uvchr(s, NULL);
926             }
927             min += l;
928             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
929                 /* The code below prefers earlier match for fixed
930                    offset, later match for variable offset.  */
931                 if (data->last_end == -1) { /* Update the start info. */
932                     data->last_start_min = data->pos_min;
933                     data->last_start_max = is_inf
934                         ? I32_MAX : data->pos_min + data->pos_delta;
935                 }
936                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
937                 {
938                     SV * sv = data->last_found;
939                     MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
940                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
941                     if (mg && mg->mg_len >= 0)
942                         mg->mg_len += utf8_length((U8*)STRING(scan),
943                                                   (U8*)STRING(scan)+STR_LEN(scan));
944                 }
945                 if (UTF)
946                     SvUTF8_on(data->last_found);
947                 data->last_end = data->pos_min + l;
948                 data->pos_min += l; /* As in the first entry. */
949                 data->flags &= ~SF_BEFORE_EOL;
950             }
951             if (flags & SCF_DO_STCLASS_AND) {
952                 /* Check whether it is compatible with what we know already! */
953                 int compat = 1;
954
955                 if (uc >= 0x100 ||
956                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
957                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
958                     && (!(data->start_class->flags & ANYOF_FOLD)
959                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
960                     )
961                     compat = 0;
962                 ANYOF_CLASS_ZERO(data->start_class);
963                 ANYOF_BITMAP_ZERO(data->start_class);
964                 if (compat)
965                     ANYOF_BITMAP_SET(data->start_class, uc);
966                 data->start_class->flags &= ~ANYOF_EOS;
967                 if (uc < 0x100)
968                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
969             }
970             else if (flags & SCF_DO_STCLASS_OR) {
971                 /* false positive possible if the class is case-folded */
972                 if (uc < 0x100)
973                     ANYOF_BITMAP_SET(data->start_class, uc);
974                 else
975                     data->start_class->flags |= ANYOF_UNICODE_ALL;
976                 data->start_class->flags &= ~ANYOF_EOS;
977                 cl_and(data->start_class, &and_with);
978             }
979             flags &= ~SCF_DO_STCLASS;
980         }
981         else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
982             I32 l = STR_LEN(scan);
983             UV uc = *((U8*)STRING(scan));
984
985             /* Search for fixed substrings supports EXACT only. */
986             if (flags & SCF_DO_SUBSTR)
987                 scan_commit(pRExC_state, data);
988             if (UTF) {
989                 U8 *s = (U8 *)STRING(scan);
990                 l = utf8_length(s, s + l);
991                 uc = utf8_to_uvchr(s, NULL);
992             }
993             min += l;
994             if (data && (flags & SCF_DO_SUBSTR))
995                 data->pos_min += l;
996             if (flags & SCF_DO_STCLASS_AND) {
997                 /* Check whether it is compatible with what we know already! */
998                 int compat = 1;
999
1000                 if (uc >= 0x100 ||
1001                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1002                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
1003                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
1004                     compat = 0;
1005                 ANYOF_CLASS_ZERO(data->start_class);
1006                 ANYOF_BITMAP_ZERO(data->start_class);
1007                 if (compat) {
1008                     ANYOF_BITMAP_SET(data->start_class, uc);
1009                     data->start_class->flags &= ~ANYOF_EOS;
1010                     data->start_class->flags |= ANYOF_FOLD;
1011                     if (OP(scan) == EXACTFL)
1012                         data->start_class->flags |= ANYOF_LOCALE;
1013                 }
1014             }
1015             else if (flags & SCF_DO_STCLASS_OR) {
1016                 if (data->start_class->flags & ANYOF_FOLD) {
1017                     /* false positive possible if the class is case-folded.
1018                        Assume that the locale settings are the same... */
1019                     if (uc < 0x100)
1020                         ANYOF_BITMAP_SET(data->start_class, uc);
1021                     data->start_class->flags &= ~ANYOF_EOS;
1022                 }
1023                 cl_and(data->start_class, &and_with);
1024             }
1025             flags &= ~SCF_DO_STCLASS;
1026         }
1027         else if (strchr((char*)PL_varies,OP(scan))) {
1028             I32 mincount, maxcount, minnext, deltanext, fl = 0;
1029             I32 f = flags, pos_before = 0;
1030             regnode *oscan = scan;
1031             struct regnode_charclass_class this_class;
1032             struct regnode_charclass_class *oclass = NULL;
1033             I32 next_is_eval = 0;
1034
1035             switch (PL_regkind[(U8)OP(scan)]) {
1036             case WHILEM:                /* End of (?:...)* . */
1037                 scan = NEXTOPER(scan);
1038                 goto finish;
1039             case PLUS:
1040                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1041                     next = NEXTOPER(scan);
1042                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1043                         mincount = 1;
1044                         maxcount = REG_INFTY;
1045                         next = regnext(scan);
1046                         scan = NEXTOPER(scan);
1047                         goto do_curly;
1048                     }
1049                 }
1050                 if (flags & SCF_DO_SUBSTR)
1051                     data->pos_min++;
1052                 min++;
1053                 /* Fall through. */
1054             case STAR:
1055                 if (flags & SCF_DO_STCLASS) {
1056                     mincount = 0;
1057                     maxcount = REG_INFTY;
1058                     next = regnext(scan);
1059                     scan = NEXTOPER(scan);
1060                     goto do_curly;
1061                 }
1062                 is_inf = is_inf_internal = 1;
1063                 scan = regnext(scan);
1064                 if (flags & SCF_DO_SUBSTR) {
1065                     scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1066                     data->longest = &(data->longest_float);
1067                 }
1068                 goto optimize_curly_tail;
1069             case CURLY:
1070                 mincount = ARG1(scan);
1071                 maxcount = ARG2(scan);
1072                 next = regnext(scan);
1073                 if (OP(scan) == CURLYX) {
1074                     I32 lp = (data ? *(data->last_closep) : 0);
1075
1076                     scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1077                 }
1078                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1079                 next_is_eval = (OP(scan) == EVAL);
1080               do_curly:
1081                 if (flags & SCF_DO_SUBSTR) {
1082                     if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1083                     pos_before = data->pos_min;
1084                 }
1085                 if (data) {
1086                     fl = data->flags;
1087                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1088                     if (is_inf)
1089                         data->flags |= SF_IS_INF;
1090                 }
1091                 if (flags & SCF_DO_STCLASS) {
1092                     cl_init(pRExC_state, &this_class);
1093                     oclass = data->start_class;
1094                     data->start_class = &this_class;
1095                     f |= SCF_DO_STCLASS_AND;
1096                     f &= ~SCF_DO_STCLASS_OR;
1097                 }
1098                 /* These are the cases when once a subexpression
1099                    fails at a particular position, it cannot succeed
1100                    even after backtracking at the enclosing scope.
1101                 
1102                    XXXX what if minimal match and we are at the
1103                         initial run of {n,m}? */
1104                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1105                     f &= ~SCF_WHILEM_VISITED_POS;
1106
1107                 /* This will finish on WHILEM, setting scan, or on NULL: */
1108                 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1109                                       mincount == 0
1110                                         ? (f & ~SCF_DO_SUBSTR) : f);
1111
1112                 if (flags & SCF_DO_STCLASS)
1113                     data->start_class = oclass;
1114                 if (mincount == 0 || minnext == 0) {
1115                     if (flags & SCF_DO_STCLASS_OR) {
1116                         cl_or(pRExC_state, data->start_class, &this_class);
1117                     }
1118                     else if (flags & SCF_DO_STCLASS_AND) {
1119                         /* Switch to OR mode: cache the old value of
1120                          * data->start_class */
1121                         StructCopy(data->start_class, &and_with,
1122                                    struct regnode_charclass_class);
1123                         flags &= ~SCF_DO_STCLASS_AND;
1124                         StructCopy(&this_class, data->start_class,
1125                                    struct regnode_charclass_class);
1126                         flags |= SCF_DO_STCLASS_OR;
1127                         data->start_class->flags |= ANYOF_EOS;
1128                     }
1129                 } else {                /* Non-zero len */
1130                     if (flags & SCF_DO_STCLASS_OR) {
1131                         cl_or(pRExC_state, data->start_class, &this_class);
1132                         cl_and(data->start_class, &and_with);
1133                     }
1134                     else if (flags & SCF_DO_STCLASS_AND)
1135                         cl_and(data->start_class, &this_class);
1136                     flags &= ~SCF_DO_STCLASS;
1137                 }
1138                 if (!scan)              /* It was not CURLYX, but CURLY. */
1139                     scan = next;
1140                 if (ckWARN(WARN_REGEXP)
1141                        /* ? quantifier ok, except for (?{ ... }) */
1142                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
1143                     && (minnext == 0) && (deltanext == 0)
1144                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1145                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
1146                 {
1147                     vWARN(RExC_parse,
1148                           "Quantifier unexpected on zero-length expression");
1149                 }
1150
1151                 min += minnext * mincount;
1152                 is_inf_internal |= ((maxcount == REG_INFTY
1153                                      && (minnext + deltanext) > 0)
1154                                     || deltanext == I32_MAX);
1155                 is_inf |= is_inf_internal;
1156                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1157
1158                 /* Try powerful optimization CURLYX => CURLYN. */
1159                 if (  OP(oscan) == CURLYX && data
1160                       && data->flags & SF_IN_PAR
1161                       && !(data->flags & SF_HAS_EVAL)
1162                       && !deltanext && minnext == 1 ) {
1163                     /* Try to optimize to CURLYN.  */
1164                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1165                     regnode *nxt1 = nxt;
1166 #ifdef DEBUGGING
1167                     regnode *nxt2;
1168 #endif
1169
1170                     /* Skip open. */
1171                     nxt = regnext(nxt);
1172                     if (!strchr((char*)PL_simple,OP(nxt))
1173                         && !(PL_regkind[(U8)OP(nxt)] == EXACT
1174                              && STR_LEN(nxt) == 1))
1175                         goto nogo;
1176 #ifdef DEBUGGING
1177                     nxt2 = nxt;
1178 #endif
1179                     nxt = regnext(nxt);
1180                     if (OP(nxt) != CLOSE)
1181                         goto nogo;
1182                     /* Now we know that nxt2 is the only contents: */
1183                     oscan->flags = (U8)ARG(nxt);
1184                     OP(oscan) = CURLYN;
1185                     OP(nxt1) = NOTHING; /* was OPEN. */
1186 #ifdef DEBUGGING
1187                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1188                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1189                     NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1190                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
1191                     OP(nxt + 1) = OPTIMIZED; /* was count. */
1192                     NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1193 #endif
1194                 }
1195               nogo:
1196
1197                 /* Try optimization CURLYX => CURLYM. */
1198                 if (  OP(oscan) == CURLYX && data
1199                       && !(data->flags & SF_HAS_PAR)
1200                       && !(data->flags & SF_HAS_EVAL)
1201                       && !deltanext     /* atom is fixed width */
1202                       && minnext != 0   /* CURLYM can't handle zero width */
1203                 ) {
1204                     /* XXXX How to optimize if data == 0? */
1205                     /* Optimize to a simpler form.  */
1206                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1207                     regnode *nxt2;
1208
1209                     OP(oscan) = CURLYM;
1210                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1211                             && (OP(nxt2) != WHILEM))
1212                         nxt = nxt2;
1213                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
1214                     /* Need to optimize away parenths. */
1215                     if (data->flags & SF_IN_PAR) {
1216                         /* Set the parenth number.  */
1217                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1218
1219                         if (OP(nxt) != CLOSE)
1220                             FAIL("Panic opt close");
1221                         oscan->flags = (U8)ARG(nxt);
1222                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
1223                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
1224 #ifdef DEBUGGING
1225                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1226                         OP(nxt + 1) = OPTIMIZED; /* was count. */
1227                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1228                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1229 #endif
1230 #if 0
1231                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
1232                             regnode *nnxt = regnext(nxt1);
1233                         
1234                             if (nnxt == nxt) {
1235                                 if (reg_off_by_arg[OP(nxt1)])
1236                                     ARG_SET(nxt1, nxt2 - nxt1);
1237                                 else if (nxt2 - nxt1 < U16_MAX)
1238                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
1239                                 else
1240                                     OP(nxt) = NOTHING;  /* Cannot beautify */
1241                             }
1242                             nxt1 = nnxt;
1243                         }
1244 #endif
1245                         /* Optimize again: */
1246                         study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1247                                     NULL, 0);
1248                     }
1249                     else
1250                         oscan->flags = 0;
1251                 }
1252                 else if ((OP(oscan) == CURLYX)
1253                          && (flags & SCF_WHILEM_VISITED_POS)
1254                          /* See the comment on a similar expression above.
1255                             However, this time it not a subexpression
1256                             we care about, but the expression itself. */
1257                          && (maxcount == REG_INFTY)
1258                          && data && ++data->whilem_c < 16) {
1259                     /* This stays as CURLYX, we can put the count/of pair. */
1260                     /* Find WHILEM (as in regexec.c) */
1261                     regnode *nxt = oscan + NEXT_OFF(oscan);
1262
1263                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1264                         nxt += ARG(nxt);
1265                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
1266                         | (RExC_whilem_seen << 4)); /* On WHILEM */
1267                 }
1268                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1269                     pars++;
1270                 if (flags & SCF_DO_SUBSTR) {
1271                     SV *last_str = Nullsv;
1272                     int counted = mincount != 0;
1273
1274                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1275 #if defined(SPARC64_GCC_WORKAROUND)
1276                         I32 b = 0;
1277                         STRLEN l = 0;
1278                         char *s = NULL;
1279                         I32 old = 0;
1280
1281                         if (pos_before >= data->last_start_min)
1282                             b = pos_before;
1283                         else
1284                             b = data->last_start_min;
1285
1286                         l = 0;
1287                         s = SvPV(data->last_found, l);
1288                         old = b - data->last_start_min;
1289
1290 #else
1291                         I32 b = pos_before >= data->last_start_min
1292                             ? pos_before : data->last_start_min;
1293                         STRLEN l;
1294                         char *s = SvPV(data->last_found, l);
1295                         I32 old = b - data->last_start_min;
1296 #endif
1297
1298                         if (UTF)
1299                             old = utf8_hop((U8*)s, old) - (U8*)s;
1300                         
1301                         l -= old;
1302                         /* Get the added string: */
1303                         last_str = newSVpvn(s  + old, l);
1304                         if (UTF)
1305                             SvUTF8_on(last_str);
1306                         if (deltanext == 0 && pos_before == b) {
1307                             /* What was added is a constant string */
1308                             if (mincount > 1) {
1309                                 SvGROW(last_str, (mincount * l) + 1);
1310                                 repeatcpy(SvPVX(last_str) + l,
1311                                           SvPVX(last_str), l, mincount - 1);
1312                                 SvCUR(last_str) *= mincount;
1313                                 /* Add additional parts. */
1314                                 SvCUR_set(data->last_found,
1315                                           SvCUR(data->last_found) - l);
1316                                 sv_catsv(data->last_found, last_str);
1317                                 {
1318                                     SV * sv = data->last_found;
1319                                     MAGIC *mg =
1320                                         SvUTF8(sv) && SvMAGICAL(sv) ?
1321                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
1322                                     if (mg && mg->mg_len >= 0)
1323                                         mg->mg_len += CHR_SVLEN(last_str);
1324                                 }
1325                                 data->last_end += l * (mincount - 1);
1326                             }
1327                         } else {
1328                             /* start offset must point into the last copy */
1329                             data->last_start_min += minnext * (mincount - 1);
1330                             data->last_start_max += is_inf ? I32_MAX
1331                                 : (maxcount - 1) * (minnext + data->pos_delta);
1332                         }
1333                     }
1334                     /* It is counted once already... */
1335                     data->pos_min += minnext * (mincount - counted);
1336                     data->pos_delta += - counted * deltanext +
1337                         (minnext + deltanext) * maxcount - minnext * mincount;
1338                     if (mincount != maxcount) {
1339                          /* Cannot extend fixed substrings found inside
1340                             the group.  */
1341                         scan_commit(pRExC_state,data);
1342                         if (mincount && last_str) {
1343                             sv_setsv(data->last_found, last_str);
1344                             data->last_end = data->pos_min;
1345                             data->last_start_min =
1346                                 data->pos_min - CHR_SVLEN(last_str);
1347                             data->last_start_max = is_inf
1348                                 ? I32_MAX
1349                                 : data->pos_min + data->pos_delta
1350                                 - CHR_SVLEN(last_str);
1351                         }
1352                         data->longest = &(data->longest_float);
1353                     }
1354                     SvREFCNT_dec(last_str);
1355                 }
1356                 if (data && (fl & SF_HAS_EVAL))
1357                     data->flags |= SF_HAS_EVAL;
1358               optimize_curly_tail:
1359                 if (OP(oscan) != CURLYX) {
1360                     while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1361                            && NEXT_OFF(next))
1362                         NEXT_OFF(oscan) += NEXT_OFF(next);
1363                 }
1364                 continue;
1365             default:                    /* REF and CLUMP only? */
1366                 if (flags & SCF_DO_SUBSTR) {
1367                     scan_commit(pRExC_state,data);      /* Cannot expect anything... */
1368                     data->longest = &(data->longest_float);
1369                 }
1370                 is_inf = is_inf_internal = 1;
1371                 if (flags & SCF_DO_STCLASS_OR)
1372                     cl_anything(pRExC_state, data->start_class);
1373                 flags &= ~SCF_DO_STCLASS;
1374                 break;
1375             }
1376         }
1377         else if (strchr((char*)PL_simple,OP(scan))) {
1378             int value = 0;
1379
1380             if (flags & SCF_DO_SUBSTR) {
1381                 scan_commit(pRExC_state,data);
1382                 data->pos_min++;
1383             }
1384             min++;
1385             if (flags & SCF_DO_STCLASS) {
1386                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1387
1388                 /* Some of the logic below assumes that switching
1389                    locale on will only add false positives. */
1390                 switch (PL_regkind[(U8)OP(scan)]) {
1391                 case SANY:
1392                 default:
1393                   do_default:
1394                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1395                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1396                         cl_anything(pRExC_state, data->start_class);
1397                     break;
1398                 case REG_ANY:
1399                     if (OP(scan) == SANY)
1400                         goto do_default;
1401                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1402                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1403                                  || (data->start_class->flags & ANYOF_CLASS));
1404                         cl_anything(pRExC_state, data->start_class);
1405                     }
1406                     if (flags & SCF_DO_STCLASS_AND || !value)
1407                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1408                     break;
1409                 case ANYOF:
1410                     if (flags & SCF_DO_STCLASS_AND)
1411                         cl_and(data->start_class,
1412                                (struct regnode_charclass_class*)scan);
1413                     else
1414                         cl_or(pRExC_state, data->start_class,
1415                               (struct regnode_charclass_class*)scan);
1416                     break;
1417                 case ALNUM:
1418                     if (flags & SCF_DO_STCLASS_AND) {
1419                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1420                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1421                             for (value = 0; value < 256; value++)
1422                                 if (!isALNUM(value))
1423                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1424                         }
1425                     }
1426                     else {
1427                         if (data->start_class->flags & ANYOF_LOCALE)
1428                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1429                         else {
1430                             for (value = 0; value < 256; value++)
1431                                 if (isALNUM(value))
1432                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1433                         }
1434                     }
1435                     break;
1436                 case ALNUML:
1437                     if (flags & SCF_DO_STCLASS_AND) {
1438                         if (data->start_class->flags & ANYOF_LOCALE)
1439                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1440                     }
1441                     else {
1442                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1443                         data->start_class->flags |= ANYOF_LOCALE;
1444                     }
1445                     break;
1446                 case NALNUM:
1447                     if (flags & SCF_DO_STCLASS_AND) {
1448                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1449                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1450                             for (value = 0; value < 256; value++)
1451                                 if (isALNUM(value))
1452                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1453                         }
1454                     }
1455                     else {
1456                         if (data->start_class->flags & ANYOF_LOCALE)
1457                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1458                         else {
1459                             for (value = 0; value < 256; value++)
1460                                 if (!isALNUM(value))
1461                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1462                         }
1463                     }
1464                     break;
1465                 case NALNUML:
1466                     if (flags & SCF_DO_STCLASS_AND) {
1467                         if (data->start_class->flags & ANYOF_LOCALE)
1468                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1469                     }
1470                     else {
1471                         data->start_class->flags |= ANYOF_LOCALE;
1472                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1473                     }
1474                     break;
1475                 case SPACE:
1476                     if (flags & SCF_DO_STCLASS_AND) {
1477                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1478                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1479                             for (value = 0; value < 256; value++)
1480                                 if (!isSPACE(value))
1481                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1482                         }
1483                     }
1484                     else {
1485                         if (data->start_class->flags & ANYOF_LOCALE)
1486                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1487                         else {
1488                             for (value = 0; value < 256; value++)
1489                                 if (isSPACE(value))
1490                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1491                         }
1492                     }
1493                     break;
1494                 case SPACEL:
1495                     if (flags & SCF_DO_STCLASS_AND) {
1496                         if (data->start_class->flags & ANYOF_LOCALE)
1497                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1498                     }
1499                     else {
1500                         data->start_class->flags |= ANYOF_LOCALE;
1501                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1502                     }
1503                     break;
1504                 case NSPACE:
1505                     if (flags & SCF_DO_STCLASS_AND) {
1506                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1507                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1508                             for (value = 0; value < 256; value++)
1509                                 if (isSPACE(value))
1510                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1511                         }
1512                     }
1513                     else {
1514                         if (data->start_class->flags & ANYOF_LOCALE)
1515                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1516                         else {
1517                             for (value = 0; value < 256; value++)
1518                                 if (!isSPACE(value))
1519                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1520                         }
1521                     }
1522                     break;
1523                 case NSPACEL:
1524                     if (flags & SCF_DO_STCLASS_AND) {
1525                         if (data->start_class->flags & ANYOF_LOCALE) {
1526                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1527                             for (value = 0; value < 256; value++)
1528                                 if (!isSPACE(value))
1529                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1530                         }
1531                     }
1532                     else {
1533                         data->start_class->flags |= ANYOF_LOCALE;
1534                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1535                     }
1536                     break;
1537                 case DIGIT:
1538                     if (flags & SCF_DO_STCLASS_AND) {
1539                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1540                         for (value = 0; value < 256; value++)
1541                             if (!isDIGIT(value))
1542                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1543                     }
1544                     else {
1545                         if (data->start_class->flags & ANYOF_LOCALE)
1546                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1547                         else {
1548                             for (value = 0; value < 256; value++)
1549                                 if (isDIGIT(value))
1550                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1551                         }
1552                     }
1553                     break;
1554                 case NDIGIT:
1555                     if (flags & SCF_DO_STCLASS_AND) {
1556                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1557                         for (value = 0; value < 256; value++)
1558                             if (isDIGIT(value))
1559                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1560                     }
1561                     else {
1562                         if (data->start_class->flags & ANYOF_LOCALE)
1563                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1564                         else {
1565                             for (value = 0; value < 256; value++)
1566                                 if (!isDIGIT(value))
1567                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1568                         }
1569                     }
1570                     break;
1571                 }
1572                 if (flags & SCF_DO_STCLASS_OR)
1573                     cl_and(data->start_class, &and_with);
1574                 flags &= ~SCF_DO_STCLASS;
1575             }
1576         }
1577         else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1578             data->flags |= (OP(scan) == MEOL
1579                             ? SF_BEFORE_MEOL
1580                             : SF_BEFORE_SEOL);
1581         }
1582         else if (  PL_regkind[(U8)OP(scan)] == BRANCHJ
1583                  /* Lookbehind, or need to calculate parens/evals/stclass: */
1584                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
1585                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1586             /* Lookahead/lookbehind */
1587             I32 deltanext, minnext, fake = 0;
1588             regnode *nscan;
1589             struct regnode_charclass_class intrnl;
1590             int f = 0;
1591
1592             data_fake.flags = 0;
1593             if (data) {         
1594                 data_fake.whilem_c = data->whilem_c;
1595                 data_fake.last_closep = data->last_closep;
1596             }
1597             else
1598                 data_fake.last_closep = &fake;
1599             if ( flags & SCF_DO_STCLASS && !scan->flags
1600                  && OP(scan) == IFMATCH ) { /* Lookahead */
1601                 cl_init(pRExC_state, &intrnl);
1602                 data_fake.start_class = &intrnl;
1603                 f |= SCF_DO_STCLASS_AND;
1604             }
1605             if (flags & SCF_WHILEM_VISITED_POS)
1606                 f |= SCF_WHILEM_VISITED_POS;
1607             next = regnext(scan);
1608             nscan = NEXTOPER(NEXTOPER(scan));
1609             minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1610             if (scan->flags) {
1611                 if (deltanext) {
1612                     vFAIL("Variable length lookbehind not implemented");
1613                 }
1614                 else if (minnext > U8_MAX) {
1615                     vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1616                 }
1617                 scan->flags = (U8)minnext;
1618             }
1619             if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1620                 pars++;
1621             if (data && (data_fake.flags & SF_HAS_EVAL))
1622                 data->flags |= SF_HAS_EVAL;
1623             if (data)
1624                 data->whilem_c = data_fake.whilem_c;
1625             if (f & SCF_DO_STCLASS_AND) {
1626                 int was = (data->start_class->flags & ANYOF_EOS);
1627
1628                 cl_and(data->start_class, &intrnl);
1629                 if (was)
1630                     data->start_class->flags |= ANYOF_EOS;
1631             }
1632         }
1633         else if (OP(scan) == OPEN) {
1634             pars++;
1635         }
1636         else if (OP(scan) == CLOSE) {
1637             if ((I32)ARG(scan) == is_par) {
1638                 next = regnext(scan);
1639
1640                 if ( next && (OP(next) != WHILEM) && next < last)
1641                     is_par = 0;         /* Disable optimization */
1642             }
1643             if (data)
1644                 *(data->last_closep) = ARG(scan);
1645         }
1646         else if (OP(scan) == EVAL) {
1647                 if (data)
1648                     data->flags |= SF_HAS_EVAL;
1649         }
1650         else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1651                 if (flags & SCF_DO_SUBSTR) {
1652                     scan_commit(pRExC_state,data);
1653                     data->longest = &(data->longest_float);
1654                 }
1655                 is_inf = is_inf_internal = 1;
1656                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1657                     cl_anything(pRExC_state, data->start_class);
1658                 flags &= ~SCF_DO_STCLASS;
1659         }
1660         /* Else: zero-length, ignore. */
1661         scan = regnext(scan);
1662     }
1663
1664   finish:
1665     *scanp = scan;
1666     *deltap = is_inf_internal ? I32_MAX : delta;
1667     if (flags & SCF_DO_SUBSTR && is_inf)
1668         data->pos_delta = I32_MAX - data->pos_min;
1669     if (is_par > U8_MAX)
1670         is_par = 0;
1671     if (is_par && pars==1 && data) {
1672         data->flags |= SF_IN_PAR;
1673         data->flags &= ~SF_HAS_PAR;
1674     }
1675     else if (pars && data) {
1676         data->flags |= SF_HAS_PAR;
1677         data->flags &= ~SF_IN_PAR;
1678     }
1679     if (flags & SCF_DO_STCLASS_OR)
1680         cl_and(data->start_class, &and_with);
1681     return min;
1682 }
1683
1684 STATIC I32
1685 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1686 {
1687     if (RExC_rx->data) {
1688         Renewc(RExC_rx->data,
1689                sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1690                char, struct reg_data);
1691         Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1692         RExC_rx->data->count += n;
1693     }
1694     else {
1695         Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1696              char, struct reg_data);
1697         New(1208, RExC_rx->data->what, n, U8);
1698         RExC_rx->data->count = n;
1699     }
1700     Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1701     return RExC_rx->data->count - n;
1702 }
1703
1704 void
1705 Perl_reginitcolors(pTHX)
1706 {
1707     int i = 0;
1708     char *s = PerlEnv_getenv("PERL_RE_COLORS");
1709         
1710     if (s) {
1711         PL_colors[0] = s = savepv(s);
1712         while (++i < 6) {
1713             s = strchr(s, '\t');
1714             if (s) {
1715                 *s = '\0';
1716                 PL_colors[i] = ++s;
1717             }
1718             else
1719                 PL_colors[i] = s = "";
1720         }
1721     } else {
1722         while (i < 6)
1723             PL_colors[i++] = "";
1724     }
1725     PL_colorset = 1;
1726 }
1727
1728
1729 /*
1730  - pregcomp - compile a regular expression into internal code
1731  *
1732  * We can't allocate space until we know how big the compiled form will be,
1733  * but we can't compile it (and thus know how big it is) until we've got a
1734  * place to put the code.  So we cheat:  we compile it twice, once with code
1735  * generation turned off and size counting turned on, and once "for real".
1736  * This also means that we don't allocate space until we are sure that the
1737  * thing really will compile successfully, and we never have to move the
1738  * code and thus invalidate pointers into it.  (Note that it has to be in
1739  * one piece because free() must be able to free it all.) [NB: not true in perl]
1740  *
1741  * Beware that the optimization-preparation code in here knows about some
1742  * of the structure of the compiled regexp.  [I'll say.]
1743  */
1744 regexp *
1745 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1746 {
1747     register regexp *r;
1748     regnode *scan;
1749     regnode *first;
1750     I32 flags;
1751     I32 minlen = 0;
1752     I32 sawplus = 0;
1753     I32 sawopen = 0;
1754     scan_data_t data;
1755     RExC_state_t RExC_state;
1756     RExC_state_t *pRExC_state = &RExC_state;
1757
1758     if (exp == NULL)
1759         FAIL("NULL regexp argument");
1760
1761     RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1762
1763     RExC_precomp = exp;
1764     DEBUG_r({
1765          if (!PL_colorset) reginitcolors();
1766          PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1767                        PL_colors[4],PL_colors[5],PL_colors[0],
1768                        (int)(xend - exp), RExC_precomp, PL_colors[1]);
1769     });
1770     RExC_flags = pm->op_pmflags;
1771     RExC_sawback = 0;
1772
1773     RExC_seen = 0;
1774     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1775     RExC_seen_evals = 0;
1776     RExC_extralen = 0;
1777
1778     /* First pass: determine size, legality. */
1779     RExC_parse = exp;
1780     RExC_start = exp;
1781     RExC_end = xend;
1782     RExC_naughty = 0;
1783     RExC_npar = 1;
1784     RExC_size = 0L;
1785     RExC_emit = &PL_regdummy;
1786     RExC_whilem_seen = 0;
1787 #if 0 /* REGC() is (currently) a NOP at the first pass.
1788        * Clever compilers notice this and complain. --jhi */
1789     REGC((U8)REG_MAGIC, (char*)RExC_emit);
1790 #endif
1791     if (reg(pRExC_state, 0, &flags) == NULL) {
1792         RExC_precomp = Nullch;
1793         return(NULL);
1794     }
1795     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1796
1797     /* Small enough for pointer-storage convention?
1798        If extralen==0, this means that we will not need long jumps. */
1799     if (RExC_size >= 0x10000L && RExC_extralen)
1800         RExC_size += RExC_extralen;
1801     else
1802         RExC_extralen = 0;
1803     if (RExC_whilem_seen > 15)
1804         RExC_whilem_seen = 15;
1805
1806     /* Allocate space and initialize. */
1807     Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1808          char, regexp);
1809     if (r == NULL)
1810         FAIL("Regexp out of space");
1811
1812 #ifdef DEBUGGING
1813     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1814     Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1815 #endif
1816     r->refcnt = 1;
1817     r->prelen = xend - exp;
1818     r->precomp = savepvn(RExC_precomp, r->prelen);
1819     r->subbeg = NULL;
1820 #ifdef PERL_COPY_ON_WRITE
1821     r->saved_copy = Nullsv;
1822 #endif
1823     r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1824     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1825
1826     r->substrs = 0;                     /* Useful during FAIL. */
1827     r->startp = 0;                      /* Useful during FAIL. */
1828     r->endp = 0;                        /* Useful during FAIL. */
1829
1830     Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1831     if (r->offsets) {
1832       r->offsets[0] = RExC_size; 
1833     }
1834     DEBUG_r(PerlIO_printf(Perl_debug_log, 
1835                           "%s %"UVuf" bytes for offset annotations.\n", 
1836                           r->offsets ? "Got" : "Couldn't get", 
1837                           (UV)((2*RExC_size+1) * sizeof(U32))));
1838
1839     RExC_rx = r;
1840
1841     /* Second pass: emit code. */
1842     RExC_flags = pm->op_pmflags;        /* don't let top level (?i) bleed */
1843     RExC_parse = exp;
1844     RExC_end = xend;
1845     RExC_naughty = 0;
1846     RExC_npar = 1;
1847     RExC_emit_start = r->program;
1848     RExC_emit = r->program;
1849     /* Store the count of eval-groups for security checks: */
1850     RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1851     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1852     r->data = 0;
1853     if (reg(pRExC_state, 0, &flags) == NULL)
1854         return(NULL);
1855
1856     /* Dig out information for optimizations. */
1857     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1858     pm->op_pmflags = RExC_flags;
1859     if (UTF)
1860         r->reganch |= ROPT_UTF8;        /* Unicode in it? */
1861     r->regstclass = NULL;
1862     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
1863         r->reganch |= ROPT_NAUGHTY;
1864     scan = r->program + 1;              /* First BRANCH. */
1865
1866     /* XXXX To minimize changes to RE engine we always allocate
1867        3-units-long substrs field. */
1868     Newz(1004, r->substrs, 1, struct reg_substr_data);
1869
1870     StructCopy(&zero_scan_data, &data, scan_data_t);
1871     /* XXXX Should not we check for something else?  Usually it is OPEN1... */
1872     if (OP(scan) != BRANCH) {   /* Only one top-level choice. */
1873         I32 fake;
1874         STRLEN longest_float_length, longest_fixed_length;
1875         struct regnode_charclass_class ch_class;
1876         int stclass_flag;
1877         I32 last_close = 0;
1878
1879         first = scan;
1880         /* Skip introductions and multiplicators >= 1. */
1881         while ((OP(first) == OPEN && (sawopen = 1)) ||
1882                /* An OR of *one* alternative - should not happen now. */
1883             (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1884             (OP(first) == PLUS) ||
1885             (OP(first) == MINMOD) ||
1886                /* An {n,m} with n>0 */
1887             (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1888                 if (OP(first) == PLUS)
1889                     sawplus = 1;
1890                 else
1891                     first += regarglen[(U8)OP(first)];
1892                 first = NEXTOPER(first);
1893         }
1894
1895         /* Starting-point info. */
1896       again:
1897         if (PL_regkind[(U8)OP(first)] == EXACT) {
1898             if (OP(first) == EXACT)
1899                 ;       /* Empty, get anchored substr later. */
1900             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1901                 r->regstclass = first;
1902         }
1903         else if (strchr((char*)PL_simple,OP(first)))
1904             r->regstclass = first;
1905         else if (PL_regkind[(U8)OP(first)] == BOUND ||
1906                  PL_regkind[(U8)OP(first)] == NBOUND)
1907             r->regstclass = first;
1908         else if (PL_regkind[(U8)OP(first)] == BOL) {
1909             r->reganch |= (OP(first) == MBOL
1910                            ? ROPT_ANCH_MBOL
1911                            : (OP(first) == SBOL
1912                               ? ROPT_ANCH_SBOL
1913                               : ROPT_ANCH_BOL));
1914             first = NEXTOPER(first);
1915             goto again;
1916         }
1917         else if (OP(first) == GPOS) {
1918             r->reganch |= ROPT_ANCH_GPOS;
1919             first = NEXTOPER(first);
1920             goto again;
1921         }
1922         else if (!sawopen && (OP(first) == STAR &&
1923             PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1924             !(r->reganch & ROPT_ANCH) )
1925         {
1926             /* turn .* into ^.* with an implied $*=1 */
1927             int type = OP(NEXTOPER(first));
1928
1929             if (type == REG_ANY)
1930                 type = ROPT_ANCH_MBOL;
1931             else
1932                 type = ROPT_ANCH_SBOL;
1933
1934             r->reganch |= type | ROPT_IMPLICIT;
1935             first = NEXTOPER(first);
1936             goto again;
1937         }
1938         if (sawplus && (!sawopen || !RExC_sawback)
1939             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1940             /* x+ must match at the 1st pos of run of x's */
1941             r->reganch |= ROPT_SKIP;
1942
1943         /* Scan is after the zeroth branch, first is atomic matcher. */
1944         DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1945                               (IV)(first - scan + 1)));
1946         /*
1947         * If there's something expensive in the r.e., find the
1948         * longest literal string that must appear and make it the
1949         * regmust.  Resolve ties in favor of later strings, since
1950         * the regstart check works with the beginning of the r.e.
1951         * and avoiding duplication strengthens checking.  Not a
1952         * strong reason, but sufficient in the absence of others.
1953         * [Now we resolve ties in favor of the earlier string if
1954         * it happens that c_offset_min has been invalidated, since the
1955         * earlier string may buy us something the later one won't.]
1956         */
1957         minlen = 0;
1958
1959         data.longest_fixed = newSVpvn("",0);
1960         data.longest_float = newSVpvn("",0);
1961         data.last_found = newSVpvn("",0);
1962         data.longest = &(data.longest_fixed);
1963         first = scan;
1964         if (!r->regstclass) {
1965             cl_init(pRExC_state, &ch_class);
1966             data.start_class = &ch_class;
1967             stclass_flag = SCF_DO_STCLASS_AND;
1968         } else                          /* XXXX Check for BOUND? */
1969             stclass_flag = 0;
1970         data.last_closep = &last_close;
1971
1972         minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1973                              &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1974         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1975              && data.last_start_min == 0 && data.last_end > 0
1976              && !RExC_seen_zerolen
1977              && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1978             r->reganch |= ROPT_CHECK_ALL;
1979         scan_commit(pRExC_state, &data);
1980         SvREFCNT_dec(data.last_found);
1981
1982         longest_float_length = CHR_SVLEN(data.longest_float);
1983         if (longest_float_length
1984             || (data.flags & SF_FL_BEFORE_EOL
1985                 && (!(data.flags & SF_FL_BEFORE_MEOL)
1986                     || (RExC_flags & PMf_MULTILINE)))) {
1987             int t;
1988
1989             if (SvCUR(data.longest_fixed)                       /* ok to leave SvCUR */
1990                 && data.offset_fixed == data.offset_float_min
1991                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1992                     goto remove_float;          /* As in (a)+. */
1993
1994             if (SvUTF8(data.longest_float)) {
1995                 r->float_utf8 = data.longest_float;
1996                 r->float_substr = Nullsv;
1997             } else {
1998                 r->float_substr = data.longest_float;
1999                 r->float_utf8 = Nullsv;
2000             }
2001             r->float_min_offset = data.offset_float_min;
2002             r->float_max_offset = data.offset_float_max;
2003             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
2004                        && (!(data.flags & SF_FL_BEFORE_MEOL)
2005                            || (RExC_flags & PMf_MULTILINE)));
2006             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
2007         }
2008         else {
2009           remove_float:
2010             r->float_substr = r->float_utf8 = Nullsv;
2011             SvREFCNT_dec(data.longest_float);
2012             longest_float_length = 0;
2013         }
2014
2015         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
2016         if (longest_fixed_length
2017             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2018                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2019                     || (RExC_flags & PMf_MULTILINE)))) {
2020             int t;
2021
2022             if (SvUTF8(data.longest_fixed)) {
2023                 r->anchored_utf8 = data.longest_fixed;
2024                 r->anchored_substr = Nullsv;
2025             } else {
2026                 r->anchored_substr = data.longest_fixed;
2027                 r->anchored_utf8 = Nullsv;
2028             }
2029             r->anchored_offset = data.offset_fixed;
2030             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2031                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
2032                      || (RExC_flags & PMf_MULTILINE)));
2033             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2034         }
2035         else {
2036             r->anchored_substr = r->anchored_utf8 = Nullsv;
2037             SvREFCNT_dec(data.longest_fixed);
2038             longest_fixed_length = 0;
2039         }
2040         if (r->regstclass
2041             && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2042             r->regstclass = NULL;
2043         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2044             && stclass_flag
2045             && !(data.start_class->flags & ANYOF_EOS)
2046             && !cl_is_anything(data.start_class))
2047         {
2048             I32 n = add_data(pRExC_state, 1, "f");
2049
2050             New(1006, RExC_rx->data->data[n], 1,
2051                 struct regnode_charclass_class);
2052             StructCopy(data.start_class,
2053                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2054                        struct regnode_charclass_class);
2055             r->regstclass = (regnode*)RExC_rx->data->data[n];
2056             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2057             PL_regdata = r->data; /* for regprop() */
2058             DEBUG_r({ SV *sv = sv_newmortal();
2059                       regprop(sv, (regnode*)data.start_class);
2060                       PerlIO_printf(Perl_debug_log,
2061                                     "synthetic stclass `%s'.\n",
2062                                     SvPVX(sv));});
2063         }
2064
2065         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2066         if (longest_fixed_length > longest_float_length) {
2067             r->check_substr = r->anchored_substr;
2068             r->check_utf8 = r->anchored_utf8;
2069             r->check_offset_min = r->check_offset_max = r->anchored_offset;
2070             if (r->reganch & ROPT_ANCH_SINGLE)
2071                 r->reganch |= ROPT_NOSCAN;
2072         }
2073         else {
2074             r->check_substr = r->float_substr;
2075             r->check_utf8 = r->float_utf8;
2076             r->check_offset_min = data.offset_float_min;
2077             r->check_offset_max = data.offset_float_max;
2078         }
2079         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2080            This should be changed ASAP!  */
2081         if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2082             r->reganch |= RE_USE_INTUIT;
2083             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2084                 r->reganch |= RE_INTUIT_TAIL;
2085         }
2086     }
2087     else {
2088         /* Several toplevels. Best we can is to set minlen. */
2089         I32 fake;
2090         struct regnode_charclass_class ch_class;
2091         I32 last_close = 0;
2092         
2093         DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2094         scan = r->program + 1;
2095         cl_init(pRExC_state, &ch_class);
2096         data.start_class = &ch_class;
2097         data.last_closep = &last_close;
2098         minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2099         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2100                 = r->float_substr = r->float_utf8 = Nullsv;
2101         if (!(data.start_class->flags & ANYOF_EOS)
2102             && !cl_is_anything(data.start_class))
2103         {
2104             I32 n = add_data(pRExC_state, 1, "f");
2105
2106             New(1006, RExC_rx->data->data[n], 1,
2107                 struct regnode_charclass_class);
2108             StructCopy(data.start_class,
2109                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2110                        struct regnode_charclass_class);
2111             r->regstclass = (regnode*)RExC_rx->data->data[n];
2112             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2113             DEBUG_r({ SV* sv = sv_newmortal();
2114                       regprop(sv, (regnode*)data.start_class);
2115                       PerlIO_printf(Perl_debug_log,
2116                                     "synthetic stclass `%s'.\n",
2117                                     SvPVX(sv));});
2118         }
2119     }
2120
2121     r->minlen = minlen;
2122     if (RExC_seen & REG_SEEN_GPOS)
2123         r->reganch |= ROPT_GPOS_SEEN;
2124     if (RExC_seen & REG_SEEN_LOOKBEHIND)
2125         r->reganch |= ROPT_LOOKBEHIND_SEEN;
2126     if (RExC_seen & REG_SEEN_EVAL)
2127         r->reganch |= ROPT_EVAL_SEEN;
2128     if (RExC_seen & REG_SEEN_CANY)
2129         r->reganch |= ROPT_CANY_SEEN;
2130     Newz(1002, r->startp, RExC_npar, I32);
2131     Newz(1002, r->endp, RExC_npar, I32);
2132     PL_regdata = r->data; /* for regprop() */
2133     DEBUG_r(regdump(r));
2134     return(r);
2135 }
2136
2137 /*
2138  - reg - regular expression, i.e. main body or parenthesized thing
2139  *
2140  * Caller must absorb opening parenthesis.
2141  *
2142  * Combining parenthesis handling with the base level of regular expression
2143  * is a trifle forced, but the need to tie the tails of the branches to what
2144  * follows makes it hard to avoid.
2145  */
2146 STATIC regnode *
2147 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2148     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2149 {
2150     register regnode *ret;              /* Will be the head of the group. */
2151     register regnode *br;
2152     register regnode *lastbr;
2153     register regnode *ender = 0;
2154     register I32 parno = 0;
2155     I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2156
2157     /* for (?g), (?gc), and (?o) warnings; warning
2158        about (?c) will warn about (?g) -- japhy    */
2159
2160     I32 wastedflags = 0x00,
2161         wasted_o    = 0x01,
2162         wasted_g    = 0x02,
2163         wasted_gc   = 0x02 | 0x04,
2164         wasted_c    = 0x04;
2165
2166     char * parse_start = RExC_parse; /* MJD */
2167     char *oregcomp_parse = RExC_parse;
2168     char c;
2169
2170     *flagp = 0;                         /* Tentatively. */
2171
2172
2173     /* Make an OPEN node, if parenthesized. */
2174     if (paren) {
2175         if (*RExC_parse == '?') { /* (?...) */
2176             U32 posflags = 0, negflags = 0;
2177             U32 *flagsp = &posflags;
2178             int logical = 0;
2179             char *seqstart = RExC_parse;
2180
2181             RExC_parse++;
2182             paren = *RExC_parse++;
2183             ret = NULL;                 /* For look-ahead/behind. */
2184             switch (paren) {
2185             case '<':           /* (?<...) */
2186                 RExC_seen |= REG_SEEN_LOOKBEHIND;
2187                 if (*RExC_parse == '!')
2188                     paren = ',';
2189                 if (*RExC_parse != '=' && *RExC_parse != '!')
2190                     goto unknown;
2191                 RExC_parse++;
2192             case '=':           /* (?=...) */
2193             case '!':           /* (?!...) */
2194                 RExC_seen_zerolen++;
2195             case ':':           /* (?:...) */
2196             case '>':           /* (?>...) */
2197                 break;
2198             case '$':           /* (?$...) */
2199             case '@':           /* (?@...) */
2200                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2201                 break;
2202             case '#':           /* (?#...) */
2203                 while (*RExC_parse && *RExC_parse != ')')
2204                     RExC_parse++;
2205                 if (*RExC_parse != ')')
2206                     FAIL("Sequence (?#... not terminated");
2207                 nextchar(pRExC_state);
2208                 *flagp = TRYAGAIN;
2209                 return NULL;
2210             case 'p':           /* (?p...) */
2211                 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2212                     vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2213                 /* FALL THROUGH*/
2214             case '?':           /* (??...) */
2215                 logical = 1;
2216                 if (*RExC_parse != '{')
2217                     goto unknown;
2218                 paren = *RExC_parse++;
2219                 /* FALL THROUGH */
2220             case '{':           /* (?{...}) */
2221             {
2222                 I32 count = 1, n = 0;
2223                 char c;
2224                 char *s = RExC_parse;
2225                 SV *sv;
2226                 OP_4tree *sop, *rop;
2227
2228                 RExC_seen_zerolen++;
2229                 RExC_seen |= REG_SEEN_EVAL;
2230                 while (count && (c = *RExC_parse)) {
2231                     if (c == '\\' && RExC_parse[1])
2232                         RExC_parse++;
2233                     else if (c == '{')
2234                         count++;
2235                     else if (c == '}')
2236                         count--;
2237                     RExC_parse++;
2238                 }
2239                 if (*RExC_parse != ')')
2240                 {
2241                     RExC_parse = s;             
2242                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2243                 }
2244                 if (!SIZE_ONLY) {
2245                     PAD *pad;
2246                 
2247                     if (RExC_parse - 1 - s)
2248                         sv = newSVpvn(s, RExC_parse - 1 - s);
2249                     else
2250                         sv = newSVpvn("", 0);
2251
2252                     ENTER;
2253                     Perl_save_re_context(aTHX);
2254                     rop = sv_compile_2op(sv, &sop, "re", &pad);
2255                     sop->op_private |= OPpREFCOUNTED;
2256                     /* re_dup will OpREFCNT_inc */
2257                     OpREFCNT_set(sop, 1);
2258                     LEAVE;
2259
2260                     n = add_data(pRExC_state, 3, "nop");
2261                     RExC_rx->data->data[n] = (void*)rop;
2262                     RExC_rx->data->data[n+1] = (void*)sop;
2263                     RExC_rx->data->data[n+2] = (void*)pad;
2264                     SvREFCNT_dec(sv);
2265                 }
2266                 else {                                          /* First pass */
2267                     if (PL_reginterp_cnt < ++RExC_seen_evals
2268                         && IN_PERL_RUNTIME)
2269                         /* No compiled RE interpolated, has runtime
2270                            components ===> unsafe.  */
2271                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
2272                     if (PL_tainting && PL_tainted)
2273                         FAIL("Eval-group in insecure regular expression");
2274                     if (IN_PERL_COMPILETIME)
2275                         PL_cv_has_eval = 1;
2276                 }
2277
2278                 nextchar(pRExC_state);
2279                 if (logical) {
2280                     ret = reg_node(pRExC_state, LOGICAL);
2281                     if (!SIZE_ONLY)
2282                         ret->flags = 2;
2283                     regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2284                     /* deal with the length of this later - MJD */
2285                     return ret;
2286                 }
2287                 ret = reganode(pRExC_state, EVAL, n);
2288                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2289                 Set_Node_Offset(ret, parse_start);
2290                 return ret;
2291             }
2292             case '(':           /* (?(?{...})...) and (?(?=...)...) */
2293             {
2294                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
2295                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2296                         || RExC_parse[1] == '<'
2297                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
2298                         I32 flag;
2299                         
2300                         ret = reg_node(pRExC_state, LOGICAL);
2301                         if (!SIZE_ONLY)
2302                             ret->flags = 1;
2303                         regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2304                         goto insert_if;
2305                     }
2306                 }
2307                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2308                     /* (?(1)...) */
2309                     parno = atoi(RExC_parse++);
2310
2311                     while (isDIGIT(*RExC_parse))
2312                         RExC_parse++;
2313                     ret = reganode(pRExC_state, GROUPP, parno);
2314                     
2315                     if ((c = *nextchar(pRExC_state)) != ')')
2316                         vFAIL("Switch condition not recognized");
2317                   insert_if:
2318                     regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2319                     br = regbranch(pRExC_state, &flags, 1);
2320                     if (br == NULL)
2321                         br = reganode(pRExC_state, LONGJMP, 0);
2322                     else
2323                         regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2324                     c = *nextchar(pRExC_state);
2325                     if (flags&HASWIDTH)
2326                         *flagp |= HASWIDTH;
2327                     if (c == '|') {
2328                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2329                         regbranch(pRExC_state, &flags, 1);
2330                         regtail(pRExC_state, ret, lastbr);
2331                         if (flags&HASWIDTH)
2332                             *flagp |= HASWIDTH;
2333                         c = *nextchar(pRExC_state);
2334                     }
2335                     else
2336                         lastbr = NULL;
2337                     if (c != ')')
2338                         vFAIL("Switch (?(condition)... contains too many branches");
2339                     ender = reg_node(pRExC_state, TAIL);
2340                     regtail(pRExC_state, br, ender);
2341                     if (lastbr) {
2342                         regtail(pRExC_state, lastbr, ender);
2343                         regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2344                     }
2345                     else
2346                         regtail(pRExC_state, ret, ender);
2347                     return ret;
2348                 }
2349                 else {
2350                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2351                 }
2352             }
2353             case 0:
2354                 RExC_parse--; /* for vFAIL to print correctly */
2355                 vFAIL("Sequence (? incomplete");
2356                 break;
2357             default:
2358                 --RExC_parse;
2359               parse_flags:      /* (?i) */
2360                 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2361                     /* (?g), (?gc) and (?o) are useless here
2362                        and must be globally applied -- japhy */
2363
2364                     if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2365                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2366                             I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2367                             if (! (wastedflags & wflagbit) ) {
2368                                 wastedflags |= wflagbit;
2369                                 vWARN5(
2370                                     RExC_parse + 1,
2371                                     "Useless (%s%c) - %suse /%c modifier",
2372                                     flagsp == &negflags ? "?-" : "?",
2373                                     *RExC_parse,
2374                                     flagsp == &negflags ? "don't " : "",
2375                                     *RExC_parse
2376                                 );
2377                             }
2378                         }
2379                     }
2380                     else if (*RExC_parse == 'c') {
2381                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2382                             if (! (wastedflags & wasted_c) ) {
2383                                 wastedflags |= wasted_gc;
2384                                 vWARN3(
2385                                     RExC_parse + 1,
2386                                     "Useless (%sc) - %suse /gc modifier",
2387                                     flagsp == &negflags ? "?-" : "?",
2388                                     flagsp == &negflags ? "don't " : ""
2389                                 );
2390                             }
2391                         }
2392                     }
2393                     else { pmflag(flagsp, *RExC_parse); }
2394
2395                     ++RExC_parse;
2396                 }
2397                 if (*RExC_parse == '-') {
2398                     flagsp = &negflags;
2399                     wastedflags = 0;  /* reset so (?g-c) warns twice */
2400                     ++RExC_parse;
2401                     goto parse_flags;
2402                 }
2403                 RExC_flags |= posflags;
2404                 RExC_flags &= ~negflags;
2405                 if (*RExC_parse == ':') {
2406                     RExC_parse++;
2407                     paren = ':';
2408                     break;
2409                 }               
2410               unknown:
2411                 if (*RExC_parse != ')') {
2412                     RExC_parse++;
2413                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2414                 }
2415                 nextchar(pRExC_state);
2416                 *flagp = TRYAGAIN;
2417                 return NULL;
2418             }
2419         }
2420         else {                  /* (...) */
2421             parno = RExC_npar;
2422             RExC_npar++;
2423             ret = reganode(pRExC_state, OPEN, parno);
2424             Set_Node_Length(ret, 1); /* MJD */
2425             Set_Node_Offset(ret, RExC_parse); /* MJD */
2426             open = 1;
2427         }
2428     }
2429     else                        /* ! paren */
2430         ret = NULL;
2431
2432     /* Pick up the branches, linking them together. */
2433     parse_start = RExC_parse;   /* MJD */
2434     br = regbranch(pRExC_state, &flags, 1);
2435     /*     branch_len = (paren != 0); */
2436     
2437     if (br == NULL)
2438         return(NULL);
2439     if (*RExC_parse == '|') {
2440         if (!SIZE_ONLY && RExC_extralen) {
2441             reginsert(pRExC_state, BRANCHJ, br);
2442         }
2443         else {                  /* MJD */
2444             reginsert(pRExC_state, BRANCH, br);
2445             Set_Node_Length(br, paren != 0);
2446             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2447         }
2448         have_branch = 1;
2449         if (SIZE_ONLY)
2450             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
2451     }
2452     else if (paren == ':') {
2453         *flagp |= flags&SIMPLE;
2454     }
2455     if (open) {                         /* Starts with OPEN. */
2456         regtail(pRExC_state, ret, br);          /* OPEN -> first. */
2457     }
2458     else if (paren != '?')              /* Not Conditional */
2459         ret = br;
2460     *flagp |= flags & (SPSTART | HASWIDTH);
2461     lastbr = br;
2462     while (*RExC_parse == '|') {
2463         if (!SIZE_ONLY && RExC_extralen) {
2464             ender = reganode(pRExC_state, LONGJMP,0);
2465             regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2466         }
2467         if (SIZE_ONLY)
2468             RExC_extralen += 2;         /* Account for LONGJMP. */
2469         nextchar(pRExC_state);
2470         br = regbranch(pRExC_state, &flags, 0);
2471         
2472         if (br == NULL)
2473             return(NULL);
2474         regtail(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
2475         lastbr = br;
2476         if (flags&HASWIDTH)
2477             *flagp |= HASWIDTH;
2478         *flagp |= flags&SPSTART;
2479     }
2480
2481     if (have_branch || paren != ':') {
2482         /* Make a closing node, and hook it on the end. */
2483         switch (paren) {
2484         case ':':
2485             ender = reg_node(pRExC_state, TAIL);
2486             break;
2487         case 1:
2488             ender = reganode(pRExC_state, CLOSE, parno);
2489             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2490             Set_Node_Length(ender,1); /* MJD */
2491             break;
2492         case '<':
2493         case ',':
2494         case '=':
2495         case '!':
2496             *flagp &= ~HASWIDTH;
2497             /* FALL THROUGH */
2498         case '>':
2499             ender = reg_node(pRExC_state, SUCCEED);
2500             break;
2501         case 0:
2502             ender = reg_node(pRExC_state, END);
2503             break;
2504         }
2505         regtail(pRExC_state, lastbr, ender);
2506
2507         if (have_branch) {
2508             /* Hook the tails of the branches to the closing node. */
2509             for (br = ret; br != NULL; br = regnext(br)) {
2510                 regoptail(pRExC_state, br, ender);
2511             }
2512         }
2513     }
2514
2515     {
2516         char *p;
2517         static char parens[] = "=!<,>";
2518
2519         if (paren && (p = strchr(parens, paren))) {
2520             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2521             int flag = (p - parens) > 1;
2522
2523             if (paren == '>')
2524                 node = SUSPEND, flag = 0;
2525             reginsert(pRExC_state, node,ret);
2526             Set_Node_Cur_Length(ret);
2527             Set_Node_Offset(ret, parse_start + 1);
2528             ret->flags = flag;
2529             regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2530         }
2531     }
2532
2533     /* Check for proper termination. */
2534     if (paren) {
2535         RExC_flags = oregflags;
2536         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2537             RExC_parse = oregcomp_parse;
2538             vFAIL("Unmatched (");
2539         }
2540     }
2541     else if (!paren && RExC_parse < RExC_end) {
2542         if (*RExC_parse == ')') {
2543             RExC_parse++;
2544             vFAIL("Unmatched )");
2545         }
2546         else
2547             FAIL("Junk on end of regexp");      /* "Can't happen". */
2548         /* NOTREACHED */
2549     }
2550
2551     return(ret);
2552 }
2553
2554 /*
2555  - regbranch - one alternative of an | operator
2556  *
2557  * Implements the concatenation operator.
2558  */
2559 STATIC regnode *
2560 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2561 {
2562     register regnode *ret;
2563     register regnode *chain = NULL;
2564     register regnode *latest;
2565     I32 flags = 0, c = 0;
2566
2567     if (first)
2568         ret = NULL;
2569     else {
2570         if (!SIZE_ONLY && RExC_extralen)
2571             ret = reganode(pRExC_state, BRANCHJ,0);
2572         else {
2573             ret = reg_node(pRExC_state, BRANCH);
2574             Set_Node_Length(ret, 1);
2575         }
2576     }
2577         
2578     if (!first && SIZE_ONLY)
2579         RExC_extralen += 1;                     /* BRANCHJ */
2580
2581     *flagp = WORST;                     /* Tentatively. */
2582
2583     RExC_parse--;
2584     nextchar(pRExC_state);
2585     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2586         flags &= ~TRYAGAIN;
2587         latest = regpiece(pRExC_state, &flags);
2588         if (latest == NULL) {
2589             if (flags & TRYAGAIN)
2590                 continue;
2591             return(NULL);
2592         }
2593         else if (ret == NULL)
2594             ret = latest;
2595         *flagp |= flags&HASWIDTH;
2596         if (chain == NULL)      /* First piece. */
2597             *flagp |= flags&SPSTART;
2598         else {
2599             RExC_naughty++;
2600             regtail(pRExC_state, chain, latest);
2601         }
2602         chain = latest;
2603         c++;
2604     }
2605     if (chain == NULL) {        /* Loop ran zero times. */
2606         chain = reg_node(pRExC_state, NOTHING);
2607         if (ret == NULL)
2608             ret = chain;
2609     }
2610     if (c == 1) {
2611         *flagp |= flags&SIMPLE;
2612     }
2613
2614     return(ret);
2615 }
2616
2617 /*
2618  - regpiece - something followed by possible [*+?]
2619  *
2620  * Note that the branching code sequences used for ? and the general cases
2621  * of * and + are somewhat optimized:  they use the same NOTHING node as
2622  * both the endmarker for their branch list and the body of the last branch.
2623  * It might seem that this node could be dispensed with entirely, but the
2624  * endmarker role is not redundant.
2625  */
2626 STATIC regnode *
2627 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2628 {
2629     register regnode *ret;
2630     register char op;
2631     register char *next;
2632     I32 flags;
2633     char *origparse = RExC_parse;
2634     char *maxpos;
2635     I32 min;
2636     I32 max = REG_INFTY;
2637     char *parse_start;
2638
2639     ret = regatom(pRExC_state, &flags);
2640     if (ret == NULL) {
2641         if (flags & TRYAGAIN)
2642             *flagp |= TRYAGAIN;
2643         return(NULL);
2644     }
2645
2646     op = *RExC_parse;
2647
2648     if (op == '{' && regcurly(RExC_parse)) {
2649         parse_start = RExC_parse; /* MJD */
2650         next = RExC_parse + 1;
2651         maxpos = Nullch;
2652         while (isDIGIT(*next) || *next == ',') {
2653             if (*next == ',') {
2654                 if (maxpos)
2655                     break;
2656                 else
2657                     maxpos = next;
2658             }
2659             next++;
2660         }
2661         if (*next == '}') {             /* got one */
2662             if (!maxpos)
2663                 maxpos = next;
2664             RExC_parse++;
2665             min = atoi(RExC_parse);
2666             if (*maxpos == ',')
2667                 maxpos++;
2668             else
2669                 maxpos = RExC_parse;
2670             max = atoi(maxpos);
2671             if (!max && *maxpos != '0')
2672                 max = REG_INFTY;                /* meaning "infinity" */
2673             else if (max >= REG_INFTY)
2674                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2675             RExC_parse = next;
2676             nextchar(pRExC_state);
2677
2678         do_curly:
2679             if ((flags&SIMPLE)) {
2680                 RExC_naughty += 2 + RExC_naughty / 2;
2681                 reginsert(pRExC_state, CURLY, ret);
2682                 Set_Node_Offset(ret, parse_start+1); /* MJD */
2683                 Set_Node_Cur_Length(ret);
2684             }
2685             else {
2686                 regnode *w = reg_node(pRExC_state, WHILEM);
2687
2688                 w->flags = 0;
2689                 regtail(pRExC_state, ret, w);
2690                 if (!SIZE_ONLY && RExC_extralen) {
2691                     reginsert(pRExC_state, LONGJMP,ret);
2692                     reginsert(pRExC_state, NOTHING,ret);
2693                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
2694                 }
2695                 reginsert(pRExC_state, CURLYX,ret);
2696                                 /* MJD hk */
2697                 Set_Node_Offset(ret, parse_start+1);
2698                 Set_Node_Length(ret, 
2699                                 op == '{' ? (RExC_parse - parse_start) : 1);
2700                 
2701                 if (!SIZE_ONLY && RExC_extralen)
2702                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
2703                 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2704                 if (SIZE_ONLY)
2705                     RExC_whilem_seen++, RExC_extralen += 3;
2706                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
2707             }
2708             ret->flags = 0;
2709
2710             if (min > 0)
2711                 *flagp = WORST;
2712             if (max > 0)
2713                 *flagp |= HASWIDTH;
2714             if (max && max < min)
2715                 vFAIL("Can't do {n,m} with n > m");
2716             if (!SIZE_ONLY) {
2717                 ARG1_SET(ret, (U16)min);
2718                 ARG2_SET(ret, (U16)max);
2719             }
2720
2721             goto nest_check;
2722         }
2723     }
2724
2725     if (!ISMULT1(op)) {
2726         *flagp = flags;
2727         return(ret);
2728     }
2729
2730 #if 0                           /* Now runtime fix should be reliable. */
2731
2732     /* if this is reinstated, don't forget to put this back into perldiag:
2733
2734             =item Regexp *+ operand could be empty at {#} in regex m/%s/
2735
2736            (F) The part of the regexp subject to either the * or + quantifier
2737            could match an empty string. The {#} shows in the regular
2738            expression about where the problem was discovered.
2739
2740     */
2741
2742     if (!(flags&HASWIDTH) && op != '?')
2743       vFAIL("Regexp *+ operand could be empty");
2744 #endif
2745
2746     parse_start = RExC_parse;
2747     nextchar(pRExC_state);
2748
2749     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2750
2751     if (op == '*' && (flags&SIMPLE)) {
2752         reginsert(pRExC_state, STAR, ret);
2753         ret->flags = 0;
2754         RExC_naughty += 4;
2755     }
2756     else if (op == '*') {
2757         min = 0;
2758         goto do_curly;
2759     }
2760     else if (op == '+' && (flags&SIMPLE)) {
2761         reginsert(pRExC_state, PLUS, ret);
2762         ret->flags = 0;
2763         RExC_naughty += 3;
2764     }
2765     else if (op == '+') {
2766         min = 1;
2767         goto do_curly;
2768     }
2769     else if (op == '?') {
2770         min = 0; max = 1;
2771         goto do_curly;
2772     }
2773   nest_check:
2774     if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2775         vWARN3(RExC_parse,
2776                "%.*s matches null string many times",
2777                RExC_parse - origparse,
2778                origparse);
2779     }
2780
2781     if (*RExC_parse == '?') {
2782         nextchar(pRExC_state);
2783         reginsert(pRExC_state, MINMOD, ret);
2784         regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2785     }
2786     if (ISMULT2(RExC_parse)) {
2787         RExC_parse++;
2788         vFAIL("Nested quantifiers");
2789     }
2790
2791     return(ret);
2792 }
2793
2794 /*
2795  - regatom - the lowest level
2796  *
2797  * Optimization:  gobbles an entire sequence of ordinary characters so that
2798  * it can turn them into a single node, which is smaller to store and
2799  * faster to run.  Backslashed characters are exceptions, each becoming a
2800  * separate node; the code is simpler that way and it's not worth fixing.
2801  *
2802  * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2803 STATIC regnode *
2804 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2805 {
2806     register regnode *ret = 0;
2807     I32 flags;
2808     char *parse_start = RExC_parse;
2809
2810     *flagp = WORST;             /* Tentatively. */
2811
2812 tryagain:
2813     switch (*RExC_parse) {
2814     case '^':
2815         RExC_seen_zerolen++;
2816         nextchar(pRExC_state);
2817         if (RExC_flags & PMf_MULTILINE)
2818             ret = reg_node(pRExC_state, MBOL);
2819         else if (RExC_flags & PMf_SINGLELINE)
2820             ret = reg_node(pRExC_state, SBOL);
2821         else
2822             ret = reg_node(pRExC_state, BOL);
2823         Set_Node_Length(ret, 1); /* MJD */
2824         break;
2825     case '$':
2826         nextchar(pRExC_state);
2827         if (*RExC_parse)
2828             RExC_seen_zerolen++;
2829         if (RExC_flags & PMf_MULTILINE)
2830             ret = reg_node(pRExC_state, MEOL);
2831         else if (RExC_flags & PMf_SINGLELINE)
2832             ret = reg_node(pRExC_state, SEOL);
2833         else
2834             ret = reg_node(pRExC_state, EOL);
2835         Set_Node_Length(ret, 1); /* MJD */
2836         break;
2837     case '.':
2838         nextchar(pRExC_state);
2839         if (RExC_flags & PMf_SINGLELINE)
2840             ret = reg_node(pRExC_state, SANY);
2841         else
2842             ret = reg_node(pRExC_state, REG_ANY);
2843         *flagp |= HASWIDTH|SIMPLE;
2844         RExC_naughty++;
2845         Set_Node_Length(ret, 1); /* MJD */
2846         break;
2847     case '[':
2848     {
2849         char *oregcomp_parse = ++RExC_parse;
2850         ret = regclass(pRExC_state);
2851         if (*RExC_parse != ']') {
2852             RExC_parse = oregcomp_parse;
2853             vFAIL("Unmatched [");
2854         }
2855         nextchar(pRExC_state);
2856         *flagp |= HASWIDTH|SIMPLE;
2857         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2858         break;
2859     }
2860     case '(':
2861         nextchar(pRExC_state);
2862         ret = reg(pRExC_state, 1, &flags);
2863         if (ret == NULL) {
2864                 if (flags & TRYAGAIN) {
2865                     if (RExC_parse == RExC_end) {
2866                          /* Make parent create an empty node if needed. */
2867                         *flagp |= TRYAGAIN;
2868                         return(NULL);
2869                     }
2870                     goto tryagain;
2871                 }
2872                 return(NULL);
2873         }
2874         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2875         break;
2876     case '|':
2877     case ')':
2878         if (flags & TRYAGAIN) {
2879             *flagp |= TRYAGAIN;
2880             return NULL;
2881         }
2882         vFAIL("Internal urp");
2883                                 /* Supposed to be caught earlier. */
2884         break;
2885     case '{':
2886         if (!regcurly(RExC_parse)) {
2887             RExC_parse++;
2888             goto defchar;
2889         }
2890         /* FALL THROUGH */
2891     case '?':
2892     case '+':
2893     case '*':
2894         RExC_parse++;
2895         vFAIL("Quantifier follows nothing");
2896         break;
2897     case '\\':
2898         switch (*++RExC_parse) {
2899         case 'A':
2900             RExC_seen_zerolen++;
2901             ret = reg_node(pRExC_state, SBOL);
2902             *flagp |= SIMPLE;
2903             nextchar(pRExC_state);
2904             Set_Node_Length(ret, 2); /* MJD */
2905             break;
2906         case 'G':
2907             ret = reg_node(pRExC_state, GPOS);
2908             RExC_seen |= REG_SEEN_GPOS;
2909             *flagp |= SIMPLE;
2910             nextchar(pRExC_state);
2911             Set_Node_Length(ret, 2); /* MJD */
2912             break;
2913         case 'Z':
2914             ret = reg_node(pRExC_state, SEOL);
2915             *flagp |= SIMPLE;
2916             RExC_seen_zerolen++;                /* Do not optimize RE away */
2917             nextchar(pRExC_state);
2918             break;
2919         case 'z':
2920             ret = reg_node(pRExC_state, EOS);
2921             *flagp |= SIMPLE;
2922             RExC_seen_zerolen++;                /* Do not optimize RE away */
2923             nextchar(pRExC_state);
2924             Set_Node_Length(ret, 2); /* MJD */
2925             break;
2926         case 'C':
2927             ret = reg_node(pRExC_state, CANY);
2928             RExC_seen |= REG_SEEN_CANY;
2929             *flagp |= HASWIDTH|SIMPLE;
2930             nextchar(pRExC_state);
2931             Set_Node_Length(ret, 2); /* MJD */
2932             break;
2933         case 'X':
2934             ret = reg_node(pRExC_state, CLUMP);
2935             *flagp |= HASWIDTH;
2936             nextchar(pRExC_state);
2937             Set_Node_Length(ret, 2); /* MJD */
2938             break;
2939         case 'w':
2940             ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
2941             *flagp |= HASWIDTH|SIMPLE;
2942             nextchar(pRExC_state);
2943             Set_Node_Length(ret, 2); /* MJD */
2944             break;
2945         case 'W':
2946             ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
2947             *flagp |= HASWIDTH|SIMPLE;
2948             nextchar(pRExC_state);
2949             Set_Node_Length(ret, 2); /* MJD */
2950             break;
2951         case 'b':
2952             RExC_seen_zerolen++;
2953             RExC_seen |= REG_SEEN_LOOKBEHIND;
2954             ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
2955             *flagp |= SIMPLE;
2956             nextchar(pRExC_state);
2957             Set_Node_Length(ret, 2); /* MJD */
2958             break;
2959         case 'B':
2960             RExC_seen_zerolen++;
2961             RExC_seen |= REG_SEEN_LOOKBEHIND;
2962             ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
2963             *flagp |= SIMPLE;
2964             nextchar(pRExC_state);
2965             Set_Node_Length(ret, 2); /* MJD */
2966             break;
2967         case 's':
2968             ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
2969             *flagp |= HASWIDTH|SIMPLE;
2970             nextchar(pRExC_state);
2971             Set_Node_Length(ret, 2); /* MJD */
2972             break;
2973         case 'S':
2974             ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
2975             *flagp |= HASWIDTH|SIMPLE;
2976             nextchar(pRExC_state);
2977             Set_Node_Length(ret, 2); /* MJD */
2978             break;
2979         case 'd':
2980             ret = reg_node(pRExC_state, DIGIT);
2981             *flagp |= HASWIDTH|SIMPLE;
2982             nextchar(pRExC_state);
2983             Set_Node_Length(ret, 2); /* MJD */
2984             break;
2985         case 'D':
2986             ret = reg_node(pRExC_state, NDIGIT);
2987             *flagp |= HASWIDTH|SIMPLE;
2988             nextchar(pRExC_state);
2989             Set_Node_Length(ret, 2); /* MJD */
2990             break;
2991         case 'p':
2992         case 'P':
2993             {   
2994                 char* oldregxend = RExC_end;
2995                 char* parse_start = RExC_parse - 2;
2996
2997                 if (RExC_parse[1] == '{') {
2998                   /* a lovely hack--pretend we saw [\pX] instead */
2999                     RExC_end = strchr(RExC_parse, '}');
3000                     if (!RExC_end) {
3001                         U8 c = (U8)*RExC_parse;
3002                         RExC_parse += 2;
3003                         RExC_end = oldregxend;
3004                         vFAIL2("Missing right brace on \\%c{}", c);
3005                     }
3006                     RExC_end++;
3007                 }
3008                 else {
3009                     RExC_end = RExC_parse + 2;
3010                     if (RExC_end > oldregxend)
3011                         RExC_end = oldregxend;
3012                 }
3013                 RExC_parse--;
3014
3015                 ret = regclass(pRExC_state);
3016
3017                 RExC_end = oldregxend;
3018                 RExC_parse--;
3019
3020                 Set_Node_Offset(ret, parse_start + 2);
3021                 Set_Node_Cur_Length(ret);
3022                 nextchar(pRExC_state);
3023                 *flagp |= HASWIDTH|SIMPLE;
3024             }
3025             break;
3026         case 'n':
3027         case 'r':
3028         case 't':
3029         case 'f':
3030         case 'e':
3031         case 'a':
3032         case 'x':
3033         case 'c':
3034         case '0':
3035             goto defchar;
3036         case '1': case '2': case '3': case '4':
3037         case '5': case '6': case '7': case '8': case '9':
3038             {
3039                 I32 num = atoi(RExC_parse);
3040
3041                 if (num > 9 && num >= RExC_npar)
3042                     goto defchar;
3043                 else {
3044                     char * parse_start = RExC_parse - 1; /* MJD */
3045                     while (isDIGIT(*RExC_parse))
3046                         RExC_parse++;
3047
3048                     if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3049                         vFAIL("Reference to nonexistent group");
3050                     RExC_sawback = 1;
3051                     ret = reganode(pRExC_state,
3052                                    (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3053                                    num);
3054                     *flagp |= HASWIDTH;
3055                     
3056                     /* override incorrect value set in reganode MJD */
3057                     Set_Node_Offset(ret, parse_start+1); 
3058                     Set_Node_Cur_Length(ret); /* MJD */
3059                     RExC_parse--;
3060                     nextchar(pRExC_state);
3061                 }
3062             }
3063             break;
3064         case '\0':
3065             if (RExC_parse >= RExC_end)
3066                 FAIL("Trailing \\");
3067             /* FALL THROUGH */
3068         default:
3069             /* Do not generate `unrecognized' warnings here, we fall
3070                back into the quick-grab loop below */
3071             parse_start--;
3072             goto defchar;
3073         }
3074         break;
3075
3076     case '#':
3077         if (RExC_flags & PMf_EXTENDED) {
3078             while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3079             if (RExC_parse < RExC_end)
3080                 goto tryagain;
3081         }
3082         /* FALL THROUGH */
3083
3084     default: {
3085             register STRLEN len;
3086             register UV ender;
3087             register char *p;
3088             char *oldp, *s;
3089             STRLEN numlen;
3090             STRLEN foldlen;
3091             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
3092
3093             parse_start = RExC_parse - 1;
3094
3095             RExC_parse++;
3096
3097         defchar:
3098             ender = 0;
3099             ret = reg_node(pRExC_state,
3100                            (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3101             s = STRING(ret);
3102             for (len = 0, p = RExC_parse - 1;
3103               len < 127 && p < RExC_end;
3104               len++)
3105             {
3106                 oldp = p;
3107
3108                 if (RExC_flags & PMf_EXTENDED)
3109                     p = regwhite(p, RExC_end);
3110                 switch (*p) {
3111                 case '^':
3112                 case '$':
3113                 case '.':
3114                 case '[':
3115                 case '(':
3116                 case ')':
3117                 case '|':
3118                     goto loopdone;
3119                 case '\\':
3120                     switch (*++p) {
3121                     case 'A':
3122                     case 'C':
3123                     case 'X':
3124                     case 'G':
3125                     case 'Z':
3126                     case 'z':
3127                     case 'w':
3128                     case 'W':
3129                     case 'b':
3130                     case 'B':
3131                     case 's':
3132                     case 'S':
3133                     case 'd':
3134                     case 'D':
3135                     case 'p':
3136                     case 'P':
3137                         --p;
3138                         goto loopdone;
3139                     case 'n':
3140                         ender = '\n';
3141                         p++;
3142                         break;
3143                     case 'r':
3144                         ender = '\r';
3145                         p++;
3146                         break;
3147                     case 't':
3148                         ender = '\t';
3149                         p++;
3150                         break;
3151                     case 'f':
3152                         ender = '\f';
3153                         p++;
3154                         break;
3155                     case 'e':
3156                           ender = ASCII_TO_NATIVE('\033');
3157                         p++;
3158                         break;
3159                     case 'a':
3160                           ender = ASCII_TO_NATIVE('\007');
3161                         p++;
3162                         break;
3163                     case 'x':
3164                         if (*++p == '{') {
3165                             char* e = strchr(p, '}');
3166         
3167                             if (!e) {
3168                                 RExC_parse = p + 1;
3169                                 vFAIL("Missing right brace on \\x{}");
3170                             }
3171                             else {
3172                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3173                                     | PERL_SCAN_DISALLOW_PREFIX;
3174                                 numlen = e - p - 1;
3175                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3176                                 if (ender > 0xff)
3177                                     RExC_utf8 = 1;
3178                                 p = e + 1;
3179                             }
3180                         }
3181                         else {
3182                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3183                             numlen = 2;
3184                             ender = grok_hex(p, &numlen, &flags, NULL);
3185                             p += numlen;
3186                         }
3187                         break;
3188                     case 'c':
3189                         p++;
3190                         ender = UCHARAT(p++);
3191                         ender = toCTRL(ender);
3192                         break;
3193                     case '0': case '1': case '2': case '3':case '4':
3194                     case '5': case '6': case '7': case '8':case '9':
3195                         if (*p == '0' ||
3196                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3197                             I32 flags = 0;
3198                             numlen = 3;
3199                             ender = grok_oct(p, &numlen, &flags, NULL);
3200                             p += numlen;
3201                         }
3202                         else {
3203                             --p;
3204                             goto loopdone;
3205                         }
3206                         break;
3207                     case '\0':
3208                         if (p >= RExC_end)
3209                             FAIL("Trailing \\");
3210                         /* FALL THROUGH */
3211                     default:
3212                         if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3213                             vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3214                         goto normal_default;
3215                     }
3216                     break;
3217                 default:
3218                   normal_default:
3219                     if (UTF8_IS_START(*p) && UTF) {
3220                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3221                                                &numlen, 0);
3222                         p += numlen;
3223                     }
3224                     else
3225                         ender = *p++;
3226                     break;
3227                 }
3228                 if (RExC_flags & PMf_EXTENDED)
3229                     p = regwhite(p, RExC_end);
3230                 if (UTF && FOLD) {
3231                     /* Prime the casefolded buffer. */
3232                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3233                 }
3234                 if (ISMULT2(p)) { /* Back off on ?+*. */
3235                     if (len)
3236                         p = oldp;
3237                     else if (UTF) {
3238                          STRLEN unilen;
3239
3240                          if (FOLD) {
3241                               /* Emit all the Unicode characters. */
3242                               for (foldbuf = tmpbuf;
3243                                    foldlen;
3244                                    foldlen -= numlen) {
3245                                    ender = utf8_to_uvchr(foldbuf, &numlen);
3246                                    if (numlen > 0) {
3247                                         reguni(pRExC_state, ender, s, &unilen);
3248                                         s       += unilen;
3249                                         len     += unilen;
3250                                         /* In EBCDIC the numlen
3251                                          * and unilen can differ. */
3252                                         foldbuf += numlen;
3253                                         if (numlen >= foldlen)
3254                                              break;
3255                                    }
3256                                    else
3257                                         break; /* "Can't happen." */
3258                               }
3259                          }
3260                          else {
3261                               reguni(pRExC_state, ender, s, &unilen);
3262                               if (unilen > 0) {
3263                                    s   += unilen;
3264                                    len += unilen;
3265                               }
3266                          }
3267                     }
3268                     else {
3269                         len++;
3270                         REGC((char)ender, s++);
3271                     }
3272                     break;
3273                 }
3274                 if (UTF) {
3275                      STRLEN unilen;
3276
3277                      if (FOLD) {
3278                           /* Emit all the Unicode characters. */
3279                           for (foldbuf = tmpbuf;
3280                                foldlen;
3281                                foldlen -= numlen) {
3282                                ender = utf8_to_uvchr(foldbuf, &numlen);
3283                                if (numlen > 0) {
3284                                     reguni(pRExC_state, ender, s, &unilen);
3285                                     len     += unilen;
3286                                     s       += unilen;
3287                                     /* In EBCDIC the numlen
3288                                      * and unilen can differ. */
3289                                     foldbuf += numlen;
3290                                     if (numlen >= foldlen)
3291                                          break;
3292                                }
3293                                else
3294                                     break;
3295                           }
3296                      }
3297                      else {
3298                           reguni(pRExC_state, ender, s, &unilen);
3299                           if (unilen > 0) {
3300                                s   += unilen;
3301                                len += unilen;
3302                           }
3303                      }
3304                      len--;
3305                 }
3306                 else
3307                     REGC((char)ender, s++);
3308             }
3309         loopdone:
3310             RExC_parse = p - 1;
3311             Set_Node_Cur_Length(ret); /* MJD */
3312             nextchar(pRExC_state);
3313             {
3314                 /* len is STRLEN which is unsigned, need to copy to signed */
3315                 IV iv = len;
3316                 if (iv < 0)
3317                     vFAIL("Internal disaster");
3318             }
3319             if (len > 0)
3320                 *flagp |= HASWIDTH;
3321             if (len == 1 && UNI_IS_INVARIANT(ender))
3322                 *flagp |= SIMPLE;
3323             if (!SIZE_ONLY)
3324                 STR_LEN(ret) = len;
3325             if (SIZE_ONLY)
3326                 RExC_size += STR_SZ(len);
3327             else
3328                 RExC_emit += STR_SZ(len);
3329         }
3330         break;
3331     }
3332
3333     /* If the encoding pragma is in effect recode the text of
3334      * any EXACT-kind nodes. */
3335     if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3336         STRLEN oldlen = STR_LEN(ret);
3337         SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3338
3339         if (RExC_utf8)
3340             SvUTF8_on(sv);
3341         if (sv_utf8_downgrade(sv, TRUE)) {
3342             char *s       = sv_recode_to_utf8(sv, PL_encoding);
3343             STRLEN newlen = SvCUR(sv);
3344
3345             if (SvUTF8(sv))
3346                 RExC_utf8 = 1;
3347             if (!SIZE_ONLY) {
3348                 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3349                                       (int)oldlen, STRING(ret),
3350                                       (int)newlen, s));
3351                 Copy(s, STRING(ret), newlen, char);
3352                 STR_LEN(ret) += newlen - oldlen;
3353                 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3354             } else
3355                 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3356         }
3357     }
3358
3359     return(ret);
3360 }
3361
3362 STATIC char *
3363 S_regwhite(pTHX_ char *p, char *e)
3364 {
3365     while (p < e) {
3366         if (isSPACE(*p))
3367             ++p;
3368         else if (*p == '#') {
3369             do {
3370                 p++;
3371             } while (p < e && *p != '\n');
3372         }
3373         else
3374             break;
3375     }
3376     return p;
3377 }
3378
3379 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3380    Character classes ([:foo:]) can also be negated ([:^foo:]).
3381    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3382    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3383    but trigger failures because they are currently unimplemented. */
3384
3385 #define POSIXCC_DONE(c)   ((c) == ':')
3386 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3387 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3388
3389 STATIC I32
3390 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3391 {
3392     char *posixcc = 0;
3393     I32 namedclass = OOB_NAMEDCLASS;
3394
3395     if (value == '[' && RExC_parse + 1 < RExC_end &&
3396         /* I smell either [: or [= or [. -- POSIX has been here, right? */
3397         POSIXCC(UCHARAT(RExC_parse))) {
3398         char  c = UCHARAT(RExC_parse);
3399         char* s = RExC_parse++;
3400         
3401         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3402             RExC_parse++;
3403         if (RExC_parse == RExC_end)
3404             /* Grandfather lone [:, [=, [. */
3405             RExC_parse = s;
3406         else {
3407             char* t = RExC_parse++; /* skip over the c */
3408
3409             assert(*t == c);
3410
3411             if (UCHARAT(RExC_parse) == ']') {
3412                 RExC_parse++; /* skip over the ending ] */
3413                 posixcc = s + 1;
3414                 if (*s == ':') {
3415                     I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3416                     I32 skip = t - posixcc;
3417
3418                     /* Initially switch on the length of the name.  */
3419                     switch (skip) {
3420                     case 4:
3421                         if (memEQ(posixcc, "word", 4)) {
3422                             /* this is not POSIX, this is the Perl \w */;
3423                             namedclass
3424                                 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3425                         }
3426                         break;
3427                     case 5:
3428                         /* Names all of length 5.  */
3429                         /* alnum alpha ascii blank cntrl digit graph lower
3430                            print punct space upper  */
3431                         /* Offset 4 gives the best switch position.  */
3432                         switch (posixcc[4]) {
3433                         case 'a':
3434                             if (memEQ(posixcc, "alph", 4)) {
3435                                 /*                  a     */
3436                                 namedclass
3437                                     = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3438                             }
3439                             break;
3440                         case 'e':
3441                             if (memEQ(posixcc, "spac", 4)) {
3442                                 /*                  e     */
3443                                 namedclass
3444                                     = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3445                             }
3446                             break;
3447                         case 'h':
3448                             if (memEQ(posixcc, "grap", 4)) {
3449                                 /*                  h     */
3450                                 namedclass
3451                                     = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3452                             }
3453                             break;
3454                         case 'i':
3455                             if (memEQ(posixcc, "asci", 4)) {
3456                                 /*                  i     */
3457                                 namedclass
3458                                     = complement ? ANYOF_NASCII : ANYOF_ASCII;
3459                             }
3460                             break;
3461                         case 'k':
3462                             if (memEQ(posixcc, "blan", 4)) {
3463                                 /*                  k     */
3464                                 namedclass
3465                                     = complement ? ANYOF_NBLANK : ANYOF_BLANK;
3466                             }
3467                             break;
3468                         case 'l':
3469                             if (memEQ(posixcc, "cntr", 4)) {
3470                                 /*                  l     */
3471                                 namedclass
3472                                     = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3473                             }
3474                             break;
3475                         case 'm':
3476                             if (memEQ(posixcc, "alnu", 4)) {
3477                                 /*                  m     */
3478                                 namedclass
3479                                     = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3480                             }
3481                             break;
3482                         case 'r':
3483                             if (memEQ(posixcc, "lowe", 4)) {
3484                                 /*                  r     */
3485                                 namedclass
3486                                     = complement ? ANYOF_NLOWER : ANYOF_LOWER;
3487                             }
3488                             if (memEQ(posixcc, "uppe", 4)) {
3489                                 /*                      r     */
3490                                 namedclass
3491                                     = complement ? ANYOF_NUPPER : ANYOF_UPPER;
3492                             }
3493                             break;
3494                         case 't':
3495                             if (memEQ(posixcc, "digi", 4)) {
3496                                 /*                  t     */
3497                                 namedclass
3498                                     = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3499                             }
3500                             if (memEQ(posixcc, "prin", 4)) {
3501                                 /*                      t     */
3502                                 namedclass
3503                                     = complement ? ANYOF_NPRINT : ANYOF_PRINT;
3504                             }
3505                             if (memEQ(posixcc, "punc", 4)) {
3506                                 /*                  t     */
3507                                 namedclass
3508                                     = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3509                             }
3510                             break;
3511                         }
3512                         break;
3513                     case 6:
3514                         if (memEQ(posixcc, "xdigit", 6)) {
3515                             namedclass
3516                                 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3517                         }
3518                         break;
3519                     }
3520
3521                     if (namedclass == OOB_NAMEDCLASS)
3522                     {
3523                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3524                                       t - s - 1, s + 1);
3525                     }
3526                     assert (posixcc[skip] == ':');
3527                     assert (posixcc[skip+1] == ']');
3528                 } else if (!SIZE_ONLY) {
3529                     /* [[=foo=]] and [[.foo.]] are still future. */
3530
3531                     /* adjust RExC_parse so the warning shows after
3532                        the class closes */
3533                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3534                         RExC_parse++;
3535                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3536                 }
3537             } else {
3538                 /* Maternal grandfather:
3539                  * "[:" ending in ":" but not in ":]" */
3540                 RExC_parse = s;
3541             }
3542         }
3543     }
3544
3545     return namedclass;
3546 }
3547
3548 STATIC void
3549 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3550 {
3551     if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3552         char *s = RExC_parse;
3553         char  c = *s++;
3554
3555         while(*s && isALNUM(*s))
3556             s++;
3557         if (*s && c == *s && s[1] == ']') {
3558             if (ckWARN(WARN_REGEXP))
3559                 vWARN3(s+2,
3560                         "POSIX syntax [%c %c] belongs inside character classes",
3561                         c, c);
3562
3563             /* [[=foo=]] and [[.foo.]] are still future. */
3564             if (POSIXCC_NOTYET(c)) {
3565                 /* adjust RExC_parse so the error shows after
3566                    the class closes */
3567                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3568                     ;
3569                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3570             }
3571         }
3572     }
3573 }
3574
3575 STATIC regnode *
3576 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3577 {
3578     register UV value;
3579     register UV nextvalue;
3580     register IV prevvalue = OOB_UNICODE;
3581     register IV range = 0;
3582     register regnode *ret;
3583     STRLEN numlen;
3584     IV namedclass;
3585     char *rangebegin = 0;
3586     bool need_class = 0;
3587     SV *listsv = Nullsv;
3588     register char *e;
3589     UV n;
3590     bool optimize_invert   = TRUE;
3591     AV* unicode_alternate  = 0;
3592 #ifdef EBCDIC
3593     UV literal_endpoint = 0;
3594 #endif
3595
3596     ret = reganode(pRExC_state, ANYOF, 0);
3597
3598     if (!SIZE_ONLY)
3599         ANYOF_FLAGS(ret) = 0;
3600
3601     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
3602         RExC_naughty++;
3603         RExC_parse++;
3604         if (!SIZE_ONLY)
3605             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3606     }
3607
3608     if (SIZE_ONLY)
3609         RExC_size += ANYOF_SKIP;
3610     else {
3611         RExC_emit += ANYOF_SKIP;
3612         if (FOLD)
3613             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3614         if (LOC)
3615             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3616         ANYOF_BITMAP_ZERO(ret);
3617         listsv = newSVpvn("# comment\n", 10);
3618     }
3619
3620     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3621
3622     if (!SIZE_ONLY && POSIXCC(nextvalue))
3623         checkposixcc(pRExC_state);
3624
3625     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3626     if (UCHARAT(RExC_parse) == ']')
3627         goto charclassloop;
3628
3629     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3630
3631     charclassloop:
3632
3633         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3634
3635         if (!range)
3636             rangebegin = RExC_parse;
3637         if (UTF) {
3638             value = utf8n_to_uvchr((U8*)RExC_parse,
3639                                    RExC_end - RExC_parse,
3640                                    &numlen, 0);
3641             RExC_parse += numlen;
3642         }
3643         else
3644             value = UCHARAT(RExC_parse++);
3645         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3646         if (value == '[' && POSIXCC(nextvalue))
3647             namedclass = regpposixcc(pRExC_state, value);
3648         else if (value == '\\') {
3649             if (UTF) {
3650                 value = utf8n_to_uvchr((U8*)RExC_parse,
3651                                    RExC_end - RExC_parse,
3652                                    &numlen, 0);
3653                 RExC_parse += numlen;
3654             }
3655             else
3656                 value = UCHARAT(RExC_parse++);
3657             /* Some compilers cannot handle switching on 64-bit integer
3658              * values, therefore value cannot be an UV.  Yes, this will
3659              * be a problem later if we want switch on Unicode.
3660              * A similar issue a little bit later when switching on
3661              * namedclass. --jhi */
3662             switch ((I32)value) {
3663             case 'w':   namedclass = ANYOF_ALNUM;       break;
3664             case 'W':   namedclass = ANYOF_NALNUM;      break;
3665             case 's':   namedclass = ANYOF_SPACE;       break;
3666             case 'S':   namedclass = ANYOF_NSPACE;      break;
3667             case 'd':   namedclass = ANYOF_DIGIT;       break;
3668             case 'D':   namedclass = ANYOF_NDIGIT;      break;
3669             case 'p':
3670             case 'P':
3671                 if (RExC_parse >= RExC_end)
3672                     vFAIL2("Empty \\%c{}", (U8)value);
3673                 if (*RExC_parse == '{') {
3674                     U8 c = (U8)value;
3675                     e = strchr(RExC_parse++, '}');
3676                     if (!e)
3677                         vFAIL2("Missing right brace on \\%c{}", c);
3678                     while (isSPACE(UCHARAT(RExC_parse)))
3679                         RExC_parse++;
3680                     if (e == RExC_parse)
3681                         vFAIL2("Empty \\%c{}", c);
3682                     n = e - RExC_parse;
3683                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3684                         n--;
3685                 }
3686                 else {
3687                     e = RExC_parse;
3688                     n = 1;
3689                 }
3690                 if (!SIZE_ONLY) {
3691                     if (UCHARAT(RExC_parse) == '^') {
3692                          RExC_parse++;
3693                          n--;
3694                          value = value == 'p' ? 'P' : 'p'; /* toggle */
3695                          while (isSPACE(UCHARAT(RExC_parse))) {
3696                               RExC_parse++;
3697                               n--;
3698                          }
3699                     }
3700                     if (value == 'p')
3701                          Perl_sv_catpvf(aTHX_ listsv,
3702                                         "+utf8::%.*s\n", (int)n, RExC_parse);
3703                     else
3704                          Perl_sv_catpvf(aTHX_ listsv,
3705                                         "!utf8::%.*s\n", (int)n, RExC_parse);
3706                 }
3707                 RExC_parse = e + 1;
3708                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3709                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
3710                 break;
3711             case 'n':   value = '\n';                   break;
3712             case 'r':   value = '\r';                   break;
3713             case 't':   value = '\t';                   break;
3714             case 'f':   value = '\f';                   break;
3715             case 'b':   value = '\b';                   break;
3716             case 'e':   value = ASCII_TO_NATIVE('\033');break;
3717             case 'a':   value = ASCII_TO_NATIVE('\007');break;
3718             case 'x':
3719                 if (*RExC_parse == '{') {
3720                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3721                         | PERL_SCAN_DISALLOW_PREFIX;
3722                     e = strchr(RExC_parse++, '}');
3723                     if (!e)
3724                         vFAIL("Missing right brace on \\x{}");
3725
3726                     numlen = e - RExC_parse;
3727                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3728                     RExC_parse = e + 1;
3729                 }
3730                 else {
3731                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3732                     numlen = 2;
3733                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3734                     RExC_parse += numlen;
3735                 }
3736                 break;
3737             case 'c':
3738                 value = UCHARAT(RExC_parse++);
3739                 value = toCTRL(value);
3740                 break;
3741             case '0': case '1': case '2': case '3': case '4':
3742             case '5': case '6': case '7': case '8': case '9':
3743             {
3744                 I32 flags = 0;
3745                 numlen = 3;
3746                 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3747                 RExC_parse += numlen;
3748                 break;
3749             }
3750             default:
3751                 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3752                     vWARN2(RExC_parse,
3753                            "Unrecognized escape \\%c in character class passed through",
3754                            (int)value);
3755                 break;
3756             }
3757         } /* end of \blah */
3758 #ifdef EBCDIC
3759         else
3760             literal_endpoint++;
3761 #endif
3762
3763         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3764
3765             if (!SIZE_ONLY && !need_class)
3766                 ANYOF_CLASS_ZERO(ret);
3767
3768             need_class = 1;
3769
3770             /* a bad range like a-\d, a-[:digit:] ? */
3771             if (range) {
3772                 if (!SIZE_ONLY) {
3773                     if (ckWARN(WARN_REGEXP))
3774                         vWARN4(RExC_parse,
3775                                "False [] range \"%*.*s\"",
3776                                RExC_parse - rangebegin,
3777                                RExC_parse - rangebegin,
3778                                rangebegin);
3779                     if (prevvalue < 256) {
3780                         ANYOF_BITMAP_SET(ret, prevvalue);
3781                         ANYOF_BITMAP_SET(ret, '-');
3782                     }
3783                     else {
3784                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3785                         Perl_sv_catpvf(aTHX_ listsv,
3786                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3787                     }
3788                 }
3789
3790                 range = 0; /* this was not a true range */
3791             }
3792
3793             if (!SIZE_ONLY) {
3794                 const char *what = NULL;
3795                 char yesno = 0;
3796
3797                 if (namedclass > OOB_NAMEDCLASS)
3798                     optimize_invert = FALSE;
3799                 /* Possible truncation here but in some 64-bit environments
3800                  * the compiler gets heartburn about switch on 64-bit values.
3801                  * A similar issue a little earlier when switching on value.
3802                  * --jhi */
3803                 switch ((I32)namedclass) {
3804                 case ANYOF_ALNUM:
3805                     if (LOC)
3806                         ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3807                     else {
3808                         for (value = 0; value < 256; value++)
3809                             if (isALNUM(value))
3810                                 ANYOF_BITMAP_SET(ret, value);
3811                     }
3812                     yesno = '+';
3813                     what = "Word";      
3814                     break;
3815                 case ANYOF_NALNUM:
3816                     if (LOC)
3817                         ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3818                     else {
3819                         for (value = 0; value < 256; value++)
3820                             if (!isALNUM(value))
3821                                 ANYOF_BITMAP_SET(ret, value);
3822                     }
3823                     yesno = '!';
3824                     what = "Word";
3825                     break;
3826                 case ANYOF_ALNUMC:
3827                     if (LOC)
3828                         ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3829                     else {
3830                         for (value = 0; value < 256; value++)
3831                             if (isALNUMC(value))
3832                                 ANYOF_BITMAP_SET(ret, value);
3833                     }
3834                     yesno = '+';
3835                     what = "Alnum";
3836                     break;
3837                 case ANYOF_NALNUMC:
3838                     if (LOC)
3839                         ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3840                     else {
3841                         for (value = 0; value < 256; value++)
3842                             if (!isALNUMC(value))
3843                                 ANYOF_BITMAP_SET(ret, value);
3844                     }
3845                     yesno = '!';
3846                     what = "Alnum";
3847                     break;
3848                 case ANYOF_ALPHA:
3849                     if (LOC)
3850                         ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3851                     else {
3852                         for (value = 0; value < 256; value++)
3853                             if (isALPHA(value))
3854                                 ANYOF_BITMAP_SET(ret, value);
3855                     }
3856                     yesno = '+';
3857                     what = "Alpha";
3858                     break;
3859                 case ANYOF_NALPHA:
3860                     if (LOC)
3861                         ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3862                     else {
3863                         for (value = 0; value < 256; value++)
3864                             if (!isALPHA(value))
3865                                 ANYOF_BITMAP_SET(ret, value);
3866                     }
3867                     yesno = '!';
3868                     what = "Alpha";
3869                     break;
3870                 case ANYOF_ASCII:
3871                     if (LOC)
3872                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3873                     else {
3874 #ifndef EBCDIC
3875                         for (value = 0; value < 128; value++)
3876                             ANYOF_BITMAP_SET(ret, value);
3877 #else  /* EBCDIC */
3878                         for (value = 0; value < 256; value++) {
3879                             if (isASCII(value))
3880                                 ANYOF_BITMAP_SET(ret, value);
3881                         }
3882 #endif /* EBCDIC */
3883                     }
3884                     yesno = '+';
3885                     what = "ASCII";
3886                     break;
3887                 case ANYOF_NASCII:
3888                     if (LOC)
3889                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3890                     else {
3891 #ifndef EBCDIC
3892                         for (value = 128; value < 256; value++)
3893                             ANYOF_BITMAP_SET(ret, value);
3894 #else  /* EBCDIC */
3895                         for (value = 0; value < 256; value++) {
3896                             if (!isASCII(value))
3897                                 ANYOF_BITMAP_SET(ret, value);
3898                         }
3899 #endif /* EBCDIC */
3900                     }
3901                     yesno = '!';
3902                     what = "ASCII";
3903                     break;
3904                 case ANYOF_BLANK:
3905                     if (LOC)
3906                         ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3907                     else {
3908                         for (value = 0; value < 256; value++)
3909                             if (isBLANK(value))
3910                                 ANYOF_BITMAP_SET(ret, value);
3911                     }
3912                     yesno = '+';
3913                     what = "Blank";
3914                     break;
3915                 case ANYOF_NBLANK:
3916                     if (LOC)
3917                         ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3918                     else {
3919                         for (value = 0; value < 256; value++)
3920                             if (!isBLANK(value))
3921                                 ANYOF_BITMAP_SET(ret, value);
3922                     }
3923                     yesno = '!';
3924                     what = "Blank";
3925                     break;
3926                 case ANYOF_CNTRL:
3927                     if (LOC)
3928                         ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3929                     else {
3930                         for (value = 0; value < 256; value++)
3931                             if (isCNTRL(value))
3932                                 ANYOF_BITMAP_SET(ret, value);
3933                     }
3934                     yesno = '+';
3935                     what = "Cntrl";
3936                     break;
3937                 case ANYOF_NCNTRL:
3938                     if (LOC)
3939                         ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3940                     else {
3941                         for (value = 0; value < 256; value++)
3942                             if (!isCNTRL(value))
3943                                 ANYOF_BITMAP_SET(ret, value);
3944                     }
3945                     yesno = '!';
3946                     what = "Cntrl";
3947                     break;
3948                 case ANYOF_DIGIT:
3949                     if (LOC)
3950                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3951                     else {
3952                         /* consecutive digits assumed */
3953                         for (value = '0'; value <= '9'; value++)
3954                             ANYOF_BITMAP_SET(ret, value);
3955                     }
3956                     yesno = '+';
3957                     what = "Digit";
3958                     break;
3959                 case ANYOF_NDIGIT:
3960                     if (LOC)
3961                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3962                     else {
3963                         /* consecutive digits assumed */
3964                         for (value = 0; value < '0'; value++)
3965                             ANYOF_BITMAP_SET(ret, value);
3966                         for (value = '9' + 1; value < 256; value++)
3967                             ANYOF_BITMAP_SET(ret, value);
3968                     }
3969                     yesno = '!';
3970                     what = "Digit";
3971                     break;
3972                 case ANYOF_GRAPH:
3973                     if (LOC)
3974                         ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3975                     else {
3976                         for (value = 0; value < 256; value++)
3977                             if (isGRAPH(value))
3978                                 ANYOF_BITMAP_SET(ret, value);
3979                     }
3980                     yesno = '+';
3981                     what = "Graph";
3982                     break;
3983                 case ANYOF_NGRAPH:
3984                     if (LOC)
3985                         ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3986                     else {
3987                         for (value = 0; value < 256; value++)
3988                             if (!isGRAPH(value))
3989                                 ANYOF_BITMAP_SET(ret, value);
3990                     }
3991                     yesno = '!';
3992                     what = "Graph";
3993                     break;
3994                 case ANYOF_LOWER:
3995                     if (LOC)
3996                         ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3997                     else {
3998                         for (value = 0; value < 256; value++)
3999                             if (isLOWER(value))
4000                                 ANYOF_BITMAP_SET(ret, value);
4001                     }
4002                     yesno = '+';
4003                     what = "Lower";
4004                     break;
4005                 case ANYOF_NLOWER:
4006                     if (LOC)
4007                         ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
4008                     else {
4009                         for (value = 0; value < 256; value++)
4010                             if (!isLOWER(value))
4011                                 ANYOF_BITMAP_SET(ret, value);
4012                     }
4013                     yesno = '!';
4014                     what = "Lower";
4015                     break;
4016                 case ANYOF_PRINT:
4017                     if (LOC)
4018                         ANYOF_CLASS_SET(ret, ANYOF_PRINT);
4019                     else {
4020                         for (value = 0; value < 256; value++)
4021                             if (isPRINT(value))
4022                                 ANYOF_BITMAP_SET(ret, value);
4023                     }
4024                     yesno = '+';
4025                     what = "Print";
4026                     break;
4027                 case ANYOF_NPRINT:
4028                     if (LOC)
4029                         ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
4030                     else {
4031                         for (value = 0; value < 256; value++)
4032                             if (!isPRINT(value))
4033                                 ANYOF_BITMAP_SET(ret, value);
4034                     }
4035                     yesno = '!';
4036                     what = "Print";
4037                     break;
4038                 case ANYOF_PSXSPC:
4039                     if (LOC)
4040                         ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
4041                     else {
4042                         for (value = 0; value < 256; value++)
4043                             if (isPSXSPC(value))
4044                                 ANYOF_BITMAP_SET(ret, value);
4045                     }
4046                     yesno = '+';
4047                     what = "Space";
4048                     break;
4049                 case ANYOF_NPSXSPC:
4050                     if (LOC)
4051                         ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
4052                     else {
4053                         for (value = 0; value < 256; value++)
4054                             if (!isPSXSPC(value))
4055                                 ANYOF_BITMAP_SET(ret, value);
4056                     }
4057                     yesno = '!';
4058                     what = "Space";
4059                     break;
4060                 case ANYOF_PUNCT:
4061                     if (LOC)
4062                         ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
4063                     else {
4064                         for (value = 0; value < 256; value++)
4065                             if (isPUNCT(value))
4066                                 ANYOF_BITMAP_SET(ret, value);
4067                     }
4068                     yesno = '+';
4069                     what = "Punct";
4070                     break;
4071                 case ANYOF_NPUNCT:
4072                     if (LOC)
4073                         ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
4074                     else {
4075                         for (value = 0; value < 256; value++)
4076                             if (!isPUNCT(value))
4077                                 ANYOF_BITMAP_SET(ret, value);
4078                     }
4079                     yesno = '!';
4080                     what = "Punct";
4081                     break;
4082                 case ANYOF_SPACE:
4083                     if (LOC)
4084                         ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4085                     else {
4086                         for (value = 0; value < 256; value++)
4087                             if (isSPACE(value))
4088                                 ANYOF_BITMAP_SET(ret, value);
4089                     }
4090                     yesno = '+';
4091                     what = "SpacePerl";
4092                     break;
4093                 case ANYOF_NSPACE:
4094                     if (LOC)
4095                         ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4096                     else {
4097                         for (value = 0; value < 256; value++)
4098                             if (!isSPACE(value))
4099                                 ANYOF_BITMAP_SET(ret, value);
4100                     }
4101                     yesno = '!';
4102                     what = "SpacePerl";
4103                     break;
4104                 case ANYOF_UPPER:
4105                     if (LOC)
4106                         ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4107                     else {
4108                         for (value = 0; value < 256; value++)
4109                             if (isUPPER(value))
4110                                 ANYOF_BITMAP_SET(ret, value);
4111                     }
4112                     yesno = '+';
4113                     what = "Upper";
4114                     break;
4115                 case ANYOF_NUPPER:
4116                     if (LOC)
4117                         ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4118                     else {
4119                         for (value = 0; value < 256; value++)
4120                             if (!isUPPER(value))
4121                                 ANYOF_BITMAP_SET(ret, value);
4122                     }
4123                     yesno = '!';
4124                     what = "Upper";
4125                     break;
4126                 case ANYOF_XDIGIT:
4127                     if (LOC)
4128                         ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4129                     else {
4130                         for (value = 0; value < 256; value++)
4131                             if (isXDIGIT(value))
4132                                 ANYOF_BITMAP_SET(ret, value);
4133                     }
4134                     yesno = '+';
4135                     what = "XDigit";
4136                     break;
4137                 case ANYOF_NXDIGIT:
4138                     if (LOC)
4139                         ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4140                     else {
4141                         for (value = 0; value < 256; value++)
4142                             if (!isXDIGIT(value))
4143                                 ANYOF_BITMAP_SET(ret, value);
4144                     }
4145                     yesno = '!';
4146                     what = "XDigit";
4147                     break;
4148                 case ANYOF_MAX:
4149                     /* this is to handle \p and \P */
4150                     break;
4151                 default:
4152                     vFAIL("Invalid [::] class");
4153                     break;
4154                 }
4155                 if (what) {
4156                     /* Strings such as "+utf8::isWord\n" */
4157                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
4158                 }
4159                 if (LOC)
4160                     ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4161                 continue;
4162             }
4163         } /* end of namedclass \blah */
4164
4165         if (range) {
4166             if (prevvalue > (IV)value) /* b-a */ {
4167                 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4168                               RExC_parse - rangebegin,
4169                               RExC_parse - rangebegin,
4170                               rangebegin);
4171                 range = 0; /* not a valid range */
4172             }
4173         }
4174         else {
4175             prevvalue = value; /* save the beginning of the range */
4176             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4177                 RExC_parse[1] != ']') {
4178                 RExC_parse++;
4179
4180                 /* a bad range like \w-, [:word:]- ? */
4181                 if (namedclass > OOB_NAMEDCLASS) {
4182                     if (ckWARN(WARN_REGEXP))
4183                         vWARN4(RExC_parse,
4184                                "False [] range \"%*.*s\"",
4185                                RExC_parse - rangebegin,
4186                                RExC_parse - rangebegin,
4187                                rangebegin);
4188                     if (!SIZE_ONLY)
4189                         ANYOF_BITMAP_SET(ret, '-');
4190                 } else
4191                     range = 1;  /* yeah, it's a range! */
4192                 continue;       /* but do it the next time */
4193             }
4194         }
4195
4196         /* now is the next time */
4197         if (!SIZE_ONLY) {
4198             IV i;
4199
4200             if (prevvalue < 256) {
4201                 IV ceilvalue = value < 256 ? value : 255;
4202
4203 #ifdef EBCDIC
4204                 /* In EBCDIC [\x89-\x91] should include
4205                  * the \x8e but [i-j] should not. */
4206                 if (literal_endpoint == 2 &&
4207                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4208                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4209                 {
4210                     if (isLOWER(prevvalue)) {
4211                         for (i = prevvalue; i <= ceilvalue; i++)
4212                             if (isLOWER(i))
4213                                 ANYOF_BITMAP_SET(ret, i);
4214                     } else {
4215                         for (i = prevvalue; i <= ceilvalue; i++)
4216                             if (isUPPER(i))
4217                                 ANYOF_BITMAP_SET(ret, i);
4218                     }
4219                 }
4220                 else
4221 #endif
4222                       for (i = prevvalue; i <= ceilvalue; i++)
4223                           ANYOF_BITMAP_SET(ret, i);
4224           }
4225           if (value > 255 || UTF) {
4226                 UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
4227                 UV natvalue      = NATIVE_TO_UNI(value);
4228
4229                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4230                 if (prevnatvalue < natvalue) { /* what about > ? */
4231                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4232                                    prevnatvalue, natvalue);
4233                 }
4234                 else if (prevnatvalue == natvalue) {
4235                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4236                     if (FOLD) {
4237                          U8 foldbuf[UTF8_MAXBYTES_CASE+1];
4238                          STRLEN foldlen;
4239                          UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4240
4241                          /* If folding and foldable and a single
4242                           * character, insert also the folded version
4243                           * to the charclass. */
4244                          if (f != value) {
4245                               if (foldlen == (STRLEN)UNISKIP(f))
4246                                   Perl_sv_catpvf(aTHX_ listsv,
4247                                                  "%04"UVxf"\n", f);
4248                               else {
4249                                   /* Any multicharacter foldings
4250                                    * require the following transform:
4251                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4252                                    * where E folds into "pq" and F folds
4253                                    * into "rst", all other characters
4254                                    * fold to single characters.  We save
4255                                    * away these multicharacter foldings,
4256                                    * to be later saved as part of the
4257                                    * additional "s" data. */
4258                                   SV *sv;
4259
4260                                   if (!unicode_alternate)
4261                                       unicode_alternate = newAV();
4262                                   sv = newSVpvn((char*)foldbuf, foldlen);
4263                                   SvUTF8_on(sv);
4264                                   av_push(unicode_alternate, sv);
4265                               }
4266                          }
4267
4268                          /* If folding and the value is one of the Greek
4269                           * sigmas insert a few more sigmas to make the
4270                           * folding rules of the sigmas to work right.
4271                           * Note that not all the possible combinations
4272                           * are handled here: some of them are handled
4273                           * by the standard folding rules, and some of
4274                           * them (literal or EXACTF cases) are handled
4275                           * during runtime in regexec.c:S_find_byclass(). */
4276                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4277                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4278                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4279                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4280                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4281                          }
4282                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4283                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4284                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4285                     }
4286                 }
4287             }
4288 #ifdef EBCDIC
4289             literal_endpoint = 0;
4290 #endif
4291         }
4292
4293         range = 0; /* this range (if it was one) is done now */
4294     }
4295
4296     if (need_class) {
4297         ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4298         if (SIZE_ONLY)
4299             RExC_size += ANYOF_CLASS_ADD_SKIP;
4300         else
4301             RExC_emit += ANYOF_CLASS_ADD_SKIP;
4302     }
4303
4304     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4305     if (!SIZE_ONLY &&
4306          /* If the only flag is folding (plus possibly inversion). */
4307         ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4308        ) {
4309         for (value = 0; value < 256; ++value) {
4310             if (ANYOF_BITMAP_TEST(ret, value)) {
4311                 UV fold = PL_fold[value];
4312
4313                 if (fold != value)
4314                     ANYOF_BITMAP_SET(ret, fold);
4315             }
4316         }
4317         ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4318     }
4319
4320     /* optimize inverted simple patterns (e.g. [^a-z]) */
4321     if (!SIZE_ONLY && optimize_invert &&
4322         /* If the only flag is inversion. */
4323         (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4324         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4325             ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4326         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4327     }
4328
4329     if (!SIZE_ONLY) {
4330         AV *av = newAV();
4331         SV *rv;
4332
4333         /* The 0th element stores the character class description
4334          * in its textual form: used later (regexec.c:Perl_regclass_swash())
4335          * to initialize the appropriate swash (which gets stored in
4336          * the 1st element), and also useful for dumping the regnode.
4337          * The 2nd element stores the multicharacter foldings,
4338          * used later (regexec.c:S_reginclass()). */
4339         av_store(av, 0, listsv);
4340         av_store(av, 1, NULL);
4341         av_store(av, 2, (SV*)unicode_alternate);
4342         rv = newRV_noinc((SV*)av);
4343         n = add_data(pRExC_state, 1, "s");
4344         RExC_rx->data->data[n] = (void*)rv;
4345         ARG_SET(ret, n);
4346     }
4347
4348     return ret;
4349 }
4350
4351 STATIC char*
4352 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4353 {
4354     char* retval = RExC_parse++;
4355
4356     for (;;) {
4357         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4358                 RExC_parse[2] == '#') {
4359             while (*RExC_parse != ')') {
4360                 if (RExC_parse == RExC_end)
4361                     FAIL("Sequence (?#... not terminated");
4362                 RExC_parse++;
4363             }
4364             RExC_parse++;
4365             continue;
4366         }
4367         if (RExC_flags & PMf_EXTENDED) {
4368             if (isSPACE(*RExC_parse)) {
4369                 RExC_parse++;
4370                 continue;
4371             }
4372             else if (*RExC_parse == '#') {
4373                 while (RExC_parse < RExC_end)
4374                     if (*RExC_parse++ == '\n') break;
4375                 continue;
4376             }
4377         }
4378         return retval;
4379     }
4380 }
4381
4382 /*
4383 - reg_node - emit a node
4384 */
4385 STATIC regnode *                        /* Location. */
4386 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4387 {
4388     register regnode *ret;
4389     register regnode *ptr;
4390
4391     ret = RExC_emit;
4392     if (SIZE_ONLY) {
4393         SIZE_ALIGN(RExC_size);
4394         RExC_size += 1;
4395         return(ret);
4396     }
4397
4398     NODE_ALIGN_FILL(ret);
4399     ptr = ret;
4400     FILL_ADVANCE_NODE(ptr, op);
4401     if (RExC_offsets) {         /* MJD */
4402         MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n", 
4403               "reg_node", __LINE__, 
4404               reg_name[op],
4405               RExC_emit - RExC_emit_start > RExC_offsets[0] 
4406               ? "Overwriting end of array!\n" : "OK",
4407               RExC_emit - RExC_emit_start,
4408               RExC_parse - RExC_start,
4409               RExC_offsets[0])); 
4410         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4411     }
4412             
4413     RExC_emit = ptr;
4414
4415     return(ret);
4416 }
4417
4418 /*
4419 - reganode - emit a node with an argument
4420 */
4421 STATIC regnode *                        /* Location. */
4422 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4423 {
4424     register regnode *ret;
4425     register regnode *ptr;
4426
4427     ret = RExC_emit;
4428     if (SIZE_ONLY) {
4429         SIZE_ALIGN(RExC_size);
4430         RExC_size += 2;
4431         return(ret);
4432     }
4433
4434     NODE_ALIGN_FILL(ret);
4435     ptr = ret;
4436     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4437     if (RExC_offsets) {         /* MJD */
4438         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n", 
4439               "reganode",
4440               __LINE__,
4441               reg_name[op],
4442               RExC_emit - RExC_emit_start > RExC_offsets[0] ? 
4443               "Overwriting end of array!\n" : "OK",
4444               RExC_emit - RExC_emit_start,
4445               RExC_parse - RExC_start,
4446               RExC_offsets[0])); 
4447         Set_Cur_Node_Offset;
4448     }
4449             
4450     RExC_emit = ptr;
4451
4452     return(ret);
4453 }
4454
4455 /*
4456 - reguni - emit (if appropriate) a Unicode character
4457 */
4458 STATIC void
4459 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4460 {
4461     *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4462 }
4463
4464 /*
4465 - reginsert - insert an operator in front of already-emitted operand
4466 *
4467 * Means relocating the operand.
4468 */
4469 STATIC void
4470 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4471 {
4472     register regnode *src;
4473     register regnode *dst;
4474     register regnode *place;
4475     register int offset = regarglen[(U8)op];
4476
4477 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4478
4479     if (SIZE_ONLY) {
4480         RExC_size += NODE_STEP_REGNODE + offset;
4481         return;
4482     }
4483
4484     src = RExC_emit;
4485     RExC_emit += NODE_STEP_REGNODE + offset;
4486     dst = RExC_emit;
4487     while (src > opnd) {
4488         StructCopy(--src, --dst, regnode);
4489         if (RExC_offsets) {     /* MJD 20010112 */
4490             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4491                   "reg_insert",
4492                   __LINE__,
4493                   reg_name[op],
4494                   dst - RExC_emit_start > RExC_offsets[0] 
4495                   ? "Overwriting end of array!\n" : "OK",
4496                   src - RExC_emit_start,
4497                   dst - RExC_emit_start,
4498                   RExC_offsets[0])); 
4499             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4500             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4501         }
4502     }
4503     
4504
4505     place = opnd;               /* Op node, where operand used to be. */
4506     if (RExC_offsets) {         /* MJD */
4507         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n", 
4508               "reginsert",
4509               __LINE__,
4510               reg_name[op],
4511               place - RExC_emit_start > RExC_offsets[0] 
4512               ? "Overwriting end of array!\n" : "OK",
4513               place - RExC_emit_start,
4514               RExC_parse - RExC_start,
4515               RExC_offsets[0])); 
4516         Set_Node_Offset(place, RExC_parse);
4517         Set_Node_Length(place, 1);
4518     }
4519     src = NEXTOPER(place);
4520     FILL_ADVANCE_NODE(place, op);
4521     Zero(src, offset, regnode);
4522 }
4523
4524 /*
4525 - regtail - set the next-pointer at the end of a node chain of p to val.
4526 */
4527 STATIC void
4528 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4529 {
4530     register regnode *scan;
4531     register regnode *temp;
4532
4533     if (SIZE_ONLY)
4534         return;
4535
4536     /* Find last node. */
4537     scan = p;
4538     for (;;) {
4539         temp = regnext(scan);
4540         if (temp == NULL)
4541             break;
4542         scan = temp;
4543     }
4544
4545     if (reg_off_by_arg[OP(scan)]) {
4546         ARG_SET(scan, val - scan);
4547     }
4548     else {
4549         NEXT_OFF(scan) = val - scan;
4550     }
4551 }
4552
4553 /*
4554 - regoptail - regtail on operand of first argument; nop if operandless
4555 */
4556 STATIC void
4557 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4558 {
4559     /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4560     if (p == NULL || SIZE_ONLY)
4561         return;
4562     if (PL_regkind[(U8)OP(p)] == BRANCH) {
4563         regtail(pRExC_state, NEXTOPER(p), val);
4564     }
4565     else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4566         regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4567     }
4568     else
4569         return;
4570 }
4571
4572 /*
4573  - regcurly - a little FSA that accepts {\d+,?\d*}
4574  */
4575 STATIC I32
4576 S_regcurly(pTHX_ register char *s)
4577 {
4578     if (*s++ != '{')
4579         return FALSE;
4580     if (!isDIGIT(*s))
4581         return FALSE;
4582     while (isDIGIT(*s))
4583         s++;
4584     if (*s == ',')
4585         s++;
4586     while (isDIGIT(*s))
4587         s++;
4588     if (*s != '}')
4589         return FALSE;
4590     return TRUE;
4591 }
4592
4593
4594 #ifdef DEBUGGING
4595
4596 STATIC regnode *
4597 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4598 {
4599     register U8 op = EXACT;     /* Arbitrary non-END op. */
4600     register regnode *next;
4601
4602     while (op != END && (!last || node < last)) {
4603         /* While that wasn't END last time... */
4604
4605         NODE_ALIGN(node);
4606         op = OP(node);
4607         if (op == CLOSE)
4608             l--;        
4609         next = regnext(node);
4610         /* Where, what. */
4611         if (OP(node) == OPTIMIZED)
4612             goto after_print;
4613         regprop(sv, node);
4614         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4615                       (int)(2*l + 1), "", SvPVX(sv));
4616         if (next == NULL)               /* Next ptr. */
4617             PerlIO_printf(Perl_debug_log, "(0)");
4618         else
4619             PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4620         (void)PerlIO_putc(Perl_debug_log, '\n');
4621       after_print:
4622         if (PL_regkind[(U8)op] == BRANCHJ) {
4623             register regnode *nnode = (OP(next) == LONGJMP
4624                                        ? regnext(next)
4625                                        : next);
4626             if (last && nnode > last)
4627                 nnode = last;
4628             node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4629         }
4630         else if (PL_regkind[(U8)op] == BRANCH) {
4631             node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4632         }
4633         else if ( op == CURLY) {   /* `next' might be very big: optimizer */
4634             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4635                              NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4636         }
4637         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4638             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4639                              next, sv, l + 1);
4640         }
4641         else if ( op == PLUS || op == STAR) {
4642             node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4643         }
4644         else if (op == ANYOF) {
4645             /* arglen 1 + class block */
4646             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4647                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4648             node = NEXTOPER(node);
4649         }
4650         else if (PL_regkind[(U8)op] == EXACT) {
4651             /* Literal string, where present. */
4652             node += NODE_SZ_STR(node) - 1;
4653             node = NEXTOPER(node);
4654         }
4655         else {
4656             node = NEXTOPER(node);
4657             node += regarglen[(U8)op];
4658         }
4659         if (op == CURLYX || op == OPEN)
4660             l++;
4661         else if (op == WHILEM)
4662             l--;
4663     }
4664     return node;
4665 }
4666
4667 #endif  /* DEBUGGING */
4668
4669 /*
4670  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4671  */
4672 void
4673 Perl_regdump(pTHX_ regexp *r)
4674 {
4675 #ifdef DEBUGGING
4676     SV *sv = sv_newmortal();
4677
4678     (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4679
4680     /* Header fields of interest. */
4681     if (r->anchored_substr)
4682         PerlIO_printf(Perl_debug_log,
4683                       "anchored `%s%.*s%s'%s at %"IVdf" ",
4684                       PL_colors[0],
4685                       (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4686                       SvPVX(r->anchored_substr),
4687                       PL_colors[1],
4688                       SvTAIL(r->anchored_substr) ? "$" : "",
4689                       (IV)r->anchored_offset);
4690     else if (r->anchored_utf8)
4691         PerlIO_printf(Perl_debug_log,
4692                       "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4693                       PL_colors[0],
4694                       (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4695                       SvPVX(r->anchored_utf8),
4696                       PL_colors[1],
4697                       SvTAIL(r->anchored_utf8) ? "$" : "",
4698                       (IV)r->anchored_offset);
4699     if (r->float_substr)
4700         PerlIO_printf(Perl_debug_log,
4701                       "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4702                       PL_colors[0],
4703                       (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4704                       SvPVX(r->float_substr),
4705                       PL_colors[1],
4706                       SvTAIL(r->float_substr) ? "$" : "",
4707                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4708     else if (r->float_utf8)
4709         PerlIO_printf(Perl_debug_log,
4710                       "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4711                       PL_colors[0],
4712                       (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4713                       SvPVX(r->float_utf8),
4714                       PL_colors[1],
4715                       SvTAIL(r->float_utf8) ? "$" : "",
4716                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4717     if (r->check_substr || r->check_utf8)
4718         PerlIO_printf(Perl_debug_log,
4719                       r->check_substr == r->float_substr
4720                       && r->check_utf8 == r->float_utf8
4721                       ? "(checking floating" : "(checking anchored");
4722     if (r->reganch & ROPT_NOSCAN)
4723         PerlIO_printf(Perl_debug_log, " noscan");
4724     if (r->reganch & ROPT_CHECK_ALL)
4725         PerlIO_printf(Perl_debug_log, " isall");
4726     if (r->check_substr || r->check_utf8)
4727         PerlIO_printf(Perl_debug_log, ") ");
4728
4729     if (r->regstclass) {
4730         regprop(sv, r->regstclass);
4731         PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4732     }
4733     if (r->reganch & ROPT_ANCH) {
4734         PerlIO_printf(Perl_debug_log, "anchored");
4735         if (r->reganch & ROPT_ANCH_BOL)
4736             PerlIO_printf(Perl_debug_log, "(BOL)");
4737         if (r->reganch & ROPT_ANCH_MBOL)
4738             PerlIO_printf(Perl_debug_log, "(MBOL)");
4739         if (r->reganch & ROPT_ANCH_SBOL)
4740             PerlIO_printf(Perl_debug_log, "(SBOL)");
4741         if (r->reganch & ROPT_ANCH_GPOS)
4742             PerlIO_printf(Perl_debug_log, "(GPOS)");
4743         PerlIO_putc(Perl_debug_log, ' ');
4744     }
4745     if (r->reganch & ROPT_GPOS_SEEN)
4746         PerlIO_printf(Perl_debug_log, "GPOS ");
4747     if (r->reganch & ROPT_SKIP)
4748         PerlIO_printf(Perl_debug_log, "plus ");
4749     if (r->reganch & ROPT_IMPLICIT)
4750         PerlIO_printf(Perl_debug_log, "implicit ");
4751     PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4752     if (r->reganch & ROPT_EVAL_SEEN)
4753         PerlIO_printf(Perl_debug_log, "with eval ");
4754     PerlIO_printf(Perl_debug_log, "\n");
4755     if (r->offsets) {
4756       U32 i;
4757       U32 len = r->offsets[0];
4758       PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4759       for (i = 1; i <= len; i++)
4760         PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ", 
4761                       (UV)r->offsets[i*2-1], 
4762                       (UV)r->offsets[i*2]);
4763       PerlIO_printf(Perl_debug_log, "\n");
4764     }
4765 #endif  /* DEBUGGING */
4766 }
4767
4768 #ifdef DEBUGGING
4769
4770 STATIC void
4771 S_put_byte(pTHX_ SV *sv, int c)
4772 {
4773     if (isCNTRL(c) || c == 255 || !isPRINT(c))
4774         Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4775     else if (c == '-' || c == ']' || c == '\\' || c == '^')
4776         Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4777     else
4778         Perl_sv_catpvf(aTHX_ sv, "%c", c);
4779 }
4780
4781 #endif  /* DEBUGGING */
4782
4783 /*
4784 - regprop - printable representation of opcode
4785 */
4786 void
4787 Perl_regprop(pTHX_ SV *sv, regnode *o)
4788 {
4789 #ifdef DEBUGGING
4790     register int k;
4791
4792     sv_setpvn(sv, "", 0);
4793     if (OP(o) >= reg_num)               /* regnode.type is unsigned */
4794         /* It would be nice to FAIL() here, but this may be called from
4795            regexec.c, and it would be hard to supply pRExC_state. */
4796         Perl_croak(aTHX_ "Corrupted regexp opcode");
4797     sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4798
4799     k = PL_regkind[(U8)OP(o)];
4800
4801     if (k == EXACT) {
4802         SV *dsv = sv_2mortal(newSVpvn("", 0));
4803         /* Using is_utf8_string() is a crude hack but it may
4804          * be the best for now since we have no flag "this EXACTish
4805          * node was UTF-8" --jhi */
4806         bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4807         char *s    = do_utf8 ?
4808           pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4809                          UNI_DISPLAY_REGEX) :
4810           STRING(o);
4811         int len = do_utf8 ?
4812           strlen(s) :
4813           STR_LEN(o);
4814         Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4815                        PL_colors[0],
4816                        len, s,
4817                        PL_colors[1]);
4818     }
4819     else if (k == CURLY) {
4820         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4821             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4822         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4823     }
4824     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
4825         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4826     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4827         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
4828     else if (k == LOGICAL)
4829         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
4830     else if (k == ANYOF) {
4831         int i, rangestart = -1;
4832         U8 flags = ANYOF_FLAGS(o);
4833         const char * const anyofs[] = { /* Should be synchronized with
4834                                          * ANYOF_ #xdefines in regcomp.h */
4835             "\\w",
4836             "\\W",
4837             "\\s",
4838             "\\S",
4839             "\\d",
4840             "\\D",
4841             "[:alnum:]",
4842             "[:^alnum:]",
4843             "[:alpha:]",
4844             "[:^alpha:]",
4845             "[:ascii:]",
4846             "[:^ascii:]",
4847             "[:ctrl:]",
4848             "[:^ctrl:]",
4849             "[:graph:]",
4850             "[:^graph:]",
4851             "[:lower:]",
4852             "[:^lower:]",
4853             "[:print:]",
4854             "[:^print:]",
4855             "[:punct:]",
4856             "[:^punct:]",
4857             "[:upper:]",
4858             "[:^upper:]",
4859             "[:xdigit:]",
4860             "[:^xdigit:]",
4861             "[:space:]",
4862             "[:^space:]",
4863             "[:blank:]",
4864             "[:^blank:]"
4865         };
4866
4867         if (flags & ANYOF_LOCALE)
4868             sv_catpv(sv, "{loc}");
4869         if (flags & ANYOF_FOLD)
4870             sv_catpv(sv, "{i}");
4871         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4872         if (flags & ANYOF_INVERT)
4873             sv_catpv(sv, "^");
4874         for (i = 0; i <= 256; i++) {
4875             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4876                 if (rangestart == -1)
4877                     rangestart = i;
4878             } else if (rangestart != -1) {
4879                 if (i <= rangestart + 3)
4880                     for (; rangestart < i; rangestart++)
4881                         put_byte(sv, rangestart);
4882                 else {
4883                     put_byte(sv, rangestart);
4884                     sv_catpv(sv, "-");
4885                     put_byte(sv, i - 1);
4886                 }
4887                 rangestart = -1;
4888             }
4889         }
4890
4891         if (o->flags & ANYOF_CLASS)
4892             for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4893                 if (ANYOF_CLASS_TEST(o,i))
4894                     sv_catpv(sv, anyofs[i]);
4895
4896         if (flags & ANYOF_UNICODE)
4897             sv_catpv(sv, "{unicode}");
4898         else if (flags & ANYOF_UNICODE_ALL)
4899             sv_catpv(sv, "{unicode_all}");
4900
4901         {
4902             SV *lv;
4903             SV *sw = regclass_swash(o, FALSE, &lv, 0);
4904         
4905             if (lv) {
4906                 if (sw) {
4907                     U8 s[UTF8_MAXBYTES_CASE+1];
4908                 
4909                     for (i = 0; i <= 256; i++) { /* just the first 256 */
4910                         U8 *e = uvchr_to_utf8(s, i);
4911                         
4912                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
4913                             if (rangestart == -1)
4914                                 rangestart = i;
4915                         } else if (rangestart != -1) {
4916                             U8 *p;
4917                         
4918                             if (i <= rangestart + 3)
4919                                 for (; rangestart < i; rangestart++) {
4920                                     for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4921                                         put_byte(sv, *p);
4922                                 }
4923                             else {
4924                                 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4925                                     put_byte(sv, *p);
4926                                 sv_catpv(sv, "-");
4927                                     for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4928                                         put_byte(sv, *p);
4929                                 }
4930                                 rangestart = -1;
4931                             }
4932                         }
4933                         
4934                     sv_catpv(sv, "..."); /* et cetera */
4935                 }
4936
4937                 {
4938                     char *s = savesvpv(lv);
4939                     char *origs = s;
4940                 
4941                     while(*s && *s != '\n') s++;
4942                 
4943                     if (*s == '\n') {
4944                         char *t = ++s;
4945                         
4946                         while (*s) {
4947                             if (*s == '\n')
4948                                 *s = ' ';
4949                             s++;
4950                         }
4951                         if (s[-1] == ' ')
4952                             s[-1] = 0;
4953                         
4954                         sv_catpv(sv, t);
4955                     }
4956                 
4957                     Safefree(origs);
4958                 }
4959             }
4960         }
4961
4962         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4963     }
4964     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4965         Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4966 #endif  /* DEBUGGING */
4967 }
4968
4969 SV *
4970 Perl_re_intuit_string(pTHX_ regexp *prog)
4971 {                               /* Assume that RE_INTUIT is set */
4972     DEBUG_r(
4973         {   STRLEN n_a;
4974             char *s = SvPV(prog->check_substr
4975                       ? prog->check_substr : prog->check_utf8, n_a);
4976
4977             if (!PL_colorset) reginitcolors();
4978             PerlIO_printf(Perl_debug_log,
4979                       "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4980                       PL_colors[4],
4981                       prog->check_substr ? "" : "utf8 ",
4982                       PL_colors[5],PL_colors[0],
4983                       s,
4984                       PL_colors[1],
4985                       (strlen(s) > 60 ? "..." : ""));
4986         } );
4987
4988     return prog->check_substr ? prog->check_substr : prog->check_utf8;
4989 }
4990
4991 void
4992 Perl_pregfree(pTHX_ struct regexp *r)
4993 {
4994 #ifdef DEBUGGING
4995     SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4996 #endif
4997
4998     if (!r || (--r->refcnt > 0))
4999         return;
5000     DEBUG_r({
5001          int len;
5002          char *s;
5003
5004          s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
5005                 r->prelen, 60, UNI_DISPLAY_REGEX)
5006             : pv_display(dsv, r->precomp, r->prelen, 0, 60);
5007          len = SvCUR(dsv);
5008          if (!PL_colorset)
5009               reginitcolors();
5010          PerlIO_printf(Perl_debug_log,
5011                        "%sFreeing REx:%s `%s%*.*s%s%s'\n",
5012                        PL_colors[4],PL_colors[5],PL_colors[0],
5013                        len, len, s,
5014                        PL_colors[1],
5015                        len > 60 ? "..." : "");
5016     });
5017
5018     if (r->precomp)
5019         Safefree(r->precomp);
5020     if (r->offsets)             /* 20010421 MJD */
5021         Safefree(r->offsets);
5022     RX_MATCH_COPY_FREE(r);
5023 #ifdef PERL_COPY_ON_WRITE
5024     if (r->saved_copy)
5025         SvREFCNT_dec(r->saved_copy);
5026 #endif
5027     if (r->substrs) {
5028         if (r->anchored_substr)
5029             SvREFCNT_dec(r->anchored_substr);
5030         if (r->anchored_utf8)
5031             SvREFCNT_dec(r->anchored_utf8);
5032         if (r->float_substr)
5033             SvREFCNT_dec(r->float_substr);
5034         if (r->float_utf8)
5035             SvREFCNT_dec(r->float_utf8);
5036         Safefree(r->substrs);
5037     }
5038     if (r->data) {
5039         int n = r->data->count;
5040         PAD* new_comppad = NULL;
5041         PAD* old_comppad;
5042         PADOFFSET refcnt;
5043
5044         while (--n >= 0) {
5045           /* If you add a ->what type here, update the comment in regcomp.h */
5046             switch (r->data->what[n]) {
5047             case 's':
5048                 SvREFCNT_dec((SV*)r->data->data[n]);
5049                 break;
5050             case 'f':
5051                 Safefree(r->data->data[n]);
5052                 break;
5053             case 'p':
5054                 new_comppad = (AV*)r->data->data[n];
5055                 break;
5056             case 'o':
5057                 if (new_comppad == NULL)
5058                     Perl_croak(aTHX_ "panic: pregfree comppad");
5059                 PAD_SAVE_LOCAL(old_comppad,
5060                     /* Watch out for global destruction's random ordering. */
5061                     (SvTYPE(new_comppad) == SVt_PVAV) ?
5062                                 new_comppad : Null(PAD *)
5063                 );
5064                 OP_REFCNT_LOCK;
5065                 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
5066                 OP_REFCNT_UNLOCK;
5067                 if (!refcnt)
5068                     op_free((OP_4tree*)r->data->data[n]);
5069
5070                 PAD_RESTORE_LOCAL(old_comppad);
5071                 SvREFCNT_dec((SV*)new_comppad);
5072                 new_comppad = NULL;
5073                 break;
5074             case 'n':
5075                 break;
5076             default:
5077                 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
5078             }
5079         }
5080         Safefree(r->data->what);
5081         Safefree(r->data);
5082     }
5083     Safefree(r->startp);
5084     Safefree(r->endp);
5085     Safefree(r);
5086 }
5087
5088 /*
5089  - regnext - dig the "next" pointer out of a node
5090  *
5091  * [Note, when REGALIGN is defined there are two places in regmatch()
5092  * that bypass this code for speed.]
5093  */
5094 regnode *
5095 Perl_regnext(pTHX_ register regnode *p)
5096 {
5097     register I32 offset;
5098
5099     if (p == &PL_regdummy)
5100         return(NULL);
5101
5102     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
5103     if (offset == 0)
5104         return(NULL);
5105
5106     return(p+offset);
5107 }
5108
5109 STATIC void     
5110 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
5111 {
5112     va_list args;
5113     STRLEN l1 = strlen(pat1);
5114     STRLEN l2 = strlen(pat2);
5115     char buf[512];
5116     SV *msv;
5117     char *message;
5118
5119     if (l1 > 510)
5120         l1 = 510;
5121     if (l1 + l2 > 510)
5122         l2 = 510 - l1;
5123     Copy(pat1, buf, l1 , char);
5124     Copy(pat2, buf + l1, l2 , char);
5125     buf[l1 + l2] = '\n';
5126     buf[l1 + l2 + 1] = '\0';
5127 #ifdef I_STDARG
5128     /* ANSI variant takes additional second argument */
5129     va_start(args, pat2);
5130 #else
5131     va_start(args);
5132 #endif
5133     msv = vmess(buf, &args);
5134     va_end(args);
5135     message = SvPV(msv,l1);
5136     if (l1 > 512)
5137         l1 = 512;
5138     Copy(message, buf, l1 , char);
5139     buf[l1-1] = '\0';                   /* Overwrite \n */
5140     Perl_croak(aTHX_ "%s", buf);
5141 }
5142
5143 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
5144
5145 void
5146 Perl_save_re_context(pTHX)
5147 {
5148     SAVEI32(PL_reg_flags);              /* from regexec.c */
5149     SAVEPPTR(PL_bostr);
5150     SAVEPPTR(PL_reginput);              /* String-input pointer. */
5151     SAVEPPTR(PL_regbol);                /* Beginning of input, for ^ check. */
5152     SAVEPPTR(PL_regeol);                /* End of input, for $ check. */
5153     SAVEVPTR(PL_regstartp);             /* Pointer to startp array. */
5154     SAVEVPTR(PL_regendp);               /* Ditto for endp. */
5155     SAVEVPTR(PL_reglastparen);          /* Similarly for lastparen. */
5156     SAVEVPTR(PL_reglastcloseparen);     /* Similarly for lastcloseparen. */
5157     SAVEPPTR(PL_regtill);               /* How far we are required to go. */
5158     SAVEGENERICPV(PL_reg_start_tmp);            /* from regexec.c */
5159     PL_reg_start_tmp = 0;
5160     SAVEI32(PL_reg_start_tmpl);         /* from regexec.c */
5161     PL_reg_start_tmpl = 0;
5162     SAVEVPTR(PL_regdata);
5163     SAVEI32(PL_reg_eval_set);           /* from regexec.c */
5164     SAVEI32(PL_regnarrate);             /* from regexec.c */
5165     SAVEVPTR(PL_regprogram);            /* from regexec.c */
5166     SAVEINT(PL_regindent);              /* from regexec.c */
5167     SAVEVPTR(PL_regcc);                 /* from regexec.c */
5168     SAVEVPTR(PL_curcop);
5169     SAVEVPTR(PL_reg_call_cc);           /* from regexec.c */
5170     SAVEVPTR(PL_reg_re);                /* from regexec.c */
5171     SAVEPPTR(PL_reg_ganch);             /* from regexec.c */
5172     SAVESPTR(PL_reg_sv);                /* from regexec.c */
5173     SAVEBOOL(PL_reg_match_utf8);        /* from regexec.c */
5174     SAVEVPTR(PL_reg_magic);             /* from regexec.c */
5175     SAVEI32(PL_reg_oldpos);                     /* from regexec.c */
5176     SAVEVPTR(PL_reg_oldcurpm);          /* from regexec.c */
5177     SAVEVPTR(PL_reg_curpm);             /* from regexec.c */
5178     SAVEPPTR(PL_reg_oldsaved);          /* old saved substr during match */
5179     PL_reg_oldsaved = Nullch;
5180     SAVEI32(PL_reg_oldsavedlen);        /* old length of saved substr during match */
5181     PL_reg_oldsavedlen = 0;
5182 #ifdef PERL_COPY_ON_WRITE
5183     SAVESPTR(PL_nrs);
5184     PL_nrs = Nullsv;
5185 #endif
5186     SAVEI32(PL_reg_maxiter);            /* max wait until caching pos */
5187     PL_reg_maxiter = 0;
5188     SAVEI32(PL_reg_leftiter);           /* wait until caching pos */
5189     PL_reg_leftiter = 0;
5190     SAVEGENERICPV(PL_reg_poscache);     /* cache of pos of WHILEM */
5191     PL_reg_poscache = Nullch;
5192     SAVEI32(PL_reg_poscache_size);      /* size of pos cache of WHILEM */
5193     PL_reg_poscache_size = 0;
5194     SAVEPPTR(PL_regprecomp);            /* uncompiled string. */
5195     SAVEI32(PL_regnpar);                /* () count. */
5196     SAVEI32(PL_regsize);                /* from regexec.c */
5197
5198     {
5199         /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
5200         U32 i;
5201         GV *mgv;
5202         REGEXP *rx;
5203         char digits[TYPE_CHARS(long)];
5204
5205         if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5206             for (i = 1; i <= rx->nparens; i++) {
5207                 sprintf(digits, "%lu", (long)i);
5208                 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5209                     save_scalar(mgv);
5210             }
5211         }
5212     }
5213
5214 #ifdef DEBUGGING
5215     SAVEPPTR(PL_reg_starttry);          /* from regexec.c */
5216 #endif
5217 }
5218
5219 static void
5220 clear_re(pTHX_ void *r)
5221 {
5222     ReREFCNT_dec((regexp *)r);
5223 }
5224
5225 /*
5226  * Local variables:
5227  * c-indentation-style: bsd
5228  * c-basic-offset: 4
5229  * indent-tabs-mode: t
5230  * End:
5231  *
5232  * vim: shiftwidth=4:
5233 */