Fix for "UTF-8 bug with s///" from Hugo.
[p5sagit/p5-mst-13.2.git] / regcomp.c
1 /*    regcomp.c
2  */
3
4 /*
5  * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
6  */
7
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9  * confused with the original package (see point 3 below).  Thanks, Henry!
10  */
11
12 /* Additional note: this code is very heavily munged from Henry's version
13  * in places.  In some spots I've traded clarity for efficiency, so don't
14  * blame Henry for some of the lack of readability.
15  */
16
17 /* The names of the functions have been changed from regcomp and
18  * regexec to  pregcomp and pregexec in order to avoid conflicts
19  * with the POSIX routines of the same names.
20 */
21
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 #  ifndef PERL_IN_XSUB_RE
25 #    define PERL_IN_XSUB_RE
26 #  endif
27 /* need access to debugger hooks */
28 #  if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
29 #    define DEBUGGING
30 #  endif
31 #endif
32
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 #  define Perl_pregcomp my_regcomp
36 #  define Perl_regdump my_regdump
37 #  define Perl_regprop my_regprop
38 #  define Perl_pregfree my_regfree
39 #  define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 #  define Perl_regnext my_regnext
42 #  define Perl_save_re_context my_save_re_context
43 #  define Perl_reginitcolors my_reginitcolors
44
45 #  define PERL_NO_GET_CONTEXT
46 #endif
47
48 /*SUPPRESS 112*/
49 /*
50  * pregcomp and pregexec -- regsub and regerror are not used in perl
51  *
52  *      Copyright (c) 1986 by University of Toronto.
53  *      Written by Henry Spencer.  Not derived from licensed software.
54  *
55  *      Permission is granted to anyone to use this software for any
56  *      purpose on any computer system, and to redistribute it freely,
57  *      subject to the following restrictions:
58  *
59  *      1. The author is not responsible for the consequences of use of
60  *              this software, no matter how awful, even if they arise
61  *              from defects in it.
62  *
63  *      2. The origin of this software must not be misrepresented, either
64  *              by explicit claim or by omission.
65  *
66  *      3. Altered versions must be plainly marked as such, and must not
67  *              be misrepresented as being the original software.
68  *
69  *
70  ****    Alterations to Henry's code are...
71  ****
72  ****    Copyright (c) 1991-2002, Larry Wall
73  ****
74  ****    You may distribute under the terms of either the GNU General Public
75  ****    License or the Artistic License, as specified in the README file.
76
77  *
78  * Beware that some of this code is subtly aware of the way operator
79  * precedence is structured in regular expressions.  Serious changes in
80  * regular-expression syntax might require a total rethink.
81  */
82 #include "EXTERN.h"
83 #define PERL_IN_REGCOMP_C
84 #include "perl.h"
85
86 #ifndef PERL_IN_XSUB_RE
87 #  include "INTERN.h"
88 #endif
89
90 #define REG_COMP_C
91 #include "regcomp.h"
92
93 #ifdef op
94 #undef op
95 #endif /* op */
96
97 #ifdef MSDOS
98 # if defined(BUGGY_MSC6)
99  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100  # pragma optimize("a",off)
101  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102  # pragma optimize("w",on )
103 # endif /* BUGGY_MSC6 */
104 #endif /* MSDOS */
105
106 #ifndef STATIC
107 #define STATIC  static
108 #endif
109
110 typedef struct RExC_state_t {
111     U16         flags16;                /* are we folding, multilining? */
112     char        *precomp;               /* uncompiled string. */
113     regexp      *rx;
114     char        *start;                 /* Start of input for compile */
115     char        *end;                   /* End of input for compile */
116     char        *parse;                 /* Input-scan pointer. */
117     I32         whilem_seen;            /* number of WHILEM in this expr */
118     regnode     *emit_start;            /* Start of emitted-code area */
119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
120     I32         naughty;                /* How bad is this pattern? */
121     I32         sawback;                /* Did we see \1, ...? */
122     U32         seen;
123     I32         size;                   /* Code size. */
124     I32         npar;                   /* () count. */
125     I32         extralen;
126     I32         seen_zerolen;
127     I32         seen_evals;
128     I32         utf8;
129 #if ADD_TO_REGEXEC
130     char        *starttry;              /* -Dr: where regtry was called. */
131 #define RExC_starttry   (pRExC_state->starttry)
132 #endif
133 } RExC_state_t;
134
135 #define RExC_flags16    (pRExC_state->flags16)
136 #define RExC_precomp    (pRExC_state->precomp)
137 #define RExC_rx         (pRExC_state->rx)
138 #define RExC_start      (pRExC_state->start)
139 #define RExC_end        (pRExC_state->end)
140 #define RExC_parse      (pRExC_state->parse)
141 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
142 #define RExC_offsets    (pRExC_state->rx->offsets) /* I am not like the others */
143 #define RExC_emit       (pRExC_state->emit)
144 #define RExC_emit_start (pRExC_state->emit_start)
145 #define RExC_naughty    (pRExC_state->naughty)
146 #define RExC_sawback    (pRExC_state->sawback)
147 #define RExC_seen       (pRExC_state->seen)
148 #define RExC_size       (pRExC_state->size)
149 #define RExC_npar       (pRExC_state->npar)
150 #define RExC_extralen   (pRExC_state->extralen)
151 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
152 #define RExC_seen_evals (pRExC_state->seen_evals)
153 #define RExC_utf8       (pRExC_state->utf8)
154
155 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
156 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157         ((*s) == '{' && regcurly(s)))
158
159 #ifdef SPSTART
160 #undef SPSTART          /* dratted cpp namespace... */
161 #endif
162 /*
163  * Flags to be passed up and down.
164  */
165 #define WORST           0       /* Worst case. */
166 #define HASWIDTH        0x1     /* Known to match non-null strings. */
167 #define SIMPLE          0x2     /* Simple enough to be STAR/PLUS operand. */
168 #define SPSTART         0x4     /* Starts with * or +. */
169 #define TRYAGAIN        0x8     /* Weeded out a declaration. */
170
171 /* Length of a variant. */
172
173 typedef struct scan_data_t {
174     I32 len_min;
175     I32 len_delta;
176     I32 pos_min;
177     I32 pos_delta;
178     SV *last_found;
179     I32 last_end;                       /* min value, <0 unless valid. */
180     I32 last_start_min;
181     I32 last_start_max;
182     SV **longest;                       /* Either &l_fixed, or &l_float. */
183     SV *longest_fixed;
184     I32 offset_fixed;
185     SV *longest_float;
186     I32 offset_float_min;
187     I32 offset_float_max;
188     I32 flags;
189     I32 whilem_c;
190     I32 *last_closep;
191     struct regnode_charclass_class *start_class;
192 } scan_data_t;
193
194 /*
195  * Forward declarations for pregcomp()'s friends.
196  */
197
198 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199                                       0, 0, 0, 0, 0, 0};
200
201 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202 #define SF_BEFORE_SEOL          0x1
203 #define SF_BEFORE_MEOL          0x2
204 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
207 #ifdef NO_UNARY_PLUS
208 #  define SF_FIX_SHIFT_EOL      (0+2)
209 #  define SF_FL_SHIFT_EOL               (0+4)
210 #else
211 #  define SF_FIX_SHIFT_EOL      (+2)
212 #  define SF_FL_SHIFT_EOL               (+4)
213 #endif
214
215 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220 #define SF_IS_INF               0x40
221 #define SF_HAS_PAR              0x80
222 #define SF_IN_PAR               0x100
223 #define SF_HAS_EVAL             0x200
224 #define SCF_DO_SUBSTR           0x400
225 #define SCF_DO_STCLASS_AND      0x0800
226 #define SCF_DO_STCLASS_OR       0x1000
227 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
228 #define SCF_WHILEM_VISITED_POS  0x2000
229
230 #define UTF RExC_utf8
231 #define LOC (RExC_flags16 & PMf_LOCALE)
232 #define FOLD (RExC_flags16 & PMf_FOLD)
233
234 #define OOB_UNICODE             12345678
235 #define OOB_NAMEDCLASS          -1
236
237 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
240
241 /* length of regex to show in messages that don't mark a position within */
242 #define RegexLengthToShowInErrorMessages 127
243
244 /*
245  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247  * op/pragma/warn/regcomp.
248  */
249 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
250 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
251
252 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
253
254 /*
255  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256  * arg. Show regex, up to a maximum length. If it's too long, chop and add
257  * "...".
258  */
259 #define FAIL(msg)                                                             \
260     STMT_START {                                                             \
261         char *ellipses = "";                                                 \
262         IV len = RExC_end - RExC_precomp;                                \
263                                                                              \
264         if (!SIZE_ONLY)                                                      \
265             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
266                                                                              \
267         if (len > RegexLengthToShowInErrorMessages) {                        \
268             /* chop 10 shorter than the max, to ensure meaning of "..." */   \
269             len = RegexLengthToShowInErrorMessages - 10;                     \
270             ellipses = "...";                                                \
271         }                                                                    \
272         Perl_croak(aTHX_ "%s in regex m/%.*s%s/",                            \
273                    msg, (int)len, RExC_precomp, ellipses);                  \
274     } STMT_END
275
276 /*
277  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278  * args. Show regex, up to a maximum length. If it's too long, chop and add
279  * "...".
280  */
281 #define FAIL2(pat,msg)                                                        \
282     STMT_START {                                                             \
283         char *ellipses = "";                                                 \
284         IV len = RExC_end - RExC_precomp;                                \
285                                                                              \
286         if (!SIZE_ONLY)                                                      \
287             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
288                                                                              \
289         if (len > RegexLengthToShowInErrorMessages) {                        \
290             /* chop 10 shorter than the max, to ensure meaning of "..." */   \
291             len = RegexLengthToShowInErrorMessages - 10;                     \
292             ellipses = "...";                                                \
293         }                                                                    \
294         S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/",                        \
295                     msg, (int)len, RExC_precomp, ellipses);                \
296     } STMT_END
297
298
299 /*
300  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
301  */
302 #define Simple_vFAIL(m)                                                      \
303     STMT_START {                                                             \
304       IV offset = RExC_parse - RExC_precomp; \
305                                                                              \
306       Perl_croak(aTHX_ "%s" REPORT_LOCATION,               \
307                  m, (int)offset, RExC_precomp, RExC_precomp + offset);     \
308     } STMT_END
309
310 /*
311  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
312  */
313 #define vFAIL(m)                                                             \
314     STMT_START {                                                             \
315       if (!SIZE_ONLY)                                                        \
316             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
317       Simple_vFAIL(m);                                                       \
318     } STMT_END
319
320 /*
321  * Like Simple_vFAIL(), but accepts two arguments.
322  */
323 #define Simple_vFAIL2(m,a1)                                                  \
324     STMT_START {                                                             \
325       IV offset = RExC_parse - RExC_precomp; \
326                                                                              \
327       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,       \
328                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
329     } STMT_END
330
331 /*
332  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
333  */
334 #define vFAIL2(m,a1)                                                         \
335     STMT_START {                                                             \
336       if (!SIZE_ONLY)                                                        \
337             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
338       Simple_vFAIL2(m, a1);                                                  \
339     } STMT_END
340
341
342 /*
343  * Like Simple_vFAIL(), but accepts three arguments.
344  */
345 #define Simple_vFAIL3(m, a1, a2)                                             \
346     STMT_START {                                                             \
347       IV offset = RExC_parse - RExC_precomp; \
348                                                                              \
349       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,   \
350                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
351     } STMT_END
352
353 /*
354  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355  */
356 #define vFAIL3(m,a1,a2)                                                      \
357     STMT_START {                                                             \
358       if (!SIZE_ONLY)                                                        \
359             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
360       Simple_vFAIL3(m, a1, a2);                                              \
361     } STMT_END
362
363 /*
364  * Like Simple_vFAIL(), but accepts four arguments.
365  */
366 #define Simple_vFAIL4(m, a1, a2, a3)                                         \
367     STMT_START {                                                             \
368       IV offset = RExC_parse - RExC_precomp; \
369                                                                              \
370       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
371                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
372     } STMT_END
373
374 /*
375  * Like Simple_vFAIL(), but accepts five arguments.
376  */
377 #define Simple_vFAIL5(m, a1, a2, a3, a4)                                     \
378     STMT_START {                                                             \
379       IV offset = RExC_parse - RExC_precomp; \
380       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
381                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
382     } STMT_END
383
384
385 #define vWARN(loc,m)                                                         \
386     STMT_START {                                                             \
387         IV offset = loc - RExC_precomp;          \
388         Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,\
389                  m, (int)offset, RExC_precomp, RExC_precomp + offset);          \
390     } STMT_END                                                               \
391
392 #define vWARNdep(loc,m)                                                         \
393     STMT_START {                                                             \
394         IV offset = loc - RExC_precomp;          \
395         Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), "%s" REPORT_LOCATION,\
396                  m, (int)offset, RExC_precomp, RExC_precomp + offset);          \
397     } STMT_END                                                               \
398
399
400 #define vWARN2(loc, m, a1)                                                   \
401     STMT_START {                                                             \
402         IV offset = loc - RExC_precomp;          \
403         Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
404                  a1,                                                         \
405                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
406     } STMT_END
407
408 #define vWARN3(loc, m, a1, a2)                                               \
409     STMT_START {                                                             \
410       IV offset = loc - RExC_precomp;        \
411         Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,                    \
412                  a1, a2,                                                     \
413                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
414     } STMT_END
415
416 #define vWARN4(loc, m, a1, a2, a3)                                           \
417     STMT_START {                                                             \
418       IV offset = loc - RExC_precomp;            \
419         Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
420                  a1, a2, a3,                                                 \
421                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
422     } STMT_END
423
424 /* used for the parse_flags section for (?c) -- japhy */
425 #define vWARN5(loc, m, a1, a2, a3, a4)                                       \
426   STMT_START {                                                   \
427       IV offset = loc - RExC_precomp;   \
428         Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
429                  a1, a2, a3, a4,                                 \
430                  (int)offset, RExC_precomp, RExC_precomp + offset);  \
431     } STMT_END
432
433
434 /* Allow for side effects in s */
435 #define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
436
437 /* Macros for recording node offsets.   20001227 mjd@plover.com 
438  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
439  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
440  * Element 0 holds the number n.
441  */
442
443 #define MJD_OFFSET_DEBUG(x)
444 /* #define MJD_OFFSET_DEBUG(x) fprintf x */
445
446
447 #  define Set_Node_Offset_To_R(node,byte)                           \
448    STMT_START {                                        \
449      if (! SIZE_ONLY) {                                  \
450        if((node) < 0) {                   \
451          Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
452        } else {                                                        \
453          RExC_offsets[2*(node)-1] = (byte);                               \
454        }                                                               \
455      }                                                                 \
456    } STMT_END
457
458 #  define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
459 #  define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
460
461 #  define Set_Node_Length_To_R(node,len)                            \
462    STMT_START {                                        \
463      if (! SIZE_ONLY) {                                  \
464        MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
465        if((node) < 0) {                   \
466          Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
467        } else {                                                        \
468          RExC_offsets[2*(node)] = (len);                               \
469        }                                                               \
470      }                                                                 \
471    } STMT_END
472
473 #  define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
474 #  define Set_Cur_Node_Length(len)  Set_Node_Length(RExC_emit, len)
475 #  define Set_Node_Cur_Length(node)   Set_Node_Length(node, RExC_parse - parse_start)
476
477 /* Get offsets and lengths */
478 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
479 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
480
481 static void clear_re(pTHX_ void *r);
482
483 /* Mark that we cannot extend a found fixed substring at this point.
484    Updata the longest found anchored substring and the longest found
485    floating substrings if needed. */
486
487 STATIC void
488 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
489 {
490     STRLEN l = CHR_SVLEN(data->last_found);
491     STRLEN old_l = CHR_SVLEN(*data->longest);
492
493     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
494         sv_setsv(*data->longest, data->last_found);
495         if (*data->longest == data->longest_fixed) {
496             data->offset_fixed = l ? data->last_start_min : data->pos_min;
497             if (data->flags & SF_BEFORE_EOL)
498                 data->flags
499                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
500             else
501                 data->flags &= ~SF_FIX_BEFORE_EOL;
502         }
503         else {
504             data->offset_float_min = l ? data->last_start_min : data->pos_min;
505             data->offset_float_max = (l
506                                       ? data->last_start_max
507                                       : data->pos_min + data->pos_delta);
508             if (data->flags & SF_BEFORE_EOL)
509                 data->flags
510                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
511             else
512                 data->flags &= ~SF_FL_BEFORE_EOL;
513         }
514     }
515     SvCUR_set(data->last_found, 0);
516     data->last_end = -1;
517     data->flags &= ~SF_BEFORE_EOL;
518 }
519
520 /* Can match anything (initialization) */
521 STATIC void
522 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
523 {
524     ANYOF_CLASS_ZERO(cl);
525     ANYOF_BITMAP_SETALL(cl);
526     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
527     if (LOC)
528         cl->flags |= ANYOF_LOCALE;
529 }
530
531 /* Can match anything (initialization) */
532 STATIC int
533 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
534 {
535     int value;
536
537     for (value = 0; value <= ANYOF_MAX; value += 2)
538         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
539             return 1;
540     if (!(cl->flags & ANYOF_UNICODE_ALL))
541         return 0;
542     if (!ANYOF_BITMAP_TESTALLSET(cl))
543         return 0;
544     return 1;
545 }
546
547 /* Can match anything (initialization) */
548 STATIC void
549 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
550 {
551     Zero(cl, 1, struct regnode_charclass_class);
552     cl->type = ANYOF;
553     cl_anything(pRExC_state, cl);
554 }
555
556 STATIC void
557 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
558 {
559     Zero(cl, 1, struct regnode_charclass_class);
560     cl->type = ANYOF;
561     cl_anything(pRExC_state, cl);
562     if (LOC)
563         cl->flags |= ANYOF_LOCALE;
564 }
565
566 /* 'And' a given class with another one.  Can create false positives */
567 /* We assume that cl is not inverted */
568 STATIC void
569 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570          struct regnode_charclass_class *and_with)
571 {
572     if (!(and_with->flags & ANYOF_CLASS)
573         && !(cl->flags & ANYOF_CLASS)
574         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575         && !(and_with->flags & ANYOF_FOLD)
576         && !(cl->flags & ANYOF_FOLD)) {
577         int i;
578
579         if (and_with->flags & ANYOF_INVERT)
580             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581                 cl->bitmap[i] &= ~and_with->bitmap[i];
582         else
583             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584                 cl->bitmap[i] &= and_with->bitmap[i];
585     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586     if (!(and_with->flags & ANYOF_EOS))
587         cl->flags &= ~ANYOF_EOS;
588
589     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
590         cl->flags &= ~ANYOF_UNICODE_ALL;
591         cl->flags |= ANYOF_UNICODE;
592         ARG_SET(cl, ARG(and_with));
593     }
594     if (!(and_with->flags & ANYOF_UNICODE_ALL))
595         cl->flags &= ~ANYOF_UNICODE_ALL;
596     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
597         cl->flags &= ~ANYOF_UNICODE;
598 }
599
600 /* 'OR' a given class with another one.  Can create false positives */
601 /* We assume that cl is not inverted */
602 STATIC void
603 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
604 {
605     if (or_with->flags & ANYOF_INVERT) {
606         /* We do not use
607          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
608          *   <= (B1 | !B2) | (CL1 | !CL2)
609          * which is wasteful if CL2 is small, but we ignore CL2:
610          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
611          * XXXX Can we handle case-fold?  Unclear:
612          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
613          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
614          */
615         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
616              && !(or_with->flags & ANYOF_FOLD)
617              && !(cl->flags & ANYOF_FOLD) ) {
618             int i;
619
620             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
621                 cl->bitmap[i] |= ~or_with->bitmap[i];
622         } /* XXXX: logic is complicated otherwise */
623         else {
624             cl_anything(pRExC_state, cl);
625         }
626     } else {
627         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
628         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
629              && (!(or_with->flags & ANYOF_FOLD)
630                  || (cl->flags & ANYOF_FOLD)) ) {
631             int i;
632
633             /* OR char bitmap and class bitmap separately */
634             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
635                 cl->bitmap[i] |= or_with->bitmap[i];
636             if (or_with->flags & ANYOF_CLASS) {
637                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
638                     cl->classflags[i] |= or_with->classflags[i];
639                 cl->flags |= ANYOF_CLASS;
640             }
641         }
642         else { /* XXXX: logic is complicated, leave it along for a moment. */
643             cl_anything(pRExC_state, cl);
644         }
645     }
646     if (or_with->flags & ANYOF_EOS)
647         cl->flags |= ANYOF_EOS;
648
649     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
650         ARG(cl) != ARG(or_with)) {
651         cl->flags |= ANYOF_UNICODE_ALL;
652         cl->flags &= ~ANYOF_UNICODE;
653     }
654     if (or_with->flags & ANYOF_UNICODE_ALL) {
655         cl->flags |= ANYOF_UNICODE_ALL;
656         cl->flags &= ~ANYOF_UNICODE;
657     }
658 }
659
660 /*
661  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
662  * These need to be revisited when a newer toolchain becomes available.
663  */
664 #if defined(__sparc64__) && defined(__GNUC__)
665 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
666 #       undef  SPARC64_GCC_WORKAROUND
667 #       define SPARC64_GCC_WORKAROUND 1
668 #   endif
669 #endif
670
671 /* REx optimizer.  Converts nodes into quickier variants "in place".
672    Finds fixed substrings.  */
673
674 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
675    to the position after last scanned or to NULL. */
676
677 STATIC I32
678 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
679                         /* scanp: Start here (read-write). */
680                         /* deltap: Write maxlen-minlen here. */
681                         /* last: Stop before this one. */
682 {
683     I32 min = 0, pars = 0, code;
684     regnode *scan = *scanp, *next;
685     I32 delta = 0;
686     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
687     int is_inf_internal = 0;            /* The studied chunk is infinite */
688     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
689     scan_data_t data_fake;
690     struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
691
692     while (scan && OP(scan) != END && scan < last) {
693         /* Peephole optimizer: */
694
695         if (PL_regkind[(U8)OP(scan)] == EXACT) {
696             /* Merge several consecutive EXACTish nodes into one. */
697             regnode *n = regnext(scan);
698             U32 stringok = 1;
699 #ifdef DEBUGGING
700             regnode *stop = scan;
701 #endif
702
703             next = scan + NODE_SZ_STR(scan);
704             /* Skip NOTHING, merge EXACT*. */
705             while (n &&
706                    ( PL_regkind[(U8)OP(n)] == NOTHING ||
707                      (stringok && (OP(n) == OP(scan))))
708                    && NEXT_OFF(n)
709                    && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
710                 if (OP(n) == TAIL || n > next)
711                     stringok = 0;
712                 if (PL_regkind[(U8)OP(n)] == NOTHING) {
713                     NEXT_OFF(scan) += NEXT_OFF(n);
714                     next = n + NODE_STEP_REGNODE;
715 #ifdef DEBUGGING
716                     if (stringok)
717                         stop = n;
718 #endif
719                     n = regnext(n);
720                 }
721                 else if (stringok) {
722                     int oldl = STR_LEN(scan);
723                     regnode *nnext = regnext(n);
724
725                     if (oldl + STR_LEN(n) > U8_MAX)
726                         break;
727                     NEXT_OFF(scan) += NEXT_OFF(n);
728                     STR_LEN(scan) += STR_LEN(n);
729                     next = n + NODE_SZ_STR(n);
730                     /* Now we can overwrite *n : */
731                     Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
732 #ifdef DEBUGGING
733                     stop = next - 1;
734 #endif
735                     n = nnext;
736                 }
737             }
738
739             if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
740 /*
741   Two problematic code points in Unicode casefolding of EXACT nodes:
742
743    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
744    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
745
746    which casefold to
747
748    Unicode                      UTF-8
749
750    U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
751    U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
752
753    This means that in case-insensitive matching (or "loose matching",
754    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
755    length of the above casefolded versions) can match a target string
756    of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
757    This would rather mess up the minimum length computation.
758
759    What we'll do is to look for the tail four bytes, and then peek
760    at the preceding two bytes to see whether we need to decrease
761    the minimum length by four (six minus two).
762
763    Thanks to the design of UTF-8, there cannot be false matches:
764    A sequence of valid UTF-8 bytes cannot be a subsequence of
765    another valid sequence of UTF-8 bytes.
766
767 */
768                  char *s0 = STRING(scan), *s, *t;
769                  char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
770                  char *t0 = "\xcc\x88\xcc\x81";
771                  char *t1 = t0 + 3;
772                  
773                  for (s = s0 + 2;
774                       s < s2 && (t = ninstr(s, s1, t0, t1));
775                       s = t + 4) {
776                       if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
777                           ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
778                            min -= 4;
779                  }
780             }
781
782 #ifdef DEBUGGING
783             /* Allow dumping */
784             n = scan + NODE_SZ_STR(scan);
785             while (n <= stop) {
786                 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
787                     OP(n) = OPTIMIZED;
788                     NEXT_OFF(n) = 0;
789                 }
790                 n++;
791             }
792 #endif
793         }
794         /* Follow the next-chain of the current node and optimize
795            away all the NOTHINGs from it.  */
796         if (OP(scan) != CURLYX) {
797             int max = (reg_off_by_arg[OP(scan)]
798                        ? I32_MAX
799                        /* I32 may be smaller than U16 on CRAYs! */
800                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
801             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
802             int noff;
803             regnode *n = scan;
804         
805             /* Skip NOTHING and LONGJMP. */
806             while ((n = regnext(n))
807                    && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
808                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
809                    && off + noff < max)
810                 off += noff;
811             if (reg_off_by_arg[OP(scan)])
812                 ARG(scan) = off;
813             else
814                 NEXT_OFF(scan) = off;
815         }
816         /* The principal pseudo-switch.  Cannot be a switch, since we
817            look into several different things.  */
818         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
819                    || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
820             next = regnext(scan);
821             code = OP(scan);
822         
823             if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
824                 I32 max1 = 0, min1 = I32_MAX, num = 0;
825                 struct regnode_charclass_class accum;
826                 
827                 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
828                     scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
829                 if (flags & SCF_DO_STCLASS)
830                     cl_init_zero(pRExC_state, &accum);
831                 while (OP(scan) == code) {
832                     I32 deltanext, minnext, f = 0, fake;
833                     struct regnode_charclass_class this_class;
834
835                     num++;
836                     data_fake.flags = 0;
837                     if (data) {         
838                         data_fake.whilem_c = data->whilem_c;
839                         data_fake.last_closep = data->last_closep;
840                     }
841                     else
842                         data_fake.last_closep = &fake;
843                     next = regnext(scan);
844                     scan = NEXTOPER(scan);
845                     if (code != BRANCH)
846                         scan = NEXTOPER(scan);
847                     if (flags & SCF_DO_STCLASS) {
848                         cl_init(pRExC_state, &this_class);
849                         data_fake.start_class = &this_class;
850                         f = SCF_DO_STCLASS_AND;
851                     }           
852                     if (flags & SCF_WHILEM_VISITED_POS)
853                         f |= SCF_WHILEM_VISITED_POS;
854                     /* we suppose the run is continuous, last=next...*/
855                     minnext = study_chunk(pRExC_state, &scan, &deltanext,
856                                           next, &data_fake, f);
857                     if (min1 > minnext)
858                         min1 = minnext;
859                     if (max1 < minnext + deltanext)
860                         max1 = minnext + deltanext;
861                     if (deltanext == I32_MAX)
862                         is_inf = is_inf_internal = 1;
863                     scan = next;
864                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
865                         pars++;
866                     if (data && (data_fake.flags & SF_HAS_EVAL))
867                         data->flags |= SF_HAS_EVAL;
868                     if (data)
869                         data->whilem_c = data_fake.whilem_c;
870                     if (flags & SCF_DO_STCLASS)
871                         cl_or(pRExC_state, &accum, &this_class);
872                     if (code == SUSPEND)
873                         break;
874                 }
875                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
876                     min1 = 0;
877                 if (flags & SCF_DO_SUBSTR) {
878                     data->pos_min += min1;
879                     data->pos_delta += max1 - min1;
880                     if (max1 != min1 || is_inf)
881                         data->longest = &(data->longest_float);
882                 }
883                 min += min1;
884                 delta += max1 - min1;
885                 if (flags & SCF_DO_STCLASS_OR) {
886                     cl_or(pRExC_state, data->start_class, &accum);
887                     if (min1) {
888                         cl_and(data->start_class, &and_with);
889                         flags &= ~SCF_DO_STCLASS;
890                     }
891                 }
892                 else if (flags & SCF_DO_STCLASS_AND) {
893                     if (min1) {
894                         cl_and(data->start_class, &accum);
895                         flags &= ~SCF_DO_STCLASS;
896                     }
897                     else {
898                         /* Switch to OR mode: cache the old value of
899                          * data->start_class */
900                         StructCopy(data->start_class, &and_with,
901                                    struct regnode_charclass_class);
902                         flags &= ~SCF_DO_STCLASS_AND;
903                         StructCopy(&accum, data->start_class,
904                                    struct regnode_charclass_class);
905                         flags |= SCF_DO_STCLASS_OR;
906                         data->start_class->flags |= ANYOF_EOS;
907                     }
908                 }
909             }
910             else if (code == BRANCHJ)   /* single branch is optimized. */
911                 scan = NEXTOPER(NEXTOPER(scan));
912             else                        /* single branch is optimized. */
913                 scan = NEXTOPER(scan);
914             continue;
915         }
916         else if (OP(scan) == EXACT) {
917             I32 l = STR_LEN(scan);
918             UV uc = *((U8*)STRING(scan));
919             if (UTF) {
920                 U8 *s = (U8*)STRING(scan);
921                 l = utf8_length(s, s + l);
922                 uc = utf8_to_uvchr(s, NULL);
923             }
924             min += l;
925             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
926                 /* The code below prefers earlier match for fixed
927                    offset, later match for variable offset.  */
928                 if (data->last_end == -1) { /* Update the start info. */
929                     data->last_start_min = data->pos_min;
930                     data->last_start_max = is_inf
931                         ? I32_MAX : data->pos_min + data->pos_delta;
932                 }
933                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
934                 if (UTF)
935                     SvUTF8_on(data->last_found);
936                 data->last_end = data->pos_min + l;
937                 data->pos_min += l; /* As in the first entry. */
938                 data->flags &= ~SF_BEFORE_EOL;
939             }
940             if (flags & SCF_DO_STCLASS_AND) {
941                 /* Check whether it is compatible with what we know already! */
942                 int compat = 1;
943
944                 if (uc >= 0x100 ||
945                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
946                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
947                     && (!(data->start_class->flags & ANYOF_FOLD)
948                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
949                     )
950                     compat = 0;
951                 ANYOF_CLASS_ZERO(data->start_class);
952                 ANYOF_BITMAP_ZERO(data->start_class);
953                 if (compat)
954                     ANYOF_BITMAP_SET(data->start_class, uc);
955                 data->start_class->flags &= ~ANYOF_EOS;
956                 if (uc < 0x100)
957                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
958             }
959             else if (flags & SCF_DO_STCLASS_OR) {
960                 /* false positive possible if the class is case-folded */
961                 if (uc < 0x100)
962                     ANYOF_BITMAP_SET(data->start_class, uc);
963                 else
964                     data->start_class->flags |= ANYOF_UNICODE_ALL;
965                 data->start_class->flags &= ~ANYOF_EOS;
966                 cl_and(data->start_class, &and_with);
967             }
968             flags &= ~SCF_DO_STCLASS;
969         }
970         else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
971             I32 l = STR_LEN(scan);
972             UV uc = *((U8*)STRING(scan));
973
974             /* Search for fixed substrings supports EXACT only. */
975             if (flags & SCF_DO_SUBSTR)
976                 scan_commit(pRExC_state, data);
977             if (UTF) {
978                 U8 *s = (U8 *)STRING(scan);
979                 l = utf8_length(s, s + l);
980                 uc = utf8_to_uvchr(s, NULL);
981             }
982             min += l;
983             if (data && (flags & SCF_DO_SUBSTR))
984                 data->pos_min += l;
985             if (flags & SCF_DO_STCLASS_AND) {
986                 /* Check whether it is compatible with what we know already! */
987                 int compat = 1;
988
989                 if (uc >= 0x100 ||
990                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
991                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
992                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
993                     compat = 0;
994                 ANYOF_CLASS_ZERO(data->start_class);
995                 ANYOF_BITMAP_ZERO(data->start_class);
996                 if (compat) {
997                     ANYOF_BITMAP_SET(data->start_class, uc);
998                     data->start_class->flags &= ~ANYOF_EOS;
999                     data->start_class->flags |= ANYOF_FOLD;
1000                     if (OP(scan) == EXACTFL)
1001                         data->start_class->flags |= ANYOF_LOCALE;
1002                 }
1003             }
1004             else if (flags & SCF_DO_STCLASS_OR) {
1005                 if (data->start_class->flags & ANYOF_FOLD) {
1006                     /* false positive possible if the class is case-folded.
1007                        Assume that the locale settings are the same... */
1008                     if (uc < 0x100)
1009                         ANYOF_BITMAP_SET(data->start_class, uc);
1010                     data->start_class->flags &= ~ANYOF_EOS;
1011                 }
1012                 cl_and(data->start_class, &and_with);
1013             }
1014             flags &= ~SCF_DO_STCLASS;
1015         }
1016         else if (strchr((char*)PL_varies,OP(scan))) {
1017             I32 mincount, maxcount, minnext, deltanext, fl = 0;
1018             I32 f = flags, pos_before = 0;
1019             regnode *oscan = scan;
1020             struct regnode_charclass_class this_class;
1021             struct regnode_charclass_class *oclass = NULL;
1022             I32 next_is_eval = 0;
1023
1024             switch (PL_regkind[(U8)OP(scan)]) {
1025             case WHILEM:                /* End of (?:...)* . */
1026                 scan = NEXTOPER(scan);
1027                 goto finish;
1028             case PLUS:
1029                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1030                     next = NEXTOPER(scan);
1031                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1032                         mincount = 1;
1033                         maxcount = REG_INFTY;
1034                         next = regnext(scan);
1035                         scan = NEXTOPER(scan);
1036                         goto do_curly;
1037                     }
1038                 }
1039                 if (flags & SCF_DO_SUBSTR)
1040                     data->pos_min++;
1041                 min++;
1042                 /* Fall through. */
1043             case STAR:
1044                 if (flags & SCF_DO_STCLASS) {
1045                     mincount = 0;
1046                     maxcount = REG_INFTY;
1047                     next = regnext(scan);
1048                     scan = NEXTOPER(scan);
1049                     goto do_curly;
1050                 }
1051                 is_inf = is_inf_internal = 1;
1052                 scan = regnext(scan);
1053                 if (flags & SCF_DO_SUBSTR) {
1054                     scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1055                     data->longest = &(data->longest_float);
1056                 }
1057                 goto optimize_curly_tail;
1058             case CURLY:
1059                 mincount = ARG1(scan);
1060                 maxcount = ARG2(scan);
1061                 next = regnext(scan);
1062                 if (OP(scan) == CURLYX) {
1063                     I32 lp = (data ? *(data->last_closep) : 0);
1064
1065                     scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1066                 }
1067                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1068                 next_is_eval = (OP(scan) == EVAL);
1069               do_curly:
1070                 if (flags & SCF_DO_SUBSTR) {
1071                     if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1072                     pos_before = data->pos_min;
1073                 }
1074                 if (data) {
1075                     fl = data->flags;
1076                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1077                     if (is_inf)
1078                         data->flags |= SF_IS_INF;
1079                 }
1080                 if (flags & SCF_DO_STCLASS) {
1081                     cl_init(pRExC_state, &this_class);
1082                     oclass = data->start_class;
1083                     data->start_class = &this_class;
1084                     f |= SCF_DO_STCLASS_AND;
1085                     f &= ~SCF_DO_STCLASS_OR;
1086                 }
1087                 /* These are the cases when once a subexpression
1088                    fails at a particular position, it cannot succeed
1089                    even after backtracking at the enclosing scope.
1090                 
1091                    XXXX what if minimal match and we are at the
1092                         initial run of {n,m}? */
1093                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1094                     f &= ~SCF_WHILEM_VISITED_POS;
1095
1096                 /* This will finish on WHILEM, setting scan, or on NULL: */
1097                 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1098                                       mincount == 0
1099                                         ? (f & ~SCF_DO_SUBSTR) : f);
1100
1101                 if (flags & SCF_DO_STCLASS)
1102                     data->start_class = oclass;
1103                 if (mincount == 0 || minnext == 0) {
1104                     if (flags & SCF_DO_STCLASS_OR) {
1105                         cl_or(pRExC_state, data->start_class, &this_class);
1106                     }
1107                     else if (flags & SCF_DO_STCLASS_AND) {
1108                         /* Switch to OR mode: cache the old value of
1109                          * data->start_class */
1110                         StructCopy(data->start_class, &and_with,
1111                                    struct regnode_charclass_class);
1112                         flags &= ~SCF_DO_STCLASS_AND;
1113                         StructCopy(&this_class, data->start_class,
1114                                    struct regnode_charclass_class);
1115                         flags |= SCF_DO_STCLASS_OR;
1116                         data->start_class->flags |= ANYOF_EOS;
1117                     }
1118                 } else {                /* Non-zero len */
1119                     if (flags & SCF_DO_STCLASS_OR) {
1120                         cl_or(pRExC_state, data->start_class, &this_class);
1121                         cl_and(data->start_class, &and_with);
1122                     }
1123                     else if (flags & SCF_DO_STCLASS_AND)
1124                         cl_and(data->start_class, &this_class);
1125                     flags &= ~SCF_DO_STCLASS;
1126                 }
1127                 if (!scan)              /* It was not CURLYX, but CURLY. */
1128                     scan = next;
1129                 if (ckWARN(WARN_REGEXP)
1130                        /* ? quantifier ok, except for (?{ ... }) */
1131                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
1132                     && (minnext == 0) && (deltanext == 0)
1133                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1134                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
1135                 {
1136                     vWARN(RExC_parse,
1137                           "Quantifier unexpected on zero-length expression");
1138                 }
1139
1140                 min += minnext * mincount;
1141                 is_inf_internal |= ((maxcount == REG_INFTY
1142                                      && (minnext + deltanext) > 0)
1143                                     || deltanext == I32_MAX);
1144                 is_inf |= is_inf_internal;
1145                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1146
1147                 /* Try powerful optimization CURLYX => CURLYN. */
1148                 if (  OP(oscan) == CURLYX && data
1149                       && data->flags & SF_IN_PAR
1150                       && !(data->flags & SF_HAS_EVAL)
1151                       && !deltanext && minnext == 1 ) {
1152                     /* Try to optimize to CURLYN.  */
1153                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1154                     regnode *nxt1 = nxt;
1155 #ifdef DEBUGGING
1156                     regnode *nxt2;
1157 #endif
1158
1159                     /* Skip open. */
1160                     nxt = regnext(nxt);
1161                     if (!strchr((char*)PL_simple,OP(nxt))
1162                         && !(PL_regkind[(U8)OP(nxt)] == EXACT
1163                              && STR_LEN(nxt) == 1))
1164                         goto nogo;
1165 #ifdef DEBUGGING
1166                     nxt2 = nxt;
1167 #endif
1168                     nxt = regnext(nxt);
1169                     if (OP(nxt) != CLOSE)
1170                         goto nogo;
1171                     /* Now we know that nxt2 is the only contents: */
1172                     oscan->flags = ARG(nxt);
1173                     OP(oscan) = CURLYN;
1174                     OP(nxt1) = NOTHING; /* was OPEN. */
1175 #ifdef DEBUGGING
1176                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1177                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1178                     NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1179                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
1180                     OP(nxt + 1) = OPTIMIZED; /* was count. */
1181                     NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1182 #endif
1183                 }
1184               nogo:
1185
1186                 /* Try optimization CURLYX => CURLYM. */
1187                 if (  OP(oscan) == CURLYX && data
1188                       && !(data->flags & SF_HAS_PAR)
1189                       && !(data->flags & SF_HAS_EVAL)
1190                       && !deltanext  ) {
1191                     /* XXXX How to optimize if data == 0? */
1192                     /* Optimize to a simpler form.  */
1193                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1194                     regnode *nxt2;
1195
1196                     OP(oscan) = CURLYM;
1197                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1198                             && (OP(nxt2) != WHILEM))
1199                         nxt = nxt2;
1200                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
1201                     /* Need to optimize away parenths. */
1202                     if (data->flags & SF_IN_PAR) {
1203                         /* Set the parenth number.  */
1204                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1205
1206                         if (OP(nxt) != CLOSE)
1207                             FAIL("Panic opt close");
1208                         oscan->flags = ARG(nxt);
1209                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
1210                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
1211 #ifdef DEBUGGING
1212                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1213                         OP(nxt + 1) = OPTIMIZED; /* was count. */
1214                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1215                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1216 #endif
1217 #if 0
1218                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
1219                             regnode *nnxt = regnext(nxt1);
1220                         
1221                             if (nnxt == nxt) {
1222                                 if (reg_off_by_arg[OP(nxt1)])
1223                                     ARG_SET(nxt1, nxt2 - nxt1);
1224                                 else if (nxt2 - nxt1 < U16_MAX)
1225                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
1226                                 else
1227                                     OP(nxt) = NOTHING;  /* Cannot beautify */
1228                             }
1229                             nxt1 = nnxt;
1230                         }
1231 #endif
1232                         /* Optimize again: */
1233                         study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1234                                     NULL, 0);
1235                     }
1236                     else
1237                         oscan->flags = 0;
1238                 }
1239                 else if ((OP(oscan) == CURLYX)
1240                          && (flags & SCF_WHILEM_VISITED_POS)
1241                          /* See the comment on a similar expression above.
1242                             However, this time it not a subexpression
1243                             we care about, but the expression itself. */
1244                          && (maxcount == REG_INFTY)
1245                          && data && ++data->whilem_c < 16) {
1246                     /* This stays as CURLYX, we can put the count/of pair. */
1247                     /* Find WHILEM (as in regexec.c) */
1248                     regnode *nxt = oscan + NEXT_OFF(oscan);
1249
1250                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1251                         nxt += ARG(nxt);
1252                     PREVOPER(nxt)->flags = data->whilem_c
1253                         | (RExC_whilem_seen << 4); /* On WHILEM */
1254                 }
1255                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1256                     pars++;
1257                 if (flags & SCF_DO_SUBSTR) {
1258                     SV *last_str = Nullsv;
1259                     int counted = mincount != 0;
1260
1261                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1262 #if defined(SPARC64_GCC_WORKAROUND)
1263                         I32 b = 0;
1264                         STRLEN l = 0;
1265                         char *s = NULL;
1266                         I32 old = 0;
1267
1268                         if (pos_before >= data->last_start_min)
1269                             b = pos_before;
1270                         else
1271                             b = data->last_start_min;
1272
1273                         l = 0;
1274                         s = SvPV(data->last_found, l);
1275                         old = b - data->last_start_min;
1276
1277 #else
1278                         I32 b = pos_before >= data->last_start_min
1279                             ? pos_before : data->last_start_min;
1280                         STRLEN l;
1281                         char *s = SvPV(data->last_found, l);
1282                         I32 old = b - data->last_start_min;
1283 #endif
1284
1285                         if (UTF)
1286                             old = utf8_hop((U8*)s, old) - (U8*)s;
1287                         
1288                         l -= old;
1289                         /* Get the added string: */
1290                         last_str = newSVpvn(s  + old, l);
1291                         if (deltanext == 0 && pos_before == b) {
1292                             /* What was added is a constant string */
1293                             if (mincount > 1) {
1294                                 SvGROW(last_str, (mincount * l) + 1);
1295                                 repeatcpy(SvPVX(last_str) + l,
1296                                           SvPVX(last_str), l, mincount - 1);
1297                                 SvCUR(last_str) *= mincount;
1298                                 /* Add additional parts. */
1299                                 SvCUR_set(data->last_found,
1300                                           SvCUR(data->last_found) - l);
1301                                 sv_catsv(data->last_found, last_str);
1302                                 data->last_end += l * (mincount - 1);
1303                             }
1304                         } else {
1305                             /* start offset must point into the last copy */
1306                             data->last_start_min += minnext * (mincount - 1);
1307                             data->last_start_max += is_inf ? 0 : (maxcount - 1)
1308                                 * (minnext + data->pos_delta);
1309                         }
1310                     }
1311                     /* It is counted once already... */
1312                     data->pos_min += minnext * (mincount - counted);
1313                     data->pos_delta += - counted * deltanext +
1314                         (minnext + deltanext) * maxcount - minnext * mincount;
1315                     if (mincount != maxcount) {
1316                          /* Cannot extend fixed substrings found inside
1317                             the group.  */
1318                         scan_commit(pRExC_state,data);
1319                         if (mincount && last_str) {
1320                             sv_setsv(data->last_found, last_str);
1321                             data->last_end = data->pos_min;
1322                             data->last_start_min =
1323                                 data->pos_min - CHR_SVLEN(last_str);
1324                             data->last_start_max = is_inf
1325                                 ? I32_MAX
1326                                 : data->pos_min + data->pos_delta
1327                                 - CHR_SVLEN(last_str);
1328                         }
1329                         data->longest = &(data->longest_float);
1330                     }
1331                     SvREFCNT_dec(last_str);
1332                 }
1333                 if (data && (fl & SF_HAS_EVAL))
1334                     data->flags |= SF_HAS_EVAL;
1335               optimize_curly_tail:
1336                 if (OP(oscan) != CURLYX) {
1337                     while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1338                            && NEXT_OFF(next))
1339                         NEXT_OFF(oscan) += NEXT_OFF(next);
1340                 }
1341                 continue;
1342             default:                    /* REF and CLUMP only? */
1343                 if (flags & SCF_DO_SUBSTR) {
1344                     scan_commit(pRExC_state,data);      /* Cannot expect anything... */
1345                     data->longest = &(data->longest_float);
1346                 }
1347                 is_inf = is_inf_internal = 1;
1348                 if (flags & SCF_DO_STCLASS_OR)
1349                     cl_anything(pRExC_state, data->start_class);
1350                 flags &= ~SCF_DO_STCLASS;
1351                 break;
1352             }
1353         }
1354         else if (strchr((char*)PL_simple,OP(scan))) {
1355             int value = 0;
1356
1357             if (flags & SCF_DO_SUBSTR) {
1358                 scan_commit(pRExC_state,data);
1359                 data->pos_min++;
1360             }
1361             min++;
1362             if (flags & SCF_DO_STCLASS) {
1363                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1364
1365                 /* Some of the logic below assumes that switching
1366                    locale on will only add false positives. */
1367                 switch (PL_regkind[(U8)OP(scan)]) {
1368                 case SANY:
1369                 default:
1370                   do_default:
1371                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1372                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1373                         cl_anything(pRExC_state, data->start_class);
1374                     break;
1375                 case REG_ANY:
1376                     if (OP(scan) == SANY)
1377                         goto do_default;
1378                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1379                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1380                                  || (data->start_class->flags & ANYOF_CLASS));
1381                         cl_anything(pRExC_state, data->start_class);
1382                     }
1383                     if (flags & SCF_DO_STCLASS_AND || !value)
1384                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1385                     break;
1386                 case ANYOF:
1387                     if (flags & SCF_DO_STCLASS_AND)
1388                         cl_and(data->start_class,
1389                                (struct regnode_charclass_class*)scan);
1390                     else
1391                         cl_or(pRExC_state, data->start_class,
1392                               (struct regnode_charclass_class*)scan);
1393                     break;
1394                 case ALNUM:
1395                     if (flags & SCF_DO_STCLASS_AND) {
1396                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1397                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1398                             for (value = 0; value < 256; value++)
1399                                 if (!isALNUM(value))
1400                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1401                         }
1402                     }
1403                     else {
1404                         if (data->start_class->flags & ANYOF_LOCALE)
1405                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1406                         else {
1407                             for (value = 0; value < 256; value++)
1408                                 if (isALNUM(value))
1409                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1410                         }
1411                     }
1412                     break;
1413                 case ALNUML:
1414                     if (flags & SCF_DO_STCLASS_AND) {
1415                         if (data->start_class->flags & ANYOF_LOCALE)
1416                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1417                     }
1418                     else {
1419                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1420                         data->start_class->flags |= ANYOF_LOCALE;
1421                     }
1422                     break;
1423                 case NALNUM:
1424                     if (flags & SCF_DO_STCLASS_AND) {
1425                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1426                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1427                             for (value = 0; value < 256; value++)
1428                                 if (isALNUM(value))
1429                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1430                         }
1431                     }
1432                     else {
1433                         if (data->start_class->flags & ANYOF_LOCALE)
1434                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1435                         else {
1436                             for (value = 0; value < 256; value++)
1437                                 if (!isALNUM(value))
1438                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1439                         }
1440                     }
1441                     break;
1442                 case NALNUML:
1443                     if (flags & SCF_DO_STCLASS_AND) {
1444                         if (data->start_class->flags & ANYOF_LOCALE)
1445                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1446                     }
1447                     else {
1448                         data->start_class->flags |= ANYOF_LOCALE;
1449                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1450                     }
1451                     break;
1452                 case SPACE:
1453                     if (flags & SCF_DO_STCLASS_AND) {
1454                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1455                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1456                             for (value = 0; value < 256; value++)
1457                                 if (!isSPACE(value))
1458                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1459                         }
1460                     }
1461                     else {
1462                         if (data->start_class->flags & ANYOF_LOCALE)
1463                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1464                         else {
1465                             for (value = 0; value < 256; value++)
1466                                 if (isSPACE(value))
1467                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1468                         }
1469                     }
1470                     break;
1471                 case SPACEL:
1472                     if (flags & SCF_DO_STCLASS_AND) {
1473                         if (data->start_class->flags & ANYOF_LOCALE)
1474                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1475                     }
1476                     else {
1477                         data->start_class->flags |= ANYOF_LOCALE;
1478                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1479                     }
1480                     break;
1481                 case NSPACE:
1482                     if (flags & SCF_DO_STCLASS_AND) {
1483                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1484                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1485                             for (value = 0; value < 256; value++)
1486                                 if (isSPACE(value))
1487                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1488                         }
1489                     }
1490                     else {
1491                         if (data->start_class->flags & ANYOF_LOCALE)
1492                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1493                         else {
1494                             for (value = 0; value < 256; value++)
1495                                 if (!isSPACE(value))
1496                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1497                         }
1498                     }
1499                     break;
1500                 case NSPACEL:
1501                     if (flags & SCF_DO_STCLASS_AND) {
1502                         if (data->start_class->flags & ANYOF_LOCALE) {
1503                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1504                             for (value = 0; value < 256; value++)
1505                                 if (!isSPACE(value))
1506                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1507                         }
1508                     }
1509                     else {
1510                         data->start_class->flags |= ANYOF_LOCALE;
1511                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1512                     }
1513                     break;
1514                 case DIGIT:
1515                     if (flags & SCF_DO_STCLASS_AND) {
1516                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1517                         for (value = 0; value < 256; value++)
1518                             if (!isDIGIT(value))
1519                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1520                     }
1521                     else {
1522                         if (data->start_class->flags & ANYOF_LOCALE)
1523                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1524                         else {
1525                             for (value = 0; value < 256; value++)
1526                                 if (isDIGIT(value))
1527                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1528                         }
1529                     }
1530                     break;
1531                 case NDIGIT:
1532                     if (flags & SCF_DO_STCLASS_AND) {
1533                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1534                         for (value = 0; value < 256; value++)
1535                             if (isDIGIT(value))
1536                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1537                     }
1538                     else {
1539                         if (data->start_class->flags & ANYOF_LOCALE)
1540                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1541                         else {
1542                             for (value = 0; value < 256; value++)
1543                                 if (!isDIGIT(value))
1544                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1545                         }
1546                     }
1547                     break;
1548                 }
1549                 if (flags & SCF_DO_STCLASS_OR)
1550                     cl_and(data->start_class, &and_with);
1551                 flags &= ~SCF_DO_STCLASS;
1552             }
1553         }
1554         else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1555             data->flags |= (OP(scan) == MEOL
1556                             ? SF_BEFORE_MEOL
1557                             : SF_BEFORE_SEOL);
1558         }
1559         else if (  PL_regkind[(U8)OP(scan)] == BRANCHJ
1560                  /* Lookbehind, or need to calculate parens/evals/stclass: */
1561                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
1562                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1563             /* Lookahead/lookbehind */
1564             I32 deltanext, minnext, fake = 0;
1565             regnode *nscan;
1566             struct regnode_charclass_class intrnl;
1567             int f = 0;
1568
1569             data_fake.flags = 0;
1570             if (data) {         
1571                 data_fake.whilem_c = data->whilem_c;
1572                 data_fake.last_closep = data->last_closep;
1573             }
1574             else
1575                 data_fake.last_closep = &fake;
1576             if ( flags & SCF_DO_STCLASS && !scan->flags
1577                  && OP(scan) == IFMATCH ) { /* Lookahead */
1578                 cl_init(pRExC_state, &intrnl);
1579                 data_fake.start_class = &intrnl;
1580                 f |= SCF_DO_STCLASS_AND;
1581             }
1582             if (flags & SCF_WHILEM_VISITED_POS)
1583                 f |= SCF_WHILEM_VISITED_POS;
1584             next = regnext(scan);
1585             nscan = NEXTOPER(NEXTOPER(scan));
1586             minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1587             if (scan->flags) {
1588                 if (deltanext) {
1589                     vFAIL("Variable length lookbehind not implemented");
1590                 }
1591                 else if (minnext > U8_MAX) {
1592                     vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1593                 }
1594                 scan->flags = minnext;
1595             }
1596             if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1597                 pars++;
1598             if (data && (data_fake.flags & SF_HAS_EVAL))
1599                 data->flags |= SF_HAS_EVAL;
1600             if (data)
1601                 data->whilem_c = data_fake.whilem_c;
1602             if (f & SCF_DO_STCLASS_AND) {
1603                 int was = (data->start_class->flags & ANYOF_EOS);
1604
1605                 cl_and(data->start_class, &intrnl);
1606                 if (was)
1607                     data->start_class->flags |= ANYOF_EOS;
1608             }
1609         }
1610         else if (OP(scan) == OPEN) {
1611             pars++;
1612         }
1613         else if (OP(scan) == CLOSE) {
1614             if (ARG(scan) == is_par) {
1615                 next = regnext(scan);
1616
1617                 if ( next && (OP(next) != WHILEM) && next < last)
1618                     is_par = 0;         /* Disable optimization */
1619             }
1620             if (data)
1621                 *(data->last_closep) = ARG(scan);
1622         }
1623         else if (OP(scan) == EVAL) {
1624                 if (data)
1625                     data->flags |= SF_HAS_EVAL;
1626         }
1627         else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1628                 if (flags & SCF_DO_SUBSTR) {
1629                     scan_commit(pRExC_state,data);
1630                     data->longest = &(data->longest_float);
1631                 }
1632                 is_inf = is_inf_internal = 1;
1633                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1634                     cl_anything(pRExC_state, data->start_class);
1635                 flags &= ~SCF_DO_STCLASS;
1636         }
1637         /* Else: zero-length, ignore. */
1638         scan = regnext(scan);
1639     }
1640
1641   finish:
1642     *scanp = scan;
1643     *deltap = is_inf_internal ? I32_MAX : delta;
1644     if (flags & SCF_DO_SUBSTR && is_inf)
1645         data->pos_delta = I32_MAX - data->pos_min;
1646     if (is_par > U8_MAX)
1647         is_par = 0;
1648     if (is_par && pars==1 && data) {
1649         data->flags |= SF_IN_PAR;
1650         data->flags &= ~SF_HAS_PAR;
1651     }
1652     else if (pars && data) {
1653         data->flags |= SF_HAS_PAR;
1654         data->flags &= ~SF_IN_PAR;
1655     }
1656     if (flags & SCF_DO_STCLASS_OR)
1657         cl_and(data->start_class, &and_with);
1658     return min;
1659 }
1660
1661 STATIC I32
1662 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1663 {
1664     if (RExC_rx->data) {
1665         Renewc(RExC_rx->data,
1666                sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1667                char, struct reg_data);
1668         Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1669         RExC_rx->data->count += n;
1670     }
1671     else {
1672         Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1673              char, struct reg_data);
1674         New(1208, RExC_rx->data->what, n, U8);
1675         RExC_rx->data->count = n;
1676     }
1677     Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1678     return RExC_rx->data->count - n;
1679 }
1680
1681 void
1682 Perl_reginitcolors(pTHX)
1683 {
1684     int i = 0;
1685     char *s = PerlEnv_getenv("PERL_RE_COLORS");
1686         
1687     if (s) {
1688         PL_colors[0] = s = savepv(s);
1689         while (++i < 6) {
1690             s = strchr(s, '\t');
1691             if (s) {
1692                 *s = '\0';
1693                 PL_colors[i] = ++s;
1694             }
1695             else
1696                 PL_colors[i] = s = "";
1697         }
1698     } else {
1699         while (i < 6)
1700             PL_colors[i++] = "";
1701     }
1702     PL_colorset = 1;
1703 }
1704
1705
1706 /*
1707  - pregcomp - compile a regular expression into internal code
1708  *
1709  * We can't allocate space until we know how big the compiled form will be,
1710  * but we can't compile it (and thus know how big it is) until we've got a
1711  * place to put the code.  So we cheat:  we compile it twice, once with code
1712  * generation turned off and size counting turned on, and once "for real".
1713  * This also means that we don't allocate space until we are sure that the
1714  * thing really will compile successfully, and we never have to move the
1715  * code and thus invalidate pointers into it.  (Note that it has to be in
1716  * one piece because free() must be able to free it all.) [NB: not true in perl]
1717  *
1718  * Beware that the optimization-preparation code in here knows about some
1719  * of the structure of the compiled regexp.  [I'll say.]
1720  */
1721 regexp *
1722 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1723 {
1724     register regexp *r;
1725     regnode *scan;
1726     regnode *first;
1727     I32 flags;
1728     I32 minlen = 0;
1729     I32 sawplus = 0;
1730     I32 sawopen = 0;
1731     scan_data_t data;
1732     RExC_state_t RExC_state;
1733     RExC_state_t *pRExC_state = &RExC_state;
1734
1735     if (exp == NULL)
1736         FAIL("NULL regexp argument");
1737
1738     RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1739
1740     RExC_precomp = exp;
1741     DEBUG_r({
1742          if (!PL_colorset) reginitcolors();
1743          PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1744                        PL_colors[4],PL_colors[5],PL_colors[0],
1745                        (int)(xend - exp), RExC_precomp, PL_colors[1]);
1746     });
1747     RExC_flags16 = pm->op_pmflags;
1748     RExC_sawback = 0;
1749
1750     RExC_seen = 0;
1751     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1752     RExC_seen_evals = 0;
1753     RExC_extralen = 0;
1754
1755     /* First pass: determine size, legality. */
1756     RExC_parse = exp;
1757     RExC_start = exp;
1758     RExC_end = xend;
1759     RExC_naughty = 0;
1760     RExC_npar = 1;
1761     RExC_size = 0L;
1762     RExC_emit = &PL_regdummy;
1763     RExC_whilem_seen = 0;
1764 #if 0 /* REGC() is (currently) a NOP at the first pass.
1765        * Clever compilers notice this and complain. --jhi */
1766     REGC((U8)REG_MAGIC, (char*)RExC_emit);
1767 #endif
1768     if (reg(pRExC_state, 0, &flags) == NULL) {
1769         RExC_precomp = Nullch;
1770         return(NULL);
1771     }
1772     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1773
1774     /* Small enough for pointer-storage convention?
1775        If extralen==0, this means that we will not need long jumps. */
1776     if (RExC_size >= 0x10000L && RExC_extralen)
1777         RExC_size += RExC_extralen;
1778     else
1779         RExC_extralen = 0;
1780     if (RExC_whilem_seen > 15)
1781         RExC_whilem_seen = 15;
1782
1783     /* Allocate space and initialize. */
1784     Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1785          char, regexp);
1786     if (r == NULL)
1787         FAIL("Regexp out of space");
1788
1789 #ifdef DEBUGGING
1790     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1791     Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1792 #endif
1793     r->refcnt = 1;
1794     r->prelen = xend - exp;
1795     r->precomp = savepvn(RExC_precomp, r->prelen);
1796     r->subbeg = NULL;
1797     r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1798     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1799
1800     r->substrs = 0;                     /* Useful during FAIL. */
1801     r->startp = 0;                      /* Useful during FAIL. */
1802     r->endp = 0;                        /* Useful during FAIL. */
1803
1804     Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1805     if (r->offsets) {
1806       r->offsets[0] = RExC_size; 
1807     }
1808     DEBUG_r(PerlIO_printf(Perl_debug_log, 
1809                           "%s %"UVuf" bytes for offset annotations.\n", 
1810                           r->offsets ? "Got" : "Couldn't get", 
1811                           (UV)((2*RExC_size+1) * sizeof(U32))));
1812
1813     RExC_rx = r;
1814
1815     /* Second pass: emit code. */
1816     RExC_flags16 = pm->op_pmflags;      /* don't let top level (?i) bleed */
1817     RExC_parse = exp;
1818     RExC_end = xend;
1819     RExC_naughty = 0;
1820     RExC_npar = 1;
1821     RExC_emit_start = r->program;
1822     RExC_emit = r->program;
1823     /* Store the count of eval-groups for security checks: */
1824     RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1825     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1826     r->data = 0;
1827     if (reg(pRExC_state, 0, &flags) == NULL)
1828         return(NULL);
1829
1830     /* Dig out information for optimizations. */
1831     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1832     pm->op_pmflags = RExC_flags16;
1833     if (UTF)
1834         r->reganch |= ROPT_UTF8;        /* Unicode in it? */
1835     r->regstclass = NULL;
1836     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
1837         r->reganch |= ROPT_NAUGHTY;
1838     scan = r->program + 1;              /* First BRANCH. */
1839
1840     /* XXXX To minimize changes to RE engine we always allocate
1841        3-units-long substrs field. */
1842     Newz(1004, r->substrs, 1, struct reg_substr_data);
1843
1844     StructCopy(&zero_scan_data, &data, scan_data_t);
1845     /* XXXX Should not we check for something else?  Usually it is OPEN1... */
1846     if (OP(scan) != BRANCH) {   /* Only one top-level choice. */
1847         I32 fake;
1848         STRLEN longest_float_length, longest_fixed_length;
1849         struct regnode_charclass_class ch_class;
1850         int stclass_flag;
1851         I32 last_close = 0;
1852
1853         first = scan;
1854         /* Skip introductions and multiplicators >= 1. */
1855         while ((OP(first) == OPEN && (sawopen = 1)) ||
1856                /* An OR of *one* alternative - should not happen now. */
1857             (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1858             (OP(first) == PLUS) ||
1859             (OP(first) == MINMOD) ||
1860                /* An {n,m} with n>0 */
1861             (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1862                 if (OP(first) == PLUS)
1863                     sawplus = 1;
1864                 else
1865                     first += regarglen[(U8)OP(first)];
1866                 first = NEXTOPER(first);
1867         }
1868
1869         /* Starting-point info. */
1870       again:
1871         if (PL_regkind[(U8)OP(first)] == EXACT) {
1872             if (OP(first) == EXACT)
1873                 ;       /* Empty, get anchored substr later. */
1874             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1875                 r->regstclass = first;
1876         }
1877         else if (strchr((char*)PL_simple,OP(first)))
1878             r->regstclass = first;
1879         else if (PL_regkind[(U8)OP(first)] == BOUND ||
1880                  PL_regkind[(U8)OP(first)] == NBOUND)
1881             r->regstclass = first;
1882         else if (PL_regkind[(U8)OP(first)] == BOL) {
1883             r->reganch |= (OP(first) == MBOL
1884                            ? ROPT_ANCH_MBOL
1885                            : (OP(first) == SBOL
1886                               ? ROPT_ANCH_SBOL
1887                               : ROPT_ANCH_BOL));
1888             first = NEXTOPER(first);
1889             goto again;
1890         }
1891         else if (OP(first) == GPOS) {
1892             r->reganch |= ROPT_ANCH_GPOS;
1893             first = NEXTOPER(first);
1894             goto again;
1895         }
1896         else if (!sawopen && (OP(first) == STAR &&
1897             PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1898             !(r->reganch & ROPT_ANCH) )
1899         {
1900             /* turn .* into ^.* with an implied $*=1 */
1901             int type = OP(NEXTOPER(first));
1902
1903             if (type == REG_ANY)
1904                 type = ROPT_ANCH_MBOL;
1905             else
1906                 type = ROPT_ANCH_SBOL;
1907
1908             r->reganch |= type | ROPT_IMPLICIT;
1909             first = NEXTOPER(first);
1910             goto again;
1911         }
1912         if (sawplus && (!sawopen || !RExC_sawback)
1913             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1914             /* x+ must match at the 1st pos of run of x's */
1915             r->reganch |= ROPT_SKIP;
1916
1917         /* Scan is after the zeroth branch, first is atomic matcher. */
1918         DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1919                               (IV)(first - scan + 1)));
1920         /*
1921         * If there's something expensive in the r.e., find the
1922         * longest literal string that must appear and make it the
1923         * regmust.  Resolve ties in favor of later strings, since
1924         * the regstart check works with the beginning of the r.e.
1925         * and avoiding duplication strengthens checking.  Not a
1926         * strong reason, but sufficient in the absence of others.
1927         * [Now we resolve ties in favor of the earlier string if
1928         * it happens that c_offset_min has been invalidated, since the
1929         * earlier string may buy us something the later one won't.]
1930         */
1931         minlen = 0;
1932
1933         data.longest_fixed = newSVpvn("",0);
1934         data.longest_float = newSVpvn("",0);
1935         data.last_found = newSVpvn("",0);
1936         data.longest = &(data.longest_fixed);
1937         first = scan;
1938         if (!r->regstclass) {
1939             cl_init(pRExC_state, &ch_class);
1940             data.start_class = &ch_class;
1941             stclass_flag = SCF_DO_STCLASS_AND;
1942         } else                          /* XXXX Check for BOUND? */
1943             stclass_flag = 0;
1944         data.last_closep = &last_close;
1945
1946         minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1947                              &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1948         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1949              && data.last_start_min == 0 && data.last_end > 0
1950              && !RExC_seen_zerolen
1951              && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1952             r->reganch |= ROPT_CHECK_ALL;
1953         scan_commit(pRExC_state, &data);
1954         SvREFCNT_dec(data.last_found);
1955
1956         longest_float_length = CHR_SVLEN(data.longest_float);
1957         if (longest_float_length
1958             || (data.flags & SF_FL_BEFORE_EOL
1959                 && (!(data.flags & SF_FL_BEFORE_MEOL)
1960                     || (RExC_flags16 & PMf_MULTILINE)))) {
1961             int t;
1962
1963             if (SvCUR(data.longest_fixed)                       /* ok to leave SvCUR */
1964                 && data.offset_fixed == data.offset_float_min
1965                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1966                     goto remove_float;          /* As in (a)+. */
1967
1968             if (SvUTF8(data.longest_float)) {
1969                 r->float_utf8 = data.longest_float;
1970                 r->float_substr = Nullsv;
1971             } else {
1972                 r->float_substr = data.longest_float;
1973                 r->float_utf8 = Nullsv;
1974             }
1975             r->float_min_offset = data.offset_float_min;
1976             r->float_max_offset = data.offset_float_max;
1977             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1978                        && (!(data.flags & SF_FL_BEFORE_MEOL)
1979                            || (RExC_flags16 & PMf_MULTILINE)));
1980             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1981         }
1982         else {
1983           remove_float:
1984             r->float_substr = r->float_utf8 = Nullsv;
1985             SvREFCNT_dec(data.longest_float);
1986             longest_float_length = 0;
1987         }
1988
1989         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
1990         if (longest_fixed_length
1991             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1992                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1993                     || (RExC_flags16 & PMf_MULTILINE)))) {
1994             int t;
1995
1996             if (SvUTF8(data.longest_fixed)) {
1997                 r->anchored_utf8 = data.longest_fixed;
1998                 r->anchored_substr = Nullsv;
1999             } else {
2000                 r->anchored_substr = data.longest_fixed;
2001                 r->anchored_utf8 = Nullsv;
2002             }
2003             r->anchored_offset = data.offset_fixed;
2004             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2005                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
2006                      || (RExC_flags16 & PMf_MULTILINE)));
2007             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2008         }
2009         else {
2010             r->anchored_substr = r->anchored_utf8 = Nullsv;
2011             SvREFCNT_dec(data.longest_fixed);
2012             longest_fixed_length = 0;
2013         }
2014         if (r->regstclass
2015             && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2016             r->regstclass = NULL;
2017         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2018             && stclass_flag
2019             && !(data.start_class->flags & ANYOF_EOS)
2020             && !cl_is_anything(data.start_class)) {
2021             I32 n = add_data(pRExC_state, 1, "f");
2022
2023             New(1006, RExC_rx->data->data[n], 1,
2024                 struct regnode_charclass_class);
2025             StructCopy(data.start_class,
2026                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2027                        struct regnode_charclass_class);
2028             r->regstclass = (regnode*)RExC_rx->data->data[n];
2029             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2030             PL_regdata = r->data; /* for regprop() */
2031             DEBUG_r({ SV *sv = sv_newmortal();
2032                       regprop(sv, (regnode*)data.start_class);
2033                       PerlIO_printf(Perl_debug_log,
2034                                     "synthetic stclass `%s'.\n",
2035                                     SvPVX(sv));});
2036         }
2037
2038         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2039         if (longest_fixed_length > longest_float_length) {
2040             r->check_substr = r->anchored_substr;
2041             r->check_utf8 = r->anchored_utf8;
2042             r->check_offset_min = r->check_offset_max = r->anchored_offset;
2043             if (r->reganch & ROPT_ANCH_SINGLE)
2044                 r->reganch |= ROPT_NOSCAN;
2045         }
2046         else {
2047             r->check_substr = r->float_substr;
2048             r->check_utf8 = r->float_utf8;
2049             r->check_offset_min = data.offset_float_min;
2050             r->check_offset_max = data.offset_float_max;
2051         }
2052         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2053            This should be changed ASAP!  */
2054         if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2055             r->reganch |= RE_USE_INTUIT;
2056             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2057                 r->reganch |= RE_INTUIT_TAIL;
2058         }
2059     }
2060     else {
2061         /* Several toplevels. Best we can is to set minlen. */
2062         I32 fake;
2063         struct regnode_charclass_class ch_class;
2064         I32 last_close = 0;
2065         
2066         DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2067         scan = r->program + 1;
2068         cl_init(pRExC_state, &ch_class);
2069         data.start_class = &ch_class;
2070         data.last_closep = &last_close;
2071         minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2072         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2073                 = r->float_substr = r->float_utf8 = Nullsv;
2074         if (!(data.start_class->flags & ANYOF_EOS)
2075             && !cl_is_anything(data.start_class)) {
2076             I32 n = add_data(pRExC_state, 1, "f");
2077
2078             New(1006, RExC_rx->data->data[n], 1,
2079                 struct regnode_charclass_class);
2080             StructCopy(data.start_class,
2081                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2082                        struct regnode_charclass_class);
2083             r->regstclass = (regnode*)RExC_rx->data->data[n];
2084             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2085             DEBUG_r({ SV* sv = sv_newmortal();
2086                       regprop(sv, (regnode*)data.start_class);
2087                       PerlIO_printf(Perl_debug_log,
2088                                     "synthetic stclass `%s'.\n",
2089                                     SvPVX(sv));});
2090         }
2091     }
2092
2093     r->minlen = minlen;
2094     if (RExC_seen & REG_SEEN_GPOS)
2095         r->reganch |= ROPT_GPOS_SEEN;
2096     if (RExC_seen & REG_SEEN_LOOKBEHIND)
2097         r->reganch |= ROPT_LOOKBEHIND_SEEN;
2098     if (RExC_seen & REG_SEEN_EVAL)
2099         r->reganch |= ROPT_EVAL_SEEN;
2100     if (RExC_seen & REG_SEEN_CANY)
2101         r->reganch |= ROPT_CANY_SEEN;
2102     Newz(1002, r->startp, RExC_npar, I32);
2103     Newz(1002, r->endp, RExC_npar, I32);
2104     PL_regdata = r->data; /* for regprop() */
2105     DEBUG_r(regdump(r));
2106     return(r);
2107 }
2108
2109 /*
2110  - reg - regular expression, i.e. main body or parenthesized thing
2111  *
2112  * Caller must absorb opening parenthesis.
2113  *
2114  * Combining parenthesis handling with the base level of regular expression
2115  * is a trifle forced, but the need to tie the tails of the branches to what
2116  * follows makes it hard to avoid.
2117  */
2118 STATIC regnode *
2119 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2120     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2121 {
2122     register regnode *ret;              /* Will be the head of the group. */
2123     register regnode *br;
2124     register regnode *lastbr;
2125     register regnode *ender = 0;
2126     register I32 parno = 0;
2127     I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
2128
2129     /* for (?g), (?gc), and (?o) warnings; warning
2130        about (?c) will warn about (?g) -- japhy    */
2131
2132     I32 wastedflags = 0x00,
2133         wasted_o    = 0x01,
2134         wasted_g    = 0x02,
2135         wasted_gc   = 0x02 | 0x04,
2136         wasted_c    = 0x04;
2137
2138     char * parse_start = RExC_parse; /* MJD */
2139     char *oregcomp_parse = RExC_parse;
2140     char c;
2141
2142     *flagp = 0;                         /* Tentatively. */
2143
2144
2145     /* Make an OPEN node, if parenthesized. */
2146     if (paren) {
2147         if (*RExC_parse == '?') { /* (?...) */
2148             U16 posflags = 0, negflags = 0;
2149             U16 *flagsp = &posflags;
2150             int logical = 0;
2151             char *seqstart = RExC_parse;
2152
2153             RExC_parse++;
2154             paren = *RExC_parse++;
2155             ret = NULL;                 /* For look-ahead/behind. */
2156             switch (paren) {
2157             case '<':           /* (?<...) */
2158                 RExC_seen |= REG_SEEN_LOOKBEHIND;
2159                 if (*RExC_parse == '!')
2160                     paren = ',';
2161                 if (*RExC_parse != '=' && *RExC_parse != '!')
2162                     goto unknown;
2163                 RExC_parse++;
2164             case '=':           /* (?=...) */
2165             case '!':           /* (?!...) */
2166                 RExC_seen_zerolen++;
2167             case ':':           /* (?:...) */
2168             case '>':           /* (?>...) */
2169                 break;
2170             case '$':           /* (?$...) */
2171             case '@':           /* (?@...) */
2172                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2173                 break;
2174             case '#':           /* (?#...) */
2175                 while (*RExC_parse && *RExC_parse != ')')
2176                     RExC_parse++;
2177                 if (*RExC_parse != ')')
2178                     FAIL("Sequence (?#... not terminated");
2179                 nextchar(pRExC_state);
2180                 *flagp = TRYAGAIN;
2181                 return NULL;
2182             case 'p':           /* (?p...) */
2183                 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2184                     vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2185                 /* FALL THROUGH*/
2186             case '?':           /* (??...) */
2187                 logical = 1;
2188                 if (*RExC_parse != '{')
2189                     goto unknown;
2190                 paren = *RExC_parse++;
2191                 /* FALL THROUGH */
2192             case '{':           /* (?{...}) */
2193             {
2194                 I32 count = 1, n = 0;
2195                 char c;
2196                 char *s = RExC_parse;
2197                 SV *sv;
2198                 OP_4tree *sop, *rop;
2199
2200                 RExC_seen_zerolen++;
2201                 RExC_seen |= REG_SEEN_EVAL;
2202                 while (count && (c = *RExC_parse)) {
2203                     if (c == '\\' && RExC_parse[1])
2204                         RExC_parse++;
2205                     else if (c == '{')
2206                         count++;
2207                     else if (c == '}')
2208                         count--;
2209                     RExC_parse++;
2210                 }
2211                 if (*RExC_parse != ')')
2212                 {
2213                     RExC_parse = s;             
2214                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2215                 }
2216                 if (!SIZE_ONLY) {
2217                     AV *av;
2218                 
2219                     if (RExC_parse - 1 - s)
2220                         sv = newSVpvn(s, RExC_parse - 1 - s);
2221                     else
2222                         sv = newSVpvn("", 0);
2223
2224                     ENTER;
2225                     Perl_save_re_context(aTHX);
2226                     rop = sv_compile_2op(sv, &sop, "re", &av);
2227                     sop->op_private |= OPpREFCOUNTED;
2228                     /* re_dup will OpREFCNT_inc */
2229                     OpREFCNT_set(sop, 1);
2230                     LEAVE;
2231
2232                     n = add_data(pRExC_state, 3, "nop");
2233                     RExC_rx->data->data[n] = (void*)rop;
2234                     RExC_rx->data->data[n+1] = (void*)sop;
2235                     RExC_rx->data->data[n+2] = (void*)av;
2236                     SvREFCNT_dec(sv);
2237                 }
2238                 else {                                          /* First pass */
2239                     if (PL_reginterp_cnt < ++RExC_seen_evals
2240                         && PL_curcop != &PL_compiling)
2241                         /* No compiled RE interpolated, has runtime
2242                            components ===> unsafe.  */
2243                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
2244                     if (PL_tainting && PL_tainted)
2245                         FAIL("Eval-group in insecure regular expression");
2246                 }
2247                 
2248                 nextchar(pRExC_state);
2249                 if (logical) {
2250                     ret = reg_node(pRExC_state, LOGICAL);
2251                     if (!SIZE_ONLY)
2252                         ret->flags = 2;
2253                     regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2254                     /* deal with the length of this later - MJD */
2255                     return ret;
2256                 }
2257                 return reganode(pRExC_state, EVAL, n);
2258             }
2259             case '(':           /* (?(?{...})...) and (?(?=...)...) */
2260             {
2261                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
2262                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2263                         || RExC_parse[1] == '<'
2264                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
2265                         I32 flag;
2266                         
2267                         ret = reg_node(pRExC_state, LOGICAL);
2268                         if (!SIZE_ONLY)
2269                             ret->flags = 1;
2270                         regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2271                         goto insert_if;
2272                     }
2273                 }
2274                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2275                     /* (?(1)...) */
2276                     parno = atoi(RExC_parse++);
2277
2278                     while (isDIGIT(*RExC_parse))
2279                         RExC_parse++;
2280                     ret = reganode(pRExC_state, GROUPP, parno);
2281                     
2282                     if ((c = *nextchar(pRExC_state)) != ')')
2283                         vFAIL("Switch condition not recognized");
2284                   insert_if:
2285                     regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2286                     br = regbranch(pRExC_state, &flags, 1);
2287                     if (br == NULL)
2288                         br = reganode(pRExC_state, LONGJMP, 0);
2289                     else
2290                         regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2291                     c = *nextchar(pRExC_state);
2292                     if (flags&HASWIDTH)
2293                         *flagp |= HASWIDTH;
2294                     if (c == '|') {
2295                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2296                         regbranch(pRExC_state, &flags, 1);
2297                         regtail(pRExC_state, ret, lastbr);
2298                         if (flags&HASWIDTH)
2299                             *flagp |= HASWIDTH;
2300                         c = *nextchar(pRExC_state);
2301                     }
2302                     else
2303                         lastbr = NULL;
2304                     if (c != ')')
2305                         vFAIL("Switch (?(condition)... contains too many branches");
2306                     ender = reg_node(pRExC_state, TAIL);
2307                     regtail(pRExC_state, br, ender);
2308                     if (lastbr) {
2309                         regtail(pRExC_state, lastbr, ender);
2310                         regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2311                     }
2312                     else
2313                         regtail(pRExC_state, ret, ender);
2314                     return ret;
2315                 }
2316                 else {
2317                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2318                 }
2319             }
2320             case 0:
2321                 RExC_parse--; /* for vFAIL to print correctly */
2322                 vFAIL("Sequence (? incomplete");
2323                 break;
2324             default:
2325                 --RExC_parse;
2326               parse_flags:      /* (?i) */
2327                 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2328                     /* (?g), (?gc) and (?o) are useless here
2329                        and must be globally applied -- japhy */
2330
2331                     if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2332                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2333                             I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2334                             if (! (wastedflags & wflagbit) ) {
2335                                 wastedflags |= wflagbit;
2336                                 vWARN5(
2337                                     RExC_parse + 1,
2338                                     "Useless (%s%c) - %suse /%c modifier",
2339                                     flagsp == &negflags ? "?-" : "?",
2340                                     *RExC_parse,
2341                                     flagsp == &negflags ? "don't " : "",
2342                                     *RExC_parse
2343                                 );
2344                             }
2345                         }
2346                     }
2347                     else if (*RExC_parse == 'c') {
2348                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2349                             if (! (wastedflags & wasted_c) ) {
2350                                 wastedflags |= wasted_gc;
2351                                 vWARN3(
2352                                     RExC_parse + 1,
2353                                     "Useless (%sc) - %suse /gc modifier",
2354                                     flagsp == &negflags ? "?-" : "?",
2355                                     flagsp == &negflags ? "don't " : ""
2356                                 );
2357                             }
2358                         }
2359                     }
2360                     else { pmflag(flagsp, *RExC_parse); }
2361
2362                     ++RExC_parse;
2363                 }
2364                 if (*RExC_parse == '-') {
2365                     flagsp = &negflags;
2366                     wastedflags = 0;  /* reset so (?g-c) warns twice */
2367                     ++RExC_parse;
2368                     goto parse_flags;
2369                 }
2370                 RExC_flags16 |= posflags;
2371                 RExC_flags16 &= ~negflags;
2372                 if (*RExC_parse == ':') {
2373                     RExC_parse++;
2374                     paren = ':';
2375                     break;
2376                 }               
2377               unknown:
2378                 if (*RExC_parse != ')') {
2379                     RExC_parse++;
2380                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2381                 }
2382                 nextchar(pRExC_state);
2383                 *flagp = TRYAGAIN;
2384                 return NULL;
2385             }
2386         }
2387         else {                  /* (...) */
2388             parno = RExC_npar;
2389             RExC_npar++;
2390             ret = reganode(pRExC_state, OPEN, parno);
2391             Set_Node_Length(ret, 1); /* MJD */
2392             Set_Node_Offset(ret, RExC_parse); /* MJD */
2393             open = 1;
2394         }
2395     }
2396     else                        /* ! paren */
2397         ret = NULL;
2398
2399     /* Pick up the branches, linking them together. */
2400     parse_start = RExC_parse;   /* MJD */
2401     br = regbranch(pRExC_state, &flags, 1);
2402     /*     branch_len = (paren != 0); */
2403     
2404     if (br == NULL)
2405         return(NULL);
2406     if (*RExC_parse == '|') {
2407         if (!SIZE_ONLY && RExC_extralen) {
2408             reginsert(pRExC_state, BRANCHJ, br);
2409         }
2410         else {                  /* MJD */
2411             reginsert(pRExC_state, BRANCH, br);
2412             Set_Node_Length(br, paren != 0);
2413             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2414         }
2415         have_branch = 1;
2416         if (SIZE_ONLY)
2417             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
2418     }
2419     else if (paren == ':') {
2420         *flagp |= flags&SIMPLE;
2421     }
2422     if (open) {                         /* Starts with OPEN. */
2423         regtail(pRExC_state, ret, br);          /* OPEN -> first. */
2424     }
2425     else if (paren != '?')              /* Not Conditional */
2426         ret = br;
2427     *flagp |= flags & (SPSTART | HASWIDTH);
2428     lastbr = br;
2429     while (*RExC_parse == '|') {
2430         if (!SIZE_ONLY && RExC_extralen) {
2431             ender = reganode(pRExC_state, LONGJMP,0);
2432             regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2433         }
2434         if (SIZE_ONLY)
2435             RExC_extralen += 2;         /* Account for LONGJMP. */
2436         nextchar(pRExC_state);
2437         br = regbranch(pRExC_state, &flags, 0);
2438         
2439         if (br == NULL)
2440             return(NULL);
2441         regtail(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
2442         lastbr = br;
2443         if (flags&HASWIDTH)
2444             *flagp |= HASWIDTH;
2445         *flagp |= flags&SPSTART;
2446     }
2447
2448     if (have_branch || paren != ':') {
2449         /* Make a closing node, and hook it on the end. */
2450         switch (paren) {
2451         case ':':
2452             ender = reg_node(pRExC_state, TAIL);
2453             break;
2454         case 1:
2455             ender = reganode(pRExC_state, CLOSE, parno);
2456             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2457             Set_Node_Length(ender,1); /* MJD */
2458             break;
2459         case '<':
2460         case ',':
2461         case '=':
2462         case '!':
2463             *flagp &= ~HASWIDTH;
2464             /* FALL THROUGH */
2465         case '>':
2466             ender = reg_node(pRExC_state, SUCCEED);
2467             break;
2468         case 0:
2469             ender = reg_node(pRExC_state, END);
2470             break;
2471         }
2472         regtail(pRExC_state, lastbr, ender);
2473
2474         if (have_branch) {
2475             /* Hook the tails of the branches to the closing node. */
2476             for (br = ret; br != NULL; br = regnext(br)) {
2477                 regoptail(pRExC_state, br, ender);
2478             }
2479         }
2480     }
2481
2482     {
2483         char *p;
2484         static char parens[] = "=!<,>";
2485
2486         if (paren && (p = strchr(parens, paren))) {
2487             int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2488             int flag = (p - parens) > 1;
2489
2490             if (paren == '>')
2491                 node = SUSPEND, flag = 0;
2492             reginsert(pRExC_state, node,ret);
2493             ret->flags = flag;
2494             regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2495         }
2496     }
2497
2498     /* Check for proper termination. */
2499     if (paren) {
2500         RExC_flags16 = oregflags;
2501         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2502             RExC_parse = oregcomp_parse;
2503             vFAIL("Unmatched (");
2504         }
2505     }
2506     else if (!paren && RExC_parse < RExC_end) {
2507         if (*RExC_parse == ')') {
2508             RExC_parse++;
2509             vFAIL("Unmatched )");
2510         }
2511         else
2512             FAIL("Junk on end of regexp");      /* "Can't happen". */
2513         /* NOTREACHED */
2514     }
2515
2516     return(ret);
2517 }
2518
2519 /*
2520  - regbranch - one alternative of an | operator
2521  *
2522  * Implements the concatenation operator.
2523  */
2524 STATIC regnode *
2525 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2526 {
2527     register regnode *ret;
2528     register regnode *chain = NULL;
2529     register regnode *latest;
2530     I32 flags = 0, c = 0;
2531
2532     if (first)
2533         ret = NULL;
2534     else {
2535         if (!SIZE_ONLY && RExC_extralen)
2536             ret = reganode(pRExC_state, BRANCHJ,0);
2537         else {
2538             ret = reg_node(pRExC_state, BRANCH);
2539             Set_Node_Length(ret, 1);
2540         }
2541     }
2542         
2543     if (!first && SIZE_ONLY)
2544         RExC_extralen += 1;                     /* BRANCHJ */
2545
2546     *flagp = WORST;                     /* Tentatively. */
2547
2548     RExC_parse--;
2549     nextchar(pRExC_state);
2550     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2551         flags &= ~TRYAGAIN;
2552         latest = regpiece(pRExC_state, &flags);
2553         if (latest == NULL) {
2554             if (flags & TRYAGAIN)
2555                 continue;
2556             return(NULL);
2557         }
2558         else if (ret == NULL)
2559             ret = latest;
2560         *flagp |= flags&HASWIDTH;
2561         if (chain == NULL)      /* First piece. */
2562             *flagp |= flags&SPSTART;
2563         else {
2564             RExC_naughty++;
2565             regtail(pRExC_state, chain, latest);
2566         }
2567         chain = latest;
2568         c++;
2569     }
2570     if (chain == NULL) {        /* Loop ran zero times. */
2571         chain = reg_node(pRExC_state, NOTHING);
2572         if (ret == NULL)
2573             ret = chain;
2574     }
2575     if (c == 1) {
2576         *flagp |= flags&SIMPLE;
2577     }
2578
2579     return(ret);
2580 }
2581
2582 /*
2583  - regpiece - something followed by possible [*+?]
2584  *
2585  * Note that the branching code sequences used for ? and the general cases
2586  * of * and + are somewhat optimized:  they use the same NOTHING node as
2587  * both the endmarker for their branch list and the body of the last branch.
2588  * It might seem that this node could be dispensed with entirely, but the
2589  * endmarker role is not redundant.
2590  */
2591 STATIC regnode *
2592 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2593 {
2594     register regnode *ret;
2595     register char op;
2596     register char *next;
2597     I32 flags;
2598     char *origparse = RExC_parse;
2599     char *maxpos;
2600     I32 min;
2601     I32 max = REG_INFTY;
2602     char *parse_start;
2603
2604     ret = regatom(pRExC_state, &flags);
2605     if (ret == NULL) {
2606         if (flags & TRYAGAIN)
2607             *flagp |= TRYAGAIN;
2608         return(NULL);
2609     }
2610
2611     op = *RExC_parse;
2612
2613     if (op == '{' && regcurly(RExC_parse)) {
2614         parse_start = RExC_parse; /* MJD */
2615         next = RExC_parse + 1;
2616         maxpos = Nullch;
2617         while (isDIGIT(*next) || *next == ',') {
2618             if (*next == ',') {
2619                 if (maxpos)
2620                     break;
2621                 else
2622                     maxpos = next;
2623             }
2624             next++;
2625         }
2626         if (*next == '}') {             /* got one */
2627             if (!maxpos)
2628                 maxpos = next;
2629             RExC_parse++;
2630             min = atoi(RExC_parse);
2631             if (*maxpos == ',')
2632                 maxpos++;
2633             else
2634                 maxpos = RExC_parse;
2635             max = atoi(maxpos);
2636             if (!max && *maxpos != '0')
2637                 max = REG_INFTY;                /* meaning "infinity" */
2638             else if (max >= REG_INFTY)
2639                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2640             RExC_parse = next;
2641             nextchar(pRExC_state);
2642
2643         do_curly:
2644             if ((flags&SIMPLE)) {
2645                 RExC_naughty += 2 + RExC_naughty / 2;
2646                 reginsert(pRExC_state, CURLY, ret);
2647                 Set_Node_Offset(ret, parse_start+1); /* MJD */
2648                 Set_Node_Cur_Length(ret);
2649             }
2650             else {
2651                 regnode *w = reg_node(pRExC_state, WHILEM);
2652
2653                 w->flags = 0;
2654                 regtail(pRExC_state, ret, w);
2655                 if (!SIZE_ONLY && RExC_extralen) {
2656                     reginsert(pRExC_state, LONGJMP,ret);
2657                     reginsert(pRExC_state, NOTHING,ret);
2658                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
2659                 }
2660                 reginsert(pRExC_state, CURLYX,ret);
2661                                 /* MJD hk */
2662                 Set_Node_Offset(ret, parse_start+1);
2663                 Set_Node_Length(ret, 
2664                                 op == '{' ? (RExC_parse - parse_start) : 1);
2665                 
2666                 if (!SIZE_ONLY && RExC_extralen)
2667                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
2668                 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2669                 if (SIZE_ONLY)
2670                     RExC_whilem_seen++, RExC_extralen += 3;
2671                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
2672             }
2673             ret->flags = 0;
2674
2675             if (min > 0)
2676                 *flagp = WORST;
2677             if (max > 0)
2678                 *flagp |= HASWIDTH;
2679             if (max && max < min)
2680                 vFAIL("Can't do {n,m} with n > m");
2681             if (!SIZE_ONLY) {
2682                 ARG1_SET(ret, min);
2683                 ARG2_SET(ret, max);
2684             }
2685
2686             goto nest_check;
2687         }
2688     }
2689
2690     if (!ISMULT1(op)) {
2691         *flagp = flags;
2692         return(ret);
2693     }
2694
2695 #if 0                           /* Now runtime fix should be reliable. */
2696
2697     /* if this is reinstated, don't forget to put this back into perldiag:
2698
2699             =item Regexp *+ operand could be empty at {#} in regex m/%s/
2700
2701            (F) The part of the regexp subject to either the * or + quantifier
2702            could match an empty string. The {#} shows in the regular
2703            expression about where the problem was discovered.
2704
2705     */
2706
2707     if (!(flags&HASWIDTH) && op != '?')
2708       vFAIL("Regexp *+ operand could be empty");
2709 #endif
2710
2711     parse_start = RExC_parse;
2712     nextchar(pRExC_state);
2713
2714     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2715
2716     if (op == '*' && (flags&SIMPLE)) {
2717         reginsert(pRExC_state, STAR, ret);
2718         ret->flags = 0;
2719         RExC_naughty += 4;
2720     }
2721     else if (op == '*') {
2722         min = 0;
2723         goto do_curly;
2724     }
2725     else if (op == '+' && (flags&SIMPLE)) {
2726         reginsert(pRExC_state, PLUS, ret);
2727         ret->flags = 0;
2728         RExC_naughty += 3;
2729     }
2730     else if (op == '+') {
2731         min = 1;
2732         goto do_curly;
2733     }
2734     else if (op == '?') {
2735         min = 0; max = 1;
2736         goto do_curly;
2737     }
2738   nest_check:
2739     if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2740         vWARN3(RExC_parse,
2741                "%.*s matches null string many times",
2742                RExC_parse - origparse,
2743                origparse);
2744     }
2745
2746     if (*RExC_parse == '?') {
2747         nextchar(pRExC_state);
2748         reginsert(pRExC_state, MINMOD, ret);
2749         regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2750     }
2751     if (ISMULT2(RExC_parse)) {
2752         RExC_parse++;
2753         vFAIL("Nested quantifiers");
2754     }
2755
2756     return(ret);
2757 }
2758
2759 /*
2760  - regatom - the lowest level
2761  *
2762  * Optimization:  gobbles an entire sequence of ordinary characters so that
2763  * it can turn them into a single node, which is smaller to store and
2764  * faster to run.  Backslashed characters are exceptions, each becoming a
2765  * separate node; the code is simpler that way and it's not worth fixing.
2766  *
2767  * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2768 STATIC regnode *
2769 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2770 {
2771     register regnode *ret = 0;
2772     I32 flags;
2773     char *parse_start = 0;
2774
2775     *flagp = WORST;             /* Tentatively. */
2776
2777 tryagain:
2778     switch (*RExC_parse) {
2779     case '^':
2780         RExC_seen_zerolen++;
2781         nextchar(pRExC_state);
2782         if (RExC_flags16 & PMf_MULTILINE)
2783             ret = reg_node(pRExC_state, MBOL);
2784         else if (RExC_flags16 & PMf_SINGLELINE)
2785             ret = reg_node(pRExC_state, SBOL);
2786         else
2787             ret = reg_node(pRExC_state, BOL);
2788         Set_Node_Length(ret, 1); /* MJD */
2789         break;
2790     case '$':
2791         nextchar(pRExC_state);
2792         if (*RExC_parse)
2793             RExC_seen_zerolen++;
2794         if (RExC_flags16 & PMf_MULTILINE)
2795             ret = reg_node(pRExC_state, MEOL);
2796         else if (RExC_flags16 & PMf_SINGLELINE)
2797             ret = reg_node(pRExC_state, SEOL);
2798         else
2799             ret = reg_node(pRExC_state, EOL);
2800         Set_Node_Length(ret, 1); /* MJD */
2801         break;
2802     case '.':
2803         nextchar(pRExC_state);
2804         if (RExC_flags16 & PMf_SINGLELINE)
2805             ret = reg_node(pRExC_state, SANY);
2806         else
2807             ret = reg_node(pRExC_state, REG_ANY);
2808         *flagp |= HASWIDTH|SIMPLE;
2809         RExC_naughty++;
2810         Set_Node_Length(ret, 1); /* MJD */
2811         break;
2812     case '[':
2813     {
2814         char *oregcomp_parse = ++RExC_parse;
2815         ret = regclass(pRExC_state);
2816         if (*RExC_parse != ']') {
2817             RExC_parse = oregcomp_parse;
2818             vFAIL("Unmatched [");
2819         }
2820         nextchar(pRExC_state);
2821         *flagp |= HASWIDTH|SIMPLE;
2822         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2823         break;
2824     }
2825     case '(':
2826         nextchar(pRExC_state);
2827         ret = reg(pRExC_state, 1, &flags);
2828         if (ret == NULL) {
2829                 if (flags & TRYAGAIN) {
2830                     if (RExC_parse == RExC_end) {
2831                          /* Make parent create an empty node if needed. */
2832                         *flagp |= TRYAGAIN;
2833                         return(NULL);
2834                     }
2835                     goto tryagain;
2836                 }
2837                 return(NULL);
2838         }
2839         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2840         break;
2841     case '|':
2842     case ')':
2843         if (flags & TRYAGAIN) {
2844             *flagp |= TRYAGAIN;
2845             return NULL;
2846         }
2847         vFAIL("Internal urp");
2848                                 /* Supposed to be caught earlier. */
2849         break;
2850     case '{':
2851         if (!regcurly(RExC_parse)) {
2852             RExC_parse++;
2853             goto defchar;
2854         }
2855         /* FALL THROUGH */
2856     case '?':
2857     case '+':
2858     case '*':
2859         RExC_parse++;
2860         vFAIL("Quantifier follows nothing");
2861         break;
2862     case '\\':
2863         switch (*++RExC_parse) {
2864         case 'A':
2865             RExC_seen_zerolen++;
2866             ret = reg_node(pRExC_state, SBOL);
2867             *flagp |= SIMPLE;
2868             nextchar(pRExC_state);
2869             Set_Node_Length(ret, 2); /* MJD */
2870             break;
2871         case 'G':
2872             ret = reg_node(pRExC_state, GPOS);
2873             RExC_seen |= REG_SEEN_GPOS;
2874             *flagp |= SIMPLE;
2875             nextchar(pRExC_state);
2876             Set_Node_Length(ret, 2); /* MJD */
2877             break;
2878         case 'Z':
2879             ret = reg_node(pRExC_state, SEOL);
2880             *flagp |= SIMPLE;
2881             RExC_seen_zerolen++;                /* Do not optimize RE away */
2882             nextchar(pRExC_state);
2883             break;
2884         case 'z':
2885             ret = reg_node(pRExC_state, EOS);
2886             *flagp |= SIMPLE;
2887             RExC_seen_zerolen++;                /* Do not optimize RE away */
2888             nextchar(pRExC_state);
2889             Set_Node_Length(ret, 2); /* MJD */
2890             break;
2891         case 'C':
2892             ret = reg_node(pRExC_state, CANY);
2893             RExC_seen |= REG_SEEN_CANY;
2894             *flagp |= HASWIDTH|SIMPLE;
2895             nextchar(pRExC_state);
2896             Set_Node_Length(ret, 2); /* MJD */
2897             break;
2898         case 'X':
2899             ret = reg_node(pRExC_state, CLUMP);
2900             *flagp |= HASWIDTH;
2901             nextchar(pRExC_state);
2902             Set_Node_Length(ret, 2); /* MJD */
2903             break;
2904         case 'w':
2905             ret = reg_node(pRExC_state, LOC ? ALNUML     : ALNUM);
2906             *flagp |= HASWIDTH|SIMPLE;
2907             nextchar(pRExC_state);
2908             Set_Node_Length(ret, 2); /* MJD */
2909             break;
2910         case 'W':
2911             ret = reg_node(pRExC_state, LOC ? NALNUML     : NALNUM);
2912             *flagp |= HASWIDTH|SIMPLE;
2913             nextchar(pRExC_state);
2914             Set_Node_Length(ret, 2); /* MJD */
2915             break;
2916         case 'b':
2917             RExC_seen_zerolen++;
2918             RExC_seen |= REG_SEEN_LOOKBEHIND;
2919             ret = reg_node(pRExC_state, LOC ? BOUNDL     : BOUND);
2920             *flagp |= SIMPLE;
2921             nextchar(pRExC_state);
2922             Set_Node_Length(ret, 2); /* MJD */
2923             break;
2924         case 'B':
2925             RExC_seen_zerolen++;
2926             RExC_seen |= REG_SEEN_LOOKBEHIND;
2927             ret = reg_node(pRExC_state, LOC ? NBOUNDL     : NBOUND);
2928             *flagp |= SIMPLE;
2929             nextchar(pRExC_state);
2930             Set_Node_Length(ret, 2); /* MJD */
2931             break;
2932         case 's':
2933             ret = reg_node(pRExC_state, LOC ? SPACEL     : SPACE);
2934             *flagp |= HASWIDTH|SIMPLE;
2935             nextchar(pRExC_state);
2936             Set_Node_Length(ret, 2); /* MJD */
2937             break;
2938         case 'S':
2939             ret = reg_node(pRExC_state, LOC ? NSPACEL     : NSPACE);
2940             *flagp |= HASWIDTH|SIMPLE;
2941             nextchar(pRExC_state);
2942             Set_Node_Length(ret, 2); /* MJD */
2943             break;
2944         case 'd':
2945             ret = reg_node(pRExC_state, DIGIT);
2946             *flagp |= HASWIDTH|SIMPLE;
2947             nextchar(pRExC_state);
2948             Set_Node_Length(ret, 2); /* MJD */
2949             break;
2950         case 'D':
2951             ret = reg_node(pRExC_state, NDIGIT);
2952             *flagp |= HASWIDTH|SIMPLE;
2953             nextchar(pRExC_state);
2954             Set_Node_Length(ret, 2); /* MJD */
2955             break;
2956         case 'p':
2957         case 'P':
2958             {   
2959                 char* oldregxend = RExC_end;
2960                 char* parse_start = RExC_parse;
2961
2962                 if (RExC_parse[1] == '{') {
2963                   /* a lovely hack--pretend we saw [\pX] instead */
2964                     RExC_end = strchr(RExC_parse, '}');
2965                     if (!RExC_end) {
2966                         U8 c = (U8)*RExC_parse;
2967                         RExC_parse += 2;
2968                         RExC_end = oldregxend;
2969                         vFAIL2("Missing right brace on \\%c{}", c);
2970                     }
2971                     RExC_end++;
2972                 }
2973                 else
2974                     RExC_end = RExC_parse + 2;
2975                 RExC_parse--;
2976
2977                 ret = regclass(pRExC_state);
2978
2979                 RExC_end = oldregxend;
2980                 RExC_parse--;
2981                 Set_Node_Cur_Length(ret); /* MJD */
2982                 nextchar(pRExC_state);
2983                 *flagp |= HASWIDTH|SIMPLE;
2984             }
2985             break;
2986         case 'n':
2987         case 'r':
2988         case 't':
2989         case 'f':
2990         case 'e':
2991         case 'a':
2992         case 'x':
2993         case 'c':
2994         case '0':
2995             goto defchar;
2996         case '1': case '2': case '3': case '4':
2997         case '5': case '6': case '7': case '8': case '9':
2998             {
2999                 I32 num = atoi(RExC_parse);
3000
3001                 if (num > 9 && num >= RExC_npar)
3002                     goto defchar;
3003                 else {
3004                     char * parse_start = RExC_parse - 1; /* MJD */
3005                     while (isDIGIT(*RExC_parse))
3006                         RExC_parse++;
3007
3008                     if (!SIZE_ONLY && num > RExC_rx->nparens)
3009                         vFAIL("Reference to nonexistent group");
3010                     RExC_sawback = 1;
3011                     ret = reganode(pRExC_state, FOLD
3012                                    ? (LOC ? REFFL : REFF)
3013                                    : REF, num);
3014                     *flagp |= HASWIDTH;
3015                     
3016                     /* override incorrect value set in reganode MJD */
3017                     Set_Node_Offset(ret, parse_start+1); 
3018                     Set_Node_Cur_Length(ret); /* MJD */
3019                     RExC_parse--;
3020                     nextchar(pRExC_state);
3021                 }
3022             }
3023             break;
3024         case '\0':
3025             if (RExC_parse >= RExC_end)
3026                 FAIL("Trailing \\");
3027             /* FALL THROUGH */
3028         default:
3029             /* Do not generate `unrecognized' warnings here, we fall
3030                back into the quick-grab loop below */
3031             goto defchar;
3032         }
3033         break;
3034
3035     case '#':
3036         if (RExC_flags16 & PMf_EXTENDED) {
3037             while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3038             if (RExC_parse < RExC_end)
3039                 goto tryagain;
3040         }
3041         /* FALL THROUGH */
3042
3043     default: {
3044             register STRLEN len;
3045             register UV ender;
3046             register char *p;
3047             char *oldp, *s;
3048             STRLEN numlen;
3049             STRLEN foldlen;
3050             U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3051
3052             parse_start = RExC_parse - 1;
3053
3054             RExC_parse++;
3055
3056         defchar:
3057             ender = 0;
3058             ret = reg_node(pRExC_state, FOLD
3059                           ? (LOC ? EXACTFL : EXACTF)
3060                           : EXACT);
3061             s = STRING(ret);
3062             for (len = 0, p = RExC_parse - 1;
3063               len < 127 && p < RExC_end;
3064               len++)
3065             {
3066                 oldp = p;
3067
3068                 if (RExC_flags16 & PMf_EXTENDED)
3069                     p = regwhite(p, RExC_end);
3070                 switch (*p) {
3071                 case '^':
3072                 case '$':
3073                 case '.':
3074                 case '[':
3075                 case '(':
3076                 case ')':
3077                 case '|':
3078                     goto loopdone;
3079                 case '\\':
3080                     switch (*++p) {
3081                     case 'A':
3082                     case 'C':
3083                     case 'X':
3084                     case 'G':
3085                     case 'Z':
3086                     case 'z':
3087                     case 'w':
3088                     case 'W':
3089                     case 'b':
3090                     case 'B':
3091                     case 's':
3092                     case 'S':
3093                     case 'd':
3094                     case 'D':
3095                     case 'p':
3096                     case 'P':
3097                         --p;
3098                         goto loopdone;
3099                     case 'n':
3100                         ender = '\n';
3101                         p++;
3102                         break;
3103                     case 'r':
3104                         ender = '\r';
3105                         p++;
3106                         break;
3107                     case 't':
3108                         ender = '\t';
3109                         p++;
3110                         break;
3111                     case 'f':
3112                         ender = '\f';
3113                         p++;
3114                         break;
3115                     case 'e':
3116                           ender = ASCII_TO_NATIVE('\033');
3117                         p++;
3118                         break;
3119                     case 'a':
3120                           ender = ASCII_TO_NATIVE('\007');
3121                         p++;
3122                         break;
3123                     case 'x':
3124                         if (*++p == '{') {
3125                             char* e = strchr(p, '}');
3126         
3127                             if (!e) {
3128                                 RExC_parse = p + 1;
3129                                 vFAIL("Missing right brace on \\x{}");
3130                             }
3131                             else {
3132                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3133                                     | PERL_SCAN_DISALLOW_PREFIX;
3134                                 numlen = e - p - 1;
3135                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3136                                 if (ender > 0xff)
3137                                     RExC_utf8 = 1;
3138                                 /* numlen is generous */
3139                                 if (numlen + len >= 127) {
3140                                     p--;
3141                                     goto loopdone;
3142                                 }
3143                                 p = e + 1;
3144                             }
3145                         }
3146                         else {
3147                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3148                             numlen = 2;
3149                             ender = grok_hex(p, &numlen, &flags, NULL);
3150                             p += numlen;
3151                         }
3152                         break;
3153                     case 'c':
3154                         p++;
3155                         ender = UCHARAT(p++);
3156                         ender = toCTRL(ender);
3157                         break;
3158                     case '0': case '1': case '2': case '3':case '4':
3159                     case '5': case '6': case '7': case '8':case '9':
3160                         if (*p == '0' ||
3161                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3162                             I32 flags = 0;
3163                             numlen = 3;
3164                             ender = grok_oct(p, &numlen, &flags, NULL);
3165                             p += numlen;
3166                         }
3167                         else {
3168                             --p;
3169                             goto loopdone;
3170                         }
3171                         break;
3172                     case '\0':
3173                         if (p >= RExC_end)
3174                             FAIL("Trailing \\");
3175                         /* FALL THROUGH */
3176                     default:
3177                         if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3178                             vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3179                         goto normal_default;
3180                     }
3181                     break;
3182                 default:
3183                   normal_default:
3184                     if (UTF8_IS_START(*p) && UTF) {
3185                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3186                                                &numlen, 0);
3187                         p += numlen;
3188                     }
3189                     else
3190                         ender = *p++;
3191                     break;
3192                 }
3193                 if (RExC_flags16 & PMf_EXTENDED)
3194                     p = regwhite(p, RExC_end);
3195                 if (UTF && FOLD) {
3196                     /* Prime the casefolded buffer. */
3197                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3198                 }
3199                 if (ISMULT2(p)) { /* Back off on ?+*. */
3200                     if (len)
3201                         p = oldp;
3202                     else if (UTF) {
3203                          STRLEN unilen;
3204
3205                          if (FOLD) {
3206                               /* Emit all the Unicode characters. */
3207                               for (foldbuf = tmpbuf;
3208                                    foldlen;
3209                                    foldlen -= numlen) {
3210                                    ender = utf8_to_uvchr(foldbuf, &numlen);
3211                                    if (numlen > 0) {
3212                                         reguni(pRExC_state, ender, s, &unilen);
3213                                         s       += unilen;
3214                                         len     += unilen;
3215                                         /* In EBCDIC the numlen
3216                                          * and unilen can differ. */
3217                                         foldbuf += numlen;
3218                                         if (numlen >= foldlen)
3219                                              break;
3220                                    }
3221                                    else
3222                                         break; /* "Can't happen." */
3223                               }
3224                          }
3225                          else {
3226                               reguni(pRExC_state, ender, s, &unilen);
3227                               if (unilen > 0) {
3228                                    s   += unilen;
3229                                    len += unilen;
3230                               }
3231                          }
3232                     }
3233                     else {
3234                         len++;
3235                         REGC(ender, s++);
3236                     }
3237                     break;
3238                 }
3239                 if (UTF) {
3240                      STRLEN unilen;
3241
3242                      if (FOLD) {
3243                           /* Emit all the Unicode characters. */
3244                           for (foldbuf = tmpbuf;
3245                                foldlen;
3246                                foldlen -= numlen) {
3247                                ender = utf8_to_uvchr(foldbuf, &numlen);
3248                                if (numlen > 0) {
3249                                     reguni(pRExC_state, ender, s, &unilen);
3250                                     len     += unilen;
3251                                     s       += unilen;
3252                                     /* In EBCDIC the numlen
3253                                      * and unilen can differ. */
3254                                     foldbuf += numlen;
3255                                     if (numlen >= foldlen)
3256                                          break;
3257                                }
3258                                else
3259                                     break;
3260                           }
3261                      }
3262                      else {
3263                           reguni(pRExC_state, ender, s, &unilen);
3264                           if (unilen > 0) {
3265                                s   += unilen;
3266                                len += unilen;
3267                           }
3268                      }
3269                      len--;
3270                 }
3271                 else
3272                     REGC(ender, s++);
3273             }
3274         loopdone:
3275             RExC_parse = p - 1;
3276             Set_Node_Cur_Length(ret); /* MJD */
3277             nextchar(pRExC_state);
3278             {
3279                 /* len is STRLEN which is unsigned, need to copy to signed */
3280                 IV iv = len;
3281                 if (iv < 0)
3282                     vFAIL("Internal disaster");
3283             }
3284             if (len > 0)
3285                 *flagp |= HASWIDTH;
3286             if (len == 1)
3287                 *flagp |= SIMPLE;
3288             if (!SIZE_ONLY)
3289                 STR_LEN(ret) = len;
3290             if (SIZE_ONLY)
3291                 RExC_size += STR_SZ(len);
3292             else
3293                 RExC_emit += STR_SZ(len);
3294         }
3295         break;
3296     }
3297
3298     /* If the encoding pragma is in effect recode the text of
3299      * any EXACT-kind nodes. */
3300     if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3301          STRLEN oldlen = STR_LEN(ret);
3302          SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3303
3304          if (RExC_utf8)
3305               SvUTF8_on(sv);
3306          if (sv_utf8_downgrade(sv, TRUE)) {
3307               char *s       = sv_recode_to_utf8(sv, PL_encoding);
3308               STRLEN newlen = SvCUR(sv);
3309          
3310               if (!SIZE_ONLY) {
3311                    DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3312                                          (int)oldlen, STRING(ret),
3313                                          (int)newlen, s));
3314                    Copy(s, STRING(ret), newlen, char);
3315                    STR_LEN(ret) += newlen - oldlen;
3316                    RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3317               } else
3318                    RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3319          }
3320     }
3321
3322     return(ret);
3323 }
3324
3325 STATIC char *
3326 S_regwhite(pTHX_ char *p, char *e)
3327 {
3328     while (p < e) {
3329         if (isSPACE(*p))
3330             ++p;
3331         else if (*p == '#') {
3332             do {
3333                 p++;
3334             } while (p < e && *p != '\n');
3335         }
3336         else
3337             break;
3338     }
3339     return p;
3340 }
3341
3342 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3343    Character classes ([:foo:]) can also be negated ([:^foo:]).
3344    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3345    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3346    but trigger failures because they are currently unimplemented. */
3347
3348 #define POSIXCC_DONE(c)   ((c) == ':')
3349 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3350 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3351
3352 STATIC I32
3353 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3354 {
3355     char *posixcc = 0;
3356     I32 namedclass = OOB_NAMEDCLASS;
3357
3358     if (value == '[' && RExC_parse + 1 < RExC_end &&
3359         /* I smell either [: or [= or [. -- POSIX has been here, right? */
3360         POSIXCC(UCHARAT(RExC_parse))) {
3361         char  c = UCHARAT(RExC_parse);
3362         char* s = RExC_parse++;
3363         
3364         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3365             RExC_parse++;
3366         if (RExC_parse == RExC_end)
3367             /* Grandfather lone [:, [=, [. */
3368             RExC_parse = s;
3369         else {
3370             char* t = RExC_parse++; /* skip over the c */
3371
3372             if (UCHARAT(RExC_parse) == ']') {
3373                 RExC_parse++; /* skip over the ending ] */
3374                 posixcc = s + 1;
3375                 if (*s == ':') {
3376                     I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3377                     I32 skip = 5; /* the most common skip */
3378
3379                     switch (*posixcc) {
3380                     case 'a':
3381                         if (strnEQ(posixcc, "alnum", 5))
3382                             namedclass =
3383                                 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3384                         else if (strnEQ(posixcc, "alpha", 5))
3385                             namedclass =
3386                                 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3387                         else if (strnEQ(posixcc, "ascii", 5))
3388                             namedclass =
3389                                 complement ? ANYOF_NASCII : ANYOF_ASCII;
3390                         break;
3391                     case 'b':
3392                         if (strnEQ(posixcc, "blank", 5))
3393                             namedclass =
3394                                 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3395                         break;
3396                     case 'c':
3397                         if (strnEQ(posixcc, "cntrl", 5))
3398                             namedclass =
3399                                 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3400                         break;
3401                     case 'd':
3402                         if (strnEQ(posixcc, "digit", 5))
3403                             namedclass =
3404                                 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3405                         break;
3406                     case 'g':
3407                         if (strnEQ(posixcc, "graph", 5))
3408                             namedclass =
3409                                 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3410                         break;
3411                     case 'l':
3412                         if (strnEQ(posixcc, "lower", 5))
3413                             namedclass =
3414                                 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3415                         break;
3416                     case 'p':
3417                         if (strnEQ(posixcc, "print", 5))
3418                             namedclass =
3419                                 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3420                         else if (strnEQ(posixcc, "punct", 5))
3421                             namedclass =
3422                                 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3423                         break;
3424                     case 's':
3425                         if (strnEQ(posixcc, "space", 5))
3426                             namedclass =
3427                                 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3428                         break;
3429                     case 'u':
3430                         if (strnEQ(posixcc, "upper", 5))
3431                             namedclass =
3432                                 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3433                         break;
3434                     case 'w': /* this is not POSIX, this is the Perl \w */
3435                         if (strnEQ(posixcc, "word", 4)) {
3436                             namedclass =
3437                                 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3438                             skip = 4;
3439                         }
3440                         break;
3441                     case 'x':
3442                         if (strnEQ(posixcc, "xdigit", 6)) {
3443                             namedclass =
3444                                 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3445                             skip = 6;
3446                         }
3447                         break;
3448                     }
3449                     if (namedclass == OOB_NAMEDCLASS ||
3450                         posixcc[skip] != ':' ||
3451                         posixcc[skip+1] != ']')
3452                     {
3453                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3454                                       t - s - 1, s + 1);
3455                     }
3456                 } else if (!SIZE_ONLY) {
3457                     /* [[=foo=]] and [[.foo.]] are still future. */
3458
3459                     /* adjust RExC_parse so the warning shows after
3460                        the class closes */
3461                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3462                         RExC_parse++;
3463                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3464                 }
3465             } else {
3466                 /* Maternal grandfather:
3467                  * "[:" ending in ":" but not in ":]" */
3468                 RExC_parse = s;
3469             }
3470         }
3471     }
3472
3473     return namedclass;
3474 }
3475
3476 STATIC void
3477 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3478 {
3479     if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3480         char *s = RExC_parse;
3481         char  c = *s++;
3482
3483         while(*s && isALNUM(*s))
3484             s++;
3485         if (*s && c == *s && s[1] == ']') {
3486             vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3487
3488             /* [[=foo=]] and [[.foo.]] are still future. */
3489             if (POSIXCC_NOTYET(c)) {
3490                 /* adjust RExC_parse so the error shows after
3491                    the class closes */
3492                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3493                     ;
3494                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3495             }
3496         }
3497     }
3498 }
3499
3500 STATIC regnode *
3501 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3502 {
3503     register UV value;
3504     register UV nextvalue;
3505     register IV prevvalue = OOB_UNICODE;
3506     register IV range = 0;
3507     register regnode *ret;
3508     STRLEN numlen;
3509     IV namedclass;
3510     char *rangebegin = 0;
3511     bool need_class = 0;
3512     SV *listsv = Nullsv;
3513     register char *e;
3514     UV n;
3515     bool optimize_invert   = TRUE;
3516     AV* unicode_alternate  = 0;
3517
3518     ret = reganode(pRExC_state, ANYOF, 0);
3519
3520     if (!SIZE_ONLY)
3521         ANYOF_FLAGS(ret) = 0;
3522
3523     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
3524         RExC_naughty++;
3525         RExC_parse++;
3526         if (!SIZE_ONLY)
3527             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3528     }
3529
3530     if (SIZE_ONLY)
3531         RExC_size += ANYOF_SKIP;
3532     else {
3533         RExC_emit += ANYOF_SKIP;
3534         if (FOLD)
3535             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3536         if (LOC)
3537             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3538         ANYOF_BITMAP_ZERO(ret);
3539         listsv = newSVpvn("# comment\n", 10);
3540     }
3541
3542     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3543
3544     if (!SIZE_ONLY && POSIXCC(nextvalue))
3545         checkposixcc(pRExC_state);
3546
3547     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3548     if (UCHARAT(RExC_parse) == ']')
3549         goto charclassloop;
3550
3551     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3552
3553     charclassloop:
3554
3555         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3556
3557         if (!range)
3558             rangebegin = RExC_parse;
3559         if (UTF) {
3560             value = utf8n_to_uvchr((U8*)RExC_parse,
3561                                    RExC_end - RExC_parse,
3562                                    &numlen, 0);
3563             RExC_parse += numlen;
3564         }
3565         else
3566             value = UCHARAT(RExC_parse++);
3567         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3568         if (value == '[' && POSIXCC(nextvalue))
3569             namedclass = regpposixcc(pRExC_state, value);
3570         else if (value == '\\') {
3571             if (UTF) {
3572                 value = utf8n_to_uvchr((U8*)RExC_parse,
3573                                    RExC_end - RExC_parse,
3574                                    &numlen, 0);
3575                 RExC_parse += numlen;
3576             }
3577             else
3578                 value = UCHARAT(RExC_parse++);
3579             /* Some compilers cannot handle switching on 64-bit integer
3580              * values, therefore value cannot be an UV.  Yes, this will
3581              * be a problem later if we want switch on Unicode.
3582              * A similar issue a little bit later when switching on
3583              * namedclass. --jhi */
3584             switch ((I32)value) {
3585             case 'w':   namedclass = ANYOF_ALNUM;       break;
3586             case 'W':   namedclass = ANYOF_NALNUM;      break;
3587             case 's':   namedclass = ANYOF_SPACE;       break;
3588             case 'S':   namedclass = ANYOF_NSPACE;      break;
3589             case 'd':   namedclass = ANYOF_DIGIT;       break;
3590             case 'D':   namedclass = ANYOF_NDIGIT;      break;
3591             case 'p':
3592             case 'P':
3593                 if (*RExC_parse == '{') {
3594                     U8 c = (U8)value;
3595                     e = strchr(RExC_parse++, '}');
3596                     if (!e)
3597                         vFAIL2("Missing right brace on \\%c{}", c);
3598                     while (isSPACE(UCHARAT(RExC_parse)))
3599                         RExC_parse++;
3600                     if (e == RExC_parse)
3601                         vFAIL2("Empty \\%c{}", c);
3602                     n = e - RExC_parse;
3603                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3604                         n--;
3605                 }
3606                 else {
3607                     e = RExC_parse;
3608                     n = 1;
3609                 }
3610                 if (!SIZE_ONLY) {
3611                     if (UCHARAT(RExC_parse) == '^') {
3612                          RExC_parse++;
3613                          n--;
3614                          value = value == 'p' ? 'P' : 'p'; /* toggle */
3615                          while (isSPACE(UCHARAT(RExC_parse))) {
3616                               RExC_parse++;
3617                               n--;
3618                          }
3619                     }
3620                     if (value == 'p')
3621                          Perl_sv_catpvf(aTHX_ listsv,
3622                                         "+utf8::%.*s\n", (int)n, RExC_parse);
3623                     else
3624                          Perl_sv_catpvf(aTHX_ listsv,
3625                                         "!utf8::%.*s\n", (int)n, RExC_parse);
3626                 }
3627                 RExC_parse = e + 1;
3628                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3629                 continue;
3630             case 'n':   value = '\n';                   break;
3631             case 'r':   value = '\r';                   break;
3632             case 't':   value = '\t';                   break;
3633             case 'f':   value = '\f';                   break;
3634             case 'b':   value = '\b';                   break;
3635             case 'e':   value = ASCII_TO_NATIVE('\033');break;
3636             case 'a':   value = ASCII_TO_NATIVE('\007');break;
3637             case 'x':
3638                 if (*RExC_parse == '{') {
3639                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3640                         | PERL_SCAN_DISALLOW_PREFIX;
3641                     e = strchr(RExC_parse++, '}');
3642                     if (!e)
3643                         vFAIL("Missing right brace on \\x{}");
3644
3645                     numlen = e - RExC_parse;
3646                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3647                     RExC_parse = e + 1;
3648                 }
3649                 else {
3650                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3651                     numlen = 2;
3652                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3653                     RExC_parse += numlen;
3654                 }
3655                 break;
3656             case 'c':
3657                 value = UCHARAT(RExC_parse++);
3658                 value = toCTRL(value);
3659                 break;
3660             case '0': case '1': case '2': case '3': case '4':
3661             case '5': case '6': case '7': case '8': case '9':
3662             {
3663                 I32 flags = 0;
3664                 numlen = 3;
3665                 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3666                 RExC_parse += numlen;
3667                 break;
3668             }
3669             default:
3670                 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3671                     vWARN2(RExC_parse,
3672                            "Unrecognized escape \\%c in character class passed through",
3673                            (int)value);
3674                 break;
3675             }
3676         } /* end of \blah */
3677
3678         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3679
3680             if (!SIZE_ONLY && !need_class)
3681                 ANYOF_CLASS_ZERO(ret);
3682
3683             need_class = 1;
3684
3685             /* a bad range like a-\d, a-[:digit:] ? */
3686             if (range) {
3687                 if (!SIZE_ONLY) {
3688                     if (ckWARN(WARN_REGEXP))
3689                         vWARN4(RExC_parse,
3690                                "False [] range \"%*.*s\"",
3691                                RExC_parse - rangebegin,
3692                                RExC_parse - rangebegin,
3693                                rangebegin);
3694                     if (prevvalue < 256) {
3695                         ANYOF_BITMAP_SET(ret, prevvalue);
3696                         ANYOF_BITMAP_SET(ret, '-');
3697                     }
3698                     else {
3699                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3700                         Perl_sv_catpvf(aTHX_ listsv,
3701                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3702                     }
3703                 }
3704
3705                 range = 0; /* this was not a true range */
3706             }
3707
3708             if (!SIZE_ONLY) {
3709                 if (namedclass > OOB_NAMEDCLASS)
3710                     optimize_invert = FALSE;
3711                 /* Possible truncation here but in some 64-bit environments
3712                  * the compiler gets heartburn about switch on 64-bit values.
3713                  * A similar issue a little earlier when switching on value.
3714                  * --jhi */
3715                 switch ((I32)namedclass) {
3716                 case ANYOF_ALNUM:
3717                     if (LOC)
3718                         ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3719                     else {
3720                         for (value = 0; value < 256; value++)
3721                             if (isALNUM(value))
3722                                 ANYOF_BITMAP_SET(ret, value);
3723                     }
3724                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");    
3725                     break;
3726                 case ANYOF_NALNUM:
3727                     if (LOC)
3728                         ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3729                     else {
3730                         for (value = 0; value < 256; value++)
3731                             if (!isALNUM(value))
3732                                 ANYOF_BITMAP_SET(ret, value);
3733                     }
3734                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3735                     break;
3736                 case ANYOF_ALNUMC:
3737                     if (LOC)
3738                         ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3739                     else {
3740                         for (value = 0; value < 256; value++)
3741                             if (isALNUMC(value))
3742                                 ANYOF_BITMAP_SET(ret, value);
3743                     }
3744                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3745                     break;
3746                 case ANYOF_NALNUMC:
3747                     if (LOC)
3748                         ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3749                     else {
3750                         for (value = 0; value < 256; value++)
3751                             if (!isALNUMC(value))
3752                                 ANYOF_BITMAP_SET(ret, value);
3753                     }
3754                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3755                     break;
3756                 case ANYOF_ALPHA:
3757                     if (LOC)
3758                         ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3759                     else {
3760                         for (value = 0; value < 256; value++)
3761                             if (isALPHA(value))
3762                                 ANYOF_BITMAP_SET(ret, value);
3763                     }
3764                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3765                     break;
3766                 case ANYOF_NALPHA:
3767                     if (LOC)
3768                         ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3769                     else {
3770                         for (value = 0; value < 256; value++)
3771                             if (!isALPHA(value))
3772                                 ANYOF_BITMAP_SET(ret, value);
3773                     }
3774                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3775                     break;
3776                 case ANYOF_ASCII:
3777                     if (LOC)
3778                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3779                     else {
3780 #ifndef EBCDIC
3781                         for (value = 0; value < 128; value++)
3782                             ANYOF_BITMAP_SET(ret, value);
3783 #else  /* EBCDIC */
3784                         for (value = 0; value < 256; value++) {
3785                             if (isASCII(value))
3786                                 ANYOF_BITMAP_SET(ret, value);
3787                         }
3788 #endif /* EBCDIC */
3789                     }
3790                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3791                     break;
3792                 case ANYOF_NASCII:
3793                     if (LOC)
3794                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3795                     else {
3796 #ifndef EBCDIC
3797                         for (value = 128; value < 256; value++)
3798                             ANYOF_BITMAP_SET(ret, value);
3799 #else  /* EBCDIC */
3800                         for (value = 0; value < 256; value++) {
3801                             if (!isASCII(value))
3802                                 ANYOF_BITMAP_SET(ret, value);
3803                         }
3804 #endif /* EBCDIC */
3805                     }
3806                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3807                     break;
3808                 case ANYOF_BLANK:
3809                     if (LOC)
3810                         ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3811                     else {
3812                         for (value = 0; value < 256; value++)
3813                             if (isBLANK(value))
3814                                 ANYOF_BITMAP_SET(ret, value);
3815                     }
3816                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3817                     break;
3818                 case ANYOF_NBLANK:
3819                     if (LOC)
3820                         ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3821                     else {
3822                         for (value = 0; value < 256; value++)
3823                             if (!isBLANK(value))
3824                                 ANYOF_BITMAP_SET(ret, value);
3825                     }
3826                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3827                     break;
3828                 case ANYOF_CNTRL:
3829                     if (LOC)
3830                         ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3831                     else {
3832                         for (value = 0; value < 256; value++)
3833                             if (isCNTRL(value))
3834                                 ANYOF_BITMAP_SET(ret, value);
3835                     }
3836                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3837                     break;
3838                 case ANYOF_NCNTRL:
3839                     if (LOC)
3840                         ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3841                     else {
3842                         for (value = 0; value < 256; value++)
3843                             if (!isCNTRL(value))
3844                                 ANYOF_BITMAP_SET(ret, value);
3845                     }
3846                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3847                     break;
3848                 case ANYOF_DIGIT:
3849                     if (LOC)
3850                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3851                     else {
3852                         /* consecutive digits assumed */
3853                         for (value = '0'; value <= '9'; value++)
3854                             ANYOF_BITMAP_SET(ret, value);
3855                     }
3856                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3857                     break;
3858                 case ANYOF_NDIGIT:
3859                     if (LOC)
3860                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3861                     else {
3862                         /* consecutive digits assumed */
3863                         for (value = 0; value < '0'; value++)
3864                             ANYOF_BITMAP_SET(ret, value);
3865                         for (value = '9' + 1; value < 256; value++)
3866                             ANYOF_BITMAP_SET(ret, value);
3867                     }
3868                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3869                     break;
3870                 case ANYOF_GRAPH:
3871                     if (LOC)
3872                         ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3873                     else {
3874                         for (value = 0; value < 256; value++)
3875                             if (isGRAPH(value))
3876                                 ANYOF_BITMAP_SET(ret, value);
3877                     }
3878                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3879                     break;
3880                 case ANYOF_NGRAPH:
3881                     if (LOC)
3882                         ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3883                     else {
3884                         for (value = 0; value < 256; value++)
3885                             if (!isGRAPH(value))
3886                                 ANYOF_BITMAP_SET(ret, value);
3887                     }
3888                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3889                     break;
3890                 case ANYOF_LOWER:
3891                     if (LOC)
3892                         ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3893                     else {
3894                         for (value = 0; value < 256; value++)
3895                             if (isLOWER(value))
3896                                 ANYOF_BITMAP_SET(ret, value);
3897                     }
3898                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3899                     break;
3900                 case ANYOF_NLOWER:
3901                     if (LOC)
3902                         ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3903                     else {
3904                         for (value = 0; value < 256; value++)
3905                             if (!isLOWER(value))
3906                                 ANYOF_BITMAP_SET(ret, value);
3907                     }
3908                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3909                     break;
3910                 case ANYOF_PRINT:
3911                     if (LOC)
3912                         ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3913                     else {
3914                         for (value = 0; value < 256; value++)
3915                             if (isPRINT(value))
3916                                 ANYOF_BITMAP_SET(ret, value);
3917                     }
3918                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3919                     break;
3920                 case ANYOF_NPRINT:
3921                     if (LOC)
3922                         ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3923                     else {
3924                         for (value = 0; value < 256; value++)
3925                             if (!isPRINT(value))
3926                                 ANYOF_BITMAP_SET(ret, value);
3927                     }
3928                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3929                     break;
3930                 case ANYOF_PSXSPC:
3931                     if (LOC)
3932                         ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3933                     else {
3934                         for (value = 0; value < 256; value++)
3935                             if (isPSXSPC(value))
3936                                 ANYOF_BITMAP_SET(ret, value);
3937                     }
3938                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3939                     break;
3940                 case ANYOF_NPSXSPC:
3941                     if (LOC)
3942                         ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3943                     else {
3944                         for (value = 0; value < 256; value++)
3945                             if (!isPSXSPC(value))
3946                                 ANYOF_BITMAP_SET(ret, value);
3947                     }
3948                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3949                     break;
3950                 case ANYOF_PUNCT:
3951                     if (LOC)
3952                         ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3953                     else {
3954                         for (value = 0; value < 256; value++)
3955                             if (isPUNCT(value))
3956                                 ANYOF_BITMAP_SET(ret, value);
3957                     }
3958                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
3959                     break;
3960                 case ANYOF_NPUNCT:
3961                     if (LOC)
3962                         ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
3963                     else {
3964                         for (value = 0; value < 256; value++)
3965                             if (!isPUNCT(value))
3966                                 ANYOF_BITMAP_SET(ret, value);
3967                     }
3968                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3969                     break;
3970                 case ANYOF_SPACE:
3971                     if (LOC)
3972                         ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3973                     else {
3974                         for (value = 0; value < 256; value++)
3975                             if (isSPACE(value))
3976                                 ANYOF_BITMAP_SET(ret, value);
3977                     }
3978                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3979                     break;
3980                 case ANYOF_NSPACE:
3981                     if (LOC)
3982                         ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3983                     else {
3984                         for (value = 0; value < 256; value++)
3985                             if (!isSPACE(value))
3986                                 ANYOF_BITMAP_SET(ret, value);
3987                     }
3988                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
3989                     break;
3990                 case ANYOF_UPPER:
3991                     if (LOC)
3992                         ANYOF_CLASS_SET(ret, ANYOF_UPPER);
3993                     else {
3994                         for (value = 0; value < 256; value++)
3995                             if (isUPPER(value))
3996                                 ANYOF_BITMAP_SET(ret, value);
3997                     }
3998                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
3999                     break;
4000                 case ANYOF_NUPPER:
4001                     if (LOC)
4002                         ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4003                     else {
4004                         for (value = 0; value < 256; value++)
4005                             if (!isUPPER(value))
4006                                 ANYOF_BITMAP_SET(ret, value);
4007                     }
4008                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
4009                     break;
4010                 case ANYOF_XDIGIT:
4011                     if (LOC)
4012                         ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4013                     else {
4014                         for (value = 0; value < 256; value++)
4015                             if (isXDIGIT(value))
4016                                 ANYOF_BITMAP_SET(ret, value);
4017                     }
4018                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4019                     break;
4020                 case ANYOF_NXDIGIT:
4021                     if (LOC)
4022                         ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4023                     else {
4024                         for (value = 0; value < 256; value++)
4025                             if (!isXDIGIT(value))
4026                                 ANYOF_BITMAP_SET(ret, value);
4027                     }
4028                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4029                     break;
4030                 default:
4031                     vFAIL("Invalid [::] class");
4032                     break;
4033                 }
4034                 if (LOC)
4035                     ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4036                 continue;
4037             }
4038         } /* end of namedclass \blah */
4039
4040         if (range) {
4041             if (prevvalue > value) /* b-a */ {
4042                 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4043                               RExC_parse - rangebegin,
4044                               RExC_parse - rangebegin,
4045                               rangebegin);
4046                 range = 0; /* not a valid range */
4047             }
4048         }
4049         else {
4050             prevvalue = value; /* save the beginning of the range */
4051             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4052                 RExC_parse[1] != ']') {
4053                 RExC_parse++;
4054
4055                 /* a bad range like \w-, [:word:]- ? */
4056                 if (namedclass > OOB_NAMEDCLASS) {
4057                     if (ckWARN(WARN_REGEXP))
4058                         vWARN4(RExC_parse,
4059                                "False [] range \"%*.*s\"",
4060                                RExC_parse - rangebegin,
4061                                RExC_parse - rangebegin,
4062                                rangebegin);
4063                     if (!SIZE_ONLY)
4064                         ANYOF_BITMAP_SET(ret, '-');
4065                 } else
4066                     range = 1;  /* yeah, it's a range! */
4067                 continue;       /* but do it the next time */
4068             }
4069         }
4070
4071         /* now is the next time */
4072         if (!SIZE_ONLY) {
4073             IV i;
4074
4075             if (prevvalue < 256) {
4076                 IV ceilvalue = value < 256 ? value : 255;
4077
4078 #ifdef EBCDIC
4079                 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4080                     (isUPPER(prevvalue) && isUPPER(ceilvalue)))
4081                 {
4082                     if (isLOWER(prevvalue)) {
4083                         for (i = prevvalue; i <= ceilvalue; i++)
4084                             if (isLOWER(i))
4085                                 ANYOF_BITMAP_SET(ret, i);
4086                     } else {
4087                         for (i = prevvalue; i <= ceilvalue; i++)
4088                             if (isUPPER(i))
4089                                 ANYOF_BITMAP_SET(ret, i);
4090                     }
4091                 }
4092                 else
4093 #endif
4094                       for (i = prevvalue; i <= ceilvalue; i++)
4095                           ANYOF_BITMAP_SET(ret, i);
4096           }
4097           if (value > 255 || UTF) {
4098                 UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
4099                 UV natvalue      = NATIVE_TO_UNI(value);
4100
4101                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4102                 if (prevnatvalue < natvalue) { /* what about > ? */
4103                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4104                                    prevnatvalue, natvalue);
4105                 }
4106                 else if (prevnatvalue == natvalue) {
4107                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4108                     if (FOLD) {
4109                          U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4110                          STRLEN foldlen;
4111                          UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4112
4113                          /* If folding and foldable and a single
4114                           * character, insert also the folded version
4115                           * to the charclass. */
4116                          if (f != value) {
4117                               if (foldlen == UNISKIP(f))
4118                                   Perl_sv_catpvf(aTHX_ listsv,
4119                                                  "%04"UVxf"\n", f);
4120                               else {
4121                                   /* Any multicharacter foldings
4122                                    * require the following transform:
4123                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4124                                    * where E folds into "pq" and F folds
4125                                    * into "rst", all other characters
4126                                    * fold to single characters.  We save
4127                                    * away these multicharacter foldings,
4128                                    * to be later saved as part of the
4129                                    * additional "s" data. */
4130                                   SV *sv;
4131
4132                                   if (!unicode_alternate)
4133                                       unicode_alternate = newAV();
4134                                   sv = newSVpvn((char*)foldbuf, foldlen);
4135                                   SvUTF8_on(sv);
4136                                   av_push(unicode_alternate, sv);
4137                               }
4138                          }
4139
4140                          /* If folding and the value is one of the Greek
4141                           * sigmas insert a few more sigmas to make the
4142                           * folding rules of the sigmas to work right.
4143                           * Note that not all the possible combinations
4144                           * are handled here: some of them are handled
4145                           * by the standard folding rules, and some of
4146                           * them (literal or EXACTF cases) are handled
4147                           * during runtime in regexec.c:S_find_byclass(). */
4148                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4149                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4150                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4151                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4152                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4153                          }
4154                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4155                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4156                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4157                     }
4158                 }
4159             }
4160         }
4161
4162         range = 0; /* this range (if it was one) is done now */
4163     }
4164
4165     if (need_class) {
4166         ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4167         if (SIZE_ONLY)
4168             RExC_size += ANYOF_CLASS_ADD_SKIP;
4169         else
4170             RExC_emit += ANYOF_CLASS_ADD_SKIP;
4171     }
4172
4173     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4174     if (!SIZE_ONLY &&
4175          /* If the only flag is folding (plus possibly inversion). */
4176         ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4177        ) {
4178         for (value = 0; value < 256; ++value) {
4179             if (ANYOF_BITMAP_TEST(ret, value)) {
4180                 IV fold = PL_fold[value];
4181
4182                 if (fold != value)
4183                     ANYOF_BITMAP_SET(ret, fold);
4184             }
4185         }
4186         ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4187     }
4188
4189     /* optimize inverted simple patterns (e.g. [^a-z]) */
4190     if (!SIZE_ONLY && optimize_invert &&
4191         /* If the only flag is inversion. */
4192         (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4193         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4194             ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4195         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4196     }
4197
4198     if (!SIZE_ONLY) {
4199         AV *av = newAV();
4200         SV *rv;
4201
4202         /* The 0th element stores the character class description
4203          * in its textual form: used later (regexec.c:Perl_regclass_swatch())
4204          * to initialize the appropriate swash (which gets stored in
4205          * the 1st element), and also useful for dumping the regnode.
4206          * The 2nd element stores the multicharacter foldings,
4207          * used later (regexec.c:s_reginclasslen()). */
4208         av_store(av, 0, listsv);
4209         av_store(av, 1, NULL);
4210         av_store(av, 2, (SV*)unicode_alternate);
4211         rv = newRV_noinc((SV*)av);
4212         n = add_data(pRExC_state, 1, "s");
4213         RExC_rx->data->data[n] = (void*)rv;
4214         ARG_SET(ret, n);
4215     }
4216
4217     return ret;
4218 }
4219
4220 STATIC char*
4221 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4222 {
4223     char* retval = RExC_parse++;
4224
4225     for (;;) {
4226         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4227                 RExC_parse[2] == '#') {
4228             while (*RExC_parse && *RExC_parse != ')')
4229                 RExC_parse++;
4230             RExC_parse++;
4231             continue;
4232         }
4233         if (RExC_flags16 & PMf_EXTENDED) {
4234             if (isSPACE(*RExC_parse)) {
4235                 RExC_parse++;
4236                 continue;
4237             }
4238             else if (*RExC_parse == '#') {
4239                 while (*RExC_parse && *RExC_parse != '\n')
4240                     RExC_parse++;
4241                 RExC_parse++;
4242                 continue;
4243             }
4244         }
4245         return retval;
4246     }
4247 }
4248
4249 /*
4250 - reg_node - emit a node
4251 */
4252 STATIC regnode *                        /* Location. */
4253 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4254 {
4255     register regnode *ret;
4256     register regnode *ptr;
4257
4258     ret = RExC_emit;
4259     if (SIZE_ONLY) {
4260         SIZE_ALIGN(RExC_size);
4261         RExC_size += 1;
4262         return(ret);
4263     }
4264
4265     NODE_ALIGN_FILL(ret);
4266     ptr = ret;
4267     FILL_ADVANCE_NODE(ptr, op);
4268     if (RExC_offsets) {         /* MJD */
4269       MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n", 
4270               "reg_node", __LINE__, 
4271               reg_name[op],
4272               RExC_emit - RExC_emit_start > RExC_offsets[0] 
4273               ? "Overwriting end of array!\n" : "OK",
4274               RExC_emit - RExC_emit_start,
4275               RExC_parse - RExC_start,
4276               RExC_offsets[0])); 
4277       Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4278     }
4279             
4280     RExC_emit = ptr;
4281
4282     return(ret);
4283 }
4284
4285 /*
4286 - reganode - emit a node with an argument
4287 */
4288 STATIC regnode *                        /* Location. */
4289 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4290 {
4291     register regnode *ret;
4292     register regnode *ptr;
4293
4294     ret = RExC_emit;
4295     if (SIZE_ONLY) {
4296         SIZE_ALIGN(RExC_size);
4297         RExC_size += 2;
4298         return(ret);
4299     }
4300
4301     NODE_ALIGN_FILL(ret);
4302     ptr = ret;
4303     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4304     if (RExC_offsets) {         /* MJD */
4305       MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n", 
4306               "reganode",
4307               RExC_emit - RExC_emit_start > RExC_offsets[0] ? 
4308               "Overwriting end of array!\n" : "OK",
4309               RExC_emit - RExC_emit_start,
4310               RExC_parse - RExC_start,
4311               RExC_offsets[0])); 
4312       Set_Cur_Node_Offset;
4313     }
4314             
4315     RExC_emit = ptr;
4316
4317     return(ret);
4318 }
4319
4320 /*
4321 - reguni - emit (if appropriate) a Unicode character
4322 */
4323 STATIC void
4324 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4325 {
4326     *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4327 }
4328
4329 /*
4330 - reginsert - insert an operator in front of already-emitted operand
4331 *
4332 * Means relocating the operand.
4333 */
4334 STATIC void
4335 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4336 {
4337     register regnode *src;
4338     register regnode *dst;
4339     register regnode *place;
4340     register int offset = regarglen[(U8)op];
4341
4342 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4343
4344     if (SIZE_ONLY) {
4345         RExC_size += NODE_STEP_REGNODE + offset;
4346         return;
4347     }
4348
4349     src = RExC_emit;
4350     RExC_emit += NODE_STEP_REGNODE + offset;
4351     dst = RExC_emit;
4352     while (src > opnd) {
4353         StructCopy(--src, --dst, regnode);
4354         if (RExC_offsets) {     /* MJD 20010112 */
4355           MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n", 
4356                   "reg_insert",
4357                   dst - RExC_emit_start > RExC_offsets[0] 
4358                   ? "Overwriting end of array!\n" : "OK",
4359                   src - RExC_emit_start,
4360                   dst - RExC_emit_start,
4361                   RExC_offsets[0])); 
4362           Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4363           Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4364         }
4365     }
4366     
4367
4368     place = opnd;               /* Op node, where operand used to be. */
4369     if (RExC_offsets) {         /* MJD */
4370       MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n", 
4371               "reginsert",
4372               place - RExC_emit_start > RExC_offsets[0] 
4373               ? "Overwriting end of array!\n" : "OK",
4374               place - RExC_emit_start,
4375               RExC_parse - RExC_start,
4376               RExC_offsets[0])); 
4377       Set_Node_Offset(place, RExC_parse);
4378     }
4379     src = NEXTOPER(place);
4380     FILL_ADVANCE_NODE(place, op);
4381     Zero(src, offset, regnode);
4382 }
4383
4384 /*
4385 - regtail - set the next-pointer at the end of a node chain of p to val.
4386 */
4387 STATIC void
4388 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4389 {
4390     register regnode *scan;
4391     register regnode *temp;
4392
4393     if (SIZE_ONLY)
4394         return;
4395
4396     /* Find last node. */
4397     scan = p;
4398     for (;;) {
4399         temp = regnext(scan);
4400         if (temp == NULL)
4401             break;
4402         scan = temp;
4403     }
4404
4405     if (reg_off_by_arg[OP(scan)]) {
4406         ARG_SET(scan, val - scan);
4407     }
4408     else {
4409         NEXT_OFF(scan) = val - scan;
4410     }
4411 }
4412
4413 /*
4414 - regoptail - regtail on operand of first argument; nop if operandless
4415 */
4416 STATIC void
4417 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4418 {
4419     /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4420     if (p == NULL || SIZE_ONLY)
4421         return;
4422     if (PL_regkind[(U8)OP(p)] == BRANCH) {
4423         regtail(pRExC_state, NEXTOPER(p), val);
4424     }
4425     else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4426         regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4427     }
4428     else
4429         return;
4430 }
4431
4432 /*
4433  - regcurly - a little FSA that accepts {\d+,?\d*}
4434  */
4435 STATIC I32
4436 S_regcurly(pTHX_ register char *s)
4437 {
4438     if (*s++ != '{')
4439         return FALSE;
4440     if (!isDIGIT(*s))
4441         return FALSE;
4442     while (isDIGIT(*s))
4443         s++;
4444     if (*s == ',')
4445         s++;
4446     while (isDIGIT(*s))
4447         s++;
4448     if (*s != '}')
4449         return FALSE;
4450     return TRUE;
4451 }
4452
4453
4454 #ifdef DEBUGGING
4455
4456 STATIC regnode *
4457 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4458 {
4459     register U8 op = EXACT;     /* Arbitrary non-END op. */
4460     register regnode *next;
4461
4462     while (op != END && (!last || node < last)) {
4463         /* While that wasn't END last time... */
4464
4465         NODE_ALIGN(node);
4466         op = OP(node);
4467         if (op == CLOSE)
4468             l--;        
4469         next = regnext(node);
4470         /* Where, what. */
4471         if (OP(node) == OPTIMIZED)
4472             goto after_print;
4473         regprop(sv, node);
4474         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4475                       (int)(2*l + 1), "", SvPVX(sv));
4476         if (next == NULL)               /* Next ptr. */
4477             PerlIO_printf(Perl_debug_log, "(0)");
4478         else
4479             PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4480         (void)PerlIO_putc(Perl_debug_log, '\n');
4481       after_print:
4482         if (PL_regkind[(U8)op] == BRANCHJ) {
4483             register regnode *nnode = (OP(next) == LONGJMP
4484                                        ? regnext(next)
4485                                        : next);
4486             if (last && nnode > last)
4487                 nnode = last;
4488             node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4489         }
4490         else if (PL_regkind[(U8)op] == BRANCH) {
4491             node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4492         }
4493         else if ( op == CURLY) {   /* `next' might be very big: optimizer */
4494             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4495                              NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4496         }
4497         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4498             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4499                              next, sv, l + 1);
4500         }
4501         else if ( op == PLUS || op == STAR) {
4502             node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4503         }
4504         else if (op == ANYOF) {
4505             /* arglen 1 + class block */
4506             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4507                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4508             node = NEXTOPER(node);
4509         }
4510         else if (PL_regkind[(U8)op] == EXACT) {
4511             /* Literal string, where present. */
4512             node += NODE_SZ_STR(node) - 1;
4513             node = NEXTOPER(node);
4514         }
4515         else {
4516             node = NEXTOPER(node);
4517             node += regarglen[(U8)op];
4518         }
4519         if (op == CURLYX || op == OPEN)
4520             l++;
4521         else if (op == WHILEM)
4522             l--;
4523     }
4524     return node;
4525 }
4526
4527 #endif  /* DEBUGGING */
4528
4529 /*
4530  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4531  */
4532 void
4533 Perl_regdump(pTHX_ regexp *r)
4534 {
4535 #ifdef DEBUGGING
4536     SV *sv = sv_newmortal();
4537
4538     (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4539
4540     /* Header fields of interest. */
4541     if (r->anchored_substr)
4542         PerlIO_printf(Perl_debug_log,
4543                       "anchored `%s%.*s%s'%s at %"IVdf" ",
4544                       PL_colors[0],
4545                       (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4546                       SvPVX(r->anchored_substr),
4547                       PL_colors[1],
4548                       SvTAIL(r->anchored_substr) ? "$" : "",
4549                       (IV)r->anchored_offset);
4550     else if (r->anchored_utf8)
4551         PerlIO_printf(Perl_debug_log,
4552                       "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4553                       PL_colors[0],
4554                       (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4555                       SvPVX(r->anchored_utf8),
4556                       PL_colors[1],
4557                       SvTAIL(r->anchored_utf8) ? "$" : "",
4558                       (IV)r->anchored_offset);
4559     if (r->float_substr)
4560         PerlIO_printf(Perl_debug_log,
4561                       "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4562                       PL_colors[0],
4563                       (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4564                       SvPVX(r->float_substr),
4565                       PL_colors[1],
4566                       SvTAIL(r->float_substr) ? "$" : "",
4567                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4568     else if (r->float_utf8)
4569         PerlIO_printf(Perl_debug_log,
4570                       "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4571                       PL_colors[0],
4572                       (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4573                       SvPVX(r->float_utf8),
4574                       PL_colors[1],
4575                       SvTAIL(r->float_utf8) ? "$" : "",
4576                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4577     if (r->check_substr || r->check_utf8)
4578         PerlIO_printf(Perl_debug_log,
4579                       r->check_substr == r->float_substr
4580                       && r->check_utf8 == r->float_utf8
4581                       ? "(checking floating" : "(checking anchored");
4582     if (r->reganch & ROPT_NOSCAN)
4583         PerlIO_printf(Perl_debug_log, " noscan");
4584     if (r->reganch & ROPT_CHECK_ALL)
4585         PerlIO_printf(Perl_debug_log, " isall");
4586     if (r->check_substr || r->check_utf8)
4587         PerlIO_printf(Perl_debug_log, ") ");
4588
4589     if (r->regstclass) {
4590         regprop(sv, r->regstclass);
4591         PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4592     }
4593     if (r->reganch & ROPT_ANCH) {
4594         PerlIO_printf(Perl_debug_log, "anchored");
4595         if (r->reganch & ROPT_ANCH_BOL)
4596             PerlIO_printf(Perl_debug_log, "(BOL)");
4597         if (r->reganch & ROPT_ANCH_MBOL)
4598             PerlIO_printf(Perl_debug_log, "(MBOL)");
4599         if (r->reganch & ROPT_ANCH_SBOL)
4600             PerlIO_printf(Perl_debug_log, "(SBOL)");
4601         if (r->reganch & ROPT_ANCH_GPOS)
4602             PerlIO_printf(Perl_debug_log, "(GPOS)");
4603         PerlIO_putc(Perl_debug_log, ' ');
4604     }
4605     if (r->reganch & ROPT_GPOS_SEEN)
4606         PerlIO_printf(Perl_debug_log, "GPOS ");
4607     if (r->reganch & ROPT_SKIP)
4608         PerlIO_printf(Perl_debug_log, "plus ");
4609     if (r->reganch & ROPT_IMPLICIT)
4610         PerlIO_printf(Perl_debug_log, "implicit ");
4611     PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4612     if (r->reganch & ROPT_EVAL_SEEN)
4613         PerlIO_printf(Perl_debug_log, "with eval ");
4614     PerlIO_printf(Perl_debug_log, "\n");
4615     if (r->offsets) {
4616       U32 i;
4617       U32 len = r->offsets[0];
4618       PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4619       for (i = 1; i <= len; i++)
4620         PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ", 
4621                       (UV)r->offsets[i*2-1], 
4622                       (UV)r->offsets[i*2]);
4623       PerlIO_printf(Perl_debug_log, "\n");
4624     }
4625 #endif  /* DEBUGGING */
4626 }
4627
4628 #ifdef DEBUGGING
4629
4630 STATIC void
4631 S_put_byte(pTHX_ SV *sv, int c)
4632 {
4633     if (isCNTRL(c) || c == 255 || !isPRINT(c))
4634         Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4635     else if (c == '-' || c == ']' || c == '\\' || c == '^')
4636         Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4637     else
4638         Perl_sv_catpvf(aTHX_ sv, "%c", c);
4639 }
4640
4641 #endif  /* DEBUGGING */
4642
4643 /*
4644 - regprop - printable representation of opcode
4645 */
4646 void
4647 Perl_regprop(pTHX_ SV *sv, regnode *o)
4648 {
4649 #ifdef DEBUGGING
4650     register int k;
4651
4652     sv_setpvn(sv, "", 0);
4653     if (OP(o) >= reg_num)               /* regnode.type is unsigned */
4654         /* It would be nice to FAIL() here, but this may be called from
4655            regexec.c, and it would be hard to supply pRExC_state. */
4656         Perl_croak(aTHX_ "Corrupted regexp opcode");
4657     sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4658
4659     k = PL_regkind[(U8)OP(o)];
4660
4661     if (k == EXACT) {
4662         SV *dsv = sv_2mortal(newSVpvn("", 0));
4663         /* Using is_utf8_string() is a crude hack but it may
4664          * be the best for now since we have no flag "this EXACTish
4665          * node was UTF-8" --jhi */
4666         bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4667         char *s    = do_utf8 ?
4668           pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4669                          UNI_DISPLAY_REGEX) :
4670           STRING(o);
4671         int len = do_utf8 ?
4672           strlen(s) :
4673           STR_LEN(o);
4674         Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4675                        PL_colors[0],
4676                        len, s,
4677                        PL_colors[1]);
4678     }
4679     else if (k == CURLY) {
4680         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4681             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4682         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4683     }
4684     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
4685         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4686     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4687         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
4688     else if (k == LOGICAL)
4689         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
4690     else if (k == ANYOF) {
4691         int i, rangestart = -1;
4692         U8 flags = ANYOF_FLAGS(o);
4693         const char * const anyofs[] = { /* Should be syncronized with
4694                                          * ANYOF_ #xdefines in regcomp.h */
4695             "\\w",
4696             "\\W",
4697             "\\s",
4698             "\\S",
4699             "\\d",
4700             "\\D",
4701             "[:alnum:]",
4702             "[:^alnum:]",
4703             "[:alpha:]",
4704             "[:^alpha:]",
4705             "[:ascii:]",
4706             "[:^ascii:]",
4707             "[:ctrl:]",
4708             "[:^ctrl:]",
4709             "[:graph:]",
4710             "[:^graph:]",
4711             "[:lower:]",
4712             "[:^lower:]",
4713             "[:print:]",
4714             "[:^print:]",
4715             "[:punct:]",
4716             "[:^punct:]",
4717             "[:upper:]",
4718             "[:^upper:]",
4719             "[:xdigit:]",
4720             "[:^xdigit:]",
4721             "[:space:]",
4722             "[:^space:]",
4723             "[:blank:]",
4724             "[:^blank:]"
4725         };
4726
4727         if (flags & ANYOF_LOCALE)
4728             sv_catpv(sv, "{loc}");
4729         if (flags & ANYOF_FOLD)
4730             sv_catpv(sv, "{i}");
4731         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4732         if (flags & ANYOF_INVERT)
4733             sv_catpv(sv, "^");
4734         for (i = 0; i <= 256; i++) {
4735             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4736                 if (rangestart == -1)
4737                     rangestart = i;
4738             } else if (rangestart != -1) {
4739                 if (i <= rangestart + 3)
4740                     for (; rangestart < i; rangestart++)
4741                         put_byte(sv, rangestart);
4742                 else {
4743                     put_byte(sv, rangestart);
4744                     sv_catpv(sv, "-");
4745                     put_byte(sv, i - 1);
4746                 }
4747                 rangestart = -1;
4748             }
4749         }
4750
4751         if (o->flags & ANYOF_CLASS)
4752             for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4753                 if (ANYOF_CLASS_TEST(o,i))
4754                     sv_catpv(sv, anyofs[i]);
4755
4756         if (flags & ANYOF_UNICODE)
4757             sv_catpv(sv, "{unicode}");
4758         else if (flags & ANYOF_UNICODE_ALL)
4759             sv_catpv(sv, "{unicode_all}");
4760
4761         {
4762             SV *lv;
4763             SV *sw = regclass_swash(o, FALSE, &lv, 0);
4764         
4765             if (lv) {
4766                 if (sw) {
4767                     UV i;
4768                     U8 s[UTF8_MAXLEN+1];
4769                 
4770                     for (i = 0; i <= 256; i++) { /* just the first 256 */
4771                         U8 *e = uvchr_to_utf8(s, i);
4772                         
4773                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
4774                             if (rangestart == -1)
4775                                 rangestart = i;
4776                         } else if (rangestart != -1) {
4777                             U8 *p;
4778                         
4779                             if (i <= rangestart + 3)
4780                                 for (; rangestart < i; rangestart++) {
4781                                     for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4782                                         put_byte(sv, *p);
4783                                 }
4784                             else {
4785                                 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4786                                     put_byte(sv, *p);
4787                                 sv_catpv(sv, "-");
4788                                     for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4789                                         put_byte(sv, *p);
4790                                 }
4791                                 rangestart = -1;
4792                             }
4793                         }
4794                         
4795                     sv_catpv(sv, "..."); /* et cetera */
4796                 }
4797
4798                 {
4799                     char *s = savepv(SvPVX(lv));
4800                     char *origs = s;
4801                 
4802                     while(*s && *s != '\n') s++;
4803                 
4804                     if (*s == '\n') {
4805                         char *t = ++s;
4806                         
4807                         while (*s) {
4808                             if (*s == '\n')
4809                                 *s = ' ';
4810                             s++;
4811                         }
4812                         if (s[-1] == ' ')
4813                             s[-1] = 0;
4814                         
4815                         sv_catpv(sv, t);
4816                     }
4817                 
4818                     Safefree(origs);
4819                 }
4820             }
4821         }
4822
4823         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4824     }
4825     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4826         Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4827 #endif  /* DEBUGGING */
4828 }
4829
4830 SV *
4831 Perl_re_intuit_string(pTHX_ regexp *prog)
4832 {                               /* Assume that RE_INTUIT is set */
4833     DEBUG_r(
4834         {   STRLEN n_a;
4835             char *s = SvPV(prog->check_substr
4836                       ? prog->check_substr : prog->check_utf8, n_a);
4837
4838             if (!PL_colorset) reginitcolors();
4839             PerlIO_printf(Perl_debug_log,
4840                       "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4841                       PL_colors[4],
4842                       prog->check_substr ? "" : "utf8 ",
4843                       PL_colors[5],PL_colors[0],
4844                       s,
4845                       PL_colors[1],
4846                       (strlen(s) > 60 ? "..." : ""));
4847         } );
4848
4849     return prog->check_substr ? prog->check_substr : prog->check_utf8;
4850 }
4851
4852 void
4853 Perl_pregfree(pTHX_ struct regexp *r)
4854 {
4855 #ifdef DEBUGGING
4856     SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4857 #endif
4858
4859     if (!r || (--r->refcnt > 0))
4860         return;
4861     DEBUG_r({
4862          char *s = pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60,
4863                                   UNI_DISPLAY_REGEX);
4864          int len = SvCUR(dsv);
4865          if (!PL_colorset)
4866               reginitcolors();
4867          PerlIO_printf(Perl_debug_log,
4868                        "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4869                        PL_colors[4],PL_colors[5],PL_colors[0],
4870                        len, len, s,
4871                        PL_colors[1],
4872                        len > 60 ? "..." : "");
4873     });
4874
4875     if (r->precomp)
4876         Safefree(r->precomp);
4877     if (r->offsets)             /* 20010421 MJD */
4878         Safefree(r->offsets);
4879     if (RX_MATCH_COPIED(r))
4880         Safefree(r->subbeg);
4881     if (r->substrs) {
4882         if (r->anchored_substr)
4883             SvREFCNT_dec(r->anchored_substr);
4884         if (r->anchored_utf8)
4885             SvREFCNT_dec(r->anchored_utf8);
4886         if (r->float_substr)
4887             SvREFCNT_dec(r->float_substr);
4888         if (r->float_utf8)
4889             SvREFCNT_dec(r->float_utf8);
4890         Safefree(r->substrs);
4891     }
4892     if (r->data) {
4893         int n = r->data->count;
4894         AV* new_comppad = NULL;
4895         AV* old_comppad;
4896         SV** old_curpad;
4897
4898         while (--n >= 0) {
4899           /* If you add a ->what type here, update the comment in regcomp.h */
4900             switch (r->data->what[n]) {
4901             case 's':
4902                 SvREFCNT_dec((SV*)r->data->data[n]);
4903                 break;
4904             case 'f':
4905                 Safefree(r->data->data[n]);
4906                 break;
4907             case 'p':
4908                 new_comppad = (AV*)r->data->data[n];
4909                 break;
4910             case 'o':
4911                 if (new_comppad == NULL)
4912                     Perl_croak(aTHX_ "panic: pregfree comppad");
4913                 old_comppad = PL_comppad;
4914                 old_curpad = PL_curpad;
4915                 /* Watch out for global destruction's random ordering. */
4916                 if (SvTYPE(new_comppad) == SVt_PVAV) {
4917                     PL_comppad = new_comppad;
4918                     PL_curpad = AvARRAY(new_comppad);
4919                 }
4920                 else
4921                     PL_curpad = NULL;
4922
4923                 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4924                     op_free((OP_4tree*)r->data->data[n]);
4925                 }
4926
4927                 PL_comppad = old_comppad;
4928                 PL_curpad = old_curpad;
4929                 SvREFCNT_dec((SV*)new_comppad);
4930                 new_comppad = NULL;
4931                 break;
4932             case 'n':
4933                 break;
4934             default:
4935                 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4936             }
4937         }
4938         Safefree(r->data->what);
4939         Safefree(r->data);
4940     }
4941     Safefree(r->startp);
4942     Safefree(r->endp);
4943     Safefree(r);
4944 }
4945
4946 /*
4947  - regnext - dig the "next" pointer out of a node
4948  *
4949  * [Note, when REGALIGN is defined there are two places in regmatch()
4950  * that bypass this code for speed.]
4951  */
4952 regnode *
4953 Perl_regnext(pTHX_ register regnode *p)
4954 {
4955     register I32 offset;
4956
4957     if (p == &PL_regdummy)
4958         return(NULL);
4959
4960     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4961     if (offset == 0)
4962         return(NULL);
4963
4964     return(p+offset);
4965 }
4966
4967 STATIC void     
4968 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
4969 {
4970     va_list args;
4971     STRLEN l1 = strlen(pat1);
4972     STRLEN l2 = strlen(pat2);
4973     char buf[512];
4974     SV *msv;
4975     char *message;
4976
4977     if (l1 > 510)
4978         l1 = 510;
4979     if (l1 + l2 > 510)
4980         l2 = 510 - l1;
4981     Copy(pat1, buf, l1 , char);
4982     Copy(pat2, buf + l1, l2 , char);
4983     buf[l1 + l2] = '\n';
4984     buf[l1 + l2 + 1] = '\0';
4985 #ifdef I_STDARG
4986     /* ANSI variant takes additional second argument */
4987     va_start(args, pat2);
4988 #else
4989     va_start(args);
4990 #endif
4991     msv = vmess(buf, &args);
4992     va_end(args);
4993     message = SvPV(msv,l1);
4994     if (l1 > 512)
4995         l1 = 512;
4996     Copy(message, buf, l1 , char);
4997     buf[l1] = '\0';                     /* Overwrite \n */
4998     Perl_croak(aTHX_ "%s", buf);
4999 }
5000
5001 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
5002
5003 void
5004 Perl_save_re_context(pTHX)
5005 {
5006 #if 0
5007     SAVEPPTR(RExC_precomp);             /* uncompiled string. */
5008     SAVEI32(RExC_npar);         /* () count. */
5009     SAVEI32(RExC_size);         /* Code size. */
5010     SAVEI16(RExC_flags16);              /* are we folding, multilining? */
5011     SAVEVPTR(RExC_rx);          /* from regcomp.c */
5012     SAVEI32(RExC_seen);         /* from regcomp.c */
5013     SAVEI32(RExC_sawback);              /* Did we see \1, ...? */
5014     SAVEI32(RExC_naughty);              /* How bad is this pattern? */
5015     SAVEVPTR(RExC_emit);                /* Code-emit pointer; &regdummy = don't */
5016     SAVEPPTR(RExC_end);         /* End of input for compile */
5017     SAVEPPTR(RExC_parse);               /* Input-scan pointer. */
5018 #endif
5019
5020     SAVEI32(PL_reg_flags);              /* from regexec.c */
5021     SAVEPPTR(PL_bostr);
5022     SAVEPPTR(PL_reginput);              /* String-input pointer. */
5023     SAVEPPTR(PL_regbol);                /* Beginning of input, for ^ check. */
5024     SAVEPPTR(PL_regeol);                /* End of input, for $ check. */
5025     SAVEVPTR(PL_regstartp);             /* Pointer to startp array. */
5026     SAVEVPTR(PL_regendp);               /* Ditto for endp. */
5027     SAVEVPTR(PL_reglastparen);          /* Similarly for lastparen. */
5028     SAVEPPTR(PL_regtill);               /* How far we are required to go. */
5029     SAVEGENERICPV(PL_reg_start_tmp);            /* from regexec.c */
5030     PL_reg_start_tmp = 0;
5031     SAVEI32(PL_reg_start_tmpl);         /* from regexec.c */
5032     PL_reg_start_tmpl = 0;
5033     SAVEVPTR(PL_regdata);
5034     SAVEI32(PL_reg_eval_set);           /* from regexec.c */
5035     SAVEI32(PL_regnarrate);             /* from regexec.c */
5036     SAVEVPTR(PL_regprogram);            /* from regexec.c */
5037     SAVEINT(PL_regindent);              /* from regexec.c */
5038     SAVEVPTR(PL_regcc);                 /* from regexec.c */
5039     SAVEVPTR(PL_curcop);
5040     SAVEVPTR(PL_reg_call_cc);           /* from regexec.c */
5041     SAVEVPTR(PL_reg_re);                /* from regexec.c */
5042     SAVEPPTR(PL_reg_ganch);             /* from regexec.c */
5043     SAVESPTR(PL_reg_sv);                /* from regexec.c */
5044     SAVEI8(PL_reg_match_utf8);          /* from regexec.c */
5045     SAVEVPTR(PL_reg_magic);             /* from regexec.c */
5046     SAVEI32(PL_reg_oldpos);                     /* from regexec.c */
5047     SAVEVPTR(PL_reg_oldcurpm);          /* from regexec.c */
5048     SAVEVPTR(PL_reg_curpm);             /* from regexec.c */
5049     SAVEI32(PL_regnpar);                /* () count. */
5050     SAVEI32(PL_regsize);                /* from regexec.c */
5051 #ifdef DEBUGGING
5052     SAVEPPTR(PL_reg_starttry);          /* from regexec.c */
5053 #endif
5054 }
5055
5056 static void
5057 clear_re(pTHX_ void *r)
5058 {
5059     ReREFCNT_dec((regexp *)r);
5060 }
5061